Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / legacyheaders / gmx_math_x86_avx_128_fma_double.h
index 07087fd599a3cbedef772103406c6d44630a216a..b47f577be08ea8cdcfc4b85cac49a84aed430e60 100644 (file)
@@ -681,6 +681,199 @@ gmx_mm_erfc_pd(__m128d x)
 
 
 
+/* Calculate the force correction due to PME analytically.
+ *
+ * This routine is meant to enable analytical evaluation of the
+ * direct-space PME electrostatic force to avoid tables.
+ *
+ * The direct-space potential should be Erfc(beta*r)/r, but there
+ * are some problems evaluating that:
+ *
+ * First, the error function is difficult (read: expensive) to
+ * approxmiate accurately for intermediate to large arguments, and
+ * this happens already in ranges of beta*r that occur in simulations.
+ * Second, we now try to avoid calculating potentials in Gromacs but
+ * use forces directly.
+ *
+ * We can simply things slight by noting that the PME part is really
+ * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
+ *
+ * V= 1/r - Erf(beta*r)/r
+ *
+ * The first term we already have from the inverse square root, so
+ * that we can leave out of this routine.
+ *
+ * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
+ * the argument beta*r will be in the range 0.15 to ~4. Use your
+ * favorite plotting program to realize how well-behaved Erf(z)/z is
+ * in this range!
+ *
+ * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
+ * However, it turns out it is more efficient to approximate f(z)/z and
+ * then only use even powers. This is another minor optimization, since
+ * we actually WANT f(z)/z, because it is going to be multiplied by
+ * the vector between the two atoms to get the vectorial force. The
+ * fastest flops are the ones we can avoid calculating!
+ *
+ * So, here's how it should be used:
+ *
+ * 1. Calculate r^2.
+ * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
+ * 3. Evaluate this routine with z^2 as the argument.
+ * 4. The return value is the expression:
+ *
+ *
+ *       2*exp(-z^2)     erf(z)
+ *       ------------ - --------
+ *       sqrt(Pi)*z^2      z^3
+ *
+ * 5. Multiply the entire expression by beta^3. This will get you
+ *
+ *       beta^3*2*exp(-z^2)     beta^3*erf(z)
+ *       ------------------  - ---------------
+ *          sqrt(Pi)*z^2            z^3
+ *
+ *    or, switching back to r (z=r*beta):
+ *
+ *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
+ *       ----------------------- - -----------
+ *            sqrt(Pi)*r^2            r^3
+ *
+ *
+ *    With a bit of math exercise you should be able to confirm that
+ *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
+ *
+ * 6. Add the result to 1/r^3, multiply by the product of the charges,
+ *    and you have your force (divided by r). A final multiplication
+ *    with the vector connecting the two particles and you have your
+ *    vectorial force to add to the particles.
+ *
+ */
+static __m128d
+gmx_mm_pmecorrF_pd(__m128d z2)
+{
+    const __m128d  FN10     = _mm_set1_pd(-8.0072854618360083154e-14);
+    const __m128d  FN9      = _mm_set1_pd(1.1859116242260148027e-11);
+    const __m128d  FN8      = _mm_set1_pd(-8.1490406329798423616e-10);
+    const __m128d  FN7      = _mm_set1_pd(3.4404793543907847655e-8);
+    const __m128d  FN6      = _mm_set1_pd(-9.9471420832602741006e-7);
+    const __m128d  FN5      = _mm_set1_pd(0.000020740315999115847456);
+    const __m128d  FN4      = _mm_set1_pd(-0.00031991745139313364005);
+    const __m128d  FN3      = _mm_set1_pd(0.0035074449373659008203);
+    const __m128d  FN2      = _mm_set1_pd(-0.031750380176100813405);
+    const __m128d  FN1      = _mm_set1_pd(0.13884101728898463426);
+    const __m128d  FN0      = _mm_set1_pd(-0.75225277815249618847);
+    
+    const __m128d  FD5      = _mm_set1_pd(0.000016009278224355026701);
+    const __m128d  FD4      = _mm_set1_pd(0.00051055686934806966046);
+    const __m128d  FD3      = _mm_set1_pd(0.0081803507497974289008);
+    const __m128d  FD2      = _mm_set1_pd(0.077181146026670287235);
+    const __m128d  FD1      = _mm_set1_pd(0.41543303143712535988);
+    const __m128d  FD0      = _mm_set1_pd(1.0);
+    
+    __m128d z4;
+    __m128d polyFN0,polyFN1,polyFD0,polyFD1;
+    
+    z4             = _mm_mul_pd(z2,z2);
+    
+    polyFD1        = _mm_macc_pd(FD5,z4,FD3);
+    polyFD1        = _mm_macc_pd(polyFD1,z4,FD1);
+    polyFD1        = _mm_mul_pd(polyFD1,z2);
+    polyFD0        = _mm_macc_pd(FD4,z4,FD2);
+    polyFD0        = _mm_macc_pd(polyFD0,z4,FD0);
+    polyFD0        = _mm_add_pd(polyFD0,polyFD1);
+    
+    polyFD0        = gmx_mm_inv_pd(polyFD0);
+    
+    polyFN0        = _mm_macc_pd(FN10,z4,FN8);
+    polyFN0        = _mm_macc_pd(polyFN0,z4,FN6);
+    polyFN0        = _mm_macc_pd(polyFN0,z4,FN4);
+    polyFN0        = _mm_macc_pd(polyFN0,z4,FN2);
+    polyFN0        = _mm_macc_pd(polyFN0,z4,FN0);
+    polyFN1        = _mm_macc_pd(FN9,z4,FN7);
+    polyFN1        = _mm_macc_pd(polyFN1,z4,FN5);
+    polyFN1        = _mm_macc_pd(polyFN1,z4,FN3);
+    polyFN1        = _mm_macc_pd(polyFN1,z4,FN1);
+    polyFN0        = _mm_macc_pd(polyFN1,z2,polyFN0);
+    
+    return   _mm_mul_pd(polyFN0,polyFD0);
+}
+
+
+/* Calculate the potential correction due to PME analytically.
+ *
+ * This routine calculates Erf(z)/z, although you should provide z^2
+ * as the input argument.
+ *
+ * Here's how it should be used:
+ *
+ * 1. Calculate r^2.
+ * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
+ * 3. Evaluate this routine with z^2 as the argument.
+ * 4. The return value is the expression:
+ *
+ *
+ *        erf(z)
+ *       --------
+ *          z
+ *
+ * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
+ *
+ *       erf(r*beta)
+ *       -----------
+ *           r
+ *
+ * 6. Subtract the result from 1/r, multiply by the product of the charges,
+ *    and you have your potential.
+ *
+ */
+static __m128d
+gmx_mm_pmecorrV_pd(__m128d z2)
+{
+    const __m128d  VN9      = _mm_set1_pd(-9.3723776169321855475e-13);
+    const __m128d  VN8      = _mm_set1_pd(1.2280156762674215741e-10);
+    const __m128d  VN7      = _mm_set1_pd(-7.3562157912251309487e-9);
+    const __m128d  VN6      = _mm_set1_pd(2.6215886208032517509e-7);
+    const __m128d  VN5      = _mm_set1_pd(-4.9532491651265819499e-6);
+    const __m128d  VN4      = _mm_set1_pd(0.00025907400778966060389);
+    const __m128d  VN3      = _mm_set1_pd(0.0010585044856156469792);
+    const __m128d  VN2      = _mm_set1_pd(0.045247661136833092885);
+    const __m128d  VN1      = _mm_set1_pd(0.11643931522926034421);
+    const __m128d  VN0      = _mm_set1_pd(1.1283791671726767970);
+    
+    const __m128d  VD5      = _mm_set1_pd(0.000021784709867336150342);
+    const __m128d  VD4      = _mm_set1_pd(0.00064293662010911388448);
+    const __m128d  VD3      = _mm_set1_pd(0.0096311444822588683504);
+    const __m128d  VD2      = _mm_set1_pd(0.085608012351550627051);
+    const __m128d  VD1      = _mm_set1_pd(0.43652499166614811084);
+    const __m128d  VD0      = _mm_set1_pd(1.0);
+    
+    __m128d z4;
+    __m128d polyVN0,polyVN1,polyVD0,polyVD1;
+    
+    z4             = _mm_mul_pd(z2,z2);
+    
+    polyVD1        = _mm_macc_pd(VD5,z4,VD3);
+    polyVD0        = _mm_macc_pd(VD4,z4,VD2);
+    polyVD1        = _mm_macc_pd(polyVD1,z4,VD1);
+    polyVD0        = _mm_macc_pd(polyVD0,z4,VD0);
+    polyVD0        = _mm_macc_pd(polyVD1,z2,polyVD0);
+    
+    polyVD0        = gmx_mm_inv_pd(polyVD0);
+    
+    polyVN1        = _mm_macc_pd(VN9,z4,VN7);
+    polyVN0        = _mm_macc_pd(VN8,z4,VN6);
+    polyVN1        = _mm_macc_pd(polyVN1,z4,VN5);
+    polyVN0        = _mm_macc_pd(polyVN0,z4,VN4);
+    polyVN1        = _mm_macc_pd(polyVN1,z4,VN3);
+    polyVN0        = _mm_macc_pd(polyVN0,z4,VN2);
+    polyVN1        = _mm_macc_pd(polyVN1,z4,VN1);
+    polyVN0        = _mm_macc_pd(polyVN0,z4,VN0);
+    polyVN0        = _mm_macc_pd(polyVN1,z2,polyVN0);
+    
+    return   _mm_mul_pd(polyVN0,polyVD0);
+}
+
 
 static int
 gmx_mm_sincos_pd(__m128d x,