+/* Calculate the force correction due to PME analytically.
+ *
+ * This routine is meant to enable analytical evaluation of the
+ * direct-space PME electrostatic force to avoid tables.
+ *
+ * The direct-space potential should be Erfc(beta*r)/r, but there
+ * are some problems evaluating that:
+ *
+ * First, the error function is difficult (read: expensive) to
+ * approxmiate accurately for intermediate to large arguments, and
+ * this happens already in ranges of beta*r that occur in simulations.
+ * Second, we now try to avoid calculating potentials in Gromacs but
+ * use forces directly.
+ *
+ * We can simply things slight by noting that the PME part is really
+ * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
+ *
+ * V= 1/r - Erf(beta*r)/r
+ *
+ * The first term we already have from the inverse square root, so
+ * that we can leave out of this routine.
+ *
+ * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
+ * the argument beta*r will be in the range 0.15 to ~4. Use your
+ * favorite plotting program to realize how well-behaved Erf(z)/z is
+ * in this range!
+ *
+ * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
+ * However, it turns out it is more efficient to approximate f(z)/z and
+ * then only use even powers. This is another minor optimization, since
+ * we actually WANT f(z)/z, because it is going to be multiplied by
+ * the vector between the two atoms to get the vectorial force. The
+ * fastest flops are the ones we can avoid calculating!
+ *
+ * So, here's how it should be used:
+ *
+ * 1. Calculate r^2.
+ * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
+ * 3. Evaluate this routine with z^2 as the argument.
+ * 4. The return value is the expression:
+ *
+ *
+ * 2*exp(-z^2) erf(z)
+ * ------------ - --------
+ * sqrt(Pi)*z^2 z^3
+ *
+ * 5. Multiply the entire expression by beta^3. This will get you
+ *
+ * beta^3*2*exp(-z^2) beta^3*erf(z)
+ * ------------------ - ---------------
+ * sqrt(Pi)*z^2 z^3
+ *
+ * or, switching back to r (z=r*beta):
+ *
+ * 2*beta*exp(-r^2*beta^2) erf(r*beta)
+ * ----------------------- - -----------
+ * sqrt(Pi)*r^2 r^3
+ *
+ *
+ * With a bit of math exercise you should be able to confirm that
+ * this is exactly D[Erf[beta*r]/r,r] divided by r another time.
+ *
+ * 6. Add the result to 1/r^3, multiply by the product of the charges,
+ * and you have your force (divided by r). A final multiplication
+ * with the vector connecting the two particles and you have your
+ * vectorial force to add to the particles.
+ *
+ */
+static __m128d
+gmx_mm_pmecorrF_pd(__m128d z2)
+{
+ const __m128d FN10 = _mm_set1_pd(-8.0072854618360083154e-14);
+ const __m128d FN9 = _mm_set1_pd(1.1859116242260148027e-11);
+ const __m128d FN8 = _mm_set1_pd(-8.1490406329798423616e-10);
+ const __m128d FN7 = _mm_set1_pd(3.4404793543907847655e-8);
+ const __m128d FN6 = _mm_set1_pd(-9.9471420832602741006e-7);
+ const __m128d FN5 = _mm_set1_pd(0.000020740315999115847456);
+ const __m128d FN4 = _mm_set1_pd(-0.00031991745139313364005);
+ const __m128d FN3 = _mm_set1_pd(0.0035074449373659008203);
+ const __m128d FN2 = _mm_set1_pd(-0.031750380176100813405);
+ const __m128d FN1 = _mm_set1_pd(0.13884101728898463426);
+ const __m128d FN0 = _mm_set1_pd(-0.75225277815249618847);
+
+ const __m128d FD5 = _mm_set1_pd(0.000016009278224355026701);
+ const __m128d FD4 = _mm_set1_pd(0.00051055686934806966046);
+ const __m128d FD3 = _mm_set1_pd(0.0081803507497974289008);
+ const __m128d FD2 = _mm_set1_pd(0.077181146026670287235);
+ const __m128d FD1 = _mm_set1_pd(0.41543303143712535988);
+ const __m128d FD0 = _mm_set1_pd(1.0);
+
+ __m128d z4;
+ __m128d polyFN0,polyFN1,polyFD0,polyFD1;
+
+ z4 = _mm_mul_pd(z2,z2);
+
+ polyFD1 = _mm_macc_pd(FD5,z4,FD3);
+ polyFD1 = _mm_macc_pd(polyFD1,z4,FD1);
+ polyFD1 = _mm_mul_pd(polyFD1,z2);
+ polyFD0 = _mm_macc_pd(FD4,z4,FD2);
+ polyFD0 = _mm_macc_pd(polyFD0,z4,FD0);
+ polyFD0 = _mm_add_pd(polyFD0,polyFD1);
+
+ polyFD0 = gmx_mm_inv_pd(polyFD0);
+
+ polyFN0 = _mm_macc_pd(FN10,z4,FN8);
+ polyFN0 = _mm_macc_pd(polyFN0,z4,FN6);
+ polyFN0 = _mm_macc_pd(polyFN0,z4,FN4);
+ polyFN0 = _mm_macc_pd(polyFN0,z4,FN2);
+ polyFN0 = _mm_macc_pd(polyFN0,z4,FN0);
+ polyFN1 = _mm_macc_pd(FN9,z4,FN7);
+ polyFN1 = _mm_macc_pd(polyFN1,z4,FN5);
+ polyFN1 = _mm_macc_pd(polyFN1,z4,FN3);
+ polyFN1 = _mm_macc_pd(polyFN1,z4,FN1);
+ polyFN0 = _mm_macc_pd(polyFN1,z2,polyFN0);
+
+ return _mm_mul_pd(polyFN0,polyFD0);
+}
+
+
+/* Calculate the potential correction due to PME analytically.
+ *
+ * This routine calculates Erf(z)/z, although you should provide z^2
+ * as the input argument.
+ *
+ * Here's how it should be used:
+ *
+ * 1. Calculate r^2.
+ * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
+ * 3. Evaluate this routine with z^2 as the argument.
+ * 4. The return value is the expression:
+ *
+ *
+ * erf(z)
+ * --------
+ * z
+ *
+ * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
+ *
+ * erf(r*beta)
+ * -----------
+ * r
+ *
+ * 6. Subtract the result from 1/r, multiply by the product of the charges,
+ * and you have your potential.
+ *
+ */
+static __m128d
+gmx_mm_pmecorrV_pd(__m128d z2)
+{
+ const __m128d VN9 = _mm_set1_pd(-9.3723776169321855475e-13);
+ const __m128d VN8 = _mm_set1_pd(1.2280156762674215741e-10);
+ const __m128d VN7 = _mm_set1_pd(-7.3562157912251309487e-9);
+ const __m128d VN6 = _mm_set1_pd(2.6215886208032517509e-7);
+ const __m128d VN5 = _mm_set1_pd(-4.9532491651265819499e-6);
+ const __m128d VN4 = _mm_set1_pd(0.00025907400778966060389);
+ const __m128d VN3 = _mm_set1_pd(0.0010585044856156469792);
+ const __m128d VN2 = _mm_set1_pd(0.045247661136833092885);
+ const __m128d VN1 = _mm_set1_pd(0.11643931522926034421);
+ const __m128d VN0 = _mm_set1_pd(1.1283791671726767970);
+
+ const __m128d VD5 = _mm_set1_pd(0.000021784709867336150342);
+ const __m128d VD4 = _mm_set1_pd(0.00064293662010911388448);
+ const __m128d VD3 = _mm_set1_pd(0.0096311444822588683504);
+ const __m128d VD2 = _mm_set1_pd(0.085608012351550627051);
+ const __m128d VD1 = _mm_set1_pd(0.43652499166614811084);
+ const __m128d VD0 = _mm_set1_pd(1.0);
+
+ __m128d z4;
+ __m128d polyVN0,polyVN1,polyVD0,polyVD1;
+
+ z4 = _mm_mul_pd(z2,z2);
+
+ polyVD1 = _mm_macc_pd(VD5,z4,VD3);
+ polyVD0 = _mm_macc_pd(VD4,z4,VD2);
+ polyVD1 = _mm_macc_pd(polyVD1,z4,VD1);
+ polyVD0 = _mm_macc_pd(polyVD0,z4,VD0);
+ polyVD0 = _mm_macc_pd(polyVD1,z2,polyVD0);
+
+ polyVD0 = gmx_mm_inv_pd(polyVD0);
+
+ polyVN1 = _mm_macc_pd(VN9,z4,VN7);
+ polyVN0 = _mm_macc_pd(VN8,z4,VN6);
+ polyVN1 = _mm_macc_pd(polyVN1,z4,VN5);
+ polyVN0 = _mm_macc_pd(polyVN0,z4,VN4);
+ polyVN1 = _mm_macc_pd(polyVN1,z4,VN3);
+ polyVN0 = _mm_macc_pd(polyVN0,z4,VN2);
+ polyVN1 = _mm_macc_pd(polyVN1,z4,VN1);
+ polyVN0 = _mm_macc_pd(polyVN0,z4,VN0);
+ polyVN0 = _mm_macc_pd(polyVN1,z2,polyVN0);
+
+ return _mm_mul_pd(polyVN0,polyVD0);
+}
+
static int
gmx_mm_sincos_pd(__m128d x,