src/gromacs/simd/math_single.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef GMX_SIMD_MATH_SINGLE_H_
  36 #define GMX_SIMD_MATH_SINGLE_H_
  37
  38
  39 /* 1.0/sqrt(x) */
  40 static gmx_inline gmx_simd_real_t
  41 gmx_simd_invsqrt_r(gmx_simd_real_t x)
  42 {
  43     /* This is one of the few cases where FMA adds a FLOP, but ends up with
  44      * less instructions in total when FMA is available in hardware.
  45      * Usually we would not optimize this far, but invsqrt is used often.
  46      */
  47 #ifdef GMX_SIMD_HAVE_FMA
  48     const gmx_simd_real_t half  = gmx_simd_set1_r(0.5);
  49     const gmx_simd_real_t one   = gmx_simd_set1_r(1.0);
  50
  51     gmx_simd_real_t       lu = gmx_simd_rsqrt_r(x);
  52
  53     return gmx_simd_fmadd_r(gmx_simd_fnmadd_r(x, gmx_simd_mul_r(lu, lu), one), gmx_simd_mul_r(lu, half), lu);
  54 #else
  55     const gmx_simd_real_t half  = gmx_simd_set1_r(0.5);
  56     const gmx_simd_real_t three = gmx_simd_set1_r(3.0);
  57
  58     gmx_simd_real_t       lu = gmx_simd_rsqrt_r(x);
  59
  60     return gmx_simd_mul_r(half, gmx_simd_mul_r(gmx_simd_sub_r(three, gmx_simd_mul_r(gmx_simd_mul_r(lu, lu), x)), lu));
  61 #endif
  62 }
  63
  64
  65 /* 1.0/x */
  66 static gmx_inline gmx_simd_real_t
  67 gmx_simd_inv_r(gmx_simd_real_t x)
  68 {
  69     const gmx_simd_real_t two = gmx_simd_set1_r(2.0);
  70
  71     gmx_simd_real_t       lu = gmx_simd_rcp_r(x);
  72
  73     return gmx_simd_mul_r(lu, gmx_simd_fnmadd_r(lu, x, two));
  74 }
  75
  76
  77 /* Calculate the force correction due to PME analytically.
  78  *
  79  * This routine is meant to enable analytical evaluation of the
  80  * direct-space PME electrostatic force to avoid tables.
  81  *
  82  * The direct-space potential should be Erfc(beta*r)/r, but there
  83  * are some problems evaluating that:
  84  *
  85  * First, the error function is difficult (read: expensive) to
  86  * approxmiate accurately for intermediate to large arguments, and
  87  * this happens already in ranges of beta*r that occur in simulations.
  88  * Second, we now try to avoid calculating potentials in Gromacs but
  89  * use forces directly.
  90  *
  91  * We can simply things slight by noting that the PME part is really
  92  * a correction to the normal Coulomb force since Erfc(z)=1-Erf(z), i.e.
  93  *
  94  * V= 1/r - Erf(beta*r)/r
  95  *
  96  * The first term we already have from the inverse square root, so
  97  * that we can leave out of this routine.
  98  *
  99  * For pme tolerances of 1e-3 to 1e-8 and cutoffs of 0.5nm to 1.8nm,
 100  * the argument beta*r will be in the range 0.15 to ~4. Use your
 101  * favorite plotting program to realize how well-behaved Erf(z)/z is
 102  * in this range!
 103  *
 104  * We approximate f(z)=erf(z)/z with a rational minimax polynomial.
 105  * However, it turns out it is more efficient to approximate f(z)/z and
 106  * then only use even powers. This is another minor optimization, since
 107  * we actually WANT f(z)/z, because it is going to be multiplied by
 108  * the vector between the two atoms to get the vectorial force. The
 109  * fastest flops are the ones we can avoid calculating!
 110  *
 111  * So, here's how it should be used:
 112  *
 113  * 1. Calculate r^2.
 114  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
 115  * 3. Evaluate this routine with z^2 as the argument.
 116  * 4. The return value is the expression:
 117  *
 118  *
 119  *       2*exp(-z^2)     erf(z)
 120  *       ------------ - --------
 121  *       sqrt(Pi)*z^2      z^3
 122  *
 123  * 5. Multiply the entire expression by beta^3. This will get you
 124  *
 125  *       beta^3*2*exp(-z^2)     beta^3*erf(z)
 126  *       ------------------  - ---------------
 127  *          sqrt(Pi)*z^2            z^3
 128  *
 129  *    or, switching back to r (z=r*beta):
 130  *
 131  *       2*beta*exp(-r^2*beta^2)   erf(r*beta)
 132  *       ----------------------- - -----------
 133  *            sqrt(Pi)*r^2            r^3
 134  *
 135  *
 136  *    With a bit of math exercise you should be able to confirm that
 137  *    this is exactly D[Erf[beta*r]/r,r] divided by r another time.
 138  *
 139  * 6. Add the result to 1/r^3, multiply by the product of the charges,
 140  *    and you have your force (divided by r). A final multiplication
 141  *    with the vector connecting the two particles and you have your
 142  *    vectorial force to add to the particles.
 143  *
 144  */
 145 static gmx_simd_real_t
 146 gmx_simd_pmecorrF_r(gmx_simd_real_t z2)
 147 {
 148     const gmx_simd_real_t  FN6      = gmx_simd_set1_r(-1.7357322914161492954e-8f);
 149     const gmx_simd_real_t  FN5      = gmx_simd_set1_r(1.4703624142580877519e-6f);
 150     const gmx_simd_real_t  FN4      = gmx_simd_set1_r(-0.000053401640219807709149f);
 151     const gmx_simd_real_t  FN3      = gmx_simd_set1_r(0.0010054721316683106153f);
 152     const gmx_simd_real_t  FN2      = gmx_simd_set1_r(-0.019278317264888380590f);
 153     const gmx_simd_real_t  FN1      = gmx_simd_set1_r(0.069670166153766424023f);
 154     const gmx_simd_real_t  FN0      = gmx_simd_set1_r(-0.75225204789749321333f);
 155
 156     const gmx_simd_real_t  FD4      = gmx_simd_set1_r(0.0011193462567257629232f);
 157     const gmx_simd_real_t  FD3      = gmx_simd_set1_r(0.014866955030185295499f);
 158     const gmx_simd_real_t  FD2      = gmx_simd_set1_r(0.11583842382862377919f);
 159     const gmx_simd_real_t  FD1      = gmx_simd_set1_r(0.50736591960530292870f);
 160     const gmx_simd_real_t  FD0      = gmx_simd_set1_r(1.0f);
 161
 162     gmx_simd_real_t        z4;
 163     gmx_simd_real_t        polyFN0, polyFN1, polyFD0, polyFD1;
 164
 165     z4             = gmx_simd_mul_r(z2, z2);
 166
 167     polyFD0        = gmx_simd_fmadd_r(FD4, z4, FD2);
 168     polyFD1        = gmx_simd_fmadd_r(FD3, z4, FD1);
 169     polyFD0        = gmx_simd_fmadd_r(polyFD0, z4, FD0);
 170     polyFD0        = gmx_simd_fmadd_r(polyFD1, z2, polyFD0);
 171
 172     polyFD0        = gmx_simd_inv_r(polyFD0);
 173
 174     polyFN0        = gmx_simd_fmadd_r(FN6, z4, FN4);
 175     polyFN1        = gmx_simd_fmadd_r(FN5, z4, FN3);
 176     polyFN0        = gmx_simd_fmadd_r(polyFN0, z4, FN2);
 177     polyFN1        = gmx_simd_fmadd_r(polyFN1, z4, FN1);
 178     polyFN0        = gmx_simd_fmadd_r(polyFN0, z4, FN0);
 179     polyFN0        = gmx_simd_fmadd_r(polyFN1, z2, polyFN0);
 180
 181     return gmx_simd_mul_r(polyFN0, polyFD0);
 182 }
 183
 184
 185 /* Calculate the potential correction due to PME analytically.
 186  *
 187  * See gmx_simd_pmecorrF_r() for details about the approximation.
 188  *
 189  * This routine calculates Erf(z)/z, although you should provide z^2
 190  * as the input argument.
 191  *
 192  * Here's how it should be used:
 193  *
 194  * 1. Calculate r^2.
 195  * 2. Multiply by beta^2, so you get z^2=beta^2*r^2.
 196  * 3. Evaluate this routine with z^2 as the argument.
 197  * 4. The return value is the expression:
 198  *
 199  *
 200  *        erf(z)
 201  *       --------
 202  *          z
 203  *
 204  * 5. Multiply the entire expression by beta and switching back to r (z=r*beta):
 205  *
 206  *       erf(r*beta)
 207  *       -----------
 208  *           r
 209  *
 210  * 6. Add the result to 1/r, multiply by the product of the charges,
 211  *    and you have your potential.
 212  */
 213 static gmx_simd_real_t
 214 gmx_simd_pmecorrV_r(gmx_simd_real_t z2)
 215 {
 216     const gmx_simd_real_t  VN6      = gmx_simd_set1_r(1.9296833005951166339e-8f);
 217     const gmx_simd_real_t  VN5      = gmx_simd_set1_r(-1.4213390571557850962e-6f);
 218     const gmx_simd_real_t  VN4      = gmx_simd_set1_r(0.000041603292906656984871f);
 219     const gmx_simd_real_t  VN3      = gmx_simd_set1_r(-0.00013134036773265025626f);
 220     const gmx_simd_real_t  VN2      = gmx_simd_set1_r(0.038657983986041781264f);
 221     const gmx_simd_real_t  VN1      = gmx_simd_set1_r(0.11285044772717598220f);
 222     const gmx_simd_real_t  VN0      = gmx_simd_set1_r(1.1283802385263030286f);
 223
 224     const gmx_simd_real_t  VD3      = gmx_simd_set1_r(0.0066752224023576045451f);
 225     const gmx_simd_real_t  VD2      = gmx_simd_set1_r(0.078647795836373922256f);
 226     const gmx_simd_real_t  VD1      = gmx_simd_set1_r(0.43336185284710920150f);
 227     const gmx_simd_real_t  VD0      = gmx_simd_set1_r(1.0f);
 228
 229     gmx_simd_real_t        z4;
 230     gmx_simd_real_t        polyVN0, polyVN1, polyVD0, polyVD1;
 231
 232     z4             = gmx_simd_mul_r(z2, z2);
 233
 234     polyVD1        = gmx_simd_fmadd_r(VD3, z4, VD1);
 235     polyVD0        = gmx_simd_fmadd_r(VD2, z4, VD0);
 236     polyVD0        = gmx_simd_fmadd_r(polyVD1, z2, polyVD0);
 237
 238     polyVD0        = gmx_simd_inv_r(polyVD0);
 239
 240     polyVN0        = gmx_simd_fmadd_r(VN6, z4, VN4);
 241     polyVN1        = gmx_simd_fmadd_r(VN5, z4, VN3);
 242     polyVN0        = gmx_simd_fmadd_r(polyVN0, z4, VN2);
 243     polyVN1        = gmx_simd_fmadd_r(polyVN1, z4, VN1);
 244     polyVN0        = gmx_simd_fmadd_r(polyVN0, z4, VN0);
 245     polyVN0        = gmx_simd_fmadd_r(polyVN1, z2, polyVN0);
 246
 247     return gmx_simd_mul_r(polyVN0, polyVD0);
 248 }
 249
 250
 251 #endif