More SIMD preparations in the FE calculations
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_free_energy.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7  * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9  * and including many others, as listed in the AUTHORS file in the
10  * top-level source directory and at http://www.gromacs.org.
11  *
12  * GROMACS is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public License
14  * as published by the Free Software Foundation; either version 2.1
15  * of the License, or (at your option) any later version.
16  *
17  * GROMACS is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with GROMACS; if not, see
24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
26  *
27  * If you want to redistribute modifications to GROMACS, please
28  * consider that scientific software is very special. Version
29  * control is crucial - bugs must be traceable. We will be happy to
30  * consider code for inclusion in the official distribution, but
31  * derived work must not be called official GROMACS. Details are found
32  * in the README & COPYING files - if they are missing, get the
33  * official version at http://www.gromacs.org.
34  *
35  * To help us fund GROMACS development, we humbly ask that you cite
36  * the research papers on the package. Check out http://www.gromacs.org.
37  */
38 #include "gmxpre.h"
39
40 #include "nb_free_energy.h"
41
42 #include <cmath>
43
44 #include <algorithm>
45
46 #include "gromacs/gmxlib/nrnb.h"
47 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
48 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
49 #include "gromacs/math/functions.h"
50 #include "gromacs/math/vec.h"
51 #include "gromacs/mdtypes/forceoutput.h"
52 #include "gromacs/mdtypes/forcerec.h"
53 #include "gromacs/mdtypes/md_enums.h"
54 #include "gromacs/simd/simd.h"
55 #include "gromacs/utility/fatalerror.h"
56
57
58 //! Scalar (non-SIMD) data types.
59 struct ScalarDataTypes
60 {
61     using RealType                     = real; //!< The data type to use as real.
62     using IntType                      = int;  //!< The data type to use as int.
63     static constexpr int simdRealWidth = 1;    //!< The width of the RealType.
64     static constexpr int simdIntWidth  = 1;    //!< The width of the IntType.
65 };
66
67 #if GMX_SIMD_HAVE_REAL && GMX_SIMD_HAVE_INT32_ARITHMETICS
68 //! SIMD data types.
69 struct SimdDataTypes
70 {
71     using RealType                     = gmx::SimdReal;         //!< The data type to use as real.
72     using IntType                      = gmx::SimdInt32;        //!< The data type to use as int.
73     static constexpr int simdRealWidth = GMX_SIMD_REAL_WIDTH;   //!< The width of the RealType.
74     static constexpr int simdIntWidth  = GMX_SIMD_FINT32_WIDTH; //!< The width of the IntType.
75 };
76 #endif
77
78 //! Computes r^(1/p) and 1/r^(1/p) for the standard p=6
79 template<class RealType>
80 static inline void pthRoot(const RealType r, RealType* pthRoot, RealType* invPthRoot)
81 {
82     *invPthRoot = gmx::invsqrt(std::cbrt(r));
83     *pthRoot    = 1 / (*invPthRoot);
84 }
85
86 template<class RealType>
87 static inline RealType calculateRinv6(const RealType rinvV)
88 {
89     RealType rinv6 = rinvV * rinvV;
90     return (rinv6 * rinv6 * rinv6);
91 }
92
93 template<class RealType>
94 static inline RealType calculateVdw6(const RealType c6, const RealType rinv6)
95 {
96     return (c6 * rinv6);
97 }
98
99 template<class RealType>
100 static inline RealType calculateVdw12(const RealType c12, const RealType rinv6)
101 {
102     return (c12 * rinv6 * rinv6);
103 }
104
105 /* reaction-field electrostatics */
106 template<class RealType>
107 static inline RealType reactionFieldScalarForce(const RealType qq,
108                                                 const RealType rinv,
109                                                 const RealType r,
110                                                 const real     krf,
111                                                 const real     two)
112 {
113     return (qq * (rinv - two * krf * r * r));
114 }
115 template<class RealType>
116 static inline RealType reactionFieldPotential(const RealType qq,
117                                               const RealType rinv,
118                                               const RealType r,
119                                               const real     krf,
120                                               const real     potentialShift)
121 {
122     return (qq * (rinv + krf * r * r - potentialShift));
123 }
124
125 /* Ewald electrostatics */
126 template<class RealType>
127 static inline RealType ewaldScalarForce(const RealType coulomb, const RealType rinv)
128 {
129     return (coulomb * rinv);
130 }
131 template<class RealType>
132 static inline RealType ewaldPotential(const RealType coulomb, const RealType rinv, const real potentialShift)
133 {
134     return (coulomb * (rinv - potentialShift));
135 }
136
137 /* cutoff LJ */
138 template<class RealType>
139 static inline RealType lennardJonesScalarForce(const RealType v6, const RealType v12)
140 {
141     return (v12 - v6);
142 }
143 template<class RealType>
144 static inline RealType lennardJonesPotential(const RealType v6,
145                                              const RealType v12,
146                                              const RealType c6,
147                                              const RealType c12,
148                                              const real     repulsionShift,
149                                              const real     dispersionShift,
150                                              const real     onesixth,
151                                              const real     onetwelfth)
152 {
153     return ((v12 + c12 * repulsionShift) * onetwelfth - (v6 + c6 * dispersionShift) * onesixth);
154 }
155
156 /* Ewald LJ */
157 static inline real ewaldLennardJonesGridSubtract(const real c6grid, const real potentialShift, const real onesixth)
158 {
159     return (c6grid * potentialShift * onesixth);
160 }
161
162 /* LJ Potential switch */
163 template<class RealType>
164 static inline RealType potSwitchScalarForceMod(const RealType fScalarInp,
165                                                const RealType potential,
166                                                const RealType sw,
167                                                const RealType r,
168                                                const RealType rVdw,
169                                                const RealType dsw,
170                                                const real     zero)
171 {
172     if (r < rVdw)
173     {
174         real fScalar = fScalarInp * sw - r * potential * dsw;
175         return (fScalar);
176     }
177     return (zero);
178 }
179 template<class RealType>
180 static inline RealType potSwitchPotentialMod(const RealType potentialInp,
181                                              const RealType sw,
182                                              const RealType r,
183                                              const RealType rVdw,
184                                              const real     zero)
185 {
186     if (r < rVdw)
187     {
188         real potential = potentialInp * sw;
189         return (potential);
190     }
191     return (zero);
192 }
193
194
195 //! Templated free-energy non-bonded kernel
196 template<typename DataTypes, bool useSoftCore, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch>
197 static void nb_free_energy_kernel(const t_nblist* gmx_restrict nlist,
198                                   rvec* gmx_restrict         xx,
199                                   gmx::ForceWithShiftForces* forceWithShiftForces,
200                                   const t_forcerec* gmx_restrict fr,
201                                   const t_mdatoms* gmx_restrict mdatoms,
202                                   nb_kernel_data_t* gmx_restrict kernel_data,
203                                   t_nrnb* gmx_restrict nrnb)
204 {
205 #define STATE_A 0
206 #define STATE_B 1
207 #define NSTATES 2
208
209     using RealType = typename DataTypes::RealType;
210     using IntType  = typename DataTypes::IntType;
211
212     /* FIXME: How should these be handled with SIMD? */
213     constexpr real onetwelfth = 1.0 / 12.0;
214     constexpr real onesixth   = 1.0 / 6.0;
215     constexpr real zero       = 0.0;
216     constexpr real half       = 0.5;
217     constexpr real one        = 1.0;
218     constexpr real two        = 2.0;
219     constexpr real six        = 6.0;
220
221     /* Extract pointer to non-bonded interaction constants */
222     const interaction_const_t* ic = fr->ic;
223
224     // Extract pair list data
225     const int  nri    = nlist->nri;
226     const int* iinr   = nlist->iinr;
227     const int* jindex = nlist->jindex;
228     const int* jjnr   = nlist->jjnr;
229     const int* shift  = nlist->shift;
230     const int* gid    = nlist->gid;
231
232     const real* shiftvec      = fr->shift_vec[0];
233     const real* chargeA       = mdatoms->chargeA;
234     const real* chargeB       = mdatoms->chargeB;
235     real*       Vc            = kernel_data->energygrp_elec;
236     const int*  typeA         = mdatoms->typeA;
237     const int*  typeB         = mdatoms->typeB;
238     const int   ntype         = fr->ntype;
239     const real* nbfp          = fr->nbfp.data();
240     const real* nbfp_grid     = fr->ljpme_c6grid;
241     real*       Vv            = kernel_data->energygrp_vdw;
242     const real  lambda_coul   = kernel_data->lambda[efptCOUL];
243     const real  lambda_vdw    = kernel_data->lambda[efptVDW];
244     real*       dvdl          = kernel_data->dvdl;
245     const real  alpha_coul    = fr->sc_alphacoul;
246     const real  alpha_vdw     = fr->sc_alphavdw;
247     const real  lam_power     = fr->sc_power;
248     const real  sigma6_def    = fr->sc_sigma6_def;
249     const real  sigma6_min    = fr->sc_sigma6_min;
250     const bool  doForces      = ((kernel_data->flags & GMX_NONBONDED_DO_FORCE) != 0);
251     const bool  doShiftForces = ((kernel_data->flags & GMX_NONBONDED_DO_SHIFTFORCE) != 0);
252     const bool  doPotential   = ((kernel_data->flags & GMX_NONBONDED_DO_POTENTIAL) != 0);
253
254     // Extract data from interaction_const_t
255     const real facel           = ic->epsfac;
256     const real rcoulomb        = ic->rcoulomb;
257     const real krf             = ic->k_rf;
258     const real crf             = ic->c_rf;
259     const real sh_lj_ewald     = ic->sh_lj_ewald;
260     const real rvdw            = ic->rvdw;
261     const real dispersionShift = ic->dispersion_shift.cpot;
262     const real repulsionShift  = ic->repulsion_shift.cpot;
263
264     // Note that the nbnxm kernels do not support Coulomb potential switching at all
265     GMX_ASSERT(ic->coulomb_modifier != eintmodPOTSWITCH,
266                "Potential switching is not supported for Coulomb with FEP");
267
268     real vdw_swV3, vdw_swV4, vdw_swV5, vdw_swF2, vdw_swF3, vdw_swF4;
269     if (vdwModifierIsPotSwitch)
270     {
271         const real d = ic->rvdw - ic->rvdw_switch;
272         vdw_swV3     = -10.0 / (d * d * d);
273         vdw_swV4     = 15.0 / (d * d * d * d);
274         vdw_swV5     = -6.0 / (d * d * d * d * d);
275         vdw_swF2     = -30.0 / (d * d * d);
276         vdw_swF3     = 60.0 / (d * d * d * d);
277         vdw_swF4     = -30.0 / (d * d * d * d * d);
278     }
279     else
280     {
281         /* Avoid warnings from stupid compilers (looking at you, Clang!) */
282         vdw_swV3 = vdw_swV4 = vdw_swV5 = vdw_swF2 = vdw_swF3 = vdw_swF4 = 0.0;
283     }
284
285     int icoul;
286     if (ic->eeltype == eelCUT || EEL_RF(ic->eeltype))
287     {
288         icoul = GMX_NBKERNEL_ELEC_REACTIONFIELD;
289     }
290     else
291     {
292         icoul = GMX_NBKERNEL_ELEC_NONE;
293     }
294
295     real rcutoff_max2 = std::max(ic->rcoulomb, ic->rvdw);
296     rcutoff_max2      = rcutoff_max2 * rcutoff_max2;
297
298     const real* tab_ewald_F_lj = nullptr;
299     const real* tab_ewald_V_lj = nullptr;
300     const real* ewtab          = nullptr;
301     real        ewtabscale     = 0;
302     real        ewtabhalfspace = 0;
303     real        sh_ewald       = 0;
304     if (elecInteractionTypeIsEwald || vdwInteractionTypeIsEwald)
305     {
306         const auto& tables = *ic->coulombEwaldTables;
307         sh_ewald           = ic->sh_ewald;
308         ewtab              = tables.tableFDV0.data();
309         ewtabscale         = tables.scale;
310         ewtabhalfspace     = half / ewtabscale;
311         tab_ewald_F_lj     = tables.tableF.data();
312         tab_ewald_V_lj     = tables.tableV.data();
313     }
314
315     /* For Ewald/PME interactions we cannot easily apply the soft-core component to
316      * reciprocal space. When we use non-switched Ewald interactions, we
317      * assume the soft-coring does not significantly affect the grid contribution
318      * and apply the soft-core only to the full 1/r (- shift) pair contribution.
319      *
320      * However, we cannot use this approach for switch-modified since we would then
321      * effectively end up evaluating a significantly different interaction here compared to the
322      * normal (non-free-energy) kernels, either by applying a cutoff at a different
323      * position than what the user requested, or by switching different
324      * things (1/r rather than short-range Ewald). For these settings, we just
325      * use the traditional short-range Ewald interaction in that case.
326      */
327     GMX_RELEASE_ASSERT(!(vdwInteractionTypeIsEwald && vdwModifierIsPotSwitch),
328                        "Can not apply soft-core to switched Ewald potentials");
329
330     real dvdl_coul = 0;
331     real dvdl_vdw  = 0;
332
333     /* Lambda factor for state A, 1-lambda*/
334     real LFC[NSTATES], LFV[NSTATES];
335     LFC[STATE_A] = one - lambda_coul;
336     LFV[STATE_A] = one - lambda_vdw;
337
338     /* Lambda factor for state B, lambda*/
339     LFC[STATE_B] = lambda_coul;
340     LFV[STATE_B] = lambda_vdw;
341
342     /*derivative of the lambda factor for state A and B */
343     real DLF[NSTATES];
344     DLF[STATE_A] = -1;
345     DLF[STATE_B] = 1;
346
347     real           lfac_coul[NSTATES], dlfac_coul[NSTATES], lfac_vdw[NSTATES], dlfac_vdw[NSTATES];
348     constexpr real sc_r_power = 6.0_real;
349     for (int i = 0; i < NSTATES; i++)
350     {
351         lfac_coul[i]  = (lam_power == 2 ? (1 - LFC[i]) * (1 - LFC[i]) : (1 - LFC[i]));
352         dlfac_coul[i] = DLF[i] * lam_power / sc_r_power * (lam_power == 2 ? (1 - LFC[i]) : 1);
353         lfac_vdw[i]   = (lam_power == 2 ? (1 - LFV[i]) * (1 - LFV[i]) : (1 - LFV[i]));
354         dlfac_vdw[i]  = DLF[i] * lam_power / sc_r_power * (lam_power == 2 ? (1 - LFV[i]) : 1);
355     }
356
357     // TODO: We should get rid of using pointers to real
358     const real* x             = xx[0];
359     real* gmx_restrict f      = &(forceWithShiftForces->force()[0][0]);
360     real* gmx_restrict fshift = &(forceWithShiftForces->shiftForces()[0][0]);
361
362     for (int n = 0; n < nri; n++)
363     {
364         int npair_within_cutoff = 0;
365
366         const int  is3   = 3 * shift[n];
367         const real shX   = shiftvec[is3];
368         const real shY   = shiftvec[is3 + 1];
369         const real shZ   = shiftvec[is3 + 2];
370         const int  nj0   = jindex[n];
371         const int  nj1   = jindex[n + 1];
372         const int  ii    = iinr[n];
373         const int  ii3   = 3 * ii;
374         const real ix    = shX + x[ii3 + 0];
375         const real iy    = shY + x[ii3 + 1];
376         const real iz    = shZ + x[ii3 + 2];
377         const real iqA   = facel * chargeA[ii];
378         const real iqB   = facel * chargeB[ii];
379         const int  ntiA  = 2 * ntype * typeA[ii];
380         const int  ntiB  = 2 * ntype * typeB[ii];
381         real       vctot = 0;
382         real       vvtot = 0;
383         real       fix   = 0;
384         real       fiy   = 0;
385         real       fiz   = 0;
386
387         for (int k = nj0; k < nj1; k++)
388         {
389             int            tj[NSTATES];
390             const int      jnr = jjnr[k];
391             const int      j3  = 3 * jnr;
392             RealType       c6[NSTATES], c12[NSTATES], qq[NSTATES], Vcoul[NSTATES], Vvdw[NSTATES];
393             RealType       r, rinv, rp, rpm2;
394             RealType       alpha_vdw_eff, alpha_coul_eff, sigma6[NSTATES];
395             const RealType dx  = ix - x[j3];
396             const RealType dy  = iy - x[j3 + 1];
397             const RealType dz  = iz - x[j3 + 2];
398             const RealType rsq = dx * dx + dy * dy + dz * dz;
399             RealType       FscalC[NSTATES], FscalV[NSTATES];
400
401             if (rsq >= rcutoff_max2)
402             {
403                 /* We save significant time by skipping all code below.
404                  * Note that with soft-core interactions, the actual cut-off
405                  * check might be different. But since the soft-core distance
406                  * is always larger than r, checking on r here is safe.
407                  */
408                 continue;
409             }
410             npair_within_cutoff++;
411
412             if (rsq > 0)
413             {
414                 /* Note that unlike in the nbnxn kernels, we do not need
415                  * to clamp the value of rsq before taking the invsqrt
416                  * to avoid NaN in the LJ calculation, since here we do
417                  * not calculate LJ interactions when C6 and C12 are zero.
418                  */
419
420                 rinv = gmx::invsqrt(rsq);
421                 r    = rsq * rinv;
422             }
423             else
424             {
425                 /* The force at r=0 is zero, because of symmetry.
426                  * But note that the potential is in general non-zero,
427                  * since the soft-cored r will be non-zero.
428                  */
429                 rinv = 0;
430                 r    = 0;
431             }
432
433             if (useSoftCore)
434             {
435                 rpm2 = rsq * rsq;  /* r4 */
436                 rp   = rpm2 * rsq; /* r6 */
437             }
438             else
439             {
440                 /* The soft-core power p will not affect the results
441                  * with not using soft-core, so we use power of 0 which gives
442                  * the simplest math and cheapest code.
443                  */
444                 rpm2 = rinv * rinv;
445                 rp   = 1;
446             }
447
448             RealType Fscal = 0;
449
450             qq[STATE_A] = iqA * chargeA[jnr];
451             qq[STATE_B] = iqB * chargeB[jnr];
452
453             tj[STATE_A] = ntiA + 2 * typeA[jnr];
454             tj[STATE_B] = ntiB + 2 * typeB[jnr];
455
456             if (nlist->excl_fep == nullptr || nlist->excl_fep[k])
457             {
458                 c6[STATE_A] = nbfp[tj[STATE_A]];
459                 c6[STATE_B] = nbfp[tj[STATE_B]];
460
461                 for (int i = 0; i < NSTATES; i++)
462                 {
463                     c12[i] = nbfp[tj[i] + 1];
464                     if (useSoftCore)
465                     {
466                         if ((c6[i] > 0) && (c12[i] > 0))
467                         {
468                             /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */
469                             sigma6[i] = half * c12[i] / c6[i];
470                             if (sigma6[i] < sigma6_min) /* for disappearing coul and vdw with soft core at the same time */
471                             {
472                                 sigma6[i] = sigma6_min;
473                             }
474                         }
475                         else
476                         {
477                             sigma6[i] = sigma6_def;
478                         }
479                     }
480                 }
481
482                 if (useSoftCore)
483                 {
484                     /* only use softcore if one of the states has a zero endstate - softcore is for avoiding infinities!*/
485                     if ((c12[STATE_A] > 0) && (c12[STATE_B] > 0))
486                     {
487                         alpha_vdw_eff  = 0;
488                         alpha_coul_eff = 0;
489                     }
490                     else
491                     {
492                         alpha_vdw_eff  = alpha_vdw;
493                         alpha_coul_eff = alpha_coul;
494                     }
495                 }
496
497                 for (int i = 0; i < NSTATES; i++)
498                 {
499                     FscalC[i] = 0;
500                     FscalV[i] = 0;
501                     Vcoul[i]  = 0;
502                     Vvdw[i]   = 0;
503
504                     RealType rinvC, rinvV, rC, rV, rpinvC, rpinvV;
505
506                     /* Only spend time on A or B state if it is non-zero */
507                     if ((qq[i] != 0) || (c6[i] != 0) || (c12[i] != 0))
508                     {
509                         /* this section has to be inside the loop because of the dependence on sigma6 */
510                         if (useSoftCore)
511                         {
512                             rpinvC = one / (alpha_coul_eff * lfac_coul[i] * sigma6[i] + rp);
513                             pthRoot(rpinvC, &rinvC, &rC);
514                             if (scLambdasOrAlphasDiffer)
515                             {
516                                 rpinvV = one / (alpha_vdw_eff * lfac_vdw[i] * sigma6[i] + rp);
517                                 pthRoot(rpinvV, &rinvV, &rV);
518                             }
519                             else
520                             {
521                                 /* We can avoid one expensive pow and one / operation */
522                                 rpinvV = rpinvC;
523                                 rinvV  = rinvC;
524                                 rV     = rC;
525                             }
526                         }
527                         else
528                         {
529                             rpinvC = 1;
530                             rinvC  = rinv;
531                             rC     = r;
532
533                             rpinvV = 1;
534                             rinvV  = rinv;
535                             rV     = r;
536                         }
537
538                         /* Only process the coulomb interactions if we have charges,
539                          * and if we either include all entries in the list (no cutoff
540                          * used in the kernel), or if we are within the cutoff.
541                          */
542                         bool computeElecInteraction = (elecInteractionTypeIsEwald && r < rcoulomb)
543                                                       || (!elecInteractionTypeIsEwald && rC < rcoulomb);
544
545                         if ((qq[i] != 0) && computeElecInteraction)
546                         {
547                             if (elecInteractionTypeIsEwald)
548                             {
549                                 Vcoul[i]  = ewaldPotential(qq[i], rinvC, sh_ewald);
550                                 FscalC[i] = ewaldScalarForce(qq[i], rinvC);
551                             }
552                             else
553                             {
554                                 Vcoul[i]  = reactionFieldPotential(qq[i], rinvC, rC, krf, crf);
555                                 FscalC[i] = reactionFieldScalarForce(qq[i], rinvC, rC, krf, two);
556                             }
557                         }
558
559                         /* Only process the VDW interactions if we have
560                          * some non-zero parameters, and if we either
561                          * include all entries in the list (no cutoff used
562                          * in the kernel), or if we are within the cutoff.
563                          */
564                         bool computeVdwInteraction = (vdwInteractionTypeIsEwald && r < rvdw)
565                                                      || (!vdwInteractionTypeIsEwald && rV < rvdw);
566                         if ((c6[i] != 0 || c12[i] != 0) && computeVdwInteraction)
567                         {
568                             RealType rinv6;
569                             if (useSoftCore)
570                             {
571                                 rinv6 = rpinvV;
572                             }
573                             else
574                             {
575                                 rinv6 = calculateRinv6(rinvV);
576                             }
577                             RealType Vvdw6  = calculateVdw6(c6[i], rinv6);
578                             RealType Vvdw12 = calculateVdw12(c12[i], rinv6);
579
580                             Vvdw[i] = lennardJonesPotential(Vvdw6, Vvdw12, c6[i], c12[i], repulsionShift,
581                                                             dispersionShift, onesixth, onetwelfth);
582                             FscalV[i] = lennardJonesScalarForce(Vvdw6, Vvdw12);
583
584                             if (vdwInteractionTypeIsEwald)
585                             {
586                                 /* Subtract the grid potential at the cut-off */
587                                 Vvdw[i] += ewaldLennardJonesGridSubtract(nbfp_grid[tj[i]],
588                                                                          sh_lj_ewald, onesixth);
589                             }
590
591                             if (vdwModifierIsPotSwitch)
592                             {
593                                 RealType d        = rV - ic->rvdw_switch;
594                                 d                 = (d > zero) ? d : zero;
595                                 const RealType d2 = d * d;
596                                 const RealType sw =
597                                         one + d2 * d * (vdw_swV3 + d * (vdw_swV4 + d * vdw_swV5));
598                                 const RealType dsw = d2 * (vdw_swF2 + d * (vdw_swF3 + d * vdw_swF4));
599
600                                 FscalV[i] = potSwitchScalarForceMod(FscalV[i], Vvdw[i], sw, rV,
601                                                                     rvdw, dsw, zero);
602                                 Vvdw[i]   = potSwitchPotentialMod(Vvdw[i], sw, rV, rvdw, zero);
603                             }
604                         }
605
606                         /* FscalC (and FscalV) now contain: dV/drC * rC
607                          * Now we multiply by rC^-p, so it will be: dV/drC * rC^1-p
608                          * Further down we first multiply by r^p-2 and then by
609                          * the vector r, which in total gives: dV/drC * (r/rC)^1-p
610                          */
611                         FscalC[i] *= rpinvC;
612                         FscalV[i] *= rpinvV;
613                     }
614                 }
615
616                 /* Assemble A and B states */
617                 for (int i = 0; i < NSTATES; i++)
618                 {
619                     vctot += LFC[i] * Vcoul[i];
620                     vvtot += LFV[i] * Vvdw[i];
621
622                     Fscal += LFC[i] * FscalC[i] * rpm2;
623                     Fscal += LFV[i] * FscalV[i] * rpm2;
624
625                     if (useSoftCore)
626                     {
627                         dvdl_coul += Vcoul[i] * DLF[i]
628                                      + LFC[i] * alpha_coul_eff * dlfac_coul[i] * FscalC[i] * sigma6[i];
629                         dvdl_vdw += Vvdw[i] * DLF[i]
630                                     + LFV[i] * alpha_vdw_eff * dlfac_vdw[i] * FscalV[i] * sigma6[i];
631                     }
632                     else
633                     {
634                         dvdl_coul += Vcoul[i] * DLF[i];
635                         dvdl_vdw += Vvdw[i] * DLF[i];
636                     }
637                 }
638             }
639             else if (icoul == GMX_NBKERNEL_ELEC_REACTIONFIELD)
640             {
641                 /* For excluded pairs, which are only in this pair list when
642                  * using the Verlet scheme, we don't use soft-core.
643                  * As there is no singularity, there is no need for soft-core.
644                  */
645                 const real FF = -two * krf;
646                 RealType   VV = krf * rsq - crf;
647
648                 if (ii == jnr)
649                 {
650                     VV *= half;
651                 }
652
653                 for (int i = 0; i < NSTATES; i++)
654                 {
655                     vctot += LFC[i] * qq[i] * VV;
656                     Fscal += LFC[i] * qq[i] * FF;
657                     dvdl_coul += DLF[i] * qq[i] * VV;
658                 }
659             }
660
661             if (elecInteractionTypeIsEwald && r < rcoulomb)
662             {
663                 /* See comment in the preamble. When using Ewald interactions
664                  * (unless we use a switch modifier) we subtract the reciprocal-space
665                  * Ewald component here which made it possible to apply the free
666                  * energy interaction to 1/r (vanilla coulomb short-range part)
667                  * above. This gets us closer to the ideal case of applying
668                  * the softcore to the entire electrostatic interaction,
669                  * including the reciprocal-space component.
670                  */
671                 real v_lr, f_lr;
672
673                 const RealType ewrt   = r * ewtabscale;
674                 IntType        ewitab = static_cast<IntType>(ewrt);
675                 const RealType eweps  = ewrt - ewitab;
676                 ewitab                = 4 * ewitab;
677                 f_lr                  = ewtab[ewitab] + eweps * ewtab[ewitab + 1];
678                 v_lr = (ewtab[ewitab + 2] - ewtabhalfspace * eweps * (ewtab[ewitab] + f_lr));
679                 f_lr *= rinv;
680
681                 /* Note that any possible Ewald shift has already been applied in
682                  * the normal interaction part above.
683                  */
684
685                 if (ii == jnr)
686                 {
687                     /* If we get here, the i particle (ii) has itself (jnr)
688                      * in its neighborlist. This can only happen with the Verlet
689                      * scheme, and corresponds to a self-interaction that will
690                      * occur twice. Scale it down by 50% to only include it once.
691                      */
692                     v_lr *= half;
693                 }
694
695                 for (int i = 0; i < NSTATES; i++)
696                 {
697                     vctot -= LFC[i] * qq[i] * v_lr;
698                     Fscal -= LFC[i] * qq[i] * f_lr;
699                     dvdl_coul -= (DLF[i] * qq[i]) * v_lr;
700                 }
701             }
702
703             if (vdwInteractionTypeIsEwald && r < rvdw)
704             {
705                 /* See comment in the preamble. When using LJ-Ewald interactions
706                  * (unless we use a switch modifier) we subtract the reciprocal-space
707                  * Ewald component here which made it possible to apply the free
708                  * energy interaction to r^-6 (vanilla LJ6 short-range part)
709                  * above. This gets us closer to the ideal case of applying
710                  * the softcore to the entire VdW interaction,
711                  * including the reciprocal-space component.
712                  */
713                 /* We could also use the analytical form here
714                  * iso a table, but that can cause issues for
715                  * r close to 0 for non-interacting pairs.
716                  */
717
718                 const RealType rs   = rsq * rinv * ewtabscale;
719                 const IntType  ri   = static_cast<IntType>(rs);
720                 const RealType frac = rs - ri;
721                 const RealType f_lr = (1 - frac) * tab_ewald_F_lj[ri] + frac * tab_ewald_F_lj[ri + 1];
722                 /* TODO: Currently the Ewald LJ table does not contain
723                  * the factor 1/6, we should add this.
724                  */
725                 const RealType FF = f_lr * rinv / six;
726                 RealType       VV =
727                         (tab_ewald_V_lj[ri] - ewtabhalfspace * frac * (tab_ewald_F_lj[ri] + f_lr)) / six;
728
729                 if (ii == jnr)
730                 {
731                     /* If we get here, the i particle (ii) has itself (jnr)
732                      * in its neighborlist. This can only happen with the Verlet
733                      * scheme, and corresponds to a self-interaction that will
734                      * occur twice. Scale it down by 50% to only include it once.
735                      */
736                     VV *= half;
737                 }
738
739                 for (int i = 0; i < NSTATES; i++)
740                 {
741                     const real c6grid = nbfp_grid[tj[i]];
742                     vvtot += LFV[i] * c6grid * VV;
743                     Fscal += LFV[i] * c6grid * FF;
744                     dvdl_vdw += (DLF[i] * c6grid) * VV;
745                 }
746             }
747
748             if (doForces)
749             {
750                 const real tx = Fscal * dx;
751                 const real ty = Fscal * dy;
752                 const real tz = Fscal * dz;
753                 fix           = fix + tx;
754                 fiy           = fiy + ty;
755                 fiz           = fiz + tz;
756                 /* OpenMP atomics are expensive, but this kernels is also
757                  * expensive, so we can take this hit, instead of using
758                  * thread-local output buffers and extra reduction.
759                  *
760                  * All the OpenMP regions in this file are trivial and should
761                  * not throw, so no need for try/catch.
762                  */
763 #pragma omp atomic
764                 f[j3] -= tx;
765 #pragma omp atomic
766                 f[j3 + 1] -= ty;
767 #pragma omp atomic
768                 f[j3 + 2] -= tz;
769             }
770         }
771
772         /* The atomics below are expensive with many OpenMP threads.
773          * Here unperturbed i-particles will usually only have a few
774          * (perturbed) j-particles in the list. Thus with a buffered list
775          * we can skip a significant number of i-reductions with a check.
776          */
777         if (npair_within_cutoff > 0)
778         {
779             if (doForces)
780             {
781 #pragma omp atomic
782                 f[ii3] += fix;
783 #pragma omp atomic
784                 f[ii3 + 1] += fiy;
785 #pragma omp atomic
786                 f[ii3 + 2] += fiz;
787             }
788             if (doShiftForces)
789             {
790 #pragma omp atomic
791                 fshift[is3] += fix;
792 #pragma omp atomic
793                 fshift[is3 + 1] += fiy;
794 #pragma omp atomic
795                 fshift[is3 + 2] += fiz;
796             }
797             if (doPotential)
798             {
799                 int ggid = gid[n];
800 #pragma omp atomic
801                 Vc[ggid] += vctot;
802 #pragma omp atomic
803                 Vv[ggid] += vvtot;
804             }
805         }
806     }
807
808 #pragma omp atomic
809     dvdl[efptCOUL] += dvdl_coul;
810 #pragma omp atomic
811     dvdl[efptVDW] += dvdl_vdw;
812
813     /* Estimate flops, average for free energy stuff:
814      * 12  flops per outer iteration
815      * 150 flops per inner iteration
816      */
817 #pragma omp atomic
818     inc_nrnb(nrnb, eNR_NBKERNEL_FREE_ENERGY, nlist->nri * 12 + nlist->jindex[nri] * 150);
819 }
820
821 typedef void (*KernelFunction)(const t_nblist* gmx_restrict nlist,
822                                rvec* gmx_restrict         xx,
823                                gmx::ForceWithShiftForces* forceWithShiftForces,
824                                const t_forcerec* gmx_restrict fr,
825                                const t_mdatoms* gmx_restrict mdatoms,
826                                nb_kernel_data_t* gmx_restrict kernel_data,
827                                t_nrnb* gmx_restrict nrnb);
828
829 template<bool useSoftCore, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch>
830 static KernelFunction dispatchKernelOnUseSimd(const bool useSimd)
831 {
832     if (useSimd)
833     {
834 #if GMX_SIMD_HAVE_REAL && GMX_SIMD_HAVE_INT32_ARITHMETICS
835         /* FIXME: Here SimdDataTypes should be used to enable SIMD. So far, the code in nb_free_energy_kernel is not adapted to SIMD */
836         return (nb_free_energy_kernel<ScalarDataTypes, useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald,
837                                       elecInteractionTypeIsEwald, vdwModifierIsPotSwitch>);
838 #else
839         return (nb_free_energy_kernel<ScalarDataTypes, useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald,
840                                       elecInteractionTypeIsEwald, vdwModifierIsPotSwitch>);
841 #endif
842     }
843     else
844     {
845         return (nb_free_energy_kernel<ScalarDataTypes, useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald,
846                                       elecInteractionTypeIsEwald, vdwModifierIsPotSwitch>);
847     }
848 }
849
850 template<bool useSoftCore, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald>
851 static KernelFunction dispatchKernelOnVdwModifier(const bool vdwModifierIsPotSwitch, const bool useSimd)
852 {
853     if (vdwModifierIsPotSwitch)
854     {
855         return (dispatchKernelOnUseSimd<useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald,
856                                         elecInteractionTypeIsEwald, true>(useSimd));
857     }
858     else
859     {
860         return (dispatchKernelOnUseSimd<useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald,
861                                         elecInteractionTypeIsEwald, false>(useSimd));
862     }
863 }
864
865 template<bool useSoftCore, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald>
866 static KernelFunction dispatchKernelOnElecInteractionType(const bool elecInteractionTypeIsEwald,
867                                                           const bool vdwModifierIsPotSwitch,
868                                                           const bool useSimd)
869 {
870     if (elecInteractionTypeIsEwald)
871     {
872         return (dispatchKernelOnVdwModifier<useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, true>(
873                 vdwModifierIsPotSwitch, useSimd));
874     }
875     else
876     {
877         return (dispatchKernelOnVdwModifier<useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, false>(
878                 vdwModifierIsPotSwitch, useSimd));
879     }
880 }
881
882 template<bool useSoftCore, bool scLambdasOrAlphasDiffer>
883 static KernelFunction dispatchKernelOnVdwInteractionType(const bool vdwInteractionTypeIsEwald,
884                                                          const bool elecInteractionTypeIsEwald,
885                                                          const bool vdwModifierIsPotSwitch,
886                                                          const bool useSimd)
887 {
888     if (vdwInteractionTypeIsEwald)
889     {
890         return (dispatchKernelOnElecInteractionType<useSoftCore, scLambdasOrAlphasDiffer, true>(
891                 elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, useSimd));
892     }
893     else
894     {
895         return (dispatchKernelOnElecInteractionType<useSoftCore, scLambdasOrAlphasDiffer, false>(
896                 elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, useSimd));
897     }
898 }
899
900 template<bool useSoftCore>
901 static KernelFunction dispatchKernelOnScLambdasOrAlphasDifference(const bool scLambdasOrAlphasDiffer,
902                                                                   const bool vdwInteractionTypeIsEwald,
903                                                                   const bool elecInteractionTypeIsEwald,
904                                                                   const bool vdwModifierIsPotSwitch,
905                                                                   const bool useSimd)
906 {
907     if (scLambdasOrAlphasDiffer)
908     {
909         return (dispatchKernelOnVdwInteractionType<useSoftCore, true>(
910                 vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, useSimd));
911     }
912     else
913     {
914         return (dispatchKernelOnVdwInteractionType<useSoftCore, false>(
915                 vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, useSimd));
916     }
917 }
918
919 static KernelFunction dispatchKernel(const bool        scLambdasOrAlphasDiffer,
920                                      const bool        vdwInteractionTypeIsEwald,
921                                      const bool        elecInteractionTypeIsEwald,
922                                      const bool        vdwModifierIsPotSwitch,
923                                      const bool        useSimd,
924                                      const t_forcerec* fr)
925 {
926     if (fr->sc_alphacoul == 0 && fr->sc_alphavdw == 0)
927     {
928         return (dispatchKernelOnScLambdasOrAlphasDifference<false>(
929                 scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald,
930                 vdwModifierIsPotSwitch, useSimd));
931     }
932     else
933     {
934         return (dispatchKernelOnScLambdasOrAlphasDifference<true>(
935                 scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald,
936                 vdwModifierIsPotSwitch, useSimd));
937     }
938 }
939
940
941 void gmx_nb_free_energy_kernel(const t_nblist*            nlist,
942                                rvec*                      xx,
943                                gmx::ForceWithShiftForces* ff,
944                                const t_forcerec*          fr,
945                                const t_mdatoms*           mdatoms,
946                                nb_kernel_data_t*          kernel_data,
947                                t_nrnb*                    nrnb)
948 {
949     GMX_ASSERT(EEL_PME_EWALD(fr->ic->eeltype) || fr->ic->eeltype == eelCUT || EEL_RF(fr->ic->eeltype),
950                "Unsupported eeltype with free energy");
951
952     const bool vdwInteractionTypeIsEwald  = (EVDW_PME(fr->ic->vdwtype));
953     const bool elecInteractionTypeIsEwald = (EEL_PME_EWALD(fr->ic->eeltype));
954     const bool vdwModifierIsPotSwitch     = (fr->ic->vdw_modifier == eintmodPOTSWITCH);
955     bool       scLambdasOrAlphasDiffer    = true;
956     const bool useSimd                    = fr->use_simd_kernels;
957
958     if (fr->sc_alphacoul == 0 && fr->sc_alphavdw == 0)
959     {
960         scLambdasOrAlphasDiffer = false;
961     }
962     else if (fr->sc_r_power == 6.0_real)
963     {
964         if (kernel_data->lambda[efptCOUL] == kernel_data->lambda[efptVDW] && fr->sc_alphacoul == fr->sc_alphavdw)
965         {
966             scLambdasOrAlphasDiffer = false;
967         }
968     }
969     else
970     {
971         GMX_RELEASE_ASSERT(false, "Unsupported soft-core r-power");
972     }
973
974     KernelFunction kernelFunc;
975     kernelFunc = dispatchKernel(scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald,
976                                 elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, useSimd, fr);
977     kernelFunc(nlist, xx, ff, fr, mdatoms, kernel_data, nrnb);
978 }