2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
39 const nbnxn_cj_t *l_cj;
42 gmx_bool do_LJ, half_LJ, do_coul;
43 int cjind0, cjind1, cjind;
47 int egps_ishift, egps_imask;
48 int egps_jshift, egps_jmask, egps_jstride;
50 real *vvdwtp[UNROLLI];
57 SimdReal ix_S0, iy_S0, iz_S0;
58 SimdReal ix_S2, iy_S2, iz_S2;
59 SimdReal fix_S0, fiy_S0, fiz_S0;
60 SimdReal fix_S2, fiy_S2, fiz_S2;
62 SimdReal diagonal_jmi_S;
63 #if UNROLLI == UNROLLJ
64 SimdBool diagonal_mask_S0, diagonal_mask_S2;
66 SimdBool diagonal_mask0_S0, diagonal_mask0_S2;
67 SimdBool diagonal_mask1_S0, diagonal_mask1_S2;
70 SimdBitMask filter_S0, filter_S2;
75 SimdReal iq_S0 = setZero();
76 SimdReal iq_S2 = setZero();
81 SimdReal hrc_3_S, moh_rc_S;
86 /* Coulomb table variables */
88 const real *tab_coul_F;
89 #if defined CALC_ENERGIES && !defined TAB_FDV0
90 const real *tab_coul_V;
98 #ifdef CALC_COUL_EWALD
99 SimdReal beta2_S, beta_S;
102 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
106 #if defined LJ_CUT && defined CALC_ENERGIES
107 SimdReal p6_cpot_S, p12_cpot_S;
111 SimdReal swV3_S, swV4_S, swV5_S;
112 SimdReal swF2_S, swF3_S, swF4_S;
114 #ifdef LJ_FORCE_SWITCH
116 SimdReal p6_fc2_S, p6_fc3_S;
117 SimdReal p12_fc2_S, p12_fc3_S;
119 SimdReal p6_vc3_S, p6_vc4_S;
120 SimdReal p12_vc3_S, p12_vc4_S;
121 SimdReal p6_6cpot_S, p12_12cpot_S;
125 real lj_ewaldcoeff2, lj_ewaldcoeff6_6;
126 SimdReal mone_S, half_S, lje_c2_S, lje_c6_6_S;
130 SimdReal hsig_i_S0, seps_i_S0;
131 SimdReal hsig_i_S2, seps_i_S2;
134 alignas(GMX_SIMD_ALIGNMENT) real pvdw_c6[2*UNROLLI*UNROLLJ];
135 real *pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
137 #endif /* LJ_COMB_LB */
141 #ifdef VDW_CUTOFF_CHECK
151 const nbnxn_atomdata_t::Params &nbatParams = nbat->params();
153 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM
154 const real * gmx_restrict ljc = nbatParams.lj_comb.data();
156 #if !(defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined FIX_LJ_C)
157 /* No combination rule used */
158 const real * gmx_restrict nbfp_ptr = nbatParams.nbfp_aligned.data();
159 const int * gmx_restrict type = nbatParams.type.data();
162 /* Load j-i for the first i */
163 diagonal_jmi_S = load<SimdReal>(nbat->simdMasks.diagonal_2xnn_j_minus_i.data());
164 /* Generate all the diagonal masks as comparison results */
165 #if UNROLLI == UNROLLJ
166 diagonal_mask_S0 = (zero_S < diagonal_jmi_S);
167 diagonal_jmi_S = diagonal_jmi_S - one_S;
168 diagonal_jmi_S = diagonal_jmi_S - one_S;
169 diagonal_mask_S2 = (zero_S < diagonal_jmi_S);
171 #if 2*UNROLLI == UNROLLJ
172 diagonal_mask0_S0 = (zero_S < diagonal_jmi_S);
173 diagonal_jmi_S = diagonal_jmi_S - one_S;
174 diagonal_jmi_S = diagonal_jmi_S - one_S;
175 diagonal_mask0_S2 = (zero_S < diagonal_jmi_S);
176 diagonal_jmi_S = diagonal_jmi_S - one_S;
177 diagonal_jmi_S = diagonal_jmi_S - one_S;
178 diagonal_mask1_S0 = (zero_S < diagonal_jmi_S);
179 diagonal_jmi_S = diagonal_jmi_S - one_S;
180 diagonal_jmi_S = diagonal_jmi_S - one_S;
181 diagonal_mask1_S2 = (zero_S < diagonal_jmi_S);
185 /* Load masks for topology exclusion masking. filter_stride is
186 static const, so the conditional will be optimized away. */
187 #if GMX_DOUBLE && !GMX_SIMD_HAVE_INT32_LOGICAL
188 const std::uint64_t * gmx_restrict exclusion_filter = nbat->simdMasks.exclusion_filter64.data();
190 const std::uint32_t * gmx_restrict exclusion_filter = nbat->simdMasks.exclusion_filter.data();
193 /* Here we cast the exclusion filters from unsigned * to int * or real *.
194 * Since we only check bits, the actual value they represent does not
195 * matter, as long as both filter and mask data are treated the same way.
197 #if GMX_SIMD_HAVE_INT32_LOGICAL
198 filter_S0 = load<SimdBitMask>(reinterpret_cast<const int *>(exclusion_filter + 0*UNROLLJ));
199 filter_S2 = load<SimdBitMask>(reinterpret_cast<const int *>(exclusion_filter + 2*UNROLLJ));
201 filter_S0 = load<SimdBitMask>(reinterpret_cast<const real *>(exclusion_filter + 0*UNROLLJ));
202 filter_S2 = load<SimdBitMask>(reinterpret_cast<const real *>(exclusion_filter + 2*UNROLLJ));
206 /* Reaction-field constants */
207 mrc_3_S = SimdReal(-2*ic->k_rf);
209 hrc_3_S = SimdReal(ic->k_rf);
210 moh_rc_S = SimdReal(-ic->c_rf);
216 invtsp_S = SimdReal(ic->tabq_scale);
218 mhalfsp_S = SimdReal(-0.5/ic->tabq_scale);
222 tab_coul_F = ic->tabq_coul_FDV0;
224 tab_coul_F = ic->tabq_coul_F;
226 tab_coul_V = ic->tabq_coul_V;
229 #endif /* CALC_COUL_TAB */
231 #ifdef CALC_COUL_EWALD
232 beta2_S = SimdReal(ic->ewaldcoeff_q*ic->ewaldcoeff_q);
233 beta_S = SimdReal(ic->ewaldcoeff_q);
236 #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
237 sh_ewald_S = SimdReal(ic->sh_ewald);
240 /* LJ function constants */
241 #if defined CALC_ENERGIES || defined LJ_POT_SWITCH
242 SimdReal sixth_S = SimdReal(1.0/6.0);
243 SimdReal twelveth_S = SimdReal(1.0/12.0);
246 #if defined LJ_CUT && defined CALC_ENERGIES
247 /* We shift the potential by cpot, which can be zero */
248 p6_cpot_S = SimdReal(ic->dispersion_shift.cpot);
249 p12_cpot_S = SimdReal(ic->repulsion_shift.cpot);
252 rswitch_S = SimdReal(ic->rvdw_switch);
253 swV3_S = SimdReal(ic->vdw_switch.c3);
254 swV4_S = SimdReal(ic->vdw_switch.c4);
255 swV5_S = SimdReal(ic->vdw_switch.c5);
256 swF2_S = SimdReal(3*ic->vdw_switch.c3);
257 swF3_S = SimdReal(4*ic->vdw_switch.c4);
258 swF4_S = SimdReal(5*ic->vdw_switch.c5);
260 #ifdef LJ_FORCE_SWITCH
261 rswitch_S = SimdReal(ic->rvdw_switch);
262 p6_fc2_S = SimdReal(ic->dispersion_shift.c2);
263 p6_fc3_S = SimdReal(ic->dispersion_shift.c3);
264 p12_fc2_S = SimdReal(ic->repulsion_shift.c2);
265 p12_fc3_S = SimdReal(ic->repulsion_shift.c3);
268 SimdReal mthird_S = SimdReal(-1.0/3.0);
269 SimdReal mfourth_S = SimdReal(-1.0/4.0);
271 p6_vc3_S = mthird_S * p6_fc2_S;
272 p6_vc4_S = mfourth_S * p6_fc3_S;
273 p6_6cpot_S = SimdReal(ic->dispersion_shift.cpot/6);
274 p12_vc3_S = mthird_S * p12_fc2_S;
275 p12_vc4_S = mfourth_S * p12_fc3_S;
276 p12_12cpot_S = SimdReal(ic->repulsion_shift.cpot/12);
281 mone_S = SimdReal(-1.0);
282 half_S = SimdReal(0.5);
283 lj_ewaldcoeff2 = ic->ewaldcoeff_lj*ic->ewaldcoeff_lj;
284 lj_ewaldcoeff6_6 = lj_ewaldcoeff2*lj_ewaldcoeff2*lj_ewaldcoeff2/6;
285 lje_c2_S = SimdReal(lj_ewaldcoeff2);
286 lje_c6_6_S = SimdReal(lj_ewaldcoeff6_6);
288 /* Determine the grid potential at the cut-off */
289 SimdReal lje_vc_S = SimdReal(ic->sh_lj_ewald);
293 /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
294 rc2_S = SimdReal(ic->rcoulomb*ic->rcoulomb);
295 #ifdef VDW_CUTOFF_CHECK
296 rcvdw2_S = SimdReal(ic->rvdw*ic->rvdw);
299 minRsq_S = SimdReal(NBNXN_MIN_RSQ);
301 const real * gmx_restrict q = nbatParams.q.data();
302 const real facel = ic->epsfac;
303 const real * gmx_restrict shiftvec = shift_vec[0];
304 const real * gmx_restrict x = nbat->x().data();
308 for (jp = 0; jp < UNROLLJ; jp++)
310 pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
311 pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
312 pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
313 pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
315 pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
316 pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
317 pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
318 pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
320 SimdReal c6_S0 = load<SimdReal>(pvdw_c6 +0*UNROLLJ);
321 SimdReal c6_S1 = load<SimdReal>(pvdw_c6 +1*UNROLLJ);
322 SimdReal c6_S2 = load<SimdReal>(pvdw_c6 +2*UNROLLJ);
323 SimdReal c6_S3 = load<SimdReal>(pvdw_c6 +3*UNROLLJ);
325 SimdReal c12_S0 = load<SimdReal>(pvdw_c12+0*UNROLLJ);
326 SimdReal c12_S1 = load<SimdReal>(pvdw_c12+1*UNROLLJ);
327 SimdReal c12_S2 = load<SimdReal>(pvdw_c12+2*UNROLLJ);
328 SimdReal c12_S3 = load<SimdReal>(pvdw_c12+3*UNROLLJ);
329 #endif /* FIX_LJ_C */
332 egps_ishift = nbatParams.neg_2log;
333 egps_imask = (1<<egps_ishift) - 1;
334 egps_jshift = 2*nbatParams.neg_2log;
335 egps_jmask = (1<<egps_jshift) - 1;
336 egps_jstride = (UNROLLJ>>1)*UNROLLJ;
337 /* Major division is over i-particle energy groups, determine the stride */
338 Vstride_i = nbatParams.nenergrp*(1 << nbatParams.neg_2log)*egps_jstride;
341 l_cj = nbl->cj.data();
344 for (const nbnxn_ci_t &ciEntry : nbl->ci)
346 ish = (ciEntry.shift & NBNXN_CI_SHIFT);
348 cjind0 = ciEntry.cj_ind_start;
349 cjind1 = ciEntry.cj_ind_end;
351 ci_sh = (ish == CENTRAL ? ci : -1);
353 shX_S = SimdReal(shiftvec[ish3]);
354 shY_S = SimdReal(shiftvec[ish3+1]);
355 shZ_S = SimdReal(shiftvec[ish3+2]);
360 #if defined LJ_COMB_LB || defined LJ_COMB_GEOM || defined LJ_EWALD_GEOM
364 int sci = (ci>>1)*STRIDE;
365 int scix = sci*DIM + (ci & 1)*(STRIDE>>1);
366 #if defined LJ_COMB_LB || defined LJ_COMB_GEOM || defined LJ_EWALD_GEOM
367 int sci2 = sci*2 + (ci & 1)*(STRIDE>>1);
369 sci += (ci & 1)*(STRIDE>>1);
372 /* We have 5 LJ/C combinations, but use only three inner loops,
373 * as the other combinations are unlikely and/or not much faster:
374 * inner half-LJ + C for half-LJ + C / no-LJ + C
375 * inner LJ + C for full-LJ + C
376 * inner LJ for full-LJ + no-C / half-LJ + no-C
378 do_LJ = ((ciEntry.shift & NBNXN_CI_DO_LJ(0)) != 0);
379 do_coul = ((ciEntry.shift & NBNXN_CI_DO_COUL(0)) != 0);
380 half_LJ = (((ciEntry.shift & NBNXN_CI_HALF_LJ(0)) != 0) || !do_LJ) && do_coul;
383 egps_i = nbatParams.energrp[ci];
387 for (ia = 0; ia < UNROLLI; ia++)
389 egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
390 vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
391 vctp[ia] = Vc + egp_ia*Vstride_i;
398 gmx_bool do_self = TRUE;
400 gmx_bool do_self = do_coul;
403 if (do_self && l_cj[ciEntry.cj_ind_start].cj == ci_sh)
406 if (do_self && l_cj[ciEntry.cj_ind_start].cj == (ci_sh>>1))
415 Vc_sub_self = 0.5*ic->c_rf;
419 Vc_sub_self = 0.5*tab_coul_F[2];
421 Vc_sub_self = 0.5*tab_coul_V[0];
424 #ifdef CALC_COUL_EWALD
426 Vc_sub_self = 0.5*ic->ewaldcoeff_q*M_2_SQRTPI;
429 for (ia = 0; ia < UNROLLI; ia++)
435 vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
439 -= facel*qi*qi*Vc_sub_self;
447 for (ia = 0; ia < UNROLLI; ia++)
451 c6_i = nbatParams.nbfp[nbatParams.type[sci+ia]*(nbatParams.numTypes + 1)*2]/6;
453 vvdwtp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
457 += 0.5*c6_i*lj_ewaldcoeff6_6;
460 #endif /* LJ_EWALD */
464 /* Load i atom data */
465 int sciy = scix + STRIDE;
466 int sciz = sciy + STRIDE;
467 ix_S0 = loadU1DualHsimd(x+scix);
468 ix_S2 = loadU1DualHsimd(x+scix+2);
469 iy_S0 = loadU1DualHsimd(x+sciy);
470 iy_S2 = loadU1DualHsimd(x+sciy+2);
471 iz_S0 = loadU1DualHsimd(x+sciz);
472 iz_S2 = loadU1DualHsimd(x+sciz+2);
473 ix_S0 = ix_S0 + shX_S;
474 ix_S2 = ix_S2 + shX_S;
475 iy_S0 = iy_S0 + shY_S;
476 iy_S2 = iy_S2 + shY_S;
477 iz_S0 = iz_S0 + shZ_S;
478 iz_S2 = iz_S2 + shZ_S;
484 facel_S = SimdReal(facel);
486 iq_S0 = loadU1DualHsimd(q+sci);
487 iq_S2 = loadU1DualHsimd(q+sci+2);
488 iq_S0 = facel_S * iq_S0;
489 iq_S2 = facel_S * iq_S2;
493 hsig_i_S0 = loadU1DualHsimd(ljc+sci2);
494 hsig_i_S2 = loadU1DualHsimd(ljc+sci2+2);
495 seps_i_S0 = loadU1DualHsimd(ljc+sci2+STRIDE);
496 seps_i_S2 = loadU1DualHsimd(ljc+sci2+STRIDE+2);
499 SimdReal c6s_S0, c12s_S0;
500 SimdReal c6s_S2, c12s_S2;
502 c6s_S0 = loadU1DualHsimd(ljc+sci2);
506 c6s_S2 = loadU1DualHsimd(ljc+sci2+2);
508 c12s_S0 = loadU1DualHsimd(ljc+sci2+STRIDE);
511 c12s_S2 = loadU1DualHsimd(ljc+sci2+STRIDE+2);
513 #elif !defined LJ_COMB_LB && !defined FIX_LJ_C
514 const int numTypes = nbatParams.numTypes;
515 const real *nbfp0 = nbfp_ptr + type[sci ]*numTypes*c_simdBestPairAlignment;
516 const real *nbfp1 = nbfp_ptr + type[sci+1]*numTypes*c_simdBestPairAlignment;
517 const real *nbfp2 = nullptr, *nbfp3 = nullptr;
520 nbfp2 = nbfp_ptr + type[sci+2]*numTypes*c_simdBestPairAlignment;
521 nbfp3 = nbfp_ptr + type[sci+3]*numTypes*c_simdBestPairAlignment;
526 /* We need the geometrically combined C6 for the PME grid correction */
527 SimdReal c6s_S0, c6s_S2;
528 c6s_S0 = loadU1DualHsimd(ljc+sci2);
531 c6s_S2 = loadU1DualHsimd(ljc+sci2+2);
535 /* Zero the potential energy for this list */
537 SimdReal Vvdwtot_S = setZero();
538 SimdReal vctot_S = setZero();
541 /* Clear i atom forces */
551 /* Currently all kernels use (at least half) LJ */
555 /* Coulomb: all i-atoms, LJ: first half i-atoms */
559 while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
561 #include "gromacs/nbnxm/kernels_simd_2xmm/kernel_inner.h"
565 for (; (cjind < cjind1); cjind++)
567 #include "gromacs/nbnxm/kernels_simd_2xmm/kernel_inner.h"
574 /* Coulomb: all i-atoms, LJ: all i-atoms */
577 while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
579 #include "gromacs/nbnxm/kernels_simd_2xmm/kernel_inner.h"
583 for (; (cjind < cjind1); cjind++)
585 #include "gromacs/nbnxm/kernels_simd_2xmm/kernel_inner.h"
591 /* Coulomb: none, LJ: all i-atoms */
593 while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
595 #include "gromacs/nbnxm/kernels_simd_2xmm/kernel_inner.h"
599 for (; (cjind < cjind1); cjind++)
601 #include "gromacs/nbnxm/kernels_simd_2xmm/kernel_inner.h"
605 ninner += cjind1 - cjind0;
607 /* Add accumulated i-forces to the force array */
608 real fShiftX = reduceIncr4ReturnSumHsimd(f+scix, fix_S0, fix_S2);
609 real fShiftY = reduceIncr4ReturnSumHsimd(f+sciy, fiy_S0, fiy_S2);
610 real fShiftZ = reduceIncr4ReturnSumHsimd(f+sciz, fiz_S0, fiz_S2);
612 #ifdef CALC_SHIFTFORCES
613 fshift[ish3+0] += fShiftX;
614 fshift[ish3+1] += fShiftY;
615 fshift[ish3+2] += fShiftZ;
621 *Vc += reduce(vctot_S);
623 *Vvdw += reduce(Vvdwtot_S);
626 /* Outer loop uses 6 flops/iteration */
630 printf("atom pairs %d\n", npair);