e17c121123bde8e801aa06f94d7469354e68469f
[alexxy/gromacs.git] / src / gromacs / nbnxm / kernels_simd_2xmm / kernel_outer.h
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35
36
37 {
38     using namespace gmx;
39
40     /* Unpack pointers for output */
41     real* f      = out->f.data();
42     real* fshift = out->fshift.data();
43 #ifdef CALC_ENERGIES
44 #    ifdef ENERGY_GROUPS
45     real* Vvdw = out->VSvdw.data();
46     real* Vc   = out->VSc.data();
47 #    else
48     real* Vvdw = out->Vvdw.data();
49     real* Vc   = out->Vc.data();
50 #    endif
51 #endif
52
53     const nbnxn_cj_t* l_cj;
54     int               ci, ci_sh;
55     int               ish, ish3;
56     gmx_bool          do_LJ, half_LJ, do_coul;
57     int               cjind0, cjind1, cjind;
58
59 #ifdef ENERGY_GROUPS
60     int   Vstride_i;
61     int   egps_ishift, egps_imask;
62     int   egps_jshift, egps_jmask, egps_jstride;
63     int   egps_i;
64     real* vvdwtp[UNROLLI];
65     real* vctp[UNROLLI];
66 #endif
67
68     SimdReal shX_S;
69     SimdReal shY_S;
70     SimdReal shZ_S;
71     SimdReal ix_S0, iy_S0, iz_S0;
72     SimdReal ix_S2, iy_S2, iz_S2;
73     SimdReal fix_S0, fiy_S0, fiz_S0;
74     SimdReal fix_S2, fiy_S2, fiz_S2;
75
76     SimdReal diagonal_jmi_S;
77 #if UNROLLI == UNROLLJ
78     SimdBool diagonal_mask_S0, diagonal_mask_S2;
79 #else
80     SimdBool                         diagonal_mask0_S0, diagonal_mask0_S2;
81     SimdBool                         diagonal_mask1_S0, diagonal_mask1_S2;
82 #endif
83
84     SimdBitMask filter_S0, filter_S2;
85
86     SimdReal zero_S(0.0);
87
88     SimdReal one_S(1.0);
89     SimdReal iq_S0 = setZero();
90     SimdReal iq_S2 = setZero();
91
92 #ifdef CALC_COUL_RF
93     SimdReal mrc_3_S;
94 #    ifdef CALC_ENERGIES
95     SimdReal hrc_3_S, moh_rc_S;
96 #    endif
97 #endif
98
99 #ifdef CALC_COUL_TAB
100     /* Coulomb table variables */
101     SimdReal    invtsp_S;
102     const real* tab_coul_F;
103 #    if defined CALC_ENERGIES && !defined TAB_FDV0
104     const real*                           tab_coul_V;
105 #    endif
106
107 #    ifdef CALC_ENERGIES
108     SimdReal mhalfsp_S;
109 #    endif
110 #endif
111
112 #ifdef CALC_COUL_EWALD
113     SimdReal beta2_S, beta_S;
114 #endif
115
116 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
117     SimdReal sh_ewald_S;
118 #endif
119
120 #if defined LJ_CUT && defined CALC_ENERGIES
121     SimdReal p6_cpot_S, p12_cpot_S;
122 #endif
123 #ifdef LJ_POT_SWITCH
124     SimdReal rswitch_S;
125     SimdReal swV3_S, swV4_S, swV5_S;
126     SimdReal swF2_S, swF3_S, swF4_S;
127 #endif
128 #ifdef LJ_FORCE_SWITCH
129     SimdReal rswitch_S;
130     SimdReal p6_fc2_S, p6_fc3_S;
131     SimdReal p12_fc2_S, p12_fc3_S;
132 #    ifdef CALC_ENERGIES
133     SimdReal p6_vc3_S, p6_vc4_S;
134     SimdReal p12_vc3_S, p12_vc4_S;
135     SimdReal p6_6cpot_S, p12_12cpot_S;
136 #    endif
137 #endif
138 #ifdef LJ_EWALD_GEOM
139     real     lj_ewaldcoeff2, lj_ewaldcoeff6_6;
140     SimdReal mone_S, half_S, lje_c2_S, lje_c6_6_S;
141 #endif
142
143 #ifdef LJ_COMB_LB
144     SimdReal hsig_i_S0, seps_i_S0;
145     SimdReal hsig_i_S2, seps_i_S2;
146 #else
147 #    ifdef FIX_LJ_C
148     alignas(GMX_SIMD_ALIGNMENT) real pvdw_c6[2 * UNROLLI * UNROLLJ];
149     real*                            pvdw_c12          = pvdw_c6 + UNROLLI * UNROLLJ;
150 #    endif
151 #endif /* LJ_COMB_LB */
152
153     SimdReal minRsq_S;
154     SimdReal rc2_S;
155 #ifdef VDW_CUTOFF_CHECK
156     SimdReal rcvdw2_S;
157 #endif
158
159     int ninner;
160
161 #ifdef COUNT_PAIRS
162     int npair = 0;
163 #endif
164
165     const nbnxn_atomdata_t::Params& nbatParams = nbat->params();
166
167 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM
168     const real* gmx_restrict ljc = nbatParams.lj_comb.data();
169 #endif
170 #if !(defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined FIX_LJ_C)
171     /* No combination rule used */
172     const real* gmx_restrict nbfp_ptr = nbatParams.nbfp_aligned.data();
173     const int* gmx_restrict type      = nbatParams.type.data();
174 #endif
175
176     /* Load j-i for the first i */
177     diagonal_jmi_S = load<SimdReal>(nbat->simdMasks.diagonal_2xnn_j_minus_i.data());
178     /* Generate all the diagonal masks as comparison results */
179 #if UNROLLI == UNROLLJ
180     diagonal_mask_S0 = (zero_S < diagonal_jmi_S);
181     diagonal_jmi_S   = diagonal_jmi_S - one_S;
182     diagonal_jmi_S   = diagonal_jmi_S - one_S;
183     diagonal_mask_S2 = (zero_S < diagonal_jmi_S);
184 #else
185 #    if 2 * UNROLLI == UNROLLJ
186     diagonal_mask0_S0                                  = (zero_S < diagonal_jmi_S);
187     diagonal_jmi_S                                     = diagonal_jmi_S - one_S;
188     diagonal_jmi_S                                     = diagonal_jmi_S - one_S;
189     diagonal_mask0_S2                                  = (zero_S < diagonal_jmi_S);
190     diagonal_jmi_S                                     = diagonal_jmi_S - one_S;
191     diagonal_jmi_S                                     = diagonal_jmi_S - one_S;
192     diagonal_mask1_S0                                  = (zero_S < diagonal_jmi_S);
193     diagonal_jmi_S                                     = diagonal_jmi_S - one_S;
194     diagonal_jmi_S                                     = diagonal_jmi_S - one_S;
195     diagonal_mask1_S2                                  = (zero_S < diagonal_jmi_S);
196 #    endif
197 #endif
198
199     /* Load masks for topology exclusion masking. filter_stride is
200        static const, so the conditional will be optimized away. */
201 #if GMX_DOUBLE && !GMX_SIMD_HAVE_INT32_LOGICAL
202     const std::uint64_t* gmx_restrict exclusion_filter = nbat->simdMasks.exclusion_filter64.data();
203 #else
204     const std::uint32_t* gmx_restrict exclusion_filter = nbat->simdMasks.exclusion_filter.data();
205 #endif
206
207     /* Here we cast the exclusion filters from unsigned * to int * or real *.
208      * Since we only check bits, the actual value they represent does not
209      * matter, as long as both filter and mask data are treated the same way.
210      */
211 #if GMX_SIMD_HAVE_INT32_LOGICAL
212     filter_S0 = load<SimdBitMask>(reinterpret_cast<const int*>(exclusion_filter + 0 * UNROLLJ));
213     filter_S2 = load<SimdBitMask>(reinterpret_cast<const int*>(exclusion_filter + 2 * UNROLLJ));
214 #else
215     filter_S0 = load<SimdBitMask>(reinterpret_cast<const real*>(exclusion_filter + 0 * UNROLLJ));
216     filter_S2 = load<SimdBitMask>(reinterpret_cast<const real*>(exclusion_filter + 2 * UNROLLJ));
217 #endif
218
219 #ifdef CALC_COUL_RF
220     /* Reaction-field constants */
221     mrc_3_S = SimdReal(-2 * ic->k_rf);
222 #    ifdef CALC_ENERGIES
223     hrc_3_S  = SimdReal(ic->k_rf);
224     moh_rc_S = SimdReal(-ic->c_rf);
225 #    endif
226 #endif
227
228 #ifdef CALC_COUL_TAB
229
230     invtsp_S = SimdReal(ic->coulombEwaldTables->scale);
231 #    ifdef CALC_ENERGIES
232     mhalfsp_S = SimdReal(-0.5_real / ic->coulombEwaldTables->scale);
233 #    endif
234
235 #    ifdef TAB_FDV0
236     tab_coul_F = ic->coulombEwaldTables->tableFDV0.data();
237 #    else
238     tab_coul_F = ic->coulombEwaldTables->tableF.data();
239 #        ifdef CALC_ENERGIES
240     tab_coul_V = ic->coulombEwaldTables->tableV.data();
241 #        endif
242 #    endif
243 #endif /* CALC_COUL_TAB */
244
245 #ifdef CALC_COUL_EWALD
246     beta2_S = SimdReal(ic->ewaldcoeff_q * ic->ewaldcoeff_q);
247     beta_S  = SimdReal(ic->ewaldcoeff_q);
248 #endif
249
250 #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
251     sh_ewald_S = SimdReal(ic->sh_ewald);
252 #endif
253
254     /* LJ function constants */
255 #if defined CALC_ENERGIES || defined LJ_POT_SWITCH
256     SimdReal sixth_S    = SimdReal(1.0 / 6.0);
257     SimdReal twelveth_S = SimdReal(1.0 / 12.0);
258 #endif
259
260 #if defined LJ_CUT && defined CALC_ENERGIES
261     /* We shift the potential by cpot, which can be zero */
262     p6_cpot_S  = SimdReal(ic->dispersion_shift.cpot);
263     p12_cpot_S = SimdReal(ic->repulsion_shift.cpot);
264 #endif
265 #ifdef LJ_POT_SWITCH
266     rswitch_S = SimdReal(ic->rvdw_switch);
267     swV3_S    = SimdReal(ic->vdw_switch.c3);
268     swV4_S    = SimdReal(ic->vdw_switch.c4);
269     swV5_S    = SimdReal(ic->vdw_switch.c5);
270     swF2_S    = SimdReal(3 * ic->vdw_switch.c3);
271     swF3_S    = SimdReal(4 * ic->vdw_switch.c4);
272     swF4_S    = SimdReal(5 * ic->vdw_switch.c5);
273 #endif
274 #ifdef LJ_FORCE_SWITCH
275     rswitch_S = SimdReal(ic->rvdw_switch);
276     p6_fc2_S  = SimdReal(ic->dispersion_shift.c2);
277     p6_fc3_S  = SimdReal(ic->dispersion_shift.c3);
278     p12_fc2_S = SimdReal(ic->repulsion_shift.c2);
279     p12_fc3_S = SimdReal(ic->repulsion_shift.c3);
280 #    ifdef CALC_ENERGIES
281     {
282         SimdReal mthird_S  = SimdReal(-1.0 / 3.0);
283         SimdReal mfourth_S = SimdReal(-1.0 / 4.0);
284
285         p6_vc3_S     = mthird_S * p6_fc2_S;
286         p6_vc4_S     = mfourth_S * p6_fc3_S;
287         p6_6cpot_S   = SimdReal(ic->dispersion_shift.cpot / 6);
288         p12_vc3_S    = mthird_S * p12_fc2_S;
289         p12_vc4_S    = mfourth_S * p12_fc3_S;
290         p12_12cpot_S = SimdReal(ic->repulsion_shift.cpot / 12);
291     }
292 #    endif
293 #endif
294 #ifdef LJ_EWALD_GEOM
295     mone_S           = SimdReal(-1.0);
296     half_S           = SimdReal(0.5);
297     lj_ewaldcoeff2   = ic->ewaldcoeff_lj * ic->ewaldcoeff_lj;
298     lj_ewaldcoeff6_6 = lj_ewaldcoeff2 * lj_ewaldcoeff2 * lj_ewaldcoeff2 / 6;
299     lje_c2_S         = SimdReal(lj_ewaldcoeff2);
300     lje_c6_6_S       = SimdReal(lj_ewaldcoeff6_6);
301 #    ifdef CALC_ENERGIES
302     /* Determine the grid potential at the cut-off */
303     SimdReal lje_vc_S = SimdReal(ic->sh_lj_ewald);
304 #    endif
305 #endif
306
307     /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
308     rc2_S = SimdReal(ic->rcoulomb * ic->rcoulomb);
309 #ifdef VDW_CUTOFF_CHECK
310     rcvdw2_S = SimdReal(ic->rvdw * ic->rvdw);
311 #endif
312
313     minRsq_S = SimdReal(NBNXN_MIN_RSQ);
314
315     const real* gmx_restrict q        = nbatParams.q.data();
316     const real               facel    = ic->epsfac;
317     const real* gmx_restrict shiftvec = shift_vec[0];
318     const real* gmx_restrict x        = nbat->x().data();
319
320 #ifdef FIX_LJ_C
321
322     for (jp = 0; jp < UNROLLJ; jp++)
323     {
324         pvdw_c6[0 * UNROLLJ + jp] = nbat->nbfp[0 * 2];
325         pvdw_c6[1 * UNROLLJ + jp] = nbat->nbfp[0 * 2];
326         pvdw_c6[2 * UNROLLJ + jp] = nbat->nbfp[0 * 2];
327         pvdw_c6[3 * UNROLLJ + jp] = nbat->nbfp[0 * 2];
328
329         pvdw_c12[0 * UNROLLJ + jp] = nbat->nbfp[0 * 2 + 1];
330         pvdw_c12[1 * UNROLLJ + jp] = nbat->nbfp[0 * 2 + 1];
331         pvdw_c12[2 * UNROLLJ + jp] = nbat->nbfp[0 * 2 + 1];
332         pvdw_c12[3 * UNROLLJ + jp] = nbat->nbfp[0 * 2 + 1];
333     }
334     SimdReal c6_S0 = load<SimdReal>(pvdw_c6 + 0 * UNROLLJ);
335     SimdReal c6_S1 = load<SimdReal>(pvdw_c6 + 1 * UNROLLJ);
336     SimdReal c6_S2 = load<SimdReal>(pvdw_c6 + 2 * UNROLLJ);
337     SimdReal c6_S3 = load<SimdReal>(pvdw_c6 + 3 * UNROLLJ);
338
339     SimdReal c12_S0 = load<SimdReal>(pvdw_c12 + 0 * UNROLLJ);
340     SimdReal c12_S1 = load<SimdReal>(pvdw_c12 + 1 * UNROLLJ);
341     SimdReal c12_S2 = load<SimdReal>(pvdw_c12 + 2 * UNROLLJ);
342     SimdReal c12_S3 = load<SimdReal>(pvdw_c12 + 3 * UNROLLJ);
343 #endif /* FIX_LJ_C */
344
345 #ifdef ENERGY_GROUPS
346     egps_ishift  = nbatParams.neg_2log;
347     egps_imask   = (1 << egps_ishift) - 1;
348     egps_jshift  = 2 * nbatParams.neg_2log;
349     egps_jmask   = (1 << egps_jshift) - 1;
350     egps_jstride = (UNROLLJ >> 1) * UNROLLJ;
351     /* Major division is over i-particle energy groups, determine the stride */
352     Vstride_i = nbatParams.nenergrp * (1 << nbatParams.neg_2log) * egps_jstride;
353 #endif
354
355     l_cj = nbl->cj.data();
356
357     ninner = 0;
358     for (const nbnxn_ci_t& ciEntry : nbl->ci)
359     {
360         ish    = (ciEntry.shift & NBNXN_CI_SHIFT);
361         ish3   = ish * 3;
362         cjind0 = ciEntry.cj_ind_start;
363         cjind1 = ciEntry.cj_ind_end;
364         ci     = ciEntry.ci;
365         ci_sh  = (ish == CENTRAL ? ci : -1);
366
367         shX_S = SimdReal(shiftvec[ish3]);
368         shY_S = SimdReal(shiftvec[ish3 + 1]);
369         shZ_S = SimdReal(shiftvec[ish3 + 2]);
370
371 #if UNROLLJ <= 4
372         int sci  = ci * STRIDE;
373         int scix = sci * DIM;
374 #    if defined LJ_COMB_LB || defined LJ_COMB_GEOM || defined LJ_EWALD_GEOM
375         int sci2 = sci * 2;
376 #    endif
377 #else
378         int sci  = (ci >> 1) * STRIDE;
379         int scix = sci * DIM + (ci & 1) * (STRIDE >> 1);
380 #    if defined LJ_COMB_LB || defined LJ_COMB_GEOM || defined LJ_EWALD_GEOM
381         int sci2 = sci * 2 + (ci & 1) * (STRIDE >> 1);
382 #    endif
383         sci += (ci & 1) * (STRIDE >> 1);
384 #endif
385
386         /* We have 5 LJ/C combinations, but use only three inner loops,
387          * as the other combinations are unlikely and/or not much faster:
388          * inner half-LJ + C for half-LJ + C / no-LJ + C
389          * inner LJ + C      for full-LJ + C
390          * inner LJ          for full-LJ + no-C / half-LJ + no-C
391          */
392         do_LJ   = ((ciEntry.shift & NBNXN_CI_DO_LJ(0)) != 0);
393         do_coul = ((ciEntry.shift & NBNXN_CI_DO_COUL(0)) != 0);
394         half_LJ = (((ciEntry.shift & NBNXN_CI_HALF_LJ(0)) != 0) || !do_LJ) && do_coul;
395
396 #ifdef ENERGY_GROUPS
397         egps_i = nbatParams.energrp[ci];
398         {
399             int ia, egp_ia;
400
401             for (ia = 0; ia < UNROLLI; ia++)
402             {
403                 egp_ia     = (egps_i >> (ia * egps_ishift)) & egps_imask;
404                 vvdwtp[ia] = Vvdw + egp_ia * Vstride_i;
405                 vctp[ia]   = Vc + egp_ia * Vstride_i;
406             }
407         }
408 #endif
409
410 #ifdef CALC_ENERGIES
411 #    ifdef LJ_EWALD_GEOM
412         gmx_bool do_self = TRUE;
413 #    else
414         gmx_bool do_self = do_coul;
415 #    endif
416 #    if UNROLLJ == 4
417         if (do_self && l_cj[ciEntry.cj_ind_start].cj == ci_sh)
418 #    endif
419 #    if UNROLLJ == 8
420             if (do_self && l_cj[ciEntry.cj_ind_start].cj == (ci_sh >> 1))
421 #    endif
422             {
423                 if (do_coul)
424                 {
425                     real Vc_sub_self;
426                     int  ia;
427
428 #    ifdef CALC_COUL_RF
429                     Vc_sub_self = 0.5 * ic->c_rf;
430 #    endif
431 #    ifdef CALC_COUL_TAB
432 #        ifdef TAB_FDV0
433                     Vc_sub_self = 0.5 * tab_coul_F[2];
434 #        else
435                     Vc_sub_self = 0.5 * tab_coul_V[0];
436 #        endif
437 #    endif
438 #    ifdef CALC_COUL_EWALD
439                     /* beta/sqrt(pi) */
440                     Vc_sub_self = 0.5 * ic->ewaldcoeff_q * M_2_SQRTPI;
441 #    endif
442
443                     for (ia = 0; ia < UNROLLI; ia++)
444                     {
445                         real qi;
446
447                         qi = q[sci + ia];
448 #    ifdef ENERGY_GROUPS
449                         vctp[ia][((egps_i >> (ia * egps_ishift)) & egps_imask) * egps_jstride]
450 #    else
451                     Vc[0]
452 #    endif
453                                 -= facel * qi * qi * Vc_sub_self;
454                     }
455                 }
456
457 #    ifdef LJ_EWALD_GEOM
458                 {
459                     int ia;
460
461                     for (ia = 0; ia < UNROLLI; ia++)
462                     {
463                         real c6_i;
464
465                         c6_i = nbatParams.nbfp[nbatParams.type[sci + ia] * (nbatParams.numTypes + 1) * 2]
466                                / 6;
467 #        ifdef ENERGY_GROUPS
468                         vvdwtp[ia][((egps_i >> (ia * egps_ishift)) & egps_imask) * egps_jstride]
469 #        else
470                         Vvdw[0]
471 #        endif
472                                 += 0.5 * c6_i * lj_ewaldcoeff6_6;
473                     }
474                 }
475 #    endif /* LJ_EWALD */
476             }
477 #endif
478
479         /* Load i atom data */
480         int sciy = scix + STRIDE;
481         int sciz = sciy + STRIDE;
482         ix_S0    = loadU1DualHsimd(x + scix);
483         ix_S2    = loadU1DualHsimd(x + scix + 2);
484         iy_S0    = loadU1DualHsimd(x + sciy);
485         iy_S2    = loadU1DualHsimd(x + sciy + 2);
486         iz_S0    = loadU1DualHsimd(x + sciz);
487         iz_S2    = loadU1DualHsimd(x + sciz + 2);
488         ix_S0    = ix_S0 + shX_S;
489         ix_S2    = ix_S2 + shX_S;
490         iy_S0    = iy_S0 + shY_S;
491         iy_S2    = iy_S2 + shY_S;
492         iz_S0    = iz_S0 + shZ_S;
493         iz_S2    = iz_S2 + shZ_S;
494
495         if (do_coul)
496         {
497             SimdReal facel_S;
498
499             facel_S = SimdReal(facel);
500
501             iq_S0 = loadU1DualHsimd(q + sci);
502             iq_S2 = loadU1DualHsimd(q + sci + 2);
503             iq_S0 = facel_S * iq_S0;
504             iq_S2 = facel_S * iq_S2;
505         }
506
507 #ifdef LJ_COMB_LB
508         hsig_i_S0 = loadU1DualHsimd(ljc + sci2);
509         hsig_i_S2 = loadU1DualHsimd(ljc + sci2 + 2);
510         seps_i_S0 = loadU1DualHsimd(ljc + sci2 + STRIDE);
511         seps_i_S2 = loadU1DualHsimd(ljc + sci2 + STRIDE + 2);
512 #else
513 #    ifdef LJ_COMB_GEOM
514         SimdReal c6s_S0, c12s_S0;
515         SimdReal c6s_S2, c12s_S2;
516
517         c6s_S0 = loadU1DualHsimd(ljc + sci2);
518
519         if (!half_LJ)
520         {
521             c6s_S2 = loadU1DualHsimd(ljc + sci2 + 2);
522         }
523         c12s_S0 = loadU1DualHsimd(ljc + sci2 + STRIDE);
524         if (!half_LJ)
525         {
526             c12s_S2 = loadU1DualHsimd(ljc + sci2 + STRIDE + 2);
527         }
528 #    elif !defined LJ_COMB_LB && !defined FIX_LJ_C
529         const int   numTypes = nbatParams.numTypes;
530         const real* nbfp0    = nbfp_ptr + type[sci] * numTypes * c_simdBestPairAlignment;
531         const real* nbfp1    = nbfp_ptr + type[sci + 1] * numTypes * c_simdBestPairAlignment;
532         const real *nbfp2 = nullptr, *nbfp3 = nullptr;
533         if (!half_LJ)
534         {
535             nbfp2 = nbfp_ptr + type[sci + 2] * numTypes * c_simdBestPairAlignment;
536             nbfp3 = nbfp_ptr + type[sci + 3] * numTypes * c_simdBestPairAlignment;
537         }
538 #    endif
539 #endif
540 #ifdef LJ_EWALD_GEOM
541         /* We need the geometrically combined C6 for the PME grid correction */
542         SimdReal c6s_S0, c6s_S2;
543         c6s_S0 = loadU1DualHsimd(ljc + sci2);
544         if (!half_LJ)
545         {
546             c6s_S2 = loadU1DualHsimd(ljc + sci2 + 2);
547         }
548 #endif
549
550         /* Zero the potential energy for this list */
551 #ifdef CALC_ENERGIES
552         SimdReal Vvdwtot_S = setZero();
553         SimdReal vctot_S   = setZero();
554 #endif
555
556         /* Clear i atom forces */
557         fix_S0 = setZero();
558         fix_S2 = setZero();
559         fiy_S0 = setZero();
560         fiy_S2 = setZero();
561         fiz_S0 = setZero();
562         fiz_S2 = setZero();
563
564         cjind = cjind0;
565
566         /* Currently all kernels use (at least half) LJ */
567 #define CALC_LJ
568         if (half_LJ)
569         {
570             /* Coulomb: all i-atoms, LJ: first half i-atoms */
571 #define CALC_COULOMB
572 #define HALF_LJ
573 #define CHECK_EXCLS
574             while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
575             {
576 #include "kernel_inner.h"
577                 cjind++;
578             }
579 #undef CHECK_EXCLS
580             for (; (cjind < cjind1); cjind++)
581             {
582 #include "kernel_inner.h"
583             }
584 #undef HALF_LJ
585 #undef CALC_COULOMB
586         }
587         else if (do_coul)
588         {
589             /* Coulomb: all i-atoms, LJ: all i-atoms */
590 #define CALC_COULOMB
591 #define CHECK_EXCLS
592             while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
593             {
594 #include "kernel_inner.h"
595                 cjind++;
596             }
597 #undef CHECK_EXCLS
598             for (; (cjind < cjind1); cjind++)
599             {
600 #include "kernel_inner.h"
601             }
602 #undef CALC_COULOMB
603         }
604         else
605         {
606             /* Coulomb: none, LJ: all i-atoms */
607 #define CHECK_EXCLS
608             while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
609             {
610 #include "kernel_inner.h"
611                 cjind++;
612             }
613 #undef CHECK_EXCLS
614             for (; (cjind < cjind1); cjind++)
615             {
616 #include "kernel_inner.h"
617             }
618         }
619 #undef CALC_LJ
620         ninner += cjind1 - cjind0;
621
622         /* Add accumulated i-forces to the force array */
623         real fShiftX = reduceIncr4ReturnSumHsimd(f + scix, fix_S0, fix_S2);
624         real fShiftY = reduceIncr4ReturnSumHsimd(f + sciy, fiy_S0, fiy_S2);
625         real fShiftZ = reduceIncr4ReturnSumHsimd(f + sciz, fiz_S0, fiz_S2);
626
627 #ifdef CALC_SHIFTFORCES
628         fshift[ish3 + 0] += fShiftX;
629         fshift[ish3 + 1] += fShiftY;
630         fshift[ish3 + 2] += fShiftZ;
631 #endif
632
633 #ifdef CALC_ENERGIES
634         if (do_coul)
635         {
636             *Vc += reduce(vctot_S);
637         }
638         *Vvdw += reduce(Vvdwtot_S);
639 #endif
640
641         /* Outer loop uses 6 flops/iteration */
642     }
643
644 #ifdef COUNT_PAIRS
645     printf("atom pairs %d\n", npair);
646 #endif
647 }