590c09547df299ca20279ee02be9f82a5f4863a5
[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_kernels / simd_2xnn / nbnxn_kernel_simd_2xnn_outer.h
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35
36
37 /* Half-width SIMD operations are required here.
38  * As the 4xn kernels are the "standard" kernels and some special operations
39  * are required only here, we define those in nbnxn_kernel_simd_utils_...
40  *
41  * Half-width SIMD real type:
42  * gmx_mm_hpr
43  *
44  * Half-width SIMD operations
45  * Load reals at half-width aligned pointer b into half-width SIMD register a:
46  * gmx_load_hpr(a, b)
47  * Set all entries in half-width SIMD register *a to b:
48  * gmx_set1_hpr(a, b)
49  * Load one real at b and one real at b+1 into halves of a, respectively:
50  * gmx_load1p1_pr(a, b)
51  * Load reals at half-width aligned pointer b into two halves of a:
52  * gmx_loaddh_pr(a, b)
53  * Store half-width SIMD register b into half width aligned memory a:
54  * gmx_store_hpr(a, b)
55  * gmx_add_hpr(a, b)
56  * gmx_sub_hpr(a, b)
57  * Sum over 4 half SIMD registers:
58  * gmx_sum4_hpr(a, b)
59  * Sum the elements of halfs of each input register and store sums in out:
60  * gmx_mm_transpose_sum4h_pr(a, b)
61  * Extract two half-width registers *b, *c from a full width register a:
62  * gmx_pr_to_2hpr(a, b, c)
63  */
64
65
66 {
67     const nbnxn_ci_t   *nbln;
68     const nbnxn_cj_t   *l_cj;
69     const int          *type;
70     const real         *q;
71     const real         *shiftvec;
72     const real         *x;
73     const real         *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
74     real                facel;
75     real               *nbfp_ptr;
76     int                 n, ci, ci_sh;
77     int                 ish, ish3;
78     gmx_bool            do_LJ, half_LJ, do_coul, do_self;
79     int                 sci, scix, sciy, sciz, sci2;
80     int                 cjind0, cjind1, cjind;
81     int                 ip, jp;
82
83 #ifdef ENERGY_GROUPS
84     int         Vstride_i;
85     int         egps_ishift, egps_imask;
86     int         egps_jshift, egps_jmask, egps_jstride;
87     int         egps_i;
88     real       *vvdwtp[UNROLLI];
89     real       *vctp[UNROLLI];
90 #endif
91
92     gmx_simd_real_t  shX_S;
93     gmx_simd_real_t  shY_S;
94     gmx_simd_real_t  shZ_S;
95     gmx_simd_real_t  ix_S0, iy_S0, iz_S0;
96     gmx_simd_real_t  ix_S2, iy_S2, iz_S2;
97     gmx_simd_real_t  fix_S0, fiy_S0, fiz_S0;
98     gmx_simd_real_t  fix_S2, fiy_S2, fiz_S2;
99     /* We use an i-force SIMD register width of 4 */
100     /* The simd4 stuff might be defined in nbnxn_kernel_simd_utils.h */
101     gmx_simd4_real_t fix_S, fiy_S, fiz_S;
102
103     gmx_simd_real_t  diagonal_jmi_S;
104 #if UNROLLI == UNROLLJ
105     gmx_simd_bool_t  diagonal_mask_S0, diagonal_mask_S2;
106 #else
107     gmx_simd_bool_t  diagonal_mask0_S0, diagonal_mask0_S2;
108     gmx_simd_bool_t  diagonal_mask1_S0, diagonal_mask1_S2;
109 #endif
110
111     unsigned            *exclusion_filter;
112     gmx_exclfilter       filter_S0, filter_S2;
113
114     gmx_simd_real_t      zero_S = gmx_simd_set1_r(0.0);
115
116     gmx_simd_real_t      one_S = gmx_simd_set1_r(1.0);
117     gmx_simd_real_t      iq_S0 = gmx_simd_setzero_r();
118     gmx_simd_real_t      iq_S2 = gmx_simd_setzero_r();
119
120 #ifdef CALC_COUL_RF
121     gmx_simd_real_t      mrc_3_S;
122 #ifdef CALC_ENERGIES
123     gmx_simd_real_t      hrc_3_S, moh_rc_S;
124 #endif
125 #endif
126
127 #ifdef CALC_COUL_TAB
128     /* Coulomb table variables */
129     gmx_simd_real_t   invtsp_S;
130     const real       *tab_coul_F;
131 #ifndef TAB_FDV0
132     const real       *tab_coul_V;
133 #endif
134     /* Thread-local working buffers for force and potential lookups */
135     int               ti0_array[2*GMX_SIMD_REAL_WIDTH], *ti0 = NULL;
136     int               ti2_array[2*GMX_SIMD_REAL_WIDTH], *ti2 = NULL;
137 #ifdef CALC_ENERGIES
138     gmx_simd_real_t   mhalfsp_S;
139 #endif
140 #endif
141
142 #ifdef CALC_COUL_EWALD
143     gmx_simd_real_t beta2_S, beta_S;
144 #endif
145
146 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
147     gmx_simd_real_t  sh_ewald_S;
148 #endif
149
150 #if defined LJ_CUT && defined CALC_ENERGIES
151     gmx_simd_real_t   p6_cpot_S, p12_cpot_S;
152 #endif
153 #ifdef LJ_POT_SWITCH
154     gmx_simd_real_t   rswitch_S;
155     gmx_simd_real_t   swV3_S, swV4_S, swV5_S;
156     gmx_simd_real_t   swF2_S, swF3_S, swF4_S;
157 #endif
158 #ifdef LJ_FORCE_SWITCH
159     gmx_simd_real_t   rswitch_S;
160     gmx_simd_real_t   p6_fc2_S, p6_fc3_S;
161     gmx_simd_real_t   p12_fc2_S, p12_fc3_S;
162 #ifdef CALC_ENERGIES
163     gmx_simd_real_t   p6_vc3_S, p6_vc4_S;
164     gmx_simd_real_t   p12_vc3_S, p12_vc4_S;
165     gmx_simd_real_t   p6_6cpot_S, p12_12cpot_S;
166 #endif
167 #endif
168 #ifdef LJ_EWALD_GEOM
169     real              lj_ewaldcoeff2, lj_ewaldcoeff6_6;
170     gmx_simd_real_t   mone_S, half_S, lje_c2_S, lje_c6_6_S, lje_vc_S;
171 #endif
172
173 #ifdef LJ_COMB_LB
174     const real       *ljc;
175
176     gmx_simd_real_t   hsig_i_S0, seps_i_S0;
177     gmx_simd_real_t   hsig_i_S2, seps_i_S2;
178 #else
179 #ifdef FIX_LJ_C
180     real              pvdw_array[2*UNROLLI*UNROLLJ+GMX_SIMD_REAL_WIDTH];
181     real             *pvdw_c6, *pvdw_c12;
182     gmx_simd_real_t   c6_S0, c12_S0;
183     gmx_simd_real_t   c6_S2, c12_S2;
184 #endif
185
186 #if defined LJ_COMB_GEOM || defined LJ_EWALD_GEOM
187     const real       *ljc;
188
189     gmx_simd_real_t   c6s_S0, c12s_S0;
190     gmx_simd_real_t   c6s_S2  = gmx_simd_setzero_r();
191     gmx_simd_real_t   c12s_S2 = gmx_simd_setzero_r();
192 #endif
193 #endif /* LJ_COMB_LB */
194
195     gmx_simd_real_t  vctot_S, Vvdwtot_S;
196     gmx_simd_real_t  sixth_S, twelveth_S;
197
198     gmx_simd_real_t  avoid_sing_S;
199     gmx_simd_real_t  rc2_S;
200 #ifdef VDW_CUTOFF_CHECK
201     gmx_simd_real_t  rcvdw2_S;
202 #endif
203
204     int ninner;
205
206 #ifdef COUNT_PAIRS
207     int npair = 0;
208 #endif
209
210 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM
211     ljc = nbat->lj_comb;
212 #endif
213 #if !(defined LJ_COMB_GEOM || defined LJ_COMB_LB)
214     /* No combination rule used */
215     nbfp_ptr    = (4 == nbfp_stride) ? nbat->nbfp_s4 : nbat->nbfp;
216 #endif
217
218     /* Load j-i for the first i */
219     diagonal_jmi_S    = gmx_simd_load_r(nbat->simd_2xnn_diagonal_j_minus_i);
220     /* Generate all the diagonal masks as comparison results */
221 #if UNROLLI == UNROLLJ
222     diagonal_mask_S0  = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
223     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
224     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
225     diagonal_mask_S2  = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
226 #else
227 #if 2*UNROLLI == UNROLLJ
228     diagonal_mask0_S0 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
229     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
230     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
231     diagonal_mask0_S2 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
232     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
233     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
234     diagonal_mask1_S0 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
235     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
236     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
237     diagonal_mask1_S2 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
238 #endif
239 #endif
240
241     /* Load masks for topology exclusion masking. filter_stride is
242        static const, so the conditional will be optimized away. */
243     if (1 == filter_stride)
244     {
245         exclusion_filter = nbat->simd_exclusion_filter1;
246     }
247     else /* (2 == filter_stride) */
248     {
249         exclusion_filter = nbat->simd_exclusion_filter2;
250     }
251
252     /* Here we cast the exclusion filters from unsigned * to int * or real *.
253      * Since we only check bits, the actual value they represent does not
254      * matter, as long as both filter and mask data are treated the same way.
255      */
256     filter_S0 = gmx_load_exclusion_filter(exclusion_filter + 0*2*UNROLLJ*filter_stride);
257     filter_S2 = gmx_load_exclusion_filter(exclusion_filter + 1*2*UNROLLJ*filter_stride);
258
259 #ifdef CALC_COUL_RF
260     /* Reaction-field constants */
261     mrc_3_S  = gmx_simd_set1_r(-2*ic->k_rf);
262 #ifdef CALC_ENERGIES
263     hrc_3_S  = gmx_simd_set1_r(ic->k_rf);
264     moh_rc_S = gmx_simd_set1_r(-ic->c_rf);
265 #endif
266 #endif
267
268 #ifdef CALC_COUL_TAB
269     /* Generate aligned table index pointers */
270     ti0 = prepare_table_load_buffer(ti0_array);
271     ti2 = prepare_table_load_buffer(ti2_array);
272
273     invtsp_S  = gmx_simd_set1_r(ic->tabq_scale);
274 #ifdef CALC_ENERGIES
275     mhalfsp_S = gmx_simd_set1_r(-0.5/ic->tabq_scale);
276 #endif
277
278 #ifdef TAB_FDV0
279     tab_coul_F = ic->tabq_coul_FDV0;
280 #else
281     tab_coul_F = ic->tabq_coul_F;
282     tab_coul_V = ic->tabq_coul_V;
283 #endif
284 #endif /* CALC_COUL_TAB */
285
286 #ifdef CALC_COUL_EWALD
287     beta2_S = gmx_simd_set1_r(ic->ewaldcoeff_q*ic->ewaldcoeff_q);
288     beta_S  = gmx_simd_set1_r(ic->ewaldcoeff_q);
289 #endif
290
291 #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
292     sh_ewald_S = gmx_simd_set1_r(ic->sh_ewald);
293 #endif
294
295     /* LJ function constants */
296 #if defined CALC_ENERGIES || defined LJ_POT_SWITCH
297     sixth_S      = gmx_simd_set1_r(1.0/6.0);
298     twelveth_S   = gmx_simd_set1_r(1.0/12.0);
299 #endif
300
301 #if defined LJ_CUT && defined CALC_ENERGIES
302     /* We shift the potential by cpot, which can be zero */
303     p6_cpot_S    = gmx_simd_set1_r(ic->dispersion_shift.cpot);
304     p12_cpot_S   = gmx_simd_set1_r(ic->repulsion_shift.cpot);
305 #endif
306 #ifdef LJ_POT_SWITCH
307     rswitch_S = gmx_simd_set1_r(ic->rvdw_switch);
308     swV3_S    = gmx_simd_set1_r(ic->vdw_switch.c3);
309     swV4_S    = gmx_simd_set1_r(ic->vdw_switch.c4);
310     swV5_S    = gmx_simd_set1_r(ic->vdw_switch.c5);
311     swF2_S    = gmx_simd_set1_r(3*ic->vdw_switch.c3);
312     swF3_S    = gmx_simd_set1_r(4*ic->vdw_switch.c4);
313     swF4_S    = gmx_simd_set1_r(5*ic->vdw_switch.c5);
314 #endif
315 #ifdef LJ_FORCE_SWITCH
316     rswitch_S = gmx_simd_set1_r(ic->rvdw_switch);
317     p6_fc2_S  = gmx_simd_set1_r(ic->dispersion_shift.c2);
318     p6_fc3_S  = gmx_simd_set1_r(ic->dispersion_shift.c3);
319     p12_fc2_S = gmx_simd_set1_r(ic->repulsion_shift.c2);
320     p12_fc3_S = gmx_simd_set1_r(ic->repulsion_shift.c3);
321 #ifdef CALC_ENERGIES
322     {
323         gmx_simd_real_t mthird_S  = gmx_simd_set1_r(-1.0/3.0);
324         gmx_simd_real_t mfourth_S = gmx_simd_set1_r(-1.0/4.0);
325
326         p6_vc3_S     = gmx_simd_mul_r(mthird_S,  p6_fc2_S);
327         p6_vc4_S     = gmx_simd_mul_r(mfourth_S, p6_fc3_S);
328         p6_6cpot_S   = gmx_simd_set1_r(ic->dispersion_shift.cpot/6);
329         p12_vc3_S    = gmx_simd_mul_r(mthird_S,  p12_fc2_S);
330         p12_vc4_S    = gmx_simd_mul_r(mfourth_S, p12_fc3_S);
331         p12_12cpot_S = gmx_simd_set1_r(ic->repulsion_shift.cpot/12);
332     }
333 #endif
334 #endif
335 #ifdef LJ_EWALD_GEOM
336     mone_S           = gmx_simd_set1_r(-1.0);
337     half_S           = gmx_simd_set1_r(0.5);
338     lj_ewaldcoeff2   = ic->ewaldcoeff_lj*ic->ewaldcoeff_lj;
339     lj_ewaldcoeff6_6 = lj_ewaldcoeff2*lj_ewaldcoeff2*lj_ewaldcoeff2/6;
340     lje_c2_S         = gmx_simd_set1_r(lj_ewaldcoeff2);
341     lje_c6_6_S       = gmx_simd_set1_r(lj_ewaldcoeff6_6);
342     /* Determine the grid potential at the cut-off */
343     lje_vc_S         = gmx_simd_set1_r(ic->sh_lj_ewald);
344 #endif
345
346     /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
347     rc2_S    = gmx_simd_set1_r(ic->rcoulomb*ic->rcoulomb);
348 #ifdef VDW_CUTOFF_CHECK
349     rcvdw2_S = gmx_simd_set1_r(ic->rvdw*ic->rvdw);
350 #endif
351
352     avoid_sing_S = gmx_simd_set1_r(NBNXN_AVOID_SING_R2_INC);
353
354     q                   = nbat->q;
355     type                = nbat->type;
356     facel               = ic->epsfac;
357     shiftvec            = shift_vec[0];
358     x                   = nbat->x;
359
360 #ifdef FIX_LJ_C
361     pvdw_c6  = gmx_simd_align_r(pvdw_array);
362     pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
363
364     for (jp = 0; jp < UNROLLJ; jp++)
365     {
366         pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
367         pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
368         pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
369         pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
370
371         pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
372         pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
373         pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
374         pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
375     }
376     c6_S0            = gmx_simd_load_r(pvdw_c6 +0*UNROLLJ);
377     c6_S1            = gmx_simd_load_r(pvdw_c6 +1*UNROLLJ);
378     c6_S2            = gmx_simd_load_r(pvdw_c6 +2*UNROLLJ);
379     c6_S3            = gmx_simd_load_r(pvdw_c6 +3*UNROLLJ);
380
381     c12_S0           = gmx_simd_load_r(pvdw_c12+0*UNROLLJ);
382     c12_S1           = gmx_simd_load_r(pvdw_c12+1*UNROLLJ);
383     c12_S2           = gmx_simd_load_r(pvdw_c12+2*UNROLLJ);
384     c12_S3           = gmx_simd_load_r(pvdw_c12+3*UNROLLJ);
385 #endif /* FIX_LJ_C */
386
387 #ifdef ENERGY_GROUPS
388     egps_ishift  = nbat->neg_2log;
389     egps_imask   = (1<<egps_ishift) - 1;
390     egps_jshift  = 2*nbat->neg_2log;
391     egps_jmask   = (1<<egps_jshift) - 1;
392     egps_jstride = (UNROLLJ>>1)*UNROLLJ;
393     /* Major division is over i-particle energy groups, determine the stride */
394     Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
395 #endif
396
397     l_cj = nbl->cj;
398
399     ninner = 0;
400     for (n = 0; n < nbl->nci; n++)
401     {
402         nbln = &nbl->ci[n];
403
404         ish              = (nbln->shift & NBNXN_CI_SHIFT);
405         ish3             = ish*3;
406         cjind0           = nbln->cj_ind_start;
407         cjind1           = nbln->cj_ind_end;
408         ci               = nbln->ci;
409         ci_sh            = (ish == CENTRAL ? ci : -1);
410
411         shX_S = gmx_simd_load1_r(shiftvec+ish3);
412         shY_S = gmx_simd_load1_r(shiftvec+ish3+1);
413         shZ_S = gmx_simd_load1_r(shiftvec+ish3+2);
414
415 #if UNROLLJ <= 4
416         sci              = ci*STRIDE;
417         scix             = sci*DIM;
418         sci2             = sci*2;
419 #else
420         sci              = (ci>>1)*STRIDE;
421         scix             = sci*DIM + (ci & 1)*(STRIDE>>1);
422         sci2             = sci*2 + (ci & 1)*(STRIDE>>1);
423         sci             += (ci & 1)*(STRIDE>>1);
424 #endif
425
426         /* We have 5 LJ/C combinations, but use only three inner loops,
427          * as the other combinations are unlikely and/or not much faster:
428          * inner half-LJ + C for half-LJ + C / no-LJ + C
429          * inner LJ + C      for full-LJ + C
430          * inner LJ          for full-LJ + no-C / half-LJ + no-C
431          */
432         do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
433         do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
434         half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
435 #ifdef LJ_EWALD_GEOM
436         do_self = TRUE;
437 #else
438         do_self = do_coul;
439 #endif
440
441 #ifdef ENERGY_GROUPS
442         egps_i = nbat->energrp[ci];
443         {
444             int ia, egp_ia;
445
446             for (ia = 0; ia < UNROLLI; ia++)
447             {
448                 egp_ia     = (egps_i >> (ia*egps_ishift)) & egps_imask;
449                 vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
450                 vctp[ia]   = Vc   + egp_ia*Vstride_i;
451             }
452         }
453 #endif
454
455 #ifdef CALC_ENERGIES
456 #if UNROLLJ == 4
457         if (do_self && l_cj[nbln->cj_ind_start].cj == ci_sh)
458 #endif
459 #if UNROLLJ == 8
460         if (do_self && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
461 #endif
462         {
463             if (do_coul)
464             {
465                 real Vc_sub_self;
466                 int  ia;
467
468 #ifdef CALC_COUL_RF
469                 Vc_sub_self = 0.5*ic->c_rf;
470 #endif
471 #ifdef CALC_COUL_TAB
472 #ifdef TAB_FDV0
473                 Vc_sub_self = 0.5*tab_coul_F[2];
474 #else
475                 Vc_sub_self = 0.5*tab_coul_V[0];
476 #endif
477 #endif
478 #ifdef CALC_COUL_EWALD
479                 /* beta/sqrt(pi) */
480                 Vc_sub_self = 0.5*ic->ewaldcoeff_q*M_2_SQRTPI;
481 #endif
482
483                 for (ia = 0; ia < UNROLLI; ia++)
484                 {
485                     real qi;
486
487                     qi = q[sci+ia];
488 #ifdef ENERGY_GROUPS
489                     vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
490 #else
491                     Vc[0]
492 #endif
493                         -= facel*qi*qi*Vc_sub_self;
494                 }
495             }
496
497 #ifdef LJ_EWALD_GEOM
498             {
499                 int  ia;
500
501                 for (ia = 0; ia < UNROLLI; ia++)
502                 {
503                     real c6_i;
504
505                     c6_i = nbat->nbfp[nbat->type[sci+ia]*(nbat->ntype + 1)*2]/6;
506 #ifdef ENERGY_GROUPS
507                     vvdwtp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
508 #else
509                     Vvdw[0]
510 #endif
511                         += 0.5*c6_i*lj_ewaldcoeff6_6;
512                 }
513             }
514 #endif      /* LJ_EWALD */
515         }
516 #endif
517
518         /* Load i atom data */
519         sciy             = scix + STRIDE;
520         sciz             = sciy + STRIDE;
521         gmx_load1p1_pr(&ix_S0, x+scix);
522         gmx_load1p1_pr(&ix_S2, x+scix+2);
523         gmx_load1p1_pr(&iy_S0, x+sciy);
524         gmx_load1p1_pr(&iy_S2, x+sciy+2);
525         gmx_load1p1_pr(&iz_S0, x+sciz);
526         gmx_load1p1_pr(&iz_S2, x+sciz+2);
527         ix_S0          = gmx_simd_add_r(ix_S0, shX_S);
528         ix_S2          = gmx_simd_add_r(ix_S2, shX_S);
529         iy_S0          = gmx_simd_add_r(iy_S0, shY_S);
530         iy_S2          = gmx_simd_add_r(iy_S2, shY_S);
531         iz_S0          = gmx_simd_add_r(iz_S0, shZ_S);
532         iz_S2          = gmx_simd_add_r(iz_S2, shZ_S);
533
534         if (do_coul)
535         {
536             gmx_simd_real_t facel_S;
537
538             facel_S    = gmx_simd_set1_r(facel);
539
540             gmx_load1p1_pr(&iq_S0, q+sci);
541             gmx_load1p1_pr(&iq_S2, q+sci+2);
542             iq_S0      = gmx_simd_mul_r(facel_S, iq_S0);
543             iq_S2      = gmx_simd_mul_r(facel_S, iq_S2);
544         }
545
546 #ifdef LJ_COMB_LB
547         gmx_load1p1_pr(&hsig_i_S0, ljc+sci2+0);
548         gmx_load1p1_pr(&hsig_i_S2, ljc+sci2+2);
549         gmx_load1p1_pr(&seps_i_S0, ljc+sci2+STRIDE+0);
550         gmx_load1p1_pr(&seps_i_S2, ljc+sci2+STRIDE+2);
551 #else
552 #ifdef LJ_COMB_GEOM
553         gmx_load1p1_pr(&c6s_S0, ljc+sci2+0);
554         if (!half_LJ)
555         {
556             gmx_load1p1_pr(&c6s_S2, ljc+sci2+2);
557         }
558         gmx_load1p1_pr(&c12s_S0, ljc+sci2+STRIDE+0);
559         if (!half_LJ)
560         {
561             gmx_load1p1_pr(&c12s_S2, ljc+sci2+STRIDE+2);
562         }
563 #else
564         nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
565         nbfp1     = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
566         if (!half_LJ)
567         {
568             nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
569             nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
570         }
571 #endif
572 #endif
573 #ifdef LJ_EWALD_GEOM
574         /* We need the geometrically combined C6 for the PME grid correction */
575         gmx_load1p1_pr(&c6s_S0, ljc+sci2+0);
576         if (!half_LJ)
577         {
578             gmx_load1p1_pr(&c6s_S2, ljc+sci2+2);
579         }
580 #endif
581
582         /* Zero the potential energy for this list */
583         Vvdwtot_S        = gmx_simd_setzero_r();
584         vctot_S          = gmx_simd_setzero_r();
585
586         /* Clear i atom forces */
587         fix_S0           = gmx_simd_setzero_r();
588         fix_S2           = gmx_simd_setzero_r();
589         fiy_S0           = gmx_simd_setzero_r();
590         fiy_S2           = gmx_simd_setzero_r();
591         fiz_S0           = gmx_simd_setzero_r();
592         fiz_S2           = gmx_simd_setzero_r();
593
594         cjind = cjind0;
595
596         /* Currently all kernels use (at least half) LJ */
597 #define CALC_LJ
598         if (half_LJ)
599         {
600             /* Coulomb: all i-atoms, LJ: first half i-atoms */
601 #define CALC_COULOMB
602 #define HALF_LJ
603 #define CHECK_EXCLS
604             while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
605             {
606 #include "nbnxn_kernel_simd_2xnn_inner.h"
607                 cjind++;
608             }
609 #undef CHECK_EXCLS
610             for (; (cjind < cjind1); cjind++)
611             {
612 #include "nbnxn_kernel_simd_2xnn_inner.h"
613             }
614 #undef HALF_LJ
615 #undef CALC_COULOMB
616         }
617         else if (do_coul)
618         {
619             /* Coulomb: all i-atoms, LJ: all i-atoms */
620 #define CALC_COULOMB
621 #define CHECK_EXCLS
622             while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
623             {
624 #include "nbnxn_kernel_simd_2xnn_inner.h"
625                 cjind++;
626             }
627 #undef CHECK_EXCLS
628             for (; (cjind < cjind1); cjind++)
629             {
630 #include "nbnxn_kernel_simd_2xnn_inner.h"
631             }
632 #undef CALC_COULOMB
633         }
634         else
635         {
636             /* Coulomb: none, LJ: all i-atoms */
637 #define CHECK_EXCLS
638             while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
639             {
640 #include "nbnxn_kernel_simd_2xnn_inner.h"
641                 cjind++;
642             }
643 #undef CHECK_EXCLS
644             for (; (cjind < cjind1); cjind++)
645             {
646 #include "nbnxn_kernel_simd_2xnn_inner.h"
647             }
648         }
649 #undef CALC_LJ
650         ninner += cjind1 - cjind0;
651
652         /* Add accumulated i-forces to the force array */
653         fix_S = gmx_mm_transpose_sum4h_pr(fix_S0, fix_S2);
654         gmx_simd4_store_r(f+scix, gmx_simd4_add_r(fix_S, gmx_simd4_load_r(f+scix)));
655
656         fiy_S = gmx_mm_transpose_sum4h_pr(fiy_S0, fiy_S2);
657         gmx_simd4_store_r(f+sciy, gmx_simd4_add_r(fiy_S, gmx_simd4_load_r(f+sciy)));
658
659         fiz_S = gmx_mm_transpose_sum4h_pr(fiz_S0, fiz_S2);
660         gmx_simd4_store_r(f+sciz, gmx_simd4_add_r(fiz_S, gmx_simd4_load_r(f+sciz)));
661
662 #ifdef CALC_SHIFTFORCES
663         fshift[ish3+0] += gmx_simd4_reduce_r(fix_S);
664         fshift[ish3+1] += gmx_simd4_reduce_r(fiy_S);
665         fshift[ish3+2] += gmx_simd4_reduce_r(fiz_S);
666 #endif
667
668 #ifdef CALC_ENERGIES
669         if (do_coul)
670         {
671             *Vc += gmx_simd_reduce_r(vctot_S);
672         }
673         *Vvdw += gmx_simd_reduce_r(Vvdwtot_S);
674 #endif
675
676         /* Outer loop uses 6 flops/iteration */
677     }
678
679 #ifdef COUNT_PAIRS
680     printf("atom pairs %d\n", npair);
681 #endif
682 }