Merge branch 'release-4-6' into release-5-0
[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_kernels / simd_4xn / nbnxn_kernel_simd_4xn_outer.h
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35
36
37 {
38     const nbnxn_ci_t   *nbln;
39     const nbnxn_cj_t   *l_cj;
40     const int *         type;
41     const real *        q;
42     const real         *shiftvec;
43     const real         *x;
44     const real         *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
45     real                facel;
46     real               *nbfp_ptr;
47     int                 n, ci, ci_sh;
48     int                 ish, ish3;
49     gmx_bool            do_LJ, half_LJ, do_coul, do_self;
50     int                 sci, scix, sciy, sciz, sci2;
51     int                 cjind0, cjind1, cjind;
52     int                 ip, jp;
53
54 #ifdef ENERGY_GROUPS
55     int         Vstride_i;
56     int         egps_ishift, egps_imask;
57     int         egps_jshift, egps_jmask, egps_jstride;
58     int         egps_i;
59     real       *vvdwtp[UNROLLI];
60     real       *vctp[UNROLLI];
61 #endif
62
63     gmx_simd_real_t  shX_S;
64     gmx_simd_real_t  shY_S;
65     gmx_simd_real_t  shZ_S;
66     gmx_simd_real_t  ix_S0, iy_S0, iz_S0;
67     gmx_simd_real_t  ix_S1, iy_S1, iz_S1;
68     gmx_simd_real_t  ix_S2, iy_S2, iz_S2;
69     gmx_simd_real_t  ix_S3, iy_S3, iz_S3;
70     gmx_simd_real_t  fix_S0, fiy_S0, fiz_S0;
71     gmx_simd_real_t  fix_S1, fiy_S1, fiz_S1;
72     gmx_simd_real_t  fix_S2, fiy_S2, fiz_S2;
73     gmx_simd_real_t  fix_S3, fiy_S3, fiz_S3;
74 #if UNROLLJ >= 4
75     /* We use an i-force SIMD register width of 4 */
76     gmx_simd4_real_t fix_S, fiy_S, fiz_S;
77 #else
78     /* We use an i-force SIMD register width of 2 */
79     gmx_simd_real_t  fix0_S, fiy0_S, fiz0_S;
80     gmx_simd_real_t  fix2_S, fiy2_S, fiz2_S;
81 #endif
82
83     gmx_simd_real_t  diagonal_jmi_S;
84 #if UNROLLI == UNROLLJ
85     gmx_simd_bool_t  diagonal_mask_S0, diagonal_mask_S1, diagonal_mask_S2, diagonal_mask_S3;
86 #else
87     gmx_simd_bool_t  diagonal_mask0_S0, diagonal_mask0_S1, diagonal_mask0_S2, diagonal_mask0_S3;
88     gmx_simd_bool_t  diagonal_mask1_S0, diagonal_mask1_S1, diagonal_mask1_S2, diagonal_mask1_S3;
89 #endif
90
91     unsigned            *exclusion_filter;
92     gmx_exclfilter       filter_S0, filter_S1, filter_S2, filter_S3;
93
94     gmx_simd_real_t      zero_S = gmx_simd_set1_r(0.0);
95
96     gmx_simd_real_t      one_S  = gmx_simd_set1_r(1.0);
97     gmx_simd_real_t      iq_S0  = gmx_simd_setzero_r();
98     gmx_simd_real_t      iq_S1  = gmx_simd_setzero_r();
99     gmx_simd_real_t      iq_S2  = gmx_simd_setzero_r();
100     gmx_simd_real_t      iq_S3  = gmx_simd_setzero_r();
101
102 #ifdef CALC_COUL_RF
103     gmx_simd_real_t      mrc_3_S;
104 #ifdef CALC_ENERGIES
105     gmx_simd_real_t      hrc_3_S, moh_rc_S;
106 #endif
107 #endif
108
109 #ifdef CALC_COUL_TAB
110     /* Coulomb table variables */
111     gmx_simd_real_t   invtsp_S;
112     const real *      tab_coul_F;
113 #ifndef TAB_FDV0
114     const real *      tab_coul_V;
115 #endif
116     /* Thread-local working buffers for force and potential lookups */
117     int               ti0_array[2*GMX_SIMD_REAL_WIDTH], *ti0 = NULL;
118     int               ti1_array[2*GMX_SIMD_REAL_WIDTH], *ti1 = NULL;
119     int               ti2_array[2*GMX_SIMD_REAL_WIDTH], *ti2 = NULL;
120     int               ti3_array[2*GMX_SIMD_REAL_WIDTH], *ti3 = NULL;
121 #ifdef CALC_ENERGIES
122     gmx_simd_real_t   mhalfsp_S;
123 #endif
124 #endif
125
126 #ifdef CALC_COUL_EWALD
127     gmx_simd_real_t beta2_S, beta_S;
128 #endif
129
130 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
131     gmx_simd_real_t  sh_ewald_S;
132 #endif
133
134 #if defined LJ_CUT && defined CALC_ENERGIES
135     gmx_simd_real_t   p6_cpot_S, p12_cpot_S;
136 #endif
137 #ifdef LJ_POT_SWITCH
138     gmx_simd_real_t   rswitch_S;
139     gmx_simd_real_t   swV3_S, swV4_S, swV5_S;
140     gmx_simd_real_t   swF2_S, swF3_S, swF4_S;
141 #endif
142 #ifdef LJ_FORCE_SWITCH
143     gmx_simd_real_t   rswitch_S;
144     gmx_simd_real_t   p6_fc2_S, p6_fc3_S;
145     gmx_simd_real_t   p12_fc2_S, p12_fc3_S;
146 #ifdef CALC_ENERGIES
147     gmx_simd_real_t   p6_vc3_S, p6_vc4_S;
148     gmx_simd_real_t   p12_vc3_S, p12_vc4_S;
149     gmx_simd_real_t   p6_6cpot_S, p12_12cpot_S;
150 #endif
151 #endif
152 #ifdef LJ_EWALD_GEOM
153     real              lj_ewaldcoeff2, lj_ewaldcoeff6_6;
154     gmx_simd_real_t   mone_S, half_S, lje_c2_S, lje_c6_6_S, lje_vc_S;
155 #endif
156
157 #ifdef LJ_COMB_LB
158     const real       *ljc;
159
160     gmx_simd_real_t   hsig_i_S0, seps_i_S0;
161     gmx_simd_real_t   hsig_i_S1, seps_i_S1;
162     gmx_simd_real_t   hsig_i_S2, seps_i_S2;
163     gmx_simd_real_t   hsig_i_S3, seps_i_S3;
164 #else
165 #ifdef FIX_LJ_C
166     real              pvdw_array[2*UNROLLI*UNROLLJ+3];
167     real             *pvdw_c6, *pvdw_c12;
168     gmx_simd_real_t   c6_S0, c12_S0;
169     gmx_simd_real_t   c6_S1, c12_S1;
170     gmx_simd_real_t   c6_S2, c12_S2;
171     gmx_simd_real_t   c6_S3, c12_S3;
172 #endif
173
174 #if defined LJ_COMB_GEOM || defined LJ_EWALD_GEOM
175     const real       *ljc;
176
177     gmx_simd_real_t   c6s_S0, c12s_S0;
178     gmx_simd_real_t   c6s_S1, c12s_S1;
179     gmx_simd_real_t   c6s_S2  = gmx_simd_setzero_r();
180     gmx_simd_real_t   c12s_S2 = gmx_simd_setzero_r();
181     gmx_simd_real_t   c6s_S3  = gmx_simd_setzero_r();
182     gmx_simd_real_t   c12s_S3 = gmx_simd_setzero_r();
183 #endif
184 #endif /* LJ_COMB_LB */
185
186     gmx_simd_real_t  vctot_S, Vvdwtot_S;
187     gmx_simd_real_t  sixth_S, twelveth_S;
188
189     gmx_simd_real_t  avoid_sing_S;
190     gmx_simd_real_t  rc2_S;
191 #ifdef VDW_CUTOFF_CHECK
192     gmx_simd_real_t  rcvdw2_S;
193 #endif
194
195     int ninner;
196
197 #ifdef COUNT_PAIRS
198     int npair = 0;
199 #endif
200
201 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM
202     ljc = nbat->lj_comb;
203 #endif
204 #if !(defined LJ_COMB_GEOM || defined LJ_COMB_LB)
205     /* No combination rule used */
206     nbfp_ptr    = (4 == nbfp_stride) ? nbat->nbfp_s4 : nbat->nbfp;
207 #endif
208
209     /* Load j-i for the first i */
210     diagonal_jmi_S    = gmx_simd_load_r(nbat->simd_4xn_diagonal_j_minus_i);
211     /* Generate all the diagonal masks as comparison results */
212 #if UNROLLI == UNROLLJ
213     diagonal_mask_S0  = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
214     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
215     diagonal_mask_S1  = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
216     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
217     diagonal_mask_S2  = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
218     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
219     diagonal_mask_S3  = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
220 #else
221 #if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ
222     diagonal_mask0_S0 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
223     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
224     diagonal_mask0_S1 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
225     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
226     diagonal_mask0_S2 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
227     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
228     diagonal_mask0_S3 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
229     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
230
231 #if UNROLLI == 2*UNROLLJ
232     /* Load j-i for the second half of the j-cluster */
233     diagonal_jmi_S    = gmx_simd_load_r(nbat->simd_4xn_diagonal_j_minus_i + UNROLLJ);
234 #endif
235
236     diagonal_mask1_S0 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
237     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
238     diagonal_mask1_S1 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
239     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
240     diagonal_mask1_S2 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
241     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
242     diagonal_mask1_S3 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
243 #endif
244 #endif
245
246     /* Load masks for topology exclusion masking. filter_stride is
247        static const, so the conditional will be optimized away. */
248     if (1 == filter_stride)
249     {
250         exclusion_filter = nbat->simd_exclusion_filter1;
251     }
252     else /* (2 == filter_stride) */
253     {
254         exclusion_filter = nbat->simd_exclusion_filter2;
255     }
256
257     /* Here we cast the exclusion filters from unsigned * to int * or real *.
258      * Since we only check bits, the actual value they represent does not
259      * matter, as long as both filter and mask data are treated the same way.
260      */
261     filter_S0 = gmx_load_exclusion_filter(exclusion_filter + 0*UNROLLJ*filter_stride);
262     filter_S1 = gmx_load_exclusion_filter(exclusion_filter + 1*UNROLLJ*filter_stride);
263     filter_S2 = gmx_load_exclusion_filter(exclusion_filter + 2*UNROLLJ*filter_stride);
264     filter_S3 = gmx_load_exclusion_filter(exclusion_filter + 3*UNROLLJ*filter_stride);
265
266 #ifdef CALC_COUL_RF
267     /* Reaction-field constants */
268     mrc_3_S  = gmx_simd_set1_r(-2*ic->k_rf);
269 #ifdef CALC_ENERGIES
270     hrc_3_S  = gmx_simd_set1_r(ic->k_rf);
271     moh_rc_S = gmx_simd_set1_r(-ic->c_rf);
272 #endif
273 #endif
274
275 #ifdef CALC_COUL_TAB
276     /* Generate aligned table index pointers */
277     ti0 = prepare_table_load_buffer(ti0_array);
278     ti1 = prepare_table_load_buffer(ti1_array);
279     ti2 = prepare_table_load_buffer(ti2_array);
280     ti3 = prepare_table_load_buffer(ti3_array);
281
282     invtsp_S  = gmx_simd_set1_r(ic->tabq_scale);
283 #ifdef CALC_ENERGIES
284     mhalfsp_S = gmx_simd_set1_r(-0.5/ic->tabq_scale);
285 #endif
286
287 #ifdef TAB_FDV0
288     tab_coul_F = ic->tabq_coul_FDV0;
289 #else
290     tab_coul_F = ic->tabq_coul_F;
291     tab_coul_V = ic->tabq_coul_V;
292 #endif
293 #endif /* CALC_COUL_TAB */
294
295 #ifdef CALC_COUL_EWALD
296     beta2_S = gmx_simd_set1_r(ic->ewaldcoeff_q*ic->ewaldcoeff_q);
297     beta_S  = gmx_simd_set1_r(ic->ewaldcoeff_q);
298 #endif
299
300 #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
301     sh_ewald_S = gmx_simd_set1_r(ic->sh_ewald);
302 #endif
303
304     /* LJ function constants */
305 #if defined CALC_ENERGIES || defined LJ_POT_SWITCH
306     sixth_S      = gmx_simd_set1_r(1.0/6.0);
307     twelveth_S   = gmx_simd_set1_r(1.0/12.0);
308 #endif
309
310 #if defined LJ_CUT && defined CALC_ENERGIES
311     /* We shift the potential by cpot, which can be zero */
312     p6_cpot_S    = gmx_simd_set1_r(ic->dispersion_shift.cpot);
313     p12_cpot_S   = gmx_simd_set1_r(ic->repulsion_shift.cpot);
314 #endif
315 #ifdef LJ_POT_SWITCH
316     rswitch_S = gmx_simd_set1_r(ic->rvdw_switch);
317     swV3_S    = gmx_simd_set1_r(ic->vdw_switch.c3);
318     swV4_S    = gmx_simd_set1_r(ic->vdw_switch.c4);
319     swV5_S    = gmx_simd_set1_r(ic->vdw_switch.c5);
320     swF2_S    = gmx_simd_set1_r(3*ic->vdw_switch.c3);
321     swF3_S    = gmx_simd_set1_r(4*ic->vdw_switch.c4);
322     swF4_S    = gmx_simd_set1_r(5*ic->vdw_switch.c5);
323 #endif
324 #ifdef LJ_FORCE_SWITCH
325     rswitch_S = gmx_simd_set1_r(ic->rvdw_switch);
326     p6_fc2_S  = gmx_simd_set1_r(ic->dispersion_shift.c2);
327     p6_fc3_S  = gmx_simd_set1_r(ic->dispersion_shift.c3);
328     p12_fc2_S = gmx_simd_set1_r(ic->repulsion_shift.c2);
329     p12_fc3_S = gmx_simd_set1_r(ic->repulsion_shift.c3);
330 #ifdef CALC_ENERGIES
331     {
332         gmx_simd_real_t mthird_S  = gmx_simd_set1_r(-1.0/3.0);
333         gmx_simd_real_t mfourth_S = gmx_simd_set1_r(-1.0/4.0);
334
335         p6_vc3_S     = gmx_simd_mul_r(mthird_S,  p6_fc2_S);
336         p6_vc4_S     = gmx_simd_mul_r(mfourth_S, p6_fc3_S);
337         p6_6cpot_S   = gmx_simd_set1_r(ic->dispersion_shift.cpot/6);
338         p12_vc3_S    = gmx_simd_mul_r(mthird_S,  p12_fc2_S);
339         p12_vc4_S    = gmx_simd_mul_r(mfourth_S, p12_fc3_S);
340         p12_12cpot_S = gmx_simd_set1_r(ic->repulsion_shift.cpot/12);
341     }
342 #endif
343 #endif
344 #ifdef LJ_EWALD_GEOM
345     mone_S           = gmx_simd_set1_r(-1.0);
346     half_S           = gmx_simd_set1_r(0.5);
347     lj_ewaldcoeff2   = ic->ewaldcoeff_lj*ic->ewaldcoeff_lj;
348     lj_ewaldcoeff6_6 = lj_ewaldcoeff2*lj_ewaldcoeff2*lj_ewaldcoeff2/6;
349     lje_c2_S         = gmx_simd_set1_r(lj_ewaldcoeff2);
350     lje_c6_6_S       = gmx_simd_set1_r(lj_ewaldcoeff6_6);
351     /* Determine the grid potential at the cut-off */
352     lje_vc_S         = gmx_simd_set1_r(ic->sh_lj_ewald);
353 #endif
354
355     /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
356     rc2_S    = gmx_simd_set1_r(ic->rcoulomb*ic->rcoulomb);
357 #ifdef VDW_CUTOFF_CHECK
358     rcvdw2_S = gmx_simd_set1_r(ic->rvdw*ic->rvdw);
359 #endif
360
361     avoid_sing_S = gmx_simd_set1_r(NBNXN_AVOID_SING_R2_INC);
362
363     q                   = nbat->q;
364     type                = nbat->type;
365     facel               = ic->epsfac;
366     shiftvec            = shift_vec[0];
367     x                   = nbat->x;
368
369 #ifdef FIX_LJ_C
370     pvdw_c6  = gmx_simd_align_real(pvdw_array);
371     pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
372
373     for (jp = 0; jp < UNROLLJ; jp++)
374     {
375         pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
376         pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
377         pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
378         pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
379
380         pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
381         pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
382         pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
383         pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
384     }
385     c6_S0            = gmx_simd_load_r(pvdw_c6 +0*UNROLLJ);
386     c6_S1            = gmx_simd_load_r(pvdw_c6 +1*UNROLLJ);
387     c6_S2            = gmx_simd_load_r(pvdw_c6 +2*UNROLLJ);
388     c6_S3            = gmx_simd_load_r(pvdw_c6 +3*UNROLLJ);
389
390     c12_S0           = gmx_simd_load_r(pvdw_c12+0*UNROLLJ);
391     c12_S1           = gmx_simd_load_r(pvdw_c12+1*UNROLLJ);
392     c12_S2           = gmx_simd_load_r(pvdw_c12+2*UNROLLJ);
393     c12_S3           = gmx_simd_load_r(pvdw_c12+3*UNROLLJ);
394 #endif /* FIX_LJ_C */
395
396 #ifdef ENERGY_GROUPS
397     egps_ishift  = nbat->neg_2log;
398     egps_imask   = (1<<egps_ishift) - 1;
399     egps_jshift  = 2*nbat->neg_2log;
400     egps_jmask   = (1<<egps_jshift) - 1;
401     egps_jstride = (UNROLLJ>>1)*UNROLLJ;
402     /* Major division is over i-particle energy groups, determine the stride */
403     Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
404 #endif
405
406     l_cj = nbl->cj;
407
408     ninner = 0;
409     for (n = 0; n < nbl->nci; n++)
410     {
411         nbln = &nbl->ci[n];
412
413         ish              = (nbln->shift & NBNXN_CI_SHIFT);
414         ish3             = ish*3;
415         cjind0           = nbln->cj_ind_start;
416         cjind1           = nbln->cj_ind_end;
417         ci               = nbln->ci;
418         ci_sh            = (ish == CENTRAL ? ci : -1);
419
420         shX_S = gmx_simd_load1_r(shiftvec+ish3);
421         shY_S = gmx_simd_load1_r(shiftvec+ish3+1);
422         shZ_S = gmx_simd_load1_r(shiftvec+ish3+2);
423
424 #if UNROLLJ <= 4
425         sci              = ci*STRIDE;
426         scix             = sci*DIM;
427         sci2             = sci*2;
428 #else
429         sci              = (ci>>1)*STRIDE;
430         scix             = sci*DIM + (ci & 1)*(STRIDE>>1);
431         sci2             = sci*2 + (ci & 1)*(STRIDE>>1);
432         sci             += (ci & 1)*(STRIDE>>1);
433 #endif
434
435         /* We have 5 LJ/C combinations, but use only three inner loops,
436          * as the other combinations are unlikely and/or not much faster:
437          * inner half-LJ + C for half-LJ + C / no-LJ + C
438          * inner LJ + C      for full-LJ + C
439          * inner LJ          for full-LJ + no-C / half-LJ + no-C
440          */
441         do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
442         do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
443         half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
444 #ifdef LJ_EWALD_GEOM
445         do_self = TRUE;
446 #else
447         do_self = do_coul;
448 #endif
449
450 #ifdef ENERGY_GROUPS
451         egps_i = nbat->energrp[ci];
452         {
453             int ia, egp_ia;
454
455             for (ia = 0; ia < UNROLLI; ia++)
456             {
457                 egp_ia     = (egps_i >> (ia*egps_ishift)) & egps_imask;
458                 vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
459                 vctp[ia]   = Vc   + egp_ia*Vstride_i;
460             }
461         }
462 #endif
463
464 #ifdef CALC_ENERGIES
465 #if UNROLLJ == 4
466         if (do_self && l_cj[nbln->cj_ind_start].cj == ci_sh)
467 #endif
468 #if UNROLLJ == 2
469         if (do_self && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
470 #endif
471 #if UNROLLJ == 8
472         if (do_self && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
473 #endif
474         {
475             if (do_coul)
476             {
477                 real Vc_sub_self;
478                 int  ia;
479
480 #ifdef CALC_COUL_RF
481                 Vc_sub_self = 0.5*ic->c_rf;
482 #endif
483 #ifdef CALC_COUL_TAB
484 #ifdef TAB_FDV0
485                 Vc_sub_self = 0.5*tab_coul_F[2];
486 #else
487                 Vc_sub_self = 0.5*tab_coul_V[0];
488 #endif
489 #endif
490 #ifdef CALC_COUL_EWALD
491                 /* beta/sqrt(pi) */
492                 Vc_sub_self = 0.5*ic->ewaldcoeff_q*M_2_SQRTPI;
493 #endif
494
495                 for (ia = 0; ia < UNROLLI; ia++)
496                 {
497                     real qi;
498
499                     qi = q[sci+ia];
500 #ifdef ENERGY_GROUPS
501                     vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
502 #else
503                     Vc[0]
504 #endif
505                         -= facel*qi*qi*Vc_sub_self;
506                 }
507             }
508
509 #ifdef LJ_EWALD_GEOM
510             {
511                 int  ia;
512
513                 for (ia = 0; ia < UNROLLI; ia++)
514                 {
515                     real c6_i;
516
517                     c6_i = nbat->nbfp[nbat->type[sci+ia]*(nbat->ntype + 1)*2]/6;
518 #ifdef ENERGY_GROUPS
519                     vvdwtp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
520 #else
521                     Vvdw[0]
522 #endif
523                         += 0.5*c6_i*lj_ewaldcoeff6_6;
524                 }
525             }
526 #endif      /* LJ_EWALD_GEOM */
527         }
528 #endif
529
530         /* Load i atom data */
531         sciy             = scix + STRIDE;
532         sciz             = sciy + STRIDE;
533         ix_S0            = gmx_simd_add_r(gmx_simd_load1_r(x+scix), shX_S);
534         ix_S1            = gmx_simd_add_r(gmx_simd_load1_r(x+scix+1), shX_S);
535         ix_S2            = gmx_simd_add_r(gmx_simd_load1_r(x+scix+2), shX_S);
536         ix_S3            = gmx_simd_add_r(gmx_simd_load1_r(x+scix+3), shX_S);
537         iy_S0            = gmx_simd_add_r(gmx_simd_load1_r(x+sciy), shY_S);
538         iy_S1            = gmx_simd_add_r(gmx_simd_load1_r(x+sciy+1), shY_S);
539         iy_S2            = gmx_simd_add_r(gmx_simd_load1_r(x+sciy+2), shY_S);
540         iy_S3            = gmx_simd_add_r(gmx_simd_load1_r(x+sciy+3), shY_S);
541         iz_S0            = gmx_simd_add_r(gmx_simd_load1_r(x+sciz), shZ_S);
542         iz_S1            = gmx_simd_add_r(gmx_simd_load1_r(x+sciz+1), shZ_S);
543         iz_S2            = gmx_simd_add_r(gmx_simd_load1_r(x+sciz+2), shZ_S);
544         iz_S3            = gmx_simd_add_r(gmx_simd_load1_r(x+sciz+3), shZ_S);
545
546         if (do_coul)
547         {
548             iq_S0      = gmx_simd_set1_r(facel*q[sci]);
549             iq_S1      = gmx_simd_set1_r(facel*q[sci+1]);
550             iq_S2      = gmx_simd_set1_r(facel*q[sci+2]);
551             iq_S3      = gmx_simd_set1_r(facel*q[sci+3]);
552         }
553
554 #ifdef LJ_COMB_LB
555         hsig_i_S0      = gmx_simd_load1_r(ljc+sci2+0);
556         hsig_i_S1      = gmx_simd_load1_r(ljc+sci2+1);
557         hsig_i_S2      = gmx_simd_load1_r(ljc+sci2+2);
558         hsig_i_S3      = gmx_simd_load1_r(ljc+sci2+3);
559         seps_i_S0      = gmx_simd_load1_r(ljc+sci2+STRIDE+0);
560         seps_i_S1      = gmx_simd_load1_r(ljc+sci2+STRIDE+1);
561         seps_i_S2      = gmx_simd_load1_r(ljc+sci2+STRIDE+2);
562         seps_i_S3      = gmx_simd_load1_r(ljc+sci2+STRIDE+3);
563 #else
564 #ifdef LJ_COMB_GEOM
565         c6s_S0         = gmx_simd_load1_r(ljc+sci2+0);
566         c6s_S1         = gmx_simd_load1_r(ljc+sci2+1);
567         if (!half_LJ)
568         {
569             c6s_S2     = gmx_simd_load1_r(ljc+sci2+2);
570             c6s_S3     = gmx_simd_load1_r(ljc+sci2+3);
571         }
572         c12s_S0        = gmx_simd_load1_r(ljc+sci2+STRIDE+0);
573         c12s_S1        = gmx_simd_load1_r(ljc+sci2+STRIDE+1);
574         if (!half_LJ)
575         {
576             c12s_S2    = gmx_simd_load1_r(ljc+sci2+STRIDE+2);
577             c12s_S3    = gmx_simd_load1_r(ljc+sci2+STRIDE+3);
578         }
579 #else
580         nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
581         nbfp1     = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
582         if (!half_LJ)
583         {
584             nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
585             nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
586         }
587 #endif
588 #endif
589 #ifdef LJ_EWALD_GEOM
590         /* We need the geometrically combined C6 for the PME grid correction */
591         c6s_S0 = gmx_simd_load1_r(ljc+sci2+0);
592         c6s_S1 = gmx_simd_load1_r(ljc+sci2+1);
593         if (!half_LJ)
594         {
595             c6s_S2 = gmx_simd_load1_r(ljc+sci2+2);
596             c6s_S3 = gmx_simd_load1_r(ljc+sci2+3);
597         }
598 #endif
599
600         /* Zero the potential energy for this list */
601         Vvdwtot_S        = gmx_simd_setzero_r();
602         vctot_S          = gmx_simd_setzero_r();
603
604         /* Clear i atom forces */
605         fix_S0           = gmx_simd_setzero_r();
606         fix_S1           = gmx_simd_setzero_r();
607         fix_S2           = gmx_simd_setzero_r();
608         fix_S3           = gmx_simd_setzero_r();
609         fiy_S0           = gmx_simd_setzero_r();
610         fiy_S1           = gmx_simd_setzero_r();
611         fiy_S2           = gmx_simd_setzero_r();
612         fiy_S3           = gmx_simd_setzero_r();
613         fiz_S0           = gmx_simd_setzero_r();
614         fiz_S1           = gmx_simd_setzero_r();
615         fiz_S2           = gmx_simd_setzero_r();
616         fiz_S3           = gmx_simd_setzero_r();
617
618         cjind = cjind0;
619
620         /* Currently all kernels use (at least half) LJ */
621 #define CALC_LJ
622         if (half_LJ)
623         {
624             /* Coulomb: all i-atoms, LJ: first half i-atoms */
625 #define CALC_COULOMB
626 #define HALF_LJ
627 #define CHECK_EXCLS
628             while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
629             {
630 #include "nbnxn_kernel_simd_4xn_inner.h"
631                 cjind++;
632             }
633 #undef CHECK_EXCLS
634             for (; (cjind < cjind1); cjind++)
635             {
636 #include "nbnxn_kernel_simd_4xn_inner.h"
637             }
638 #undef HALF_LJ
639 #undef CALC_COULOMB
640         }
641         else if (do_coul)
642         {
643             /* Coulomb: all i-atoms, LJ: all i-atoms */
644 #define CALC_COULOMB
645 #define CHECK_EXCLS
646             while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
647             {
648 #include "nbnxn_kernel_simd_4xn_inner.h"
649                 cjind++;
650             }
651 #undef CHECK_EXCLS
652             for (; (cjind < cjind1); cjind++)
653             {
654 #include "nbnxn_kernel_simd_4xn_inner.h"
655             }
656 #undef CALC_COULOMB
657         }
658         else
659         {
660             /* Coulomb: none, LJ: all i-atoms */
661 #define CHECK_EXCLS
662             while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
663             {
664 #include "nbnxn_kernel_simd_4xn_inner.h"
665                 cjind++;
666             }
667 #undef CHECK_EXCLS
668             for (; (cjind < cjind1); cjind++)
669             {
670 #include "nbnxn_kernel_simd_4xn_inner.h"
671             }
672         }
673 #undef CALC_LJ
674         ninner += cjind1 - cjind0;
675
676         /* Add accumulated i-forces to the force array */
677 #if UNROLLJ >= 4
678         fix_S = gmx_mm_transpose_sum4_pr(fix_S0, fix_S1, fix_S2, fix_S3);
679         gmx_simd4_store_r(f+scix, gmx_simd4_add_r(fix_S, gmx_simd4_load_r(f+scix)));
680
681         fiy_S = gmx_mm_transpose_sum4_pr(fiy_S0, fiy_S1, fiy_S2, fiy_S3);
682         gmx_simd4_store_r(f+sciy, gmx_simd4_add_r(fiy_S, gmx_simd4_load_r(f+sciy)));
683
684         fiz_S = gmx_mm_transpose_sum4_pr(fiz_S0, fiz_S1, fiz_S2, fiz_S3);
685         gmx_simd4_store_r(f+sciz, gmx_simd4_add_r(fiz_S, gmx_simd4_load_r(f+sciz)));
686
687 #ifdef CALC_SHIFTFORCES
688         fshift[ish3+0] += gmx_simd4_reduce_r(fix_S);
689         fshift[ish3+1] += gmx_simd4_reduce_r(fiy_S);
690         fshift[ish3+2] += gmx_simd4_reduce_r(fiz_S);
691 #endif
692 #else
693         fix0_S = gmx_mm_transpose_sum2_pr(fix_S0, fix_S1);
694         gmx_simd_store_r(f+scix, gmx_simd_add_r(fix0_S, gmx_simd_load_r(f+scix)));
695         fix2_S = gmx_mm_transpose_sum2_pr(fix_S2, fix_S3);
696         gmx_simd_store_r(f+scix+2, gmx_simd_add_r(fix2_S, gmx_simd_load_r(f+scix+2)));
697
698         fiy0_S = gmx_mm_transpose_sum2_pr(fiy_S0, fiy_S1);
699         gmx_simd_store_r(f+sciy, gmx_simd_add_r(fiy0_S, gmx_simd_load_r(f+sciy)));
700         fiy2_S = gmx_mm_transpose_sum2_pr(fiy_S2, fiy_S3);
701         gmx_simd_store_r(f+sciy+2, gmx_simd_add_r(fiy2_S, gmx_simd_load_r(f+sciy+2)));
702
703         fiz0_S = gmx_mm_transpose_sum2_pr(fiz_S0, fiz_S1);
704         gmx_simd_store_r(f+sciz, gmx_simd_add_r(fiz0_S, gmx_simd_load_r(f+sciz)));
705         fiz2_S = gmx_mm_transpose_sum2_pr(fiz_S2, fiz_S3);
706         gmx_simd_store_r(f+sciz+2, gmx_simd_add_r(fiz2_S, gmx_simd_load_r(f+sciz+2)));
707
708 #ifdef CALC_SHIFTFORCES
709         fshift[ish3+0] += gmx_simd_reduce_r(gmx_simd_add_r(fix0_S, fix2_S));
710         fshift[ish3+1] += gmx_simd_reduce_r(gmx_simd_add_r(fiy0_S, fiy2_S));
711         fshift[ish3+2] += gmx_simd_reduce_r(gmx_simd_add_r(fiz0_S, fiz2_S));
712 #endif
713 #endif
714
715 #ifdef CALC_ENERGIES
716         if (do_coul)
717         {
718             *Vc += gmx_simd_reduce_r(vctot_S);
719         }
720
721         *Vvdw += gmx_simd_reduce_r(Vvdwtot_S);
722 #endif
723
724         /* Outer loop uses 6 flops/iteration */
725     }
726
727 #ifdef COUNT_PAIRS
728     printf("atom pairs %d\n", npair);
729 #endif
730 }