Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_kernels / simd_4xn / nbnxn_kernel_simd_4xn_outer.h
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35
36
37 {
38     const nbnxn_ci_t   *nbln;
39     const nbnxn_cj_t   *l_cj;
40     const int          *type;
41     const real         *q;
42     const real         *shiftvec;
43     const real         *x;
44     const real         *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
45     real                facel;
46     real               *nbfp_ptr;
47     int                 n, ci, ci_sh;
48     int                 ish, ish3;
49     gmx_bool            do_LJ, half_LJ, do_coul;
50     int                 sci, scix, sciy, sciz, sci2;
51     int                 cjind0, cjind1, cjind;
52     int                 ip, jp;
53
54 #ifdef ENERGY_GROUPS
55     int         Vstride_i;
56     int         egps_ishift, egps_imask;
57     int         egps_jshift, egps_jmask, egps_jstride;
58     int         egps_i;
59     real       *vvdwtp[UNROLLI];
60     real       *vctp[UNROLLI];
61 #endif
62
63     gmx_simd_real_t  shX_S;
64     gmx_simd_real_t  shY_S;
65     gmx_simd_real_t  shZ_S;
66     gmx_simd_real_t  ix_S0, iy_S0, iz_S0;
67     gmx_simd_real_t  ix_S1, iy_S1, iz_S1;
68     gmx_simd_real_t  ix_S2, iy_S2, iz_S2;
69     gmx_simd_real_t  ix_S3, iy_S3, iz_S3;
70     gmx_simd_real_t  fix_S0, fiy_S0, fiz_S0;
71     gmx_simd_real_t  fix_S1, fiy_S1, fiz_S1;
72     gmx_simd_real_t  fix_S2, fiy_S2, fiz_S2;
73     gmx_simd_real_t  fix_S3, fiy_S3, fiz_S3;
74 #if UNROLLJ >= 4
75     /* We use an i-force SIMD register width of 4 */
76     gmx_simd4_real_t fix_S, fiy_S, fiz_S;
77 #else
78     /* We use an i-force SIMD register width of 2 */
79     gmx_simd_real_t  fix0_S, fiy0_S, fiz0_S;
80     gmx_simd_real_t  fix2_S, fiy2_S, fiz2_S;
81 #endif
82
83     gmx_simd_real_t  diagonal_jmi_S;
84 #if UNROLLI == UNROLLJ
85     gmx_simd_bool_t  diagonal_mask_S0, diagonal_mask_S1, diagonal_mask_S2, diagonal_mask_S3;
86 #else
87     gmx_simd_bool_t  diagonal_mask0_S0, diagonal_mask0_S1, diagonal_mask0_S2, diagonal_mask0_S3;
88     gmx_simd_bool_t  diagonal_mask1_S0, diagonal_mask1_S1, diagonal_mask1_S2, diagonal_mask1_S3;
89 #endif
90
91     unsigned            *exclusion_filter;
92     gmx_exclfilter       filter_S0, filter_S1, filter_S2, filter_S3;
93
94     gmx_simd_real_t      zero_S = gmx_simd_set1_r(0.0);
95
96     gmx_simd_real_t      one_S  = gmx_simd_set1_r(1.0);
97     gmx_simd_real_t      iq_S0  = gmx_simd_setzero_r();
98     gmx_simd_real_t      iq_S1  = gmx_simd_setzero_r();
99     gmx_simd_real_t      iq_S2  = gmx_simd_setzero_r();
100     gmx_simd_real_t      iq_S3  = gmx_simd_setzero_r();
101
102 #ifdef CALC_COUL_RF
103     gmx_simd_real_t      mrc_3_S;
104 #ifdef CALC_ENERGIES
105     gmx_simd_real_t      hrc_3_S, moh_rc_S;
106 #endif
107 #endif
108
109 #ifdef CALC_COUL_TAB
110     /* Coulomb table variables */
111     gmx_simd_real_t   invtsp_S;
112     const real       *tab_coul_F;
113 #ifndef TAB_FDV0
114     const real       *tab_coul_V;
115 #endif
116     /* Thread-local working buffers for force and potential lookups */
117     int               ti0_array[2*GMX_SIMD_REAL_WIDTH], *ti0 = NULL;
118     int               ti1_array[2*GMX_SIMD_REAL_WIDTH], *ti1 = NULL;
119     int               ti2_array[2*GMX_SIMD_REAL_WIDTH], *ti2 = NULL;
120     int               ti3_array[2*GMX_SIMD_REAL_WIDTH], *ti3 = NULL;
121 #ifdef CALC_ENERGIES
122     gmx_simd_real_t   mhalfsp_S;
123 #endif
124 #endif
125
126 #ifdef CALC_COUL_EWALD
127     gmx_simd_real_t beta2_S, beta_S;
128 #endif
129
130 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
131     gmx_simd_real_t  sh_ewald_S;
132 #endif
133
134 #ifdef LJ_POT_SWITCH
135     gmx_simd_real_t   rswitch_S;
136     gmx_simd_real_t   swV3_S, swV4_S, swV5_S;
137     gmx_simd_real_t   swF2_S, swF3_S, swF4_S;
138 #else
139 #ifdef LJ_FORCE_SWITCH
140     gmx_simd_real_t   rswitch_S;
141     gmx_simd_real_t   p6_fc2_S, p6_fc3_S;
142     gmx_simd_real_t   p12_fc2_S, p12_fc3_S;
143 #ifdef CALC_ENERGIES
144     gmx_simd_real_t   p6_vc3_S, p6_vc4_S;
145     gmx_simd_real_t   p12_vc3_S, p12_vc4_S;
146     gmx_simd_real_t   p6_6cpot_S, p12_12cpot_S;
147 #endif
148 #else
149 #ifdef CALC_ENERGIES
150     gmx_simd_real_t  p6_cpot_S, p12_cpot_S;
151 #endif
152 #endif
153 #endif
154
155 #ifdef LJ_COMB_LB
156     const real       *ljc;
157
158     gmx_simd_real_t   hsig_i_S0, seps_i_S0;
159     gmx_simd_real_t   hsig_i_S1, seps_i_S1;
160     gmx_simd_real_t   hsig_i_S2, seps_i_S2;
161     gmx_simd_real_t   hsig_i_S3, seps_i_S3;
162 #else
163 #ifdef FIX_LJ_C
164     real              pvdw_array[2*UNROLLI*UNROLLJ+3];
165     real             *pvdw_c6, *pvdw_c12;
166     gmx_simd_real_t   c6_S0, c12_S0;
167     gmx_simd_real_t   c6_S1, c12_S1;
168     gmx_simd_real_t   c6_S2, c12_S2;
169     gmx_simd_real_t   c6_S3, c12_S3;
170 #endif
171
172 #ifdef LJ_COMB_GEOM
173     const real       *ljc;
174
175     gmx_simd_real_t   c6s_S0, c12s_S0;
176     gmx_simd_real_t   c6s_S1, c12s_S1;
177     gmx_simd_real_t   c6s_S2  = gmx_simd_setzero_r();
178     gmx_simd_real_t   c12s_S2 = gmx_simd_setzero_r();
179     gmx_simd_real_t   c6s_S3  = gmx_simd_setzero_r();
180     gmx_simd_real_t   c12s_S3 = gmx_simd_setzero_r();
181 #endif
182 #endif /* LJ_COMB_LB */
183
184     gmx_simd_real_t  vctot_S, Vvdwtot_S;
185     gmx_simd_real_t  sixth_S, twelveth_S;
186
187     gmx_simd_real_t  avoid_sing_S;
188     gmx_simd_real_t  rc2_S;
189 #ifdef VDW_CUTOFF_CHECK
190     gmx_simd_real_t  rcvdw2_S;
191 #endif
192
193 #ifdef CALC_ENERGIES
194     /* cppcheck-suppress unassignedVariable */
195     real       tmpsum_array[GMX_SIMD_REAL_WIDTH*2], *tmpsum;
196 #endif
197 #ifdef CALC_SHIFTFORCES
198     /* cppcheck-suppress unassignedVariable */
199     real       shf_array[GMX_SIMD_REAL_WIDTH*2], *shf;
200 #endif
201
202     int ninner;
203
204 #ifdef COUNT_PAIRS
205     int npair = 0;
206 #endif
207
208 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
209     ljc = nbat->lj_comb;
210 #else
211     /* No combination rule used */
212     nbfp_ptr    = (4 == nbfp_stride) ? nbat->nbfp_s4 : nbat->nbfp;
213 #endif
214
215     /* Load j-i for the first i */
216     diagonal_jmi_S    = gmx_simd_load_r(nbat->simd_4xn_diagonal_j_minus_i);
217     /* Generate all the diagonal masks as comparison results */
218 #if UNROLLI == UNROLLJ
219     diagonal_mask_S0  = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
220     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
221     diagonal_mask_S1  = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
222     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
223     diagonal_mask_S2  = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
224     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
225     diagonal_mask_S3  = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
226 #else
227 #if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ
228     diagonal_mask0_S0 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
229     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
230     diagonal_mask0_S1 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
231     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
232     diagonal_mask0_S2 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
233     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
234     diagonal_mask0_S3 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
235     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
236
237 #if UNROLLI == 2*UNROLLJ
238     /* Load j-i for the second half of the j-cluster */
239     diagonal_jmi_S    = gmx_simd_load_r(nbat->simd_4xn_diagonal_j_minus_i + UNROLLJ);
240 #endif
241
242     diagonal_mask1_S0 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
243     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
244     diagonal_mask1_S1 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
245     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
246     diagonal_mask1_S2 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
247     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
248     diagonal_mask1_S3 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
249 #endif
250 #endif
251
252     /* Load masks for topology exclusion masking. filter_stride is
253        static const, so the conditional will be optimized away. */
254     if (1 == filter_stride)
255     {
256         exclusion_filter = nbat->simd_exclusion_filter1;
257     }
258     else /* (2 == filter_stride) */
259     {
260         exclusion_filter = nbat->simd_exclusion_filter2;
261     }
262
263     /* Here we cast the exclusion filters from unsigned * to int * or real *.
264      * Since we only check bits, the actual value they represent does not
265      * matter, as long as both filter and mask data are treated the same way.
266      */
267     filter_S0 = gmx_load_exclusion_filter(exclusion_filter + 0*UNROLLJ*filter_stride);
268     filter_S1 = gmx_load_exclusion_filter(exclusion_filter + 1*UNROLLJ*filter_stride);
269     filter_S2 = gmx_load_exclusion_filter(exclusion_filter + 2*UNROLLJ*filter_stride);
270     filter_S3 = gmx_load_exclusion_filter(exclusion_filter + 3*UNROLLJ*filter_stride);
271
272 #ifdef CALC_COUL_RF
273     /* Reaction-field constants */
274     mrc_3_S  = gmx_simd_set1_r(-2*ic->k_rf);
275 #ifdef CALC_ENERGIES
276     hrc_3_S  = gmx_simd_set1_r(ic->k_rf);
277     moh_rc_S = gmx_simd_set1_r(-ic->c_rf);
278 #endif
279 #endif
280
281 #ifdef CALC_COUL_TAB
282     /* Generate aligned table index pointers */
283     ti0 = prepare_table_load_buffer(ti0_array);
284     ti1 = prepare_table_load_buffer(ti1_array);
285     ti2 = prepare_table_load_buffer(ti2_array);
286     ti3 = prepare_table_load_buffer(ti3_array);
287
288     invtsp_S  = gmx_simd_set1_r(ic->tabq_scale);
289 #ifdef CALC_ENERGIES
290     mhalfsp_S = gmx_simd_set1_r(-0.5/ic->tabq_scale);
291 #endif
292
293 #ifdef TAB_FDV0
294     tab_coul_F = ic->tabq_coul_FDV0;
295 #else
296     tab_coul_F = ic->tabq_coul_F;
297     tab_coul_V = ic->tabq_coul_V;
298 #endif
299 #endif /* CALC_COUL_TAB */
300
301 #ifdef CALC_COUL_EWALD
302     beta2_S = gmx_simd_set1_r(ic->ewaldcoeff_q*ic->ewaldcoeff_q);
303     beta_S  = gmx_simd_set1_r(ic->ewaldcoeff_q);
304 #endif
305
306 #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
307     sh_ewald_S = gmx_simd_set1_r(ic->sh_ewald);
308 #endif
309
310     /* LJ function constants */
311 #if defined CALC_ENERGIES || defined LJ_POT_SWITCH
312     sixth_S      = gmx_simd_set1_r(1.0/6.0);
313     twelveth_S   = gmx_simd_set1_r(1.0/12.0);
314 #endif
315
316 #ifdef LJ_POT_SWITCH
317     rswitch_S = gmx_simd_set1_r(ic->rvdw_switch);
318     swV3_S    = gmx_simd_set1_r(ic->vdw_switch.c3);
319     swV4_S    = gmx_simd_set1_r(ic->vdw_switch.c4);
320     swV5_S    = gmx_simd_set1_r(ic->vdw_switch.c5);
321     swF2_S    = gmx_simd_set1_r(3*ic->vdw_switch.c3);
322     swF3_S    = gmx_simd_set1_r(4*ic->vdw_switch.c4);
323     swF4_S    = gmx_simd_set1_r(5*ic->vdw_switch.c5);
324 #else
325     sixth_S      = gmx_simd_set1_r(1.0/6.0);
326     twelveth_S   = gmx_simd_set1_r(1.0/12.0);
327 #ifdef LJ_FORCE_SWITCH
328     rswitch_S = gmx_simd_set1_r(ic->rvdw_switch);
329     p6_fc2_S  = gmx_simd_set1_r(ic->dispersion_shift.c2);
330     p6_fc3_S  = gmx_simd_set1_r(ic->dispersion_shift.c3);
331     p12_fc2_S = gmx_simd_set1_r(ic->repulsion_shift.c2);
332     p12_fc3_S = gmx_simd_set1_r(ic->repulsion_shift.c3);
333 #ifdef CALC_ENERGIES
334     {
335         gmx_simd_real_t mthird_S  = gmx_simd_set1_r(-1.0/3.0);
336         gmx_simd_real_t mfourth_S = gmx_simd_set1_r(-1.0/4.0);
337
338         p6_vc3_S     = gmx_simd_mul_r(mthird_S,  p6_fc2_S);
339         p6_vc4_S     = gmx_simd_mul_r(mfourth_S, p6_fc3_S);
340         p6_6cpot_S   = gmx_simd_set1_r(ic->dispersion_shift.cpot/6);
341         p12_vc3_S    = gmx_simd_mul_r(mthird_S,  p12_fc2_S);
342         p12_vc4_S    = gmx_simd_mul_r(mfourth_S, p12_fc3_S);
343         p12_12cpot_S = gmx_simd_set1_r(ic->repulsion_shift.cpot/12);
344     }
345 #endif
346 #else
347     /* Plain LJ cut-off, with potential shift cpot, which can be 0 */
348 #ifdef CALC_ENERGIES
349     p6_cpot_S    = gmx_simd_set1_r(ic->dispersion_shift.cpot);
350     p12_cpot_S   = gmx_simd_set1_r(ic->repulsion_shift.cpot);
351 #endif
352 #endif
353 #endif /* LJ_POT_SWITCH */
354
355     /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
356     rc2_S    = gmx_simd_set1_r(ic->rcoulomb*ic->rcoulomb);
357 #ifdef VDW_CUTOFF_CHECK
358     rcvdw2_S = gmx_simd_set1_r(ic->rvdw*ic->rvdw);
359 #endif
360
361     avoid_sing_S = gmx_simd_set1_r(NBNXN_AVOID_SING_R2_INC);
362
363     q                   = nbat->q;
364     type                = nbat->type;
365     facel               = ic->epsfac;
366     shiftvec            = shift_vec[0];
367     x                   = nbat->x;
368
369 #ifdef CALC_ENERGIES
370     tmpsum   = gmx_simd_align_r(tmpsum_array);
371 #endif
372 #ifdef CALC_SHIFTFORCES
373     shf      = gmx_simd_align_r(shf_array);
374 #endif
375
376 #ifdef FIX_LJ_C
377     pvdw_c6  = gmx_simd_align_real(pvdw_array);
378     pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
379
380     for (jp = 0; jp < UNROLLJ; jp++)
381     {
382         pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
383         pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
384         pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
385         pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
386
387         pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
388         pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
389         pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
390         pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
391     }
392     c6_S0            = gmx_simd_load_r(pvdw_c6 +0*UNROLLJ);
393     c6_S1            = gmx_simd_load_r(pvdw_c6 +1*UNROLLJ);
394     c6_S2            = gmx_simd_load_r(pvdw_c6 +2*UNROLLJ);
395     c6_S3            = gmx_simd_load_r(pvdw_c6 +3*UNROLLJ);
396
397     c12_S0           = gmx_simd_load_r(pvdw_c12+0*UNROLLJ);
398     c12_S1           = gmx_simd_load_r(pvdw_c12+1*UNROLLJ);
399     c12_S2           = gmx_simd_load_r(pvdw_c12+2*UNROLLJ);
400     c12_S3           = gmx_simd_load_r(pvdw_c12+3*UNROLLJ);
401 #endif /* FIX_LJ_C */
402
403 #ifdef ENERGY_GROUPS
404     egps_ishift  = nbat->neg_2log;
405     egps_imask   = (1<<egps_ishift) - 1;
406     egps_jshift  = 2*nbat->neg_2log;
407     egps_jmask   = (1<<egps_jshift) - 1;
408     egps_jstride = (UNROLLJ>>1)*UNROLLJ;
409     /* Major division is over i-particle energy groups, determine the stride */
410     Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
411 #endif
412
413     l_cj = nbl->cj;
414
415     ninner = 0;
416     for (n = 0; n < nbl->nci; n++)
417     {
418         nbln = &nbl->ci[n];
419
420         ish              = (nbln->shift & NBNXN_CI_SHIFT);
421         ish3             = ish*3;
422         cjind0           = nbln->cj_ind_start;
423         cjind1           = nbln->cj_ind_end;
424         ci               = nbln->ci;
425         ci_sh            = (ish == CENTRAL ? ci : -1);
426
427         shX_S = gmx_simd_load1_r(shiftvec+ish3);
428         shY_S = gmx_simd_load1_r(shiftvec+ish3+1);
429         shZ_S = gmx_simd_load1_r(shiftvec+ish3+2);
430
431 #if UNROLLJ <= 4
432         sci              = ci*STRIDE;
433         scix             = sci*DIM;
434         sci2             = sci*2;
435 #else
436         sci              = (ci>>1)*STRIDE;
437         scix             = sci*DIM + (ci & 1)*(STRIDE>>1);
438         sci2             = sci*2 + (ci & 1)*(STRIDE>>1);
439         sci             += (ci & 1)*(STRIDE>>1);
440 #endif
441
442         /* We have 5 LJ/C combinations, but use only three inner loops,
443          * as the other combinations are unlikely and/or not much faster:
444          * inner half-LJ + C for half-LJ + C / no-LJ + C
445          * inner LJ + C      for full-LJ + C
446          * inner LJ          for full-LJ + no-C / half-LJ + no-C
447          */
448         do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
449         do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
450         half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
451
452 #ifdef ENERGY_GROUPS
453         egps_i = nbat->energrp[ci];
454         {
455             int ia, egp_ia;
456
457             for (ia = 0; ia < UNROLLI; ia++)
458             {
459                 egp_ia     = (egps_i >> (ia*egps_ishift)) & egps_imask;
460                 vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
461                 vctp[ia]   = Vc   + egp_ia*Vstride_i;
462             }
463         }
464 #endif
465 #if defined CALC_ENERGIES
466 #if UNROLLJ == 4
467         if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
468 #endif
469 #if UNROLLJ == 2
470         if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
471 #endif
472 #if UNROLLJ == 8
473         if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
474 #endif
475         {
476             int  ia;
477             real Vc_sub_self;
478
479 #ifdef CALC_COUL_RF
480             Vc_sub_self = 0.5*ic->c_rf;
481 #endif
482 #ifdef CALC_COUL_TAB
483 #ifdef TAB_FDV0
484             Vc_sub_self = 0.5*tab_coul_F[2];
485 #else
486             Vc_sub_self = 0.5*tab_coul_V[0];
487 #endif
488 #endif
489 #ifdef CALC_COUL_EWALD
490             /* beta/sqrt(pi) */
491             Vc_sub_self = 0.5*ic->ewaldcoeff_q*M_2_SQRTPI;
492 #endif
493
494             for (ia = 0; ia < UNROLLI; ia++)
495             {
496                 real qi;
497
498                 qi = q[sci+ia];
499 #ifdef ENERGY_GROUPS
500                 vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
501 #else
502                 Vc[0]
503 #endif
504                     -= facel*qi*qi*Vc_sub_self;
505             }
506         }
507 #endif
508
509         /* Load i atom data */
510         sciy             = scix + STRIDE;
511         sciz             = sciy + STRIDE;
512         ix_S0            = gmx_simd_add_r(gmx_simd_load1_r(x+scix), shX_S);
513         ix_S1            = gmx_simd_add_r(gmx_simd_load1_r(x+scix+1), shX_S);
514         ix_S2            = gmx_simd_add_r(gmx_simd_load1_r(x+scix+2), shX_S);
515         ix_S3            = gmx_simd_add_r(gmx_simd_load1_r(x+scix+3), shX_S);
516         iy_S0            = gmx_simd_add_r(gmx_simd_load1_r(x+sciy), shY_S);
517         iy_S1            = gmx_simd_add_r(gmx_simd_load1_r(x+sciy+1), shY_S);
518         iy_S2            = gmx_simd_add_r(gmx_simd_load1_r(x+sciy+2), shY_S);
519         iy_S3            = gmx_simd_add_r(gmx_simd_load1_r(x+sciy+3), shY_S);
520         iz_S0            = gmx_simd_add_r(gmx_simd_load1_r(x+sciz), shZ_S);
521         iz_S1            = gmx_simd_add_r(gmx_simd_load1_r(x+sciz+1), shZ_S);
522         iz_S2            = gmx_simd_add_r(gmx_simd_load1_r(x+sciz+2), shZ_S);
523         iz_S3            = gmx_simd_add_r(gmx_simd_load1_r(x+sciz+3), shZ_S);
524
525         if (do_coul)
526         {
527             iq_S0      = gmx_simd_set1_r(facel*q[sci]);
528             iq_S1      = gmx_simd_set1_r(facel*q[sci+1]);
529             iq_S2      = gmx_simd_set1_r(facel*q[sci+2]);
530             iq_S3      = gmx_simd_set1_r(facel*q[sci+3]);
531         }
532
533 #ifdef LJ_COMB_LB
534         hsig_i_S0      = gmx_simd_load1_r(ljc+sci2+0);
535         hsig_i_S1      = gmx_simd_load1_r(ljc+sci2+1);
536         hsig_i_S2      = gmx_simd_load1_r(ljc+sci2+2);
537         hsig_i_S3      = gmx_simd_load1_r(ljc+sci2+3);
538         seps_i_S0      = gmx_simd_load1_r(ljc+sci2+STRIDE+0);
539         seps_i_S1      = gmx_simd_load1_r(ljc+sci2+STRIDE+1);
540         seps_i_S2      = gmx_simd_load1_r(ljc+sci2+STRIDE+2);
541         seps_i_S3      = gmx_simd_load1_r(ljc+sci2+STRIDE+3);
542 #else
543 #ifdef LJ_COMB_GEOM
544         c6s_S0         = gmx_simd_load1_r(ljc+sci2+0);
545         c6s_S1         = gmx_simd_load1_r(ljc+sci2+1);
546         if (!half_LJ)
547         {
548             c6s_S2     = gmx_simd_load1_r(ljc+sci2+2);
549             c6s_S3     = gmx_simd_load1_r(ljc+sci2+3);
550         }
551         c12s_S0        = gmx_simd_load1_r(ljc+sci2+STRIDE+0);
552         c12s_S1        = gmx_simd_load1_r(ljc+sci2+STRIDE+1);
553         if (!half_LJ)
554         {
555             c12s_S2    = gmx_simd_load1_r(ljc+sci2+STRIDE+2);
556             c12s_S3    = gmx_simd_load1_r(ljc+sci2+STRIDE+3);
557         }
558 #else
559         nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
560         nbfp1     = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
561         if (!half_LJ)
562         {
563             nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
564             nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
565         }
566 #endif
567 #endif
568
569         /* Zero the potential energy for this list */
570         Vvdwtot_S        = gmx_simd_setzero_r();
571         vctot_S          = gmx_simd_setzero_r();
572
573         /* Clear i atom forces */
574         fix_S0           = gmx_simd_setzero_r();
575         fix_S1           = gmx_simd_setzero_r();
576         fix_S2           = gmx_simd_setzero_r();
577         fix_S3           = gmx_simd_setzero_r();
578         fiy_S0           = gmx_simd_setzero_r();
579         fiy_S1           = gmx_simd_setzero_r();
580         fiy_S2           = gmx_simd_setzero_r();
581         fiy_S3           = gmx_simd_setzero_r();
582         fiz_S0           = gmx_simd_setzero_r();
583         fiz_S1           = gmx_simd_setzero_r();
584         fiz_S2           = gmx_simd_setzero_r();
585         fiz_S3           = gmx_simd_setzero_r();
586
587         cjind = cjind0;
588
589         /* Currently all kernels use (at least half) LJ */
590 #define CALC_LJ
591         if (half_LJ)
592         {
593             /* Coulomb: all i-atoms, LJ: first half i-atoms */
594 #define CALC_COULOMB
595 #define HALF_LJ
596 #define CHECK_EXCLS
597             while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
598             {
599 #include "nbnxn_kernel_simd_4xn_inner.h"
600                 cjind++;
601             }
602 #undef CHECK_EXCLS
603             for (; (cjind < cjind1); cjind++)
604             {
605 #include "nbnxn_kernel_simd_4xn_inner.h"
606             }
607 #undef HALF_LJ
608 #undef CALC_COULOMB
609         }
610         else if (do_coul)
611         {
612             /* Coulomb: all i-atoms, LJ: all i-atoms */
613 #define CALC_COULOMB
614 #define CHECK_EXCLS
615             while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
616             {
617 #include "nbnxn_kernel_simd_4xn_inner.h"
618                 cjind++;
619             }
620 #undef CHECK_EXCLS
621             for (; (cjind < cjind1); cjind++)
622             {
623 #include "nbnxn_kernel_simd_4xn_inner.h"
624             }
625 #undef CALC_COULOMB
626         }
627         else
628         {
629             /* Coulomb: none, LJ: all i-atoms */
630 #define CHECK_EXCLS
631             while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
632             {
633 #include "nbnxn_kernel_simd_4xn_inner.h"
634                 cjind++;
635             }
636 #undef CHECK_EXCLS
637             for (; (cjind < cjind1); cjind++)
638             {
639 #include "nbnxn_kernel_simd_4xn_inner.h"
640             }
641         }
642 #undef CALC_LJ
643         ninner += cjind1 - cjind0;
644
645         /* Add accumulated i-forces to the force array */
646 #if UNROLLJ >= 4
647         fix_S = gmx_mm_transpose_sum4_pr(fix_S0, fix_S1, fix_S2, fix_S3);
648         gmx_simd4_store_r(f+scix, gmx_add_pr4(fix_S, gmx_load_pr4(f+scix)));
649
650         fiy_S = gmx_mm_transpose_sum4_pr(fiy_S0, fiy_S1, fiy_S2, fiy_S3);
651         gmx_simd4_store_r(f+sciy, gmx_add_pr4(fiy_S, gmx_load_pr4(f+sciy)));
652
653         fiz_S = gmx_mm_transpose_sum4_pr(fiz_S0, fiz_S1, fiz_S2, fiz_S3);
654         gmx_simd4_store_r(f+sciz, gmx_add_pr4(fiz_S, gmx_load_pr4(f+sciz)));
655
656 #ifdef CALC_SHIFTFORCES
657         fshift[ish3+0] += gmx_sum_simd4(fix_S, shf);
658         fshift[ish3+1] += gmx_sum_simd4(fiy_S, shf);
659         fshift[ish3+2] += gmx_sum_simd4(fiz_S, shf);
660 #endif
661 #else
662         fix0_S = gmx_mm_transpose_sum2_pr(fix_S0, fix_S1);
663         gmx_simd_store_r(f+scix, gmx_simd_add_r(fix0_S, gmx_simd_load_r(f+scix)));
664         fix2_S = gmx_mm_transpose_sum2_pr(fix_S2, fix_S3);
665         gmx_simd_store_r(f+scix+2, gmx_simd_add_r(fix2_S, gmx_simd_load_r(f+scix+2)));
666
667         fiy0_S = gmx_mm_transpose_sum2_pr(fiy_S0, fiy_S1);
668         gmx_simd_store_r(f+sciy, gmx_simd_add_r(fiy0_S, gmx_simd_load_r(f+sciy)));
669         fiy2_S = gmx_mm_transpose_sum2_pr(fiy_S2, fiy_S3);
670         gmx_simd_store_r(f+sciy+2, gmx_simd_add_r(fiy2_S, gmx_simd_load_r(f+sciy+2)));
671
672         fiz0_S = gmx_mm_transpose_sum2_pr(fiz_S0, fiz_S1);
673         gmx_simd_store_r(f+sciz, gmx_simd_add_r(fiz0_S, gmx_simd_load_r(f+sciz)));
674         fiz2_S = gmx_mm_transpose_sum2_pr(fiz_S2, fiz_S3);
675         gmx_simd_store_r(f+sciz+2, gmx_simd_add_r(fiz2_S, gmx_simd_load_r(f+sciz+2)));
676
677 #ifdef CALC_SHIFTFORCES
678         fshift[ish3+0] += gmx_sum_simd2(gmx_simd_add_r(fix0_S, fix2_S), shf);
679         fshift[ish3+1] += gmx_sum_simd2(gmx_simd_add_r(fiy0_S, fiy2_S), shf);
680         fshift[ish3+2] += gmx_sum_simd2(gmx_simd_add_r(fiz0_S, fiz2_S), shf);
681 #endif
682 #endif
683
684 #ifdef CALC_ENERGIES
685         if (do_coul)
686         {
687             *Vc += gmx_sum_simd(vctot_S, tmpsum);
688         }
689
690         *Vvdw += gmx_sum_simd(Vvdwtot_S, tmpsum);
691 #endif
692
693         /* Outer loop uses 6 flops/iteration */
694     }
695
696 #ifdef COUNT_PAIRS
697     printf("atom pairs %d\n", npair);
698 #endif
699 }