Implemented LJ-PME nbnxn kernels
[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_kernels / simd_4xn / nbnxn_kernel_simd_4xn_outer.h
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35
36
37 {
38     const nbnxn_ci_t   *nbln;
39     const nbnxn_cj_t   *l_cj;
40     const int          *type;
41     const real         *q;
42     const real         *shiftvec;
43     const real         *x;
44     const real         *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
45     real                facel;
46     real               *nbfp_ptr;
47     int                 n, ci, ci_sh;
48     int                 ish, ish3;
49     gmx_bool            do_LJ, half_LJ, do_coul, do_self;
50     int                 sci, scix, sciy, sciz, sci2;
51     int                 cjind0, cjind1, cjind;
52     int                 ip, jp;
53
54 #ifdef ENERGY_GROUPS
55     int         Vstride_i;
56     int         egps_ishift, egps_imask;
57     int         egps_jshift, egps_jmask, egps_jstride;
58     int         egps_i;
59     real       *vvdwtp[UNROLLI];
60     real       *vctp[UNROLLI];
61 #endif
62
63     gmx_simd_real_t  shX_S;
64     gmx_simd_real_t  shY_S;
65     gmx_simd_real_t  shZ_S;
66     gmx_simd_real_t  ix_S0, iy_S0, iz_S0;
67     gmx_simd_real_t  ix_S1, iy_S1, iz_S1;
68     gmx_simd_real_t  ix_S2, iy_S2, iz_S2;
69     gmx_simd_real_t  ix_S3, iy_S3, iz_S3;
70     gmx_simd_real_t  fix_S0, fiy_S0, fiz_S0;
71     gmx_simd_real_t  fix_S1, fiy_S1, fiz_S1;
72     gmx_simd_real_t  fix_S2, fiy_S2, fiz_S2;
73     gmx_simd_real_t  fix_S3, fiy_S3, fiz_S3;
74 #if UNROLLJ >= 4
75     /* We use an i-force SIMD register width of 4 */
76     gmx_simd4_real_t fix_S, fiy_S, fiz_S;
77 #else
78     /* We use an i-force SIMD register width of 2 */
79     gmx_simd_real_t  fix0_S, fiy0_S, fiz0_S;
80     gmx_simd_real_t  fix2_S, fiy2_S, fiz2_S;
81 #endif
82
83     gmx_simd_real_t  diagonal_jmi_S;
84 #if UNROLLI == UNROLLJ
85     gmx_simd_bool_t  diagonal_mask_S0, diagonal_mask_S1, diagonal_mask_S2, diagonal_mask_S3;
86 #else
87     gmx_simd_bool_t  diagonal_mask0_S0, diagonal_mask0_S1, diagonal_mask0_S2, diagonal_mask0_S3;
88     gmx_simd_bool_t  diagonal_mask1_S0, diagonal_mask1_S1, diagonal_mask1_S2, diagonal_mask1_S3;
89 #endif
90
91     unsigned            *exclusion_filter;
92     gmx_exclfilter       filter_S0, filter_S1, filter_S2, filter_S3;
93
94     gmx_simd_real_t      zero_S = gmx_simd_set1_r(0.0);
95
96     gmx_simd_real_t      one_S  = gmx_simd_set1_r(1.0);
97     gmx_simd_real_t      iq_S0  = gmx_simd_setzero_r();
98     gmx_simd_real_t      iq_S1  = gmx_simd_setzero_r();
99     gmx_simd_real_t      iq_S2  = gmx_simd_setzero_r();
100     gmx_simd_real_t      iq_S3  = gmx_simd_setzero_r();
101
102 #ifdef CALC_COUL_RF
103     gmx_simd_real_t      mrc_3_S;
104 #ifdef CALC_ENERGIES
105     gmx_simd_real_t      hrc_3_S, moh_rc_S;
106 #endif
107 #endif
108
109 #ifdef CALC_COUL_TAB
110     /* Coulomb table variables */
111     gmx_simd_real_t   invtsp_S;
112     const real       *tab_coul_F;
113 #ifndef TAB_FDV0
114     const real       *tab_coul_V;
115 #endif
116     /* Thread-local working buffers for force and potential lookups */
117     int               ti0_array[2*GMX_SIMD_REAL_WIDTH], *ti0 = NULL;
118     int               ti1_array[2*GMX_SIMD_REAL_WIDTH], *ti1 = NULL;
119     int               ti2_array[2*GMX_SIMD_REAL_WIDTH], *ti2 = NULL;
120     int               ti3_array[2*GMX_SIMD_REAL_WIDTH], *ti3 = NULL;
121 #ifdef CALC_ENERGIES
122     gmx_simd_real_t   mhalfsp_S;
123 #endif
124 #endif
125
126 #ifdef CALC_COUL_EWALD
127     gmx_simd_real_t beta2_S, beta_S;
128 #endif
129
130 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
131     gmx_simd_real_t  sh_ewald_S;
132 #endif
133
134 #if defined LJ_CUT && defined CALC_ENERGIES
135     gmx_simd_real_t   p6_cpot_S, p12_cpot_S;
136 #endif
137 #ifdef LJ_POT_SWITCH
138     gmx_simd_real_t   rswitch_S;
139     gmx_simd_real_t   swV3_S, swV4_S, swV5_S;
140     gmx_simd_real_t   swF2_S, swF3_S, swF4_S;
141 #endif
142 #ifdef LJ_FORCE_SWITCH
143     gmx_simd_real_t   rswitch_S;
144     gmx_simd_real_t   p6_fc2_S, p6_fc3_S;
145     gmx_simd_real_t   p12_fc2_S, p12_fc3_S;
146 #ifdef CALC_ENERGIES
147     gmx_simd_real_t   p6_vc3_S, p6_vc4_S;
148     gmx_simd_real_t   p12_vc3_S, p12_vc4_S;
149     gmx_simd_real_t   p6_6cpot_S, p12_12cpot_S;
150 #endif
151 #endif
152 #ifdef LJ_EWALD_GEOM
153     real              lj_ewaldcoeff2, lj_ewaldcoeff6_6;
154     gmx_simd_real_t   mone_S, half_S, lje_c2_S, lje_c6_6_S, lje_vc_S;
155 #endif
156
157 #ifdef LJ_COMB_LB
158     const real       *ljc;
159
160     gmx_simd_real_t   hsig_i_S0, seps_i_S0;
161     gmx_simd_real_t   hsig_i_S1, seps_i_S1;
162     gmx_simd_real_t   hsig_i_S2, seps_i_S2;
163     gmx_simd_real_t   hsig_i_S3, seps_i_S3;
164 #else
165 #ifdef FIX_LJ_C
166     real              pvdw_array[2*UNROLLI*UNROLLJ+3];
167     real             *pvdw_c6, *pvdw_c12;
168     gmx_simd_real_t   c6_S0, c12_S0;
169     gmx_simd_real_t   c6_S1, c12_S1;
170     gmx_simd_real_t   c6_S2, c12_S2;
171     gmx_simd_real_t   c6_S3, c12_S3;
172 #endif
173
174 #if defined LJ_COMB_GEOM || defined LJ_EWALD_GEOM
175     const real       *ljc;
176
177     gmx_simd_real_t   c6s_S0, c12s_S0;
178     gmx_simd_real_t   c6s_S1, c12s_S1;
179     gmx_simd_real_t   c6s_S2  = gmx_simd_setzero_r();
180     gmx_simd_real_t   c12s_S2 = gmx_simd_setzero_r();
181     gmx_simd_real_t   c6s_S3  = gmx_simd_setzero_r();
182     gmx_simd_real_t   c12s_S3 = gmx_simd_setzero_r();
183 #endif
184 #endif /* LJ_COMB_LB */
185
186     gmx_simd_real_t  vctot_S, Vvdwtot_S;
187     gmx_simd_real_t  sixth_S, twelveth_S;
188
189     gmx_simd_real_t  avoid_sing_S;
190     gmx_simd_real_t  rc2_S;
191 #ifdef VDW_CUTOFF_CHECK
192     gmx_simd_real_t  rcvdw2_S;
193 #endif
194
195 #ifdef CALC_ENERGIES
196     /* cppcheck-suppress unassignedVariable */
197     real       tmpsum_array[GMX_SIMD_REAL_WIDTH*2], *tmpsum;
198 #endif
199 #ifdef CALC_SHIFTFORCES
200     /* cppcheck-suppress unassignedVariable */
201     real       shf_array[GMX_SIMD_REAL_WIDTH*2], *shf;
202 #endif
203
204     int ninner;
205
206 #ifdef COUNT_PAIRS
207     int npair = 0;
208 #endif
209
210 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM
211     ljc = nbat->lj_comb;
212 #endif
213 #if !(defined LJ_COMB_GEOM || defined LJ_COMB_LB)
214     /* No combination rule used */
215     nbfp_ptr    = (4 == nbfp_stride) ? nbat->nbfp_s4 : nbat->nbfp;
216 #endif
217
218     /* Load j-i for the first i */
219     diagonal_jmi_S    = gmx_simd_load_r(nbat->simd_4xn_diagonal_j_minus_i);
220     /* Generate all the diagonal masks as comparison results */
221 #if UNROLLI == UNROLLJ
222     diagonal_mask_S0  = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
223     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
224     diagonal_mask_S1  = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
225     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
226     diagonal_mask_S2  = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
227     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
228     diagonal_mask_S3  = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
229 #else
230 #if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ
231     diagonal_mask0_S0 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
232     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
233     diagonal_mask0_S1 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
234     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
235     diagonal_mask0_S2 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
236     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
237     diagonal_mask0_S3 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
238     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
239
240 #if UNROLLI == 2*UNROLLJ
241     /* Load j-i for the second half of the j-cluster */
242     diagonal_jmi_S    = gmx_simd_load_r(nbat->simd_4xn_diagonal_j_minus_i + UNROLLJ);
243 #endif
244
245     diagonal_mask1_S0 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
246     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
247     diagonal_mask1_S1 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
248     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
249     diagonal_mask1_S2 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
250     diagonal_jmi_S    = gmx_simd_sub_r(diagonal_jmi_S, one_S);
251     diagonal_mask1_S3 = gmx_simd_cmplt_r(zero_S, diagonal_jmi_S);
252 #endif
253 #endif
254
255     /* Load masks for topology exclusion masking. filter_stride is
256        static const, so the conditional will be optimized away. */
257     if (1 == filter_stride)
258     {
259         exclusion_filter = nbat->simd_exclusion_filter1;
260     }
261     else /* (2 == filter_stride) */
262     {
263         exclusion_filter = nbat->simd_exclusion_filter2;
264     }
265
266     /* Here we cast the exclusion filters from unsigned * to int * or real *.
267      * Since we only check bits, the actual value they represent does not
268      * matter, as long as both filter and mask data are treated the same way.
269      */
270     filter_S0 = gmx_load_exclusion_filter(exclusion_filter + 0*UNROLLJ*filter_stride);
271     filter_S1 = gmx_load_exclusion_filter(exclusion_filter + 1*UNROLLJ*filter_stride);
272     filter_S2 = gmx_load_exclusion_filter(exclusion_filter + 2*UNROLLJ*filter_stride);
273     filter_S3 = gmx_load_exclusion_filter(exclusion_filter + 3*UNROLLJ*filter_stride);
274
275 #ifdef CALC_COUL_RF
276     /* Reaction-field constants */
277     mrc_3_S  = gmx_simd_set1_r(-2*ic->k_rf);
278 #ifdef CALC_ENERGIES
279     hrc_3_S  = gmx_simd_set1_r(ic->k_rf);
280     moh_rc_S = gmx_simd_set1_r(-ic->c_rf);
281 #endif
282 #endif
283
284 #ifdef CALC_COUL_TAB
285     /* Generate aligned table index pointers */
286     ti0 = prepare_table_load_buffer(ti0_array);
287     ti1 = prepare_table_load_buffer(ti1_array);
288     ti2 = prepare_table_load_buffer(ti2_array);
289     ti3 = prepare_table_load_buffer(ti3_array);
290
291     invtsp_S  = gmx_simd_set1_r(ic->tabq_scale);
292 #ifdef CALC_ENERGIES
293     mhalfsp_S = gmx_simd_set1_r(-0.5/ic->tabq_scale);
294 #endif
295
296 #ifdef TAB_FDV0
297     tab_coul_F = ic->tabq_coul_FDV0;
298 #else
299     tab_coul_F = ic->tabq_coul_F;
300     tab_coul_V = ic->tabq_coul_V;
301 #endif
302 #endif /* CALC_COUL_TAB */
303
304 #ifdef CALC_COUL_EWALD
305     beta2_S = gmx_simd_set1_r(ic->ewaldcoeff_q*ic->ewaldcoeff_q);
306     beta_S  = gmx_simd_set1_r(ic->ewaldcoeff_q);
307 #endif
308
309 #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
310     sh_ewald_S = gmx_simd_set1_r(ic->sh_ewald);
311 #endif
312
313     /* LJ function constants */
314 #if defined CALC_ENERGIES || defined LJ_POT_SWITCH
315     sixth_S      = gmx_simd_set1_r(1.0/6.0);
316     twelveth_S   = gmx_simd_set1_r(1.0/12.0);
317 #endif
318
319 #if defined LJ_CUT && defined CALC_ENERGIES
320     /* We shift the potential by cpot, which can be zero */
321     p6_cpot_S    = gmx_simd_set1_r(ic->dispersion_shift.cpot);
322     p12_cpot_S   = gmx_simd_set1_r(ic->repulsion_shift.cpot);
323 #endif
324 #ifdef LJ_POT_SWITCH
325     rswitch_S = gmx_simd_set1_r(ic->rvdw_switch);
326     swV3_S    = gmx_simd_set1_r(ic->vdw_switch.c3);
327     swV4_S    = gmx_simd_set1_r(ic->vdw_switch.c4);
328     swV5_S    = gmx_simd_set1_r(ic->vdw_switch.c5);
329     swF2_S    = gmx_simd_set1_r(3*ic->vdw_switch.c3);
330     swF3_S    = gmx_simd_set1_r(4*ic->vdw_switch.c4);
331     swF4_S    = gmx_simd_set1_r(5*ic->vdw_switch.c5);
332 #endif
333 #ifdef LJ_FORCE_SWITCH
334     rswitch_S = gmx_simd_set1_r(ic->rvdw_switch);
335     p6_fc2_S  = gmx_simd_set1_r(ic->dispersion_shift.c2);
336     p6_fc3_S  = gmx_simd_set1_r(ic->dispersion_shift.c3);
337     p12_fc2_S = gmx_simd_set1_r(ic->repulsion_shift.c2);
338     p12_fc3_S = gmx_simd_set1_r(ic->repulsion_shift.c3);
339 #ifdef CALC_ENERGIES
340     {
341         gmx_simd_real_t mthird_S  = gmx_simd_set1_r(-1.0/3.0);
342         gmx_simd_real_t mfourth_S = gmx_simd_set1_r(-1.0/4.0);
343
344         p6_vc3_S     = gmx_simd_mul_r(mthird_S,  p6_fc2_S);
345         p6_vc4_S     = gmx_simd_mul_r(mfourth_S, p6_fc3_S);
346         p6_6cpot_S   = gmx_simd_set1_r(ic->dispersion_shift.cpot/6);
347         p12_vc3_S    = gmx_simd_mul_r(mthird_S,  p12_fc2_S);
348         p12_vc4_S    = gmx_simd_mul_r(mfourth_S, p12_fc3_S);
349         p12_12cpot_S = gmx_simd_set1_r(ic->repulsion_shift.cpot/12);
350     }
351 #endif
352 #endif
353 #ifdef LJ_EWALD_GEOM
354     mone_S           = gmx_simd_set1_r(-1.0);
355     half_S           = gmx_simd_set1_r(0.5);
356     lj_ewaldcoeff2   = ic->ewaldcoeff_lj*ic->ewaldcoeff_lj;
357     lj_ewaldcoeff6_6 = lj_ewaldcoeff2*lj_ewaldcoeff2*lj_ewaldcoeff2/6;
358     lje_c2_S         = gmx_simd_set1_r(lj_ewaldcoeff2);
359     lje_c6_6_S       = gmx_simd_set1_r(lj_ewaldcoeff6_6);
360     /* Determine the grid potential at the cut-off */
361     lje_vc_S         = gmx_simd_set1_r(ic->sh_lj_ewald);
362 #endif
363
364     /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
365     rc2_S    = gmx_simd_set1_r(ic->rcoulomb*ic->rcoulomb);
366 #ifdef VDW_CUTOFF_CHECK
367     rcvdw2_S = gmx_simd_set1_r(ic->rvdw*ic->rvdw);
368 #endif
369
370     avoid_sing_S = gmx_simd_set1_r(NBNXN_AVOID_SING_R2_INC);
371
372     q                   = nbat->q;
373     type                = nbat->type;
374     facel               = ic->epsfac;
375     shiftvec            = shift_vec[0];
376     x                   = nbat->x;
377
378 #ifdef CALC_ENERGIES
379     tmpsum   = gmx_simd_align_r(tmpsum_array);
380 #endif
381 #ifdef CALC_SHIFTFORCES
382     shf      = gmx_simd_align_r(shf_array);
383 #endif
384
385 #ifdef FIX_LJ_C
386     pvdw_c6  = gmx_simd_align_real(pvdw_array);
387     pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
388
389     for (jp = 0; jp < UNROLLJ; jp++)
390     {
391         pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
392         pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
393         pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
394         pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
395
396         pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
397         pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
398         pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
399         pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
400     }
401     c6_S0            = gmx_simd_load_r(pvdw_c6 +0*UNROLLJ);
402     c6_S1            = gmx_simd_load_r(pvdw_c6 +1*UNROLLJ);
403     c6_S2            = gmx_simd_load_r(pvdw_c6 +2*UNROLLJ);
404     c6_S3            = gmx_simd_load_r(pvdw_c6 +3*UNROLLJ);
405
406     c12_S0           = gmx_simd_load_r(pvdw_c12+0*UNROLLJ);
407     c12_S1           = gmx_simd_load_r(pvdw_c12+1*UNROLLJ);
408     c12_S2           = gmx_simd_load_r(pvdw_c12+2*UNROLLJ);
409     c12_S3           = gmx_simd_load_r(pvdw_c12+3*UNROLLJ);
410 #endif /* FIX_LJ_C */
411
412 #ifdef ENERGY_GROUPS
413     egps_ishift  = nbat->neg_2log;
414     egps_imask   = (1<<egps_ishift) - 1;
415     egps_jshift  = 2*nbat->neg_2log;
416     egps_jmask   = (1<<egps_jshift) - 1;
417     egps_jstride = (UNROLLJ>>1)*UNROLLJ;
418     /* Major division is over i-particle energy groups, determine the stride */
419     Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
420 #endif
421
422     l_cj = nbl->cj;
423
424     ninner = 0;
425     for (n = 0; n < nbl->nci; n++)
426     {
427         nbln = &nbl->ci[n];
428
429         ish              = (nbln->shift & NBNXN_CI_SHIFT);
430         ish3             = ish*3;
431         cjind0           = nbln->cj_ind_start;
432         cjind1           = nbln->cj_ind_end;
433         ci               = nbln->ci;
434         ci_sh            = (ish == CENTRAL ? ci : -1);
435
436         shX_S = gmx_simd_load1_r(shiftvec+ish3);
437         shY_S = gmx_simd_load1_r(shiftvec+ish3+1);
438         shZ_S = gmx_simd_load1_r(shiftvec+ish3+2);
439
440 #if UNROLLJ <= 4
441         sci              = ci*STRIDE;
442         scix             = sci*DIM;
443         sci2             = sci*2;
444 #else
445         sci              = (ci>>1)*STRIDE;
446         scix             = sci*DIM + (ci & 1)*(STRIDE>>1);
447         sci2             = sci*2 + (ci & 1)*(STRIDE>>1);
448         sci             += (ci & 1)*(STRIDE>>1);
449 #endif
450
451         /* We have 5 LJ/C combinations, but use only three inner loops,
452          * as the other combinations are unlikely and/or not much faster:
453          * inner half-LJ + C for half-LJ + C / no-LJ + C
454          * inner LJ + C      for full-LJ + C
455          * inner LJ          for full-LJ + no-C / half-LJ + no-C
456          */
457         do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
458         do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
459         half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
460 #ifdef LJ_EWALD_GEOM
461         do_self = TRUE;
462 #else
463         do_self = do_coul;
464 #endif
465
466 #ifdef ENERGY_GROUPS
467         egps_i = nbat->energrp[ci];
468         {
469             int ia, egp_ia;
470
471             for (ia = 0; ia < UNROLLI; ia++)
472             {
473                 egp_ia     = (egps_i >> (ia*egps_ishift)) & egps_imask;
474                 vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
475                 vctp[ia]   = Vc   + egp_ia*Vstride_i;
476             }
477         }
478 #endif
479
480 #ifdef CALC_ENERGIES
481 #if UNROLLJ == 4
482         if (do_self && l_cj[nbln->cj_ind_start].cj == ci_sh)
483 #endif
484 #if UNROLLJ == 2
485         if (do_self && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
486 #endif
487 #if UNROLLJ == 8
488         if (do_self && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
489 #endif
490         {
491             if (do_coul)
492             {
493                 real Vc_sub_self;
494                 int  ia;
495
496 #ifdef CALC_COUL_RF
497                 Vc_sub_self = 0.5*ic->c_rf;
498 #endif
499 #ifdef CALC_COUL_TAB
500 #ifdef TAB_FDV0
501                 Vc_sub_self = 0.5*tab_coul_F[2];
502 #else
503                 Vc_sub_self = 0.5*tab_coul_V[0];
504 #endif
505 #endif
506 #ifdef CALC_COUL_EWALD
507                 /* beta/sqrt(pi) */
508                 Vc_sub_self = 0.5*ic->ewaldcoeff_q*M_2_SQRTPI;
509 #endif
510
511                 for (ia = 0; ia < UNROLLI; ia++)
512                 {
513                     real qi;
514
515                     qi = q[sci+ia];
516 #ifdef ENERGY_GROUPS
517                     vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
518 #else
519                     Vc[0]
520 #endif
521                         -= facel*qi*qi*Vc_sub_self;
522                 }
523             }
524
525 #ifdef LJ_EWALD_GEOM
526             {
527                 int  ia;
528
529                 for (ia = 0; ia < UNROLLI; ia++)
530                 {
531                     real c6_i;
532
533                     c6_i = nbat->nbfp[nbat->type[sci+ia]*(nbat->ntype + 1)*2]/6;
534 #ifdef ENERGY_GROUPS
535                     vvdwtp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
536 #else
537                     Vvdw[0]
538 #endif
539                         += 0.5*c6_i*lj_ewaldcoeff6_6;
540                 }
541             }
542 #endif      /* LJ_EWALD_GEOM */
543         }
544 #endif
545
546         /* Load i atom data */
547         sciy             = scix + STRIDE;
548         sciz             = sciy + STRIDE;
549         ix_S0            = gmx_simd_add_r(gmx_simd_load1_r(x+scix), shX_S);
550         ix_S1            = gmx_simd_add_r(gmx_simd_load1_r(x+scix+1), shX_S);
551         ix_S2            = gmx_simd_add_r(gmx_simd_load1_r(x+scix+2), shX_S);
552         ix_S3            = gmx_simd_add_r(gmx_simd_load1_r(x+scix+3), shX_S);
553         iy_S0            = gmx_simd_add_r(gmx_simd_load1_r(x+sciy), shY_S);
554         iy_S1            = gmx_simd_add_r(gmx_simd_load1_r(x+sciy+1), shY_S);
555         iy_S2            = gmx_simd_add_r(gmx_simd_load1_r(x+sciy+2), shY_S);
556         iy_S3            = gmx_simd_add_r(gmx_simd_load1_r(x+sciy+3), shY_S);
557         iz_S0            = gmx_simd_add_r(gmx_simd_load1_r(x+sciz), shZ_S);
558         iz_S1            = gmx_simd_add_r(gmx_simd_load1_r(x+sciz+1), shZ_S);
559         iz_S2            = gmx_simd_add_r(gmx_simd_load1_r(x+sciz+2), shZ_S);
560         iz_S3            = gmx_simd_add_r(gmx_simd_load1_r(x+sciz+3), shZ_S);
561
562         if (do_coul)
563         {
564             iq_S0      = gmx_simd_set1_r(facel*q[sci]);
565             iq_S1      = gmx_simd_set1_r(facel*q[sci+1]);
566             iq_S2      = gmx_simd_set1_r(facel*q[sci+2]);
567             iq_S3      = gmx_simd_set1_r(facel*q[sci+3]);
568         }
569
570 #ifdef LJ_COMB_LB
571         hsig_i_S0      = gmx_simd_load1_r(ljc+sci2+0);
572         hsig_i_S1      = gmx_simd_load1_r(ljc+sci2+1);
573         hsig_i_S2      = gmx_simd_load1_r(ljc+sci2+2);
574         hsig_i_S3      = gmx_simd_load1_r(ljc+sci2+3);
575         seps_i_S0      = gmx_simd_load1_r(ljc+sci2+STRIDE+0);
576         seps_i_S1      = gmx_simd_load1_r(ljc+sci2+STRIDE+1);
577         seps_i_S2      = gmx_simd_load1_r(ljc+sci2+STRIDE+2);
578         seps_i_S3      = gmx_simd_load1_r(ljc+sci2+STRIDE+3);
579 #else
580 #ifdef LJ_COMB_GEOM
581         c6s_S0         = gmx_simd_load1_r(ljc+sci2+0);
582         c6s_S1         = gmx_simd_load1_r(ljc+sci2+1);
583         if (!half_LJ)
584         {
585             c6s_S2     = gmx_simd_load1_r(ljc+sci2+2);
586             c6s_S3     = gmx_simd_load1_r(ljc+sci2+3);
587         }
588         c12s_S0        = gmx_simd_load1_r(ljc+sci2+STRIDE+0);
589         c12s_S1        = gmx_simd_load1_r(ljc+sci2+STRIDE+1);
590         if (!half_LJ)
591         {
592             c12s_S2    = gmx_simd_load1_r(ljc+sci2+STRIDE+2);
593             c12s_S3    = gmx_simd_load1_r(ljc+sci2+STRIDE+3);
594         }
595 #else
596         nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
597         nbfp1     = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
598         if (!half_LJ)
599         {
600             nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
601             nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
602         }
603 #endif
604 #endif
605 #ifdef LJ_EWALD_GEOM
606         /* We need the geometrically combined C6 for the PME grid correction */
607         c6s_S0 = gmx_simd_load1_r(ljc+sci2+0);
608         c6s_S1 = gmx_simd_load1_r(ljc+sci2+1);
609         if (!half_LJ)
610         {
611             c6s_S2 = gmx_simd_load1_r(ljc+sci2+2);
612             c6s_S3 = gmx_simd_load1_r(ljc+sci2+3);
613         }
614 #endif
615
616         /* Zero the potential energy for this list */
617         Vvdwtot_S        = gmx_simd_setzero_r();
618         vctot_S          = gmx_simd_setzero_r();
619
620         /* Clear i atom forces */
621         fix_S0           = gmx_simd_setzero_r();
622         fix_S1           = gmx_simd_setzero_r();
623         fix_S2           = gmx_simd_setzero_r();
624         fix_S3           = gmx_simd_setzero_r();
625         fiy_S0           = gmx_simd_setzero_r();
626         fiy_S1           = gmx_simd_setzero_r();
627         fiy_S2           = gmx_simd_setzero_r();
628         fiy_S3           = gmx_simd_setzero_r();
629         fiz_S0           = gmx_simd_setzero_r();
630         fiz_S1           = gmx_simd_setzero_r();
631         fiz_S2           = gmx_simd_setzero_r();
632         fiz_S3           = gmx_simd_setzero_r();
633
634         cjind = cjind0;
635
636         /* Currently all kernels use (at least half) LJ */
637 #define CALC_LJ
638         if (half_LJ)
639         {
640             /* Coulomb: all i-atoms, LJ: first half i-atoms */
641 #define CALC_COULOMB
642 #define HALF_LJ
643 #define CHECK_EXCLS
644             while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
645             {
646 #include "nbnxn_kernel_simd_4xn_inner.h"
647                 cjind++;
648             }
649 #undef CHECK_EXCLS
650             for (; (cjind < cjind1); cjind++)
651             {
652 #include "nbnxn_kernel_simd_4xn_inner.h"
653             }
654 #undef HALF_LJ
655 #undef CALC_COULOMB
656         }
657         else if (do_coul)
658         {
659             /* Coulomb: all i-atoms, LJ: all i-atoms */
660 #define CALC_COULOMB
661 #define CHECK_EXCLS
662             while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
663             {
664 #include "nbnxn_kernel_simd_4xn_inner.h"
665                 cjind++;
666             }
667 #undef CHECK_EXCLS
668             for (; (cjind < cjind1); cjind++)
669             {
670 #include "nbnxn_kernel_simd_4xn_inner.h"
671             }
672 #undef CALC_COULOMB
673         }
674         else
675         {
676             /* Coulomb: none, LJ: all i-atoms */
677 #define CHECK_EXCLS
678             while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
679             {
680 #include "nbnxn_kernel_simd_4xn_inner.h"
681                 cjind++;
682             }
683 #undef CHECK_EXCLS
684             for (; (cjind < cjind1); cjind++)
685             {
686 #include "nbnxn_kernel_simd_4xn_inner.h"
687             }
688         }
689 #undef CALC_LJ
690         ninner += cjind1 - cjind0;
691
692         /* Add accumulated i-forces to the force array */
693 #if UNROLLJ >= 4
694         fix_S = gmx_mm_transpose_sum4_pr(fix_S0, fix_S1, fix_S2, fix_S3);
695         gmx_simd4_store_r(f+scix, gmx_simd4_add_r(fix_S, gmx_simd4_load_r(f+scix)));
696
697         fiy_S = gmx_mm_transpose_sum4_pr(fiy_S0, fiy_S1, fiy_S2, fiy_S3);
698         gmx_simd4_store_r(f+sciy, gmx_simd4_add_r(fiy_S, gmx_simd4_load_r(f+sciy)));
699
700         fiz_S = gmx_mm_transpose_sum4_pr(fiz_S0, fiz_S1, fiz_S2, fiz_S3);
701         gmx_simd4_store_r(f+sciz, gmx_simd4_add_r(fiz_S, gmx_simd4_load_r(f+sciz)));
702
703 #ifdef CALC_SHIFTFORCES
704         fshift[ish3+0] += gmx_sum_simd4(fix_S, shf);
705         fshift[ish3+1] += gmx_sum_simd4(fiy_S, shf);
706         fshift[ish3+2] += gmx_sum_simd4(fiz_S, shf);
707 #endif
708 #else
709         fix0_S = gmx_mm_transpose_sum2_pr(fix_S0, fix_S1);
710         gmx_simd_store_r(f+scix, gmx_simd_add_r(fix0_S, gmx_simd_load_r(f+scix)));
711         fix2_S = gmx_mm_transpose_sum2_pr(fix_S2, fix_S3);
712         gmx_simd_store_r(f+scix+2, gmx_simd_add_r(fix2_S, gmx_simd_load_r(f+scix+2)));
713
714         fiy0_S = gmx_mm_transpose_sum2_pr(fiy_S0, fiy_S1);
715         gmx_simd_store_r(f+sciy, gmx_simd_add_r(fiy0_S, gmx_simd_load_r(f+sciy)));
716         fiy2_S = gmx_mm_transpose_sum2_pr(fiy_S2, fiy_S3);
717         gmx_simd_store_r(f+sciy+2, gmx_simd_add_r(fiy2_S, gmx_simd_load_r(f+sciy+2)));
718
719         fiz0_S = gmx_mm_transpose_sum2_pr(fiz_S0, fiz_S1);
720         gmx_simd_store_r(f+sciz, gmx_simd_add_r(fiz0_S, gmx_simd_load_r(f+sciz)));
721         fiz2_S = gmx_mm_transpose_sum2_pr(fiz_S2, fiz_S3);
722         gmx_simd_store_r(f+sciz+2, gmx_simd_add_r(fiz2_S, gmx_simd_load_r(f+sciz+2)));
723
724 #ifdef CALC_SHIFTFORCES
725         fshift[ish3+0] += gmx_sum_simd2(gmx_simd_add_r(fix0_S, fix2_S), shf);
726         fshift[ish3+1] += gmx_sum_simd2(gmx_simd_add_r(fiy0_S, fiy2_S), shf);
727         fshift[ish3+2] += gmx_sum_simd2(gmx_simd_add_r(fiz0_S, fiz2_S), shf);
728 #endif
729 #endif
730
731 #ifdef CALC_ENERGIES
732         if (do_coul)
733         {
734             *Vc += gmx_sum_simd(vctot_S, tmpsum);
735         }
736
737         *Vvdw += gmx_sum_simd(Vvdwtot_S, tmpsum);
738 #endif
739
740         /* Outer loop uses 6 flops/iteration */
741     }
742
743 #ifdef COUNT_PAIRS
744     printf("atom pairs %d\n", npair);
745 #endif
746 }