added Verlet scheme and NxN non-bonded functionality
[alexxy/gromacs.git] / src / mdlib / nbnxn_kernels / nbnxn_kernel_x86_simd_outer.h
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
9  * Copyright (c) 2001-2009, The GROMACS Development Team
10  *
11  * Gromacs is a library for molecular simulation and trajectory analysis,
12  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
13  * a full list of developers and information, check out http://www.gromacs.org
14  *
15  * This program is free software; you can redistribute it and/or modify it under
16  * the terms of the GNU Lesser General Public License as published by the Free
17  * Software Foundation; either version 2 of the License, or (at your option) any
18  * later version.
19  * As a special exception, you may use this file as part of a free software
20  * library without restriction.  Specifically, if other files instantiate
21  * templates or use macros or inline functions from this file, or you compile
22  * this file and link it with other files to produce an executable, this
23  * file does not by itself cause the resulting executable to be covered by
24  * the GNU Lesser General Public License.
25  *
26  * In plain-speak: do not worry about classes/macros/templates either - only
27  * changes to the library have to be LGPL, not an application linking with it.
28  *
29  * To help fund GROMACS development, we humbly ask that you cite
30  * the papers people have written on it - you can find them on the website!
31  */
32
33 /* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
34 #include "gmx_x86_simd_macros.h"
35
36 #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
37
38 #define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
39 #define UNROLLJ    GMX_X86_SIMD_WIDTH_HERE
40
41 #if defined GMX_MM128_HERE || defined GMX_DOUBLE
42 #define STRIDE     4
43 #endif
44 #if defined GMX_MM256_HERE && !defined GMX_DOUBLE
45 #define STRIDE     8
46 #endif 
47
48 #ifdef GMX_MM128_HERE
49 #ifndef GMX_DOUBLE
50 /* SSE single precision 4x4 kernel */
51 #define SUM_SIMD(x) SUM_SIMD4(x)
52 #define TAB_FDV0
53 #else
54 /* SSE double precision 4x2 kernel */
55 #define SUM_SIMD(x) (x[0]+x[1])
56 #endif
57 #endif
58
59 #ifdef GMX_MM256_HERE
60 #ifndef GMX_DOUBLE
61 /* AVX single precision 4x8 kernel */
62 #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
63 #define TAB_FDV0
64 #else
65 /* AVX double precision 4x4 kernel */
66 #define SUM_SIMD(x) SUM_SIMD4(x)
67 #endif
68 #endif
69
70 #define SIMD_MASK_ALL   0xffffffff
71
72 #include "nbnxn_kernel_x86_simd_utils.h"
73
74 /* All functionality defines are set here, except for:
75  * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
76  * CHECK_EXCLS, which is set just before including the inner loop contents.
77  * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
78  * set before calling the kernel function. We might want to move that
79  * to inside the n-loop and have a different combination rule for different
80  * ci's, as no combination rule gives a 50% performance hit for LJ.
81  */
82
83 /* We always calculate shift forces, because it's cheap anyhow */
84 #define CALC_SHIFTFORCES
85
86 /* Assumes all LJ parameters are identical */
87 /* #define FIX_LJ_C */
88
89 #define NBK_FUNC_NAME_C_LJC(b,s,c,ljc,e) b##_##s##_##c##_comb_##ljc##_##e
90
91 #if defined LJ_COMB_GEOM
92 #define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,geom,e)
93 #else
94 #if defined LJ_COMB_LB
95 #define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,lb,e)
96 #else
97 #define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,none,e)
98 #endif
99 #endif
100
101 #ifdef CALC_COUL_RF
102 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,rf,e)
103 #endif
104 #ifdef CALC_COUL_TAB
105 #ifndef VDW_CUTOFF_CHECK
106 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab,e)
107 #else
108 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab_twin,e)
109 #endif
110 #endif
111
112 #ifdef GMX_MM128_HERE
113 #define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd128,e)
114 #endif
115 #ifdef GMX_MM256_HERE
116 #define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd256,e)
117 #endif
118
119 static void
120 #ifndef CALC_ENERGIES
121 NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,noener)
122 #else
123 #ifndef ENERGY_GROUPS
124 NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,ener)
125 #else
126 NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
127 #endif
128 #endif
129 #undef NBK_FUNC_NAME
130 #undef NBK_FUNC_NAME_C
131 #undef NBK_FUNC_NAME_C_LJC
132                             (const nbnxn_pairlist_t     *nbl,
133                              const nbnxn_atomdata_t     *nbat,
134                              const interaction_const_t  *ic,
135                              rvec                       *shift_vec, 
136                              real                       *f
137 #ifdef CALC_SHIFTFORCES
138                              ,
139                              real                       *fshift
140 #endif
141 #ifdef CALC_ENERGIES
142                              ,
143                              real                       *Vvdw,
144                              real                       *Vc
145 #endif
146                             )
147 {
148     const nbnxn_ci_t   *nbln;
149     const nbnxn_cj_t   *l_cj;
150     const int          *type;
151     const real         *q;
152     const real         *shiftvec;
153     const real         *x;
154     const real         *nbfp0,*nbfp1,*nbfp2=NULL,*nbfp3=NULL;
155     real       facel;
156     real       *nbfp_ptr;
157     int        nbfp_stride;
158     int        n,ci,ci_sh;
159     int        ish,ish3;
160     gmx_bool   half_LJ,do_coul;
161     int        sci,scix,sciy,sciz,sci2;
162     int        cjind0,cjind1,cjind;
163     int        ip,jp;
164
165 #ifdef ENERGY_GROUPS
166     int        Vstride_i;
167     int        egps_ishift,egps_imask;
168     int        egps_jshift,egps_jmask,egps_jstride;
169     int        egps_i;
170     real       *vvdwtp[UNROLLI];
171     real       *vctp[UNROLLI];
172 #endif
173     
174     gmx_mm_pr  shX_SSE;
175     gmx_mm_pr  shY_SSE;
176     gmx_mm_pr  shZ_SSE;
177     gmx_mm_pr  ix_SSE0,iy_SSE0,iz_SSE0;
178     gmx_mm_pr  ix_SSE1,iy_SSE1,iz_SSE1;
179     gmx_mm_pr  ix_SSE2,iy_SSE2,iz_SSE2;
180     gmx_mm_pr  ix_SSE3,iy_SSE3,iz_SSE3;
181     gmx_mm_pr  fix_SSE0,fiy_SSE0,fiz_SSE0;
182     gmx_mm_pr  fix_SSE1,fiy_SSE1,fiz_SSE1;
183     gmx_mm_pr  fix_SSE2,fiy_SSE2,fiz_SSE2;
184     gmx_mm_pr  fix_SSE3,fiy_SSE3,fiz_SSE3;
185 #if UNROLLJ >= 4
186 #ifndef GMX_DOUBLE
187     __m128     fix_SSE,fiy_SSE,fiz_SSE;
188 #else
189     __m256d    fix_SSE,fiy_SSE,fiz_SSE;
190 #endif
191 #else
192     __m128d    fix0_SSE,fiy0_SSE,fiz0_SSE;
193     __m128d    fix2_SSE,fiy2_SSE,fiz2_SSE;
194 #endif
195
196 #ifndef GMX_MM256_HERE
197 #ifndef GMX_DOUBLE
198     __m128i    mask0 = _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 );
199     __m128i    mask1 = _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 );
200     __m128i    mask2 = _mm_set_epi32( 0x0800, 0x0400, 0x0200, 0x0100 );
201     __m128i    mask3 = _mm_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000 );
202 #else
203     /* For double precision we need to set two 32bit ints for one double */
204     __m128i    mask0 = _mm_set_epi32( 0x0002, 0x0002, 0x0001, 0x0001 );
205     __m128i    mask1 = _mm_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004 );
206     __m128i    mask2 = _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 );
207     __m128i    mask3 = _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 );
208 #endif
209 #else
210     /* AVX: use floating point masks, as there are no integer instructions */
211 #ifndef GMX_DOUBLE
212     gmx_mm_pr  mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
213     gmx_mm_pr  mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
214 #else
215     /* There is no 256-bit int to double conversion, so we use float here */
216     __m256     mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004, 0x0002, 0x0002, 0x0001, 0x0001 ));
217     __m256     mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040, 0x0020, 0x0020, 0x0010, 0x0010 ));
218     __m256     mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100 ));
219     __m256     mask3 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000 ));
220 #endif
221 #endif
222
223 #ifndef GMX_MM256_HERE
224 #ifndef GMX_DOUBLE
225     __m128     diag_SSE0 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
226     __m128     diag_SSE1 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
227     __m128     diag_SSE2 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
228     __m128     diag_SSE3 = gmx_mm_castsi128_pr( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
229 #else
230     __m128d    diag0_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
231     __m128d    diag0_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
232     __m128d    diag0_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
233     __m128d    diag0_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
234     __m128d    diag1_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
235     __m128d    diag1_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
236     __m128d    diag1_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
237     __m128d    diag1_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
238 #endif
239 #else /* GMX_MM256_HERE */
240 #ifndef GMX_DOUBLE
241     gmx_mm_pr  diag0_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
242     gmx_mm_pr  diag0_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
243     gmx_mm_pr  diag0_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
244     gmx_mm_pr  diag0_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
245     gmx_mm_pr  diag1_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
246     gmx_mm_pr  diag1_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
247     gmx_mm_pr  diag1_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
248     gmx_mm_pr  diag1_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
249 #else
250     gmx_mm_pr  diag_SSE0 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
251     gmx_mm_pr  diag_SSE1 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
252     gmx_mm_pr  diag_SSE2 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
253     gmx_mm_pr  diag_SSE3 = _mm256_castsi256_pd( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
254 #endif
255 #endif
256
257 #ifndef GMX_MM256_HERE
258     __m128i    zeroi_SSE = _mm_setzero_si128();
259 #endif
260 #ifdef GMX_X86_SSE4_1
261     gmx_mm_pr  zero_SSE = gmx_set1_pr(0);
262 #endif
263
264     gmx_mm_pr  one_SSE=gmx_set1_pr(1.0);
265     gmx_mm_pr  iq_SSE0=gmx_setzero_pr();
266     gmx_mm_pr  iq_SSE1=gmx_setzero_pr();
267     gmx_mm_pr  iq_SSE2=gmx_setzero_pr();
268     gmx_mm_pr  iq_SSE3=gmx_setzero_pr();
269     gmx_mm_pr  mrc_3_SSE;
270 #ifdef CALC_ENERGIES
271     gmx_mm_pr  hrc_3_SSE,moh_rc_SSE;
272 #endif
273 #ifdef CALC_COUL_TAB
274     /* Coulomb table variables */
275     gmx_mm_pr  invtsp_SSE;
276     const real *tab_coul_F;
277 #ifndef TAB_FDV0
278     const real *tab_coul_V;
279 #endif
280 #ifdef GMX_MM256_HERE
281     int        ti0_array[2*UNROLLJ-1],*ti0;
282     int        ti1_array[2*UNROLLJ-1],*ti1;
283     int        ti2_array[2*UNROLLJ-1],*ti2;
284     int        ti3_array[2*UNROLLJ-1],*ti3;
285 #endif
286 #ifdef CALC_ENERGIES
287     gmx_mm_pr  mhalfsp_SSE;
288     gmx_mm_pr  sh_ewald_SSE;
289 #endif
290 #endif
291
292 #ifdef LJ_COMB_LB
293     const real *ljc;
294
295     gmx_mm_pr  hsig_i_SSE0,seps_i_SSE0;
296     gmx_mm_pr  hsig_i_SSE1,seps_i_SSE1;
297     gmx_mm_pr  hsig_i_SSE2,seps_i_SSE2;
298     gmx_mm_pr  hsig_i_SSE3,seps_i_SSE3;
299 #else
300 #ifdef FIX_LJ_C
301     real       pvdw_array[2*UNROLLI*UNROLLJ+3];
302     real       *pvdw_c6,*pvdw_c12;
303     gmx_mm_pr  c6_SSE0,c12_SSE0;
304     gmx_mm_pr  c6_SSE1,c12_SSE1;
305     gmx_mm_pr  c6_SSE2,c12_SSE2;
306     gmx_mm_pr  c6_SSE3,c12_SSE3;
307 #endif
308
309 #ifdef LJ_COMB_GEOM
310     const real *ljc;
311
312     gmx_mm_pr  c6s_SSE0,c12s_SSE0;
313     gmx_mm_pr  c6s_SSE1,c12s_SSE1;
314     gmx_mm_pr  c6s_SSE2=gmx_setzero_pr(),c12s_SSE2=gmx_setzero_pr();
315     gmx_mm_pr  c6s_SSE3=gmx_setzero_pr(),c12s_SSE3=gmx_setzero_pr();
316 #endif
317 #endif /* LJ_COMB_LB */
318
319     gmx_mm_pr  vctotSSE,VvdwtotSSE;
320     gmx_mm_pr  sixthSSE,twelvethSSE;
321
322     gmx_mm_pr  avoid_sing_SSE;
323     gmx_mm_pr  rc2_SSE;
324 #ifdef VDW_CUTOFF_CHECK
325     gmx_mm_pr  rcvdw2_SSE;
326 #endif
327
328 #ifdef CALC_ENERGIES
329     gmx_mm_pr  sh_invrc6_SSE,sh_invrc12_SSE;
330
331     /* cppcheck-suppress unassignedVariable */
332     real       tmpsum_array[15],*tmpsum;
333 #endif
334 #ifdef CALC_SHIFTFORCES
335     /* cppcheck-suppress unassignedVariable */
336     real       shf_array[15],*shf;
337 #endif
338
339     int ninner;
340
341 #ifdef COUNT_PAIRS
342     int npair=0;
343 #endif
344
345 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
346     ljc = nbat->lj_comb;
347 #else
348     /* No combination rule used */
349 #ifndef GMX_DOUBLE
350     nbfp_ptr    = nbat->nbfp_s4;
351 #define NBFP_STRIDE  4
352 #else
353     nbfp_ptr    = nbat->nbfp;
354 #define NBFP_STRIDE  2
355 #endif
356     nbfp_stride = NBFP_STRIDE;
357 #endif
358
359 #ifdef CALC_COUL_TAB
360 #ifdef GMX_MM256_HERE
361     /* Generate aligned table pointers */
362     ti0 = (int *)(((size_t)(ti0_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
363     ti1 = (int *)(((size_t)(ti1_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
364     ti2 = (int *)(((size_t)(ti2_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
365     ti3 = (int *)(((size_t)(ti3_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
366 #endif
367
368     invtsp_SSE  = gmx_set1_pr(ic->tabq_scale);
369 #ifdef CALC_ENERGIES
370     mhalfsp_SSE = gmx_set1_pr(-0.5/ic->tabq_scale);
371
372     sh_ewald_SSE = gmx_set1_pr(ic->sh_ewald);
373 #endif
374
375 #ifdef TAB_FDV0
376     tab_coul_F = ic->tabq_coul_FDV0;
377 #else
378     tab_coul_F = ic->tabq_coul_F;
379     tab_coul_V = ic->tabq_coul_V;
380 #endif
381 #endif
382
383     q                   = nbat->q;
384     type                = nbat->type;
385     facel               = ic->epsfac;
386     shiftvec            = shift_vec[0];
387     x                   = nbat->x;
388
389     avoid_sing_SSE = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
390
391     /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
392     rc2_SSE    = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
393 #ifdef VDW_CUTOFF_CHECK
394     rcvdw2_SSE = gmx_set1_pr(ic->rvdw*ic->rvdw);
395 #endif
396
397 #ifdef CALC_ENERGIES
398     sixthSSE    = gmx_set1_pr(1.0/6.0);
399     twelvethSSE = gmx_set1_pr(1.0/12.0);
400
401     sh_invrc6_SSE  = gmx_set1_pr(ic->sh_invrc6);
402     sh_invrc12_SSE = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
403 #endif
404
405     mrc_3_SSE = gmx_set1_pr(-2*ic->k_rf);
406
407 #ifdef CALC_ENERGIES
408     hrc_3_SSE = gmx_set1_pr(ic->k_rf);
409     
410     moh_rc_SSE = gmx_set1_pr(-ic->c_rf); 
411 #endif
412
413 #ifdef CALC_ENERGIES
414     tmpsum = (real *)(((size_t)(tmpsum_array+7)) & (~((size_t)31)));
415 #endif
416 #ifdef CALC_SHIFTFORCES
417     shf = (real *)(((size_t)(shf_array+7)) & (~((size_t)31)));
418 #endif
419
420 #ifdef FIX_LJ_C
421     pvdw_c6  = (real *)(((size_t)(pvdw_array+3)) & (~((size_t)15)));
422     pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
423
424     for(jp=0; jp<UNROLLJ; jp++)
425     {
426         pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
427         pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
428         pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
429         pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
430
431         pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
432         pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
433         pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
434         pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
435     }
436     c6_SSE0            = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
437     c6_SSE1            = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
438     c6_SSE2            = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
439     c6_SSE3            = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
440
441     c12_SSE0           = gmx_load_pr(pvdw_c12+0*UNROLLJ);
442     c12_SSE1           = gmx_load_pr(pvdw_c12+1*UNROLLJ);
443     c12_SSE2           = gmx_load_pr(pvdw_c12+2*UNROLLJ);
444     c12_SSE3           = gmx_load_pr(pvdw_c12+3*UNROLLJ);
445 #endif /* FIX_LJ_C */
446
447 #ifdef ENERGY_GROUPS
448     egps_ishift  = nbat->neg_2log;
449     egps_imask   = (1<<egps_ishift) - 1;
450     egps_jshift  = 2*nbat->neg_2log;
451     egps_jmask   = (1<<egps_jshift) - 1;
452     egps_jstride = (UNROLLJ>>1)*UNROLLJ;
453     /* Major division is over i-particles: divide nVS by 4 for i-stride */
454     Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
455 #endif
456
457     l_cj = nbl->cj;
458
459     ninner = 0;
460     for(n=0; n<nbl->nci; n++)
461     {
462         nbln = &nbl->ci[n];
463
464         ish              = (nbln->shift & NBNXN_CI_SHIFT);
465         ish3             = ish*3;
466         cjind0           = nbln->cj_ind_start;      
467         cjind1           = nbln->cj_ind_end;    
468         /* Currently only works super-cells equal to sub-cells */
469         ci               = nbln->ci;
470         ci_sh            = (ish == CENTRAL ? ci : -1);
471
472         shX_SSE = gmx_load1_pr(shiftvec+ish3);
473         shY_SSE = gmx_load1_pr(shiftvec+ish3+1);
474         shZ_SSE = gmx_load1_pr(shiftvec+ish3+2);
475
476 #if UNROLLJ <= 4
477         sci              = ci*STRIDE;
478         scix             = sci*DIM;
479         sci2             = sci*2;
480 #else
481         sci              = (ci>>1)*STRIDE;
482         scix             = sci*DIM + (ci & 1)*(STRIDE>>1);
483         sci2             = sci*2 + (ci & 1)*(STRIDE>>1);
484         sci             += (ci & 1)*(STRIDE>>1);
485 #endif
486
487         half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
488         do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
489
490 #ifdef ENERGY_GROUPS
491         egps_i = nbat->energrp[ci];
492         {
493             int ia,egp_ia;
494
495             for(ia=0; ia<4; ia++)
496             {
497                 egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
498                 vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
499                 vctp[ia]   = Vc   + egp_ia*Vstride_i;
500             }
501         }
502 #endif
503 #if defined CALC_ENERGIES
504 #if UNROLLJ == 4
505         if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
506 #endif
507 #if UNROLLJ == 2
508         if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
509 #endif
510 #if UNROLLJ == 8
511         if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
512 #endif
513         {
514             int  ia;
515             real Vc_sub_self;
516
517 #ifdef CALC_COUL_RF
518             Vc_sub_self = 0.5*ic->c_rf;
519 #endif
520 #ifdef CALC_COUL_TAB
521 #ifdef TAB_FDV0
522             Vc_sub_self = 0.5*tab_coul_F[2];
523 #else
524             Vc_sub_self = 0.5*tab_coul_V[0];
525 #endif
526 #endif
527
528             for(ia=0; ia<UNROLLI; ia++)
529             {
530                 real qi;
531
532                 qi = q[sci+ia];
533 #ifdef ENERGY_GROUPS
534                 vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
535 #else
536                 Vc[0]
537 #endif
538                     -= facel*qi*qi*Vc_sub_self;
539             }
540         }
541 #endif
542
543                 /* Load i atom data */
544         sciy             = scix + STRIDE;
545         sciz             = sciy + STRIDE;
546         ix_SSE0          = gmx_add_pr(gmx_load1_pr(x+scix)  ,shX_SSE);
547         ix_SSE1          = gmx_add_pr(gmx_load1_pr(x+scix+1),shX_SSE);
548         ix_SSE2          = gmx_add_pr(gmx_load1_pr(x+scix+2),shX_SSE);
549         ix_SSE3          = gmx_add_pr(gmx_load1_pr(x+scix+3),shX_SSE);
550         iy_SSE0          = gmx_add_pr(gmx_load1_pr(x+sciy)  ,shY_SSE);
551         iy_SSE1          = gmx_add_pr(gmx_load1_pr(x+sciy+1),shY_SSE);
552         iy_SSE2          = gmx_add_pr(gmx_load1_pr(x+sciy+2),shY_SSE);
553         iy_SSE3          = gmx_add_pr(gmx_load1_pr(x+sciy+3),shY_SSE);
554         iz_SSE0          = gmx_add_pr(gmx_load1_pr(x+sciz)  ,shZ_SSE);
555         iz_SSE1          = gmx_add_pr(gmx_load1_pr(x+sciz+1),shZ_SSE);
556         iz_SSE2          = gmx_add_pr(gmx_load1_pr(x+sciz+2),shZ_SSE);
557         iz_SSE3          = gmx_add_pr(gmx_load1_pr(x+sciz+3),shZ_SSE);
558
559         /* With half_LJ we currently always calculate Coulomb interactions */
560         if (do_coul || half_LJ)
561         {
562             iq_SSE0      = gmx_set1_pr(facel*q[sci]);
563             iq_SSE1      = gmx_set1_pr(facel*q[sci+1]);
564             iq_SSE2      = gmx_set1_pr(facel*q[sci+2]);
565             iq_SSE3      = gmx_set1_pr(facel*q[sci+3]);
566         }
567
568 #ifdef LJ_COMB_LB
569         hsig_i_SSE0      = gmx_load1_pr(ljc+sci2+0);
570         hsig_i_SSE1      = gmx_load1_pr(ljc+sci2+1);
571         hsig_i_SSE2      = gmx_load1_pr(ljc+sci2+2);
572         hsig_i_SSE3      = gmx_load1_pr(ljc+sci2+3);
573         seps_i_SSE0      = gmx_load1_pr(ljc+sci2+STRIDE+0);
574         seps_i_SSE1      = gmx_load1_pr(ljc+sci2+STRIDE+1);
575         seps_i_SSE2      = gmx_load1_pr(ljc+sci2+STRIDE+2);
576         seps_i_SSE3      = gmx_load1_pr(ljc+sci2+STRIDE+3);
577 #else
578 #ifdef LJ_COMB_GEOM
579         c6s_SSE0         = gmx_load1_pr(ljc+sci2+0);
580         c6s_SSE1         = gmx_load1_pr(ljc+sci2+1);
581         if (!half_LJ)
582         {
583             c6s_SSE2     = gmx_load1_pr(ljc+sci2+2);
584             c6s_SSE3     = gmx_load1_pr(ljc+sci2+3);
585         }
586         c12s_SSE0        = gmx_load1_pr(ljc+sci2+STRIDE+0);
587         c12s_SSE1        = gmx_load1_pr(ljc+sci2+STRIDE+1);
588         if (!half_LJ)
589         {
590             c12s_SSE2    = gmx_load1_pr(ljc+sci2+STRIDE+2);
591             c12s_SSE3    = gmx_load1_pr(ljc+sci2+STRIDE+3);
592         }
593 #else
594         nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
595         nbfp1     = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
596         if (!half_LJ)
597         {
598             nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
599             nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
600         }
601 #endif
602 #endif
603
604         /* Zero the potential energy for this list */
605         VvdwtotSSE       = gmx_setzero_pr();
606         vctotSSE         = gmx_setzero_pr();
607
608         /* Clear i atom forces */
609         fix_SSE0           = gmx_setzero_pr();
610         fix_SSE1           = gmx_setzero_pr();
611         fix_SSE2           = gmx_setzero_pr();
612         fix_SSE3           = gmx_setzero_pr();
613         fiy_SSE0           = gmx_setzero_pr();
614         fiy_SSE1           = gmx_setzero_pr();
615         fiy_SSE2           = gmx_setzero_pr();
616         fiy_SSE3           = gmx_setzero_pr();
617         fiz_SSE0           = gmx_setzero_pr();
618         fiz_SSE1           = gmx_setzero_pr();
619         fiz_SSE2           = gmx_setzero_pr();
620         fiz_SSE3           = gmx_setzero_pr();
621
622         cjind = cjind0;
623
624         /* Currently all kernels use (at least half) LJ */
625 #define CALC_LJ
626         if (half_LJ)
627         {
628 #define CALC_COULOMB
629 #define HALF_LJ
630 #define CHECK_EXCLS
631             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
632             {
633 #include "nbnxn_kernel_x86_simd_inner.h"
634                 cjind++;
635             }
636 #undef CHECK_EXCLS
637             for(; (cjind<cjind1); cjind++)
638             {
639 #include "nbnxn_kernel_x86_simd_inner.h"
640             }
641 #undef HALF_LJ
642 #undef CALC_COULOMB
643         }
644         else if (do_coul)
645         {
646 #define CALC_COULOMB
647 #define CHECK_EXCLS
648             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
649             {
650 #include "nbnxn_kernel_x86_simd_inner.h"
651                 cjind++;
652             }
653 #undef CHECK_EXCLS
654             for(; (cjind<cjind1); cjind++)
655             {
656 #include "nbnxn_kernel_x86_simd_inner.h"
657             }
658 #undef CALC_COULOMB
659         }
660         else
661         {
662 #define CHECK_EXCLS
663             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
664             {
665 #include "nbnxn_kernel_x86_simd_inner.h"
666                 cjind++;
667             }
668 #undef CHECK_EXCLS
669             for(; (cjind<cjind1); cjind++)
670             {
671 #include "nbnxn_kernel_x86_simd_inner.h"
672             }
673         }
674 #undef CALC_LJ
675         ninner += cjind1 - cjind0;
676
677         /* Add accumulated i-forces to the force array */
678 #if UNROLLJ >= 4
679 #ifndef GMX_DOUBLE
680 #define gmx_load_ps4  _mm_load_ps
681 #define gmx_store_ps4 _mm_store_ps
682 #define gmx_add_ps4   _mm_add_ps
683 #else
684 #define gmx_load_ps4  _mm256_load_pd
685 #define gmx_store_ps4 _mm256_store_pd
686 #define gmx_add_ps4   _mm256_add_pd
687 #endif
688         GMX_MM_TRANSPOSE_SUM4_PR(fix_SSE0,fix_SSE1,fix_SSE2,fix_SSE3,fix_SSE);
689         gmx_store_ps4(f+scix, gmx_add_ps4(fix_SSE, gmx_load_ps4(f+scix)));
690
691         GMX_MM_TRANSPOSE_SUM4_PR(fiy_SSE0,fiy_SSE1,fiy_SSE2,fiy_SSE3,fiy_SSE);
692         gmx_store_ps4(f+sciy, gmx_add_ps4(fiy_SSE, gmx_load_ps4(f+sciy)));
693
694         GMX_MM_TRANSPOSE_SUM4_PR(fiz_SSE0,fiz_SSE1,fiz_SSE2,fiz_SSE3,fiz_SSE);
695         gmx_store_ps4(f+sciz, gmx_add_ps4(fiz_SSE, gmx_load_ps4(f+sciz)));
696
697 #ifdef CALC_SHIFTFORCES
698         gmx_store_ps4(shf,fix_SSE);
699         fshift[ish3+0] += SUM_SIMD4(shf);
700         gmx_store_ps4(shf,fiy_SSE);
701         fshift[ish3+1] += SUM_SIMD4(shf);
702         gmx_store_ps4(shf,fiz_SSE);
703         fshift[ish3+2] += SUM_SIMD4(shf);
704 #endif
705 #else
706         GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE0,fix_SSE1,fix0_SSE);
707         _mm_store_pd(f+scix, _mm_add_pd(fix0_SSE, _mm_load_pd(f+scix)));
708         GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE2,fix_SSE3,fix2_SSE);
709         _mm_store_pd(f+scix+2, _mm_add_pd(fix2_SSE, _mm_load_pd(f+scix+2)));
710
711         GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE0,fiy_SSE1,fiy0_SSE);
712         _mm_store_pd(f+sciy, _mm_add_pd(fiy0_SSE, _mm_load_pd(f+sciy)));
713         GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE2,fiy_SSE3,fiy2_SSE);
714         _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_SSE, _mm_load_pd(f+sciy+2)));
715
716         GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE0,fiz_SSE1,fiz0_SSE);
717         _mm_store_pd(f+sciz, _mm_add_pd(fiz0_SSE, _mm_load_pd(f+sciz)));
718         GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE2,fiz_SSE3,fiz2_SSE);
719         _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_SSE, _mm_load_pd(f+sciz+2)));
720
721 #ifdef CALC_SHIFTFORCES
722         _mm_store_pd(shf,_mm_add_pd(fix0_SSE,fix2_SSE));
723         fshift[ish3+0] += shf[0] + shf[1];
724         _mm_store_pd(shf,_mm_add_pd(fiy0_SSE,fiy2_SSE));
725         fshift[ish3+1] += shf[0] + shf[1];
726         _mm_store_pd(shf,_mm_add_pd(fiz0_SSE,fiz2_SSE));
727         fshift[ish3+2] += shf[0] + shf[1];
728 #endif
729 #endif
730                 
731 #ifdef CALC_ENERGIES
732         if (do_coul)
733         {
734             gmx_store_pr(tmpsum,vctotSSE);
735             *Vc += SUM_SIMD(tmpsum);
736         }
737                 
738         gmx_store_pr(tmpsum,VvdwtotSSE);
739         *Vvdw += SUM_SIMD(tmpsum);
740 #endif
741                 
742                 /* Outer loop uses 6 flops/iteration */
743         }
744
745 #ifdef COUNT_PAIRS
746     printf("atom pairs %d\n",npair);
747 #endif
748 }
749
750 #undef gmx_load_ps4
751 #undef gmx_store_ps4
752 #undef gmx_store_ps4
753
754 #undef CALC_SHIFTFORCES
755
756 #undef UNROLLI   
757 #undef UNROLLJ   
758 #undef STRIDE
759 #undef TAB_FDV0
760 #undef NBFP_STRIDE