1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
9 * Copyright (c) 2001-2009, The GROMACS Development Team
11 * Gromacs is a library for molecular simulation and trajectory analysis,
12 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
13 * a full list of developers and information, check out http://www.gromacs.org
15 * This program is free software; you can redistribute it and/or modify it under
16 * the terms of the GNU Lesser General Public License as published by the Free
17 * Software Foundation; either version 2 of the License, or (at your option) any
19 * As a special exception, you may use this file as part of a free software
20 * library without restriction. Specifically, if other files instantiate
21 * templates or use macros or inline functions from this file, or you compile
22 * this file and link it with other files to produce an executable, this
23 * file does not by itself cause the resulting executable to be covered by
24 * the GNU Lesser General Public License.
26 * In plain-speak: do not worry about classes/macros/templates either - only
27 * changes to the library have to be LGPL, not an application linking with it.
29 * To help fund GROMACS development, we humbly ask that you cite
30 * the papers people have written on it - you can find them on the website!
33 /* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
34 #include "gmx_x86_simd_macros.h"
36 #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
38 #define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
39 #define UNROLLJ GMX_X86_SIMD_WIDTH_HERE
41 #if defined GMX_MM128_HERE || defined GMX_DOUBLE
44 #if defined GMX_MM256_HERE && !defined GMX_DOUBLE
50 /* SSE single precision 4x4 kernel */
51 #define SUM_SIMD(x) SUM_SIMD4(x)
54 /* SSE double precision 4x2 kernel */
55 #define SUM_SIMD(x) (x[0]+x[1])
61 /* AVX single precision 4x8 kernel */
62 #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
65 /* AVX double precision 4x4 kernel */
66 #define SUM_SIMD(x) SUM_SIMD4(x)
70 #define SIMD_MASK_ALL 0xffffffff
72 #include "nbnxn_kernel_x86_simd_utils.h"
74 /* All functionality defines are set here, except for:
75 * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
76 * CHECK_EXCLS, which is set just before including the inner loop contents.
77 * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
78 * set before calling the kernel function. We might want to move that
79 * to inside the n-loop and have a different combination rule for different
80 * ci's, as no combination rule gives a 50% performance hit for LJ.
83 /* We always calculate shift forces, because it's cheap anyhow */
84 #define CALC_SHIFTFORCES
86 /* Assumes all LJ parameters are identical */
87 /* #define FIX_LJ_C */
89 #define NBK_FUNC_NAME_C_LJC(b,s,c,ljc,e) b##_##s##_##c##_comb_##ljc##_##e
91 #if defined LJ_COMB_GEOM
92 #define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,geom,e)
94 #if defined LJ_COMB_LB
95 #define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,lb,e)
97 #define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,none,e)
102 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,rf,e)
105 #ifndef VDW_CUTOFF_CHECK
106 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab,e)
108 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab_twin,e)
112 #ifdef GMX_MM128_HERE
113 #define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd128,e)
115 #ifdef GMX_MM256_HERE
116 #define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd256,e)
120 #ifndef CALC_ENERGIES
121 NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,noener)
123 #ifndef ENERGY_GROUPS
124 NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,ener)
126 NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
130 #undef NBK_FUNC_NAME_C
131 #undef NBK_FUNC_NAME_C_LJC
132 (const nbnxn_pairlist_t *nbl,
133 const nbnxn_atomdata_t *nbat,
134 const interaction_const_t *ic,
137 #ifdef CALC_SHIFTFORCES
148 const nbnxn_ci_t *nbln;
149 const nbnxn_cj_t *l_cj;
152 const real *shiftvec;
154 const real *nbfp0,*nbfp1,*nbfp2=NULL,*nbfp3=NULL;
160 gmx_bool half_LJ,do_coul;
161 int sci,scix,sciy,sciz,sci2;
162 int cjind0,cjind1,cjind;
167 int egps_ishift,egps_imask;
168 int egps_jshift,egps_jmask,egps_jstride;
170 real *vvdwtp[UNROLLI];
177 gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0;
178 gmx_mm_pr ix_SSE1,iy_SSE1,iz_SSE1;
179 gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2;
180 gmx_mm_pr ix_SSE3,iy_SSE3,iz_SSE3;
181 gmx_mm_pr fix_SSE0,fiy_SSE0,fiz_SSE0;
182 gmx_mm_pr fix_SSE1,fiy_SSE1,fiz_SSE1;
183 gmx_mm_pr fix_SSE2,fiy_SSE2,fiz_SSE2;
184 gmx_mm_pr fix_SSE3,fiy_SSE3,fiz_SSE3;
187 __m128 fix_SSE,fiy_SSE,fiz_SSE;
189 __m256d fix_SSE,fiy_SSE,fiz_SSE;
192 __m128d fix0_SSE,fiy0_SSE,fiz0_SSE;
193 __m128d fix2_SSE,fiy2_SSE,fiz2_SSE;
196 #ifndef GMX_MM256_HERE
198 __m128i mask0 = _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 );
199 __m128i mask1 = _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 );
200 __m128i mask2 = _mm_set_epi32( 0x0800, 0x0400, 0x0200, 0x0100 );
201 __m128i mask3 = _mm_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000 );
203 /* For double precision we need to set two 32bit ints for one double */
204 __m128i mask0 = _mm_set_epi32( 0x0002, 0x0002, 0x0001, 0x0001 );
205 __m128i mask1 = _mm_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004 );
206 __m128i mask2 = _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 );
207 __m128i mask3 = _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 );
210 /* AVX: use floating point masks, as there are no integer instructions */
212 gmx_mm_pr mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
213 gmx_mm_pr mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
215 /* There is no 256-bit int to double conversion, so we use float here */
216 __m256 mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004, 0x0002, 0x0002, 0x0001, 0x0001 ));
217 __m256 mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040, 0x0020, 0x0020, 0x0010, 0x0010 ));
218 __m256 mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100 ));
219 __m256 mask3 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000 ));
223 #ifndef GMX_MM256_HERE
225 __m128 diag_SSE0 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
226 __m128 diag_SSE1 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
227 __m128 diag_SSE2 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
228 __m128 diag_SSE3 = gmx_mm_castsi128_pr( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
230 __m128d diag0_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
231 __m128d diag0_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
232 __m128d diag0_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
233 __m128d diag0_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
234 __m128d diag1_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
235 __m128d diag1_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
236 __m128d diag1_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
237 __m128d diag1_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
239 #else /* GMX_MM256_HERE */
241 gmx_mm_pr diag0_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
242 gmx_mm_pr diag0_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
243 gmx_mm_pr diag0_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
244 gmx_mm_pr diag0_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
245 gmx_mm_pr diag1_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
246 gmx_mm_pr diag1_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
247 gmx_mm_pr diag1_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
248 gmx_mm_pr diag1_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
250 gmx_mm_pr diag_SSE0 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
251 gmx_mm_pr diag_SSE1 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
252 gmx_mm_pr diag_SSE2 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
253 gmx_mm_pr diag_SSE3 = _mm256_castsi256_pd( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
257 #ifndef GMX_MM256_HERE
258 __m128i zeroi_SSE = _mm_setzero_si128();
260 #ifdef GMX_X86_SSE4_1
261 gmx_mm_pr zero_SSE = gmx_set1_pr(0);
264 gmx_mm_pr one_SSE=gmx_set1_pr(1.0);
265 gmx_mm_pr iq_SSE0=gmx_setzero_pr();
266 gmx_mm_pr iq_SSE1=gmx_setzero_pr();
267 gmx_mm_pr iq_SSE2=gmx_setzero_pr();
268 gmx_mm_pr iq_SSE3=gmx_setzero_pr();
271 gmx_mm_pr hrc_3_SSE,moh_rc_SSE;
274 /* Coulomb table variables */
275 gmx_mm_pr invtsp_SSE;
276 const real *tab_coul_F;
278 const real *tab_coul_V;
280 #ifdef GMX_MM256_HERE
281 int ti0_array[2*UNROLLJ-1],*ti0;
282 int ti1_array[2*UNROLLJ-1],*ti1;
283 int ti2_array[2*UNROLLJ-1],*ti2;
284 int ti3_array[2*UNROLLJ-1],*ti3;
287 gmx_mm_pr mhalfsp_SSE;
288 gmx_mm_pr sh_ewald_SSE;
295 gmx_mm_pr hsig_i_SSE0,seps_i_SSE0;
296 gmx_mm_pr hsig_i_SSE1,seps_i_SSE1;
297 gmx_mm_pr hsig_i_SSE2,seps_i_SSE2;
298 gmx_mm_pr hsig_i_SSE3,seps_i_SSE3;
301 real pvdw_array[2*UNROLLI*UNROLLJ+3];
302 real *pvdw_c6,*pvdw_c12;
303 gmx_mm_pr c6_SSE0,c12_SSE0;
304 gmx_mm_pr c6_SSE1,c12_SSE1;
305 gmx_mm_pr c6_SSE2,c12_SSE2;
306 gmx_mm_pr c6_SSE3,c12_SSE3;
312 gmx_mm_pr c6s_SSE0,c12s_SSE0;
313 gmx_mm_pr c6s_SSE1,c12s_SSE1;
314 gmx_mm_pr c6s_SSE2=gmx_setzero_pr(),c12s_SSE2=gmx_setzero_pr();
315 gmx_mm_pr c6s_SSE3=gmx_setzero_pr(),c12s_SSE3=gmx_setzero_pr();
317 #endif /* LJ_COMB_LB */
319 gmx_mm_pr vctotSSE,VvdwtotSSE;
320 gmx_mm_pr sixthSSE,twelvethSSE;
322 gmx_mm_pr avoid_sing_SSE;
324 #ifdef VDW_CUTOFF_CHECK
325 gmx_mm_pr rcvdw2_SSE;
329 gmx_mm_pr sh_invrc6_SSE,sh_invrc12_SSE;
331 /* cppcheck-suppress unassignedVariable */
332 real tmpsum_array[15],*tmpsum;
334 #ifdef CALC_SHIFTFORCES
335 /* cppcheck-suppress unassignedVariable */
336 real shf_array[15],*shf;
345 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
348 /* No combination rule used */
350 nbfp_ptr = nbat->nbfp_s4;
351 #define NBFP_STRIDE 4
353 nbfp_ptr = nbat->nbfp;
354 #define NBFP_STRIDE 2
356 nbfp_stride = NBFP_STRIDE;
360 #ifdef GMX_MM256_HERE
361 /* Generate aligned table pointers */
362 ti0 = (int *)(((size_t)(ti0_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
363 ti1 = (int *)(((size_t)(ti1_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
364 ti2 = (int *)(((size_t)(ti2_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
365 ti3 = (int *)(((size_t)(ti3_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
368 invtsp_SSE = gmx_set1_pr(ic->tabq_scale);
370 mhalfsp_SSE = gmx_set1_pr(-0.5/ic->tabq_scale);
372 sh_ewald_SSE = gmx_set1_pr(ic->sh_ewald);
376 tab_coul_F = ic->tabq_coul_FDV0;
378 tab_coul_F = ic->tabq_coul_F;
379 tab_coul_V = ic->tabq_coul_V;
386 shiftvec = shift_vec[0];
389 avoid_sing_SSE = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
391 /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
392 rc2_SSE = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
393 #ifdef VDW_CUTOFF_CHECK
394 rcvdw2_SSE = gmx_set1_pr(ic->rvdw*ic->rvdw);
398 sixthSSE = gmx_set1_pr(1.0/6.0);
399 twelvethSSE = gmx_set1_pr(1.0/12.0);
401 sh_invrc6_SSE = gmx_set1_pr(ic->sh_invrc6);
402 sh_invrc12_SSE = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
405 mrc_3_SSE = gmx_set1_pr(-2*ic->k_rf);
408 hrc_3_SSE = gmx_set1_pr(ic->k_rf);
410 moh_rc_SSE = gmx_set1_pr(-ic->c_rf);
414 tmpsum = (real *)(((size_t)(tmpsum_array+7)) & (~((size_t)31)));
416 #ifdef CALC_SHIFTFORCES
417 shf = (real *)(((size_t)(shf_array+7)) & (~((size_t)31)));
421 pvdw_c6 = (real *)(((size_t)(pvdw_array+3)) & (~((size_t)15)));
422 pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
424 for(jp=0; jp<UNROLLJ; jp++)
426 pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
427 pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
428 pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
429 pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
431 pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
432 pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
433 pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
434 pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
436 c6_SSE0 = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
437 c6_SSE1 = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
438 c6_SSE2 = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
439 c6_SSE3 = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
441 c12_SSE0 = gmx_load_pr(pvdw_c12+0*UNROLLJ);
442 c12_SSE1 = gmx_load_pr(pvdw_c12+1*UNROLLJ);
443 c12_SSE2 = gmx_load_pr(pvdw_c12+2*UNROLLJ);
444 c12_SSE3 = gmx_load_pr(pvdw_c12+3*UNROLLJ);
445 #endif /* FIX_LJ_C */
448 egps_ishift = nbat->neg_2log;
449 egps_imask = (1<<egps_ishift) - 1;
450 egps_jshift = 2*nbat->neg_2log;
451 egps_jmask = (1<<egps_jshift) - 1;
452 egps_jstride = (UNROLLJ>>1)*UNROLLJ;
453 /* Major division is over i-particles: divide nVS by 4 for i-stride */
454 Vstride_i = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
460 for(n=0; n<nbl->nci; n++)
464 ish = (nbln->shift & NBNXN_CI_SHIFT);
466 cjind0 = nbln->cj_ind_start;
467 cjind1 = nbln->cj_ind_end;
468 /* Currently only works super-cells equal to sub-cells */
470 ci_sh = (ish == CENTRAL ? ci : -1);
472 shX_SSE = gmx_load1_pr(shiftvec+ish3);
473 shY_SSE = gmx_load1_pr(shiftvec+ish3+1);
474 shZ_SSE = gmx_load1_pr(shiftvec+ish3+2);
481 sci = (ci>>1)*STRIDE;
482 scix = sci*DIM + (ci & 1)*(STRIDE>>1);
483 sci2 = sci*2 + (ci & 1)*(STRIDE>>1);
484 sci += (ci & 1)*(STRIDE>>1);
487 half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
488 do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
491 egps_i = nbat->energrp[ci];
495 for(ia=0; ia<4; ia++)
497 egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
498 vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
499 vctp[ia] = Vc + egp_ia*Vstride_i;
503 #if defined CALC_ENERGIES
505 if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
508 if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
511 if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
518 Vc_sub_self = 0.5*ic->c_rf;
522 Vc_sub_self = 0.5*tab_coul_F[2];
524 Vc_sub_self = 0.5*tab_coul_V[0];
528 for(ia=0; ia<UNROLLI; ia++)
534 vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
538 -= facel*qi*qi*Vc_sub_self;
543 /* Load i atom data */
544 sciy = scix + STRIDE;
545 sciz = sciy + STRIDE;
546 ix_SSE0 = gmx_add_pr(gmx_load1_pr(x+scix) ,shX_SSE);
547 ix_SSE1 = gmx_add_pr(gmx_load1_pr(x+scix+1),shX_SSE);
548 ix_SSE2 = gmx_add_pr(gmx_load1_pr(x+scix+2),shX_SSE);
549 ix_SSE3 = gmx_add_pr(gmx_load1_pr(x+scix+3),shX_SSE);
550 iy_SSE0 = gmx_add_pr(gmx_load1_pr(x+sciy) ,shY_SSE);
551 iy_SSE1 = gmx_add_pr(gmx_load1_pr(x+sciy+1),shY_SSE);
552 iy_SSE2 = gmx_add_pr(gmx_load1_pr(x+sciy+2),shY_SSE);
553 iy_SSE3 = gmx_add_pr(gmx_load1_pr(x+sciy+3),shY_SSE);
554 iz_SSE0 = gmx_add_pr(gmx_load1_pr(x+sciz) ,shZ_SSE);
555 iz_SSE1 = gmx_add_pr(gmx_load1_pr(x+sciz+1),shZ_SSE);
556 iz_SSE2 = gmx_add_pr(gmx_load1_pr(x+sciz+2),shZ_SSE);
557 iz_SSE3 = gmx_add_pr(gmx_load1_pr(x+sciz+3),shZ_SSE);
559 /* With half_LJ we currently always calculate Coulomb interactions */
560 if (do_coul || half_LJ)
562 iq_SSE0 = gmx_set1_pr(facel*q[sci]);
563 iq_SSE1 = gmx_set1_pr(facel*q[sci+1]);
564 iq_SSE2 = gmx_set1_pr(facel*q[sci+2]);
565 iq_SSE3 = gmx_set1_pr(facel*q[sci+3]);
569 hsig_i_SSE0 = gmx_load1_pr(ljc+sci2+0);
570 hsig_i_SSE1 = gmx_load1_pr(ljc+sci2+1);
571 hsig_i_SSE2 = gmx_load1_pr(ljc+sci2+2);
572 hsig_i_SSE3 = gmx_load1_pr(ljc+sci2+3);
573 seps_i_SSE0 = gmx_load1_pr(ljc+sci2+STRIDE+0);
574 seps_i_SSE1 = gmx_load1_pr(ljc+sci2+STRIDE+1);
575 seps_i_SSE2 = gmx_load1_pr(ljc+sci2+STRIDE+2);
576 seps_i_SSE3 = gmx_load1_pr(ljc+sci2+STRIDE+3);
579 c6s_SSE0 = gmx_load1_pr(ljc+sci2+0);
580 c6s_SSE1 = gmx_load1_pr(ljc+sci2+1);
583 c6s_SSE2 = gmx_load1_pr(ljc+sci2+2);
584 c6s_SSE3 = gmx_load1_pr(ljc+sci2+3);
586 c12s_SSE0 = gmx_load1_pr(ljc+sci2+STRIDE+0);
587 c12s_SSE1 = gmx_load1_pr(ljc+sci2+STRIDE+1);
590 c12s_SSE2 = gmx_load1_pr(ljc+sci2+STRIDE+2);
591 c12s_SSE3 = gmx_load1_pr(ljc+sci2+STRIDE+3);
594 nbfp0 = nbfp_ptr + type[sci ]*nbat->ntype*nbfp_stride;
595 nbfp1 = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
598 nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
599 nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
604 /* Zero the potential energy for this list */
605 VvdwtotSSE = gmx_setzero_pr();
606 vctotSSE = gmx_setzero_pr();
608 /* Clear i atom forces */
609 fix_SSE0 = gmx_setzero_pr();
610 fix_SSE1 = gmx_setzero_pr();
611 fix_SSE2 = gmx_setzero_pr();
612 fix_SSE3 = gmx_setzero_pr();
613 fiy_SSE0 = gmx_setzero_pr();
614 fiy_SSE1 = gmx_setzero_pr();
615 fiy_SSE2 = gmx_setzero_pr();
616 fiy_SSE3 = gmx_setzero_pr();
617 fiz_SSE0 = gmx_setzero_pr();
618 fiz_SSE1 = gmx_setzero_pr();
619 fiz_SSE2 = gmx_setzero_pr();
620 fiz_SSE3 = gmx_setzero_pr();
624 /* Currently all kernels use (at least half) LJ */
631 while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
633 #include "nbnxn_kernel_x86_simd_inner.h"
637 for(; (cjind<cjind1); cjind++)
639 #include "nbnxn_kernel_x86_simd_inner.h"
648 while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
650 #include "nbnxn_kernel_x86_simd_inner.h"
654 for(; (cjind<cjind1); cjind++)
656 #include "nbnxn_kernel_x86_simd_inner.h"
663 while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
665 #include "nbnxn_kernel_x86_simd_inner.h"
669 for(; (cjind<cjind1); cjind++)
671 #include "nbnxn_kernel_x86_simd_inner.h"
675 ninner += cjind1 - cjind0;
677 /* Add accumulated i-forces to the force array */
680 #define gmx_load_ps4 _mm_load_ps
681 #define gmx_store_ps4 _mm_store_ps
682 #define gmx_add_ps4 _mm_add_ps
684 #define gmx_load_ps4 _mm256_load_pd
685 #define gmx_store_ps4 _mm256_store_pd
686 #define gmx_add_ps4 _mm256_add_pd
688 GMX_MM_TRANSPOSE_SUM4_PR(fix_SSE0,fix_SSE1,fix_SSE2,fix_SSE3,fix_SSE);
689 gmx_store_ps4(f+scix, gmx_add_ps4(fix_SSE, gmx_load_ps4(f+scix)));
691 GMX_MM_TRANSPOSE_SUM4_PR(fiy_SSE0,fiy_SSE1,fiy_SSE2,fiy_SSE3,fiy_SSE);
692 gmx_store_ps4(f+sciy, gmx_add_ps4(fiy_SSE, gmx_load_ps4(f+sciy)));
694 GMX_MM_TRANSPOSE_SUM4_PR(fiz_SSE0,fiz_SSE1,fiz_SSE2,fiz_SSE3,fiz_SSE);
695 gmx_store_ps4(f+sciz, gmx_add_ps4(fiz_SSE, gmx_load_ps4(f+sciz)));
697 #ifdef CALC_SHIFTFORCES
698 gmx_store_ps4(shf,fix_SSE);
699 fshift[ish3+0] += SUM_SIMD4(shf);
700 gmx_store_ps4(shf,fiy_SSE);
701 fshift[ish3+1] += SUM_SIMD4(shf);
702 gmx_store_ps4(shf,fiz_SSE);
703 fshift[ish3+2] += SUM_SIMD4(shf);
706 GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE0,fix_SSE1,fix0_SSE);
707 _mm_store_pd(f+scix, _mm_add_pd(fix0_SSE, _mm_load_pd(f+scix)));
708 GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE2,fix_SSE3,fix2_SSE);
709 _mm_store_pd(f+scix+2, _mm_add_pd(fix2_SSE, _mm_load_pd(f+scix+2)));
711 GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE0,fiy_SSE1,fiy0_SSE);
712 _mm_store_pd(f+sciy, _mm_add_pd(fiy0_SSE, _mm_load_pd(f+sciy)));
713 GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE2,fiy_SSE3,fiy2_SSE);
714 _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_SSE, _mm_load_pd(f+sciy+2)));
716 GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE0,fiz_SSE1,fiz0_SSE);
717 _mm_store_pd(f+sciz, _mm_add_pd(fiz0_SSE, _mm_load_pd(f+sciz)));
718 GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE2,fiz_SSE3,fiz2_SSE);
719 _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_SSE, _mm_load_pd(f+sciz+2)));
721 #ifdef CALC_SHIFTFORCES
722 _mm_store_pd(shf,_mm_add_pd(fix0_SSE,fix2_SSE));
723 fshift[ish3+0] += shf[0] + shf[1];
724 _mm_store_pd(shf,_mm_add_pd(fiy0_SSE,fiy2_SSE));
725 fshift[ish3+1] += shf[0] + shf[1];
726 _mm_store_pd(shf,_mm_add_pd(fiz0_SSE,fiz2_SSE));
727 fshift[ish3+2] += shf[0] + shf[1];
734 gmx_store_pr(tmpsum,vctotSSE);
735 *Vc += SUM_SIMD(tmpsum);
738 gmx_store_pr(tmpsum,VvdwtotSSE);
739 *Vvdw += SUM_SIMD(tmpsum);
742 /* Outer loop uses 6 flops/iteration */
746 printf("atom pairs %d\n",npair);
754 #undef CALC_SHIFTFORCES