1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
9 * Copyright (c) 2001-2009, The GROMACS Development Team
11 * Gromacs is a library for molecular simulation and trajectory analysis,
12 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
13 * a full list of developers and information, check out http://www.gromacs.org
15 * This program is free software; you can redistribute it and/or modify it under
16 * the terms of the GNU Lesser General Public License as published by the Free
17 * Software Foundation; either version 2 of the License, or (at your option) any
19 * As a special exception, you may use this file as part of a free software
20 * library without restriction. Specifically, if other files instantiate
21 * templates or use macros or inline functions from this file, or you compile
22 * this file and link it with other files to produce an executable, this
23 * file does not by itself cause the resulting executable to be covered by
24 * the GNU Lesser General Public License.
26 * In plain-speak: do not worry about classes/macros/templates either - only
27 * changes to the library have to be LGPL, not an application linking with it.
29 * To help fund GROMACS development, we humbly ask that you cite
30 * the papers people have written on it - you can find them on the website!
33 /* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
34 #include "gmx_x86_simd_macros.h"
36 #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
38 #define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
39 #define UNROLLJ GMX_X86_SIMD_WIDTH_HERE
41 #if defined GMX_MM128_HERE || defined GMX_DOUBLE
44 #if defined GMX_MM256_HERE && !defined GMX_DOUBLE
50 /* SSE single precision 4x4 kernel */
51 #define SUM_SIMD(x) SUM_SIMD4(x)
54 /* SSE double precision 4x2 kernel */
55 #define SUM_SIMD(x) (x[0]+x[1])
61 /* AVX single precision 4x8 kernel */
62 #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
65 /* AVX double precision 4x4 kernel */
66 #define SUM_SIMD(x) SUM_SIMD4(x)
70 #define SIMD_MASK_ALL 0xffffffff
72 #include "nbnxn_kernel_x86_simd_utils.h"
74 /* All functionality defines are set here, except for:
75 * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
76 * CHECK_EXCLS, which is set just before including the inner loop contents.
77 * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
78 * set before calling the kernel function. We might want to move that
79 * to inside the n-loop and have a different combination rule for different
80 * ci's, as no combination rule gives a 50% performance hit for LJ.
83 /* We always calculate shift forces, because it's cheap anyhow */
84 #define CALC_SHIFTFORCES
86 /* Assumes all LJ parameters are identical */
87 /* #define FIX_LJ_C */
89 #define NBK_FUNC_NAME_C_LJC(b,s,c,ljc,e) b##_##s##_##c##_comb_##ljc##_##e
91 #if defined LJ_COMB_GEOM
92 #define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,geom,e)
94 #if defined LJ_COMB_LB
95 #define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,lb,e)
97 #define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,none,e)
102 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,rf,e)
105 #ifndef VDW_CUTOFF_CHECK
106 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab,e)
108 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab_twin,e)
111 #ifdef CALC_COUL_EWALD
112 #ifndef VDW_CUTOFF_CHECK
113 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,ewald,e)
115 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,ewald_twin,e)
119 #ifdef GMX_MM128_HERE
120 #define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd128,e)
122 #ifdef GMX_MM256_HERE
123 #define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd256,e)
127 #ifndef CALC_ENERGIES
128 NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,noener)
130 #ifndef ENERGY_GROUPS
131 NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,ener)
133 NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
137 #undef NBK_FUNC_NAME_C
138 #undef NBK_FUNC_NAME_C_LJC
139 (const nbnxn_pairlist_t *nbl,
140 const nbnxn_atomdata_t *nbat,
141 const interaction_const_t *ic,
144 #ifdef CALC_SHIFTFORCES
155 const nbnxn_ci_t *nbln;
156 const nbnxn_cj_t *l_cj;
159 const real *shiftvec;
161 const real *nbfp0,*nbfp1,*nbfp2=NULL,*nbfp3=NULL;
167 gmx_bool half_LJ,do_coul;
168 int sci,scix,sciy,sciz,sci2;
169 int cjind0,cjind1,cjind;
174 int egps_ishift,egps_imask;
175 int egps_jshift,egps_jmask,egps_jstride;
177 real *vvdwtp[UNROLLI];
184 gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0;
185 gmx_mm_pr ix_SSE1,iy_SSE1,iz_SSE1;
186 gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2;
187 gmx_mm_pr ix_SSE3,iy_SSE3,iz_SSE3;
188 gmx_mm_pr fix_SSE0,fiy_SSE0,fiz_SSE0;
189 gmx_mm_pr fix_SSE1,fiy_SSE1,fiz_SSE1;
190 gmx_mm_pr fix_SSE2,fiy_SSE2,fiz_SSE2;
191 gmx_mm_pr fix_SSE3,fiy_SSE3,fiz_SSE3;
194 __m128 fix_SSE,fiy_SSE,fiz_SSE;
196 __m256d fix_SSE,fiy_SSE,fiz_SSE;
199 __m128d fix0_SSE,fiy0_SSE,fiz0_SSE;
200 __m128d fix2_SSE,fiy2_SSE,fiz2_SSE;
203 #ifndef GMX_MM256_HERE
205 __m128i mask0 = _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 );
206 __m128i mask1 = _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 );
207 __m128i mask2 = _mm_set_epi32( 0x0800, 0x0400, 0x0200, 0x0100 );
208 __m128i mask3 = _mm_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000 );
210 /* For double precision we need to set two 32bit ints for one double */
211 __m128i mask0 = _mm_set_epi32( 0x0002, 0x0002, 0x0001, 0x0001 );
212 __m128i mask1 = _mm_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004 );
213 __m128i mask2 = _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 );
214 __m128i mask3 = _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 );
217 /* AVX: use floating point masks, as there are no integer instructions */
219 gmx_mm_pr mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
220 gmx_mm_pr mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
222 /* There is no 256-bit int to double conversion, so we use float here */
223 __m256 mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004, 0x0002, 0x0002, 0x0001, 0x0001 ));
224 __m256 mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040, 0x0020, 0x0020, 0x0010, 0x0010 ));
225 __m256 mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100 ));
226 __m256 mask3 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000 ));
230 #ifndef GMX_MM256_HERE
232 __m128 diag_SSE0 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
233 __m128 diag_SSE1 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
234 __m128 diag_SSE2 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
235 __m128 diag_SSE3 = gmx_mm_castsi128_pr( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
237 __m128d diag0_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
238 __m128d diag0_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
239 __m128d diag0_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
240 __m128d diag0_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
241 __m128d diag1_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
242 __m128d diag1_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
243 __m128d diag1_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
244 __m128d diag1_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
246 #else /* GMX_MM256_HERE */
248 gmx_mm_pr diag0_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
249 gmx_mm_pr diag0_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
250 gmx_mm_pr diag0_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
251 gmx_mm_pr diag0_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
252 gmx_mm_pr diag1_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
253 gmx_mm_pr diag1_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
254 gmx_mm_pr diag1_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
255 gmx_mm_pr diag1_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
257 gmx_mm_pr diag_SSE0 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
258 gmx_mm_pr diag_SSE1 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
259 gmx_mm_pr diag_SSE2 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
260 gmx_mm_pr diag_SSE3 = _mm256_castsi256_pd( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
264 #ifndef GMX_MM256_HERE
265 __m128i zeroi_SSE = _mm_setzero_si128();
267 #ifdef GMX_X86_SSE4_1
268 gmx_mm_pr zero_SSE = gmx_set1_pr(0);
271 gmx_mm_pr one_SSE=gmx_set1_pr(1.0);
272 gmx_mm_pr iq_SSE0=gmx_setzero_pr();
273 gmx_mm_pr iq_SSE1=gmx_setzero_pr();
274 gmx_mm_pr iq_SSE2=gmx_setzero_pr();
275 gmx_mm_pr iq_SSE3=gmx_setzero_pr();
278 gmx_mm_pr hrc_3_SSE,moh_rc_SSE;
282 /* Coulomb table variables */
283 gmx_mm_pr invtsp_SSE;
284 const real *tab_coul_F;
286 const real *tab_coul_V;
288 #ifdef GMX_MM256_HERE
289 int ti0_array[2*UNROLLJ-1],*ti0;
290 int ti1_array[2*UNROLLJ-1],*ti1;
291 int ti2_array[2*UNROLLJ-1],*ti2;
292 int ti3_array[2*UNROLLJ-1],*ti3;
295 gmx_mm_pr mhalfsp_SSE;
299 #ifdef CALC_COUL_EWALD
300 gmx_mm_pr beta2_SSE,beta_SSE;
303 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
304 gmx_mm_pr sh_ewald_SSE;
310 gmx_mm_pr hsig_i_SSE0,seps_i_SSE0;
311 gmx_mm_pr hsig_i_SSE1,seps_i_SSE1;
312 gmx_mm_pr hsig_i_SSE2,seps_i_SSE2;
313 gmx_mm_pr hsig_i_SSE3,seps_i_SSE3;
316 real pvdw_array[2*UNROLLI*UNROLLJ+3];
317 real *pvdw_c6,*pvdw_c12;
318 gmx_mm_pr c6_SSE0,c12_SSE0;
319 gmx_mm_pr c6_SSE1,c12_SSE1;
320 gmx_mm_pr c6_SSE2,c12_SSE2;
321 gmx_mm_pr c6_SSE3,c12_SSE3;
327 gmx_mm_pr c6s_SSE0,c12s_SSE0;
328 gmx_mm_pr c6s_SSE1,c12s_SSE1;
329 gmx_mm_pr c6s_SSE2=gmx_setzero_pr(),c12s_SSE2=gmx_setzero_pr();
330 gmx_mm_pr c6s_SSE3=gmx_setzero_pr(),c12s_SSE3=gmx_setzero_pr();
332 #endif /* LJ_COMB_LB */
334 gmx_mm_pr vctotSSE,VvdwtotSSE;
335 gmx_mm_pr sixthSSE,twelvethSSE;
337 gmx_mm_pr avoid_sing_SSE;
339 #ifdef VDW_CUTOFF_CHECK
340 gmx_mm_pr rcvdw2_SSE;
344 gmx_mm_pr sh_invrc6_SSE,sh_invrc12_SSE;
346 /* cppcheck-suppress unassignedVariable */
347 real tmpsum_array[15],*tmpsum;
349 #ifdef CALC_SHIFTFORCES
350 /* cppcheck-suppress unassignedVariable */
351 real shf_array[15],*shf;
360 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
363 /* No combination rule used */
365 nbfp_ptr = nbat->nbfp_s4;
366 #define NBFP_STRIDE 4
368 nbfp_ptr = nbat->nbfp;
369 #define NBFP_STRIDE 2
371 nbfp_stride = NBFP_STRIDE;
375 #ifdef GMX_MM256_HERE
376 /* Generate aligned table pointers */
377 ti0 = (int *)(((size_t)(ti0_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
378 ti1 = (int *)(((size_t)(ti1_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
379 ti2 = (int *)(((size_t)(ti2_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
380 ti3 = (int *)(((size_t)(ti3_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
383 invtsp_SSE = gmx_set1_pr(ic->tabq_scale);
385 mhalfsp_SSE = gmx_set1_pr(-0.5/ic->tabq_scale);
389 tab_coul_F = ic->tabq_coul_FDV0;
391 tab_coul_F = ic->tabq_coul_F;
392 tab_coul_V = ic->tabq_coul_V;
394 #endif /* CALC_COUL_TAB */
396 #ifdef CALC_COUL_EWALD
397 beta2_SSE = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
398 beta_SSE = gmx_set1_pr(ic->ewaldcoeff);
401 #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
402 sh_ewald_SSE = gmx_set1_pr(ic->sh_ewald);
408 shiftvec = shift_vec[0];
411 avoid_sing_SSE = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
413 /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
414 rc2_SSE = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
415 #ifdef VDW_CUTOFF_CHECK
416 rcvdw2_SSE = gmx_set1_pr(ic->rvdw*ic->rvdw);
420 sixthSSE = gmx_set1_pr(1.0/6.0);
421 twelvethSSE = gmx_set1_pr(1.0/12.0);
423 sh_invrc6_SSE = gmx_set1_pr(ic->sh_invrc6);
424 sh_invrc12_SSE = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
427 mrc_3_SSE = gmx_set1_pr(-2*ic->k_rf);
430 hrc_3_SSE = gmx_set1_pr(ic->k_rf);
432 moh_rc_SSE = gmx_set1_pr(-ic->c_rf);
436 tmpsum = (real *)(((size_t)(tmpsum_array+7)) & (~((size_t)31)));
438 #ifdef CALC_SHIFTFORCES
439 shf = (real *)(((size_t)(shf_array+7)) & (~((size_t)31)));
443 pvdw_c6 = (real *)(((size_t)(pvdw_array+3)) & (~((size_t)15)));
444 pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
446 for(jp=0; jp<UNROLLJ; jp++)
448 pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
449 pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
450 pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
451 pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
453 pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
454 pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
455 pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
456 pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
458 c6_SSE0 = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
459 c6_SSE1 = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
460 c6_SSE2 = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
461 c6_SSE3 = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
463 c12_SSE0 = gmx_load_pr(pvdw_c12+0*UNROLLJ);
464 c12_SSE1 = gmx_load_pr(pvdw_c12+1*UNROLLJ);
465 c12_SSE2 = gmx_load_pr(pvdw_c12+2*UNROLLJ);
466 c12_SSE3 = gmx_load_pr(pvdw_c12+3*UNROLLJ);
467 #endif /* FIX_LJ_C */
470 egps_ishift = nbat->neg_2log;
471 egps_imask = (1<<egps_ishift) - 1;
472 egps_jshift = 2*nbat->neg_2log;
473 egps_jmask = (1<<egps_jshift) - 1;
474 egps_jstride = (UNROLLJ>>1)*UNROLLJ;
475 /* Major division is over i-particles: divide nVS by 4 for i-stride */
476 Vstride_i = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
482 for(n=0; n<nbl->nci; n++)
486 ish = (nbln->shift & NBNXN_CI_SHIFT);
488 cjind0 = nbln->cj_ind_start;
489 cjind1 = nbln->cj_ind_end;
490 /* Currently only works super-cells equal to sub-cells */
492 ci_sh = (ish == CENTRAL ? ci : -1);
494 shX_SSE = gmx_load1_pr(shiftvec+ish3);
495 shY_SSE = gmx_load1_pr(shiftvec+ish3+1);
496 shZ_SSE = gmx_load1_pr(shiftvec+ish3+2);
503 sci = (ci>>1)*STRIDE;
504 scix = sci*DIM + (ci & 1)*(STRIDE>>1);
505 sci2 = sci*2 + (ci & 1)*(STRIDE>>1);
506 sci += (ci & 1)*(STRIDE>>1);
509 half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
510 do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
513 egps_i = nbat->energrp[ci];
517 for(ia=0; ia<UNROLLI; ia++)
519 egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
520 vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
521 vctp[ia] = Vc + egp_ia*Vstride_i;
525 #if defined CALC_ENERGIES
527 if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
530 if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
533 if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
540 Vc_sub_self = 0.5*ic->c_rf;
544 Vc_sub_self = 0.5*tab_coul_F[2];
546 Vc_sub_self = 0.5*tab_coul_V[0];
549 #ifdef CALC_COUL_EWALD
551 Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
554 for(ia=0; ia<UNROLLI; ia++)
560 vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
564 -= facel*qi*qi*Vc_sub_self;
569 /* Load i atom data */
570 sciy = scix + STRIDE;
571 sciz = sciy + STRIDE;
572 ix_SSE0 = gmx_add_pr(gmx_load1_pr(x+scix) ,shX_SSE);
573 ix_SSE1 = gmx_add_pr(gmx_load1_pr(x+scix+1),shX_SSE);
574 ix_SSE2 = gmx_add_pr(gmx_load1_pr(x+scix+2),shX_SSE);
575 ix_SSE3 = gmx_add_pr(gmx_load1_pr(x+scix+3),shX_SSE);
576 iy_SSE0 = gmx_add_pr(gmx_load1_pr(x+sciy) ,shY_SSE);
577 iy_SSE1 = gmx_add_pr(gmx_load1_pr(x+sciy+1),shY_SSE);
578 iy_SSE2 = gmx_add_pr(gmx_load1_pr(x+sciy+2),shY_SSE);
579 iy_SSE3 = gmx_add_pr(gmx_load1_pr(x+sciy+3),shY_SSE);
580 iz_SSE0 = gmx_add_pr(gmx_load1_pr(x+sciz) ,shZ_SSE);
581 iz_SSE1 = gmx_add_pr(gmx_load1_pr(x+sciz+1),shZ_SSE);
582 iz_SSE2 = gmx_add_pr(gmx_load1_pr(x+sciz+2),shZ_SSE);
583 iz_SSE3 = gmx_add_pr(gmx_load1_pr(x+sciz+3),shZ_SSE);
585 /* With half_LJ we currently always calculate Coulomb interactions */
586 if (do_coul || half_LJ)
588 iq_SSE0 = gmx_set1_pr(facel*q[sci]);
589 iq_SSE1 = gmx_set1_pr(facel*q[sci+1]);
590 iq_SSE2 = gmx_set1_pr(facel*q[sci+2]);
591 iq_SSE3 = gmx_set1_pr(facel*q[sci+3]);
595 hsig_i_SSE0 = gmx_load1_pr(ljc+sci2+0);
596 hsig_i_SSE1 = gmx_load1_pr(ljc+sci2+1);
597 hsig_i_SSE2 = gmx_load1_pr(ljc+sci2+2);
598 hsig_i_SSE3 = gmx_load1_pr(ljc+sci2+3);
599 seps_i_SSE0 = gmx_load1_pr(ljc+sci2+STRIDE+0);
600 seps_i_SSE1 = gmx_load1_pr(ljc+sci2+STRIDE+1);
601 seps_i_SSE2 = gmx_load1_pr(ljc+sci2+STRIDE+2);
602 seps_i_SSE3 = gmx_load1_pr(ljc+sci2+STRIDE+3);
605 c6s_SSE0 = gmx_load1_pr(ljc+sci2+0);
606 c6s_SSE1 = gmx_load1_pr(ljc+sci2+1);
609 c6s_SSE2 = gmx_load1_pr(ljc+sci2+2);
610 c6s_SSE3 = gmx_load1_pr(ljc+sci2+3);
612 c12s_SSE0 = gmx_load1_pr(ljc+sci2+STRIDE+0);
613 c12s_SSE1 = gmx_load1_pr(ljc+sci2+STRIDE+1);
616 c12s_SSE2 = gmx_load1_pr(ljc+sci2+STRIDE+2);
617 c12s_SSE3 = gmx_load1_pr(ljc+sci2+STRIDE+3);
620 nbfp0 = nbfp_ptr + type[sci ]*nbat->ntype*nbfp_stride;
621 nbfp1 = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
624 nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
625 nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
630 /* Zero the potential energy for this list */
631 VvdwtotSSE = gmx_setzero_pr();
632 vctotSSE = gmx_setzero_pr();
634 /* Clear i atom forces */
635 fix_SSE0 = gmx_setzero_pr();
636 fix_SSE1 = gmx_setzero_pr();
637 fix_SSE2 = gmx_setzero_pr();
638 fix_SSE3 = gmx_setzero_pr();
639 fiy_SSE0 = gmx_setzero_pr();
640 fiy_SSE1 = gmx_setzero_pr();
641 fiy_SSE2 = gmx_setzero_pr();
642 fiy_SSE3 = gmx_setzero_pr();
643 fiz_SSE0 = gmx_setzero_pr();
644 fiz_SSE1 = gmx_setzero_pr();
645 fiz_SSE2 = gmx_setzero_pr();
646 fiz_SSE3 = gmx_setzero_pr();
650 /* Currently all kernels use (at least half) LJ */
657 while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
659 #include "nbnxn_kernel_x86_simd_inner.h"
663 for(; (cjind<cjind1); cjind++)
665 #include "nbnxn_kernel_x86_simd_inner.h"
674 while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
676 #include "nbnxn_kernel_x86_simd_inner.h"
680 for(; (cjind<cjind1); cjind++)
682 #include "nbnxn_kernel_x86_simd_inner.h"
689 while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
691 #include "nbnxn_kernel_x86_simd_inner.h"
695 for(; (cjind<cjind1); cjind++)
697 #include "nbnxn_kernel_x86_simd_inner.h"
701 ninner += cjind1 - cjind0;
703 /* Add accumulated i-forces to the force array */
706 #define gmx_load_ps4 _mm_load_ps
707 #define gmx_store_ps4 _mm_store_ps
708 #define gmx_add_ps4 _mm_add_ps
710 #define gmx_load_ps4 _mm256_load_pd
711 #define gmx_store_ps4 _mm256_store_pd
712 #define gmx_add_ps4 _mm256_add_pd
714 GMX_MM_TRANSPOSE_SUM4_PR(fix_SSE0,fix_SSE1,fix_SSE2,fix_SSE3,fix_SSE);
715 gmx_store_ps4(f+scix, gmx_add_ps4(fix_SSE, gmx_load_ps4(f+scix)));
717 GMX_MM_TRANSPOSE_SUM4_PR(fiy_SSE0,fiy_SSE1,fiy_SSE2,fiy_SSE3,fiy_SSE);
718 gmx_store_ps4(f+sciy, gmx_add_ps4(fiy_SSE, gmx_load_ps4(f+sciy)));
720 GMX_MM_TRANSPOSE_SUM4_PR(fiz_SSE0,fiz_SSE1,fiz_SSE2,fiz_SSE3,fiz_SSE);
721 gmx_store_ps4(f+sciz, gmx_add_ps4(fiz_SSE, gmx_load_ps4(f+sciz)));
723 #ifdef CALC_SHIFTFORCES
724 gmx_store_ps4(shf,fix_SSE);
725 fshift[ish3+0] += SUM_SIMD4(shf);
726 gmx_store_ps4(shf,fiy_SSE);
727 fshift[ish3+1] += SUM_SIMD4(shf);
728 gmx_store_ps4(shf,fiz_SSE);
729 fshift[ish3+2] += SUM_SIMD4(shf);
732 GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE0,fix_SSE1,fix0_SSE);
733 _mm_store_pd(f+scix, _mm_add_pd(fix0_SSE, _mm_load_pd(f+scix)));
734 GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE2,fix_SSE3,fix2_SSE);
735 _mm_store_pd(f+scix+2, _mm_add_pd(fix2_SSE, _mm_load_pd(f+scix+2)));
737 GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE0,fiy_SSE1,fiy0_SSE);
738 _mm_store_pd(f+sciy, _mm_add_pd(fiy0_SSE, _mm_load_pd(f+sciy)));
739 GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE2,fiy_SSE3,fiy2_SSE);
740 _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_SSE, _mm_load_pd(f+sciy+2)));
742 GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE0,fiz_SSE1,fiz0_SSE);
743 _mm_store_pd(f+sciz, _mm_add_pd(fiz0_SSE, _mm_load_pd(f+sciz)));
744 GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE2,fiz_SSE3,fiz2_SSE);
745 _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_SSE, _mm_load_pd(f+sciz+2)));
747 #ifdef CALC_SHIFTFORCES
748 _mm_store_pd(shf,_mm_add_pd(fix0_SSE,fix2_SSE));
749 fshift[ish3+0] += shf[0] + shf[1];
750 _mm_store_pd(shf,_mm_add_pd(fiy0_SSE,fiy2_SSE));
751 fshift[ish3+1] += shf[0] + shf[1];
752 _mm_store_pd(shf,_mm_add_pd(fiz0_SSE,fiz2_SSE));
753 fshift[ish3+2] += shf[0] + shf[1];
760 gmx_store_pr(tmpsum,vctotSSE);
761 *Vc += SUM_SIMD(tmpsum);
764 gmx_store_pr(tmpsum,VvdwtotSSE);
765 *Vvdw += SUM_SIMD(tmpsum);
768 /* Outer loop uses 6 flops/iteration */
772 printf("atom pairs %d\n",npair);
780 #undef CALC_SHIFTFORCES