2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2009, The GROMACS Development Team
6 * Copyright (c) 2012,2013, by the GROMACS development team, led by
7 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
8 * others, as listed in the AUTHORS file in the top-level source
9 * directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
38 /* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
39 #include "gmx_simd_macros.h"
41 #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
43 #define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
44 #define UNROLLJ GMX_SIMD_WIDTH_HERE
46 #if defined GMX_MM128_HERE || defined GMX_DOUBLE
49 #if defined GMX_MM256_HERE && !defined GMX_DOUBLE
55 /* single precision 4x4 kernel */
56 #define SUM_SIMD(x) SUM_SIMD4(x)
59 /* double precision 4x2 kernel */
60 #define SUM_SIMD(x) (x[0]+x[1])
66 /* single precision 4x8 kernel */
67 #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
70 /* double precision 4x4 kernel */
71 #define SUM_SIMD(x) SUM_SIMD4(x)
75 #define SIMD_MASK_ALL 0xffffffff
77 #include "nbnxn_kernel_simd_utils.h"
79 /* All functionality defines are set here, except for:
80 * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
81 * CHECK_EXCLS, which is set just before including the inner loop contents.
82 * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
83 * set before calling the kernel function. We might want to move that
84 * to inside the n-loop and have a different combination rule for different
85 * ci's, as no combination rule gives a 50% performance hit for LJ.
88 /* We always calculate shift forces, because it's cheap anyhow */
89 #define CALC_SHIFTFORCES
91 /* Assumes all LJ parameters are identical */
92 /* #define FIX_LJ_C */
94 /* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
95 * with all combinations off electrostatics (coul), LJ combination rules (ljc)
96 * and energy calculations (ene), depending on the defines set.
99 #define NBK_FUNC_NAME_C_LJC(base, coul, ljc, ene) base ## _ ## coul ## _comb_ ## ljc ## _ ## ene
101 #if defined LJ_COMB_GEOM
102 #define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, geom, ene)
104 #if defined LJ_COMB_LB
105 #define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, lb, ene)
107 #define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, none, ene)
112 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, rf, ene)
115 #ifndef VDW_CUTOFF_CHECK
116 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab, ene)
118 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab_twin, ene)
121 #ifdef CALC_COUL_EWALD
122 #ifndef VDW_CUTOFF_CHECK
123 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald, ene)
125 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald_twin, ene)
130 #ifndef CALC_ENERGIES
131 NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, noener)
133 #ifndef ENERGY_GROUPS
134 NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, ener)
136 NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, energrp)
140 #undef NBK_FUNC_NAME_C
141 #undef NBK_FUNC_NAME_C_LJC
142 (const nbnxn_pairlist_t *nbl,
143 const nbnxn_atomdata_t *nbat,
144 const interaction_const_t *ic,
147 #ifdef CALC_SHIFTFORCES
158 const nbnxn_ci_t *nbln;
159 const nbnxn_cj_t *l_cj;
162 const real *shiftvec;
164 const real *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
170 gmx_bool do_LJ, half_LJ, do_coul;
171 int sci, scix, sciy, sciz, sci2;
172 int cjind0, cjind1, cjind;
177 int egps_ishift, egps_imask;
178 int egps_jshift, egps_jmask, egps_jstride;
180 real *vvdwtp[UNROLLI];
187 gmx_mm_pr ix_SSE0, iy_SSE0, iz_SSE0;
188 gmx_mm_pr ix_SSE1, iy_SSE1, iz_SSE1;
189 gmx_mm_pr ix_SSE2, iy_SSE2, iz_SSE2;
190 gmx_mm_pr ix_SSE3, iy_SSE3, iz_SSE3;
191 gmx_mm_pr fix_SSE0, fiy_SSE0, fiz_SSE0;
192 gmx_mm_pr fix_SSE1, fiy_SSE1, fiz_SSE1;
193 gmx_mm_pr fix_SSE2, fiy_SSE2, fiz_SSE2;
194 gmx_mm_pr fix_SSE3, fiy_SSE3, fiz_SSE3;
197 __m128 fix_SSE, fiy_SSE, fiz_SSE;
199 __m256d fix_SSE, fiy_SSE, fiz_SSE;
202 __m128d fix0_SSE, fiy0_SSE, fiz0_SSE;
203 __m128d fix2_SSE, fiy2_SSE, fiz2_SSE;
206 #ifdef GMX_MM128_HERE
208 __m128i mask0 = _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 );
209 __m128i mask1 = _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 );
210 __m128i mask2 = _mm_set_epi32( 0x0800, 0x0400, 0x0200, 0x0100 );
211 __m128i mask3 = _mm_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000 );
213 /* For double precision we need to set two 32bit ints for one double */
214 __m128i mask0 = _mm_set_epi32( 0x0002, 0x0002, 0x0001, 0x0001 );
215 __m128i mask1 = _mm_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004 );
216 __m128i mask2 = _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 );
217 __m128i mask3 = _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 );
220 #ifdef GMX_MM256_HERE
221 /* AVX: use floating point masks, as there are no integer instructions */
223 gmx_mm_pr mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
224 gmx_mm_pr mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
226 /* There is no 256-bit int to double conversion, so we use float here */
227 __m256 mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004, 0x0002, 0x0002, 0x0001, 0x0001 ));
228 __m256 mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040, 0x0020, 0x0020, 0x0010, 0x0010 ));
229 __m256 mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100 ));
230 __m256 mask3 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000 ));
234 gmx_mm_pr diag_jmi_SSE;
235 #if UNROLLI == UNROLLJ
236 gmx_mm_pr diag_SSE0, diag_SSE1, diag_SSE2, diag_SSE3;
238 gmx_mm_pr diag0_SSE0, diag0_SSE1, diag0_SSE2, diag0_SSE3;
239 gmx_mm_pr diag1_SSE0, diag1_SSE1, diag1_SSE2, diag1_SSE3;
242 #if defined GMX_X86_SSE2 && defined GMX_MM128_HERE
243 __m128i zeroi_SSE = _mm_setzero_si128();
245 gmx_mm_pr zero_SSE = gmx_set1_pr(0);
247 gmx_mm_pr one_SSE = gmx_set1_pr(1.0);
248 gmx_mm_pr iq_SSE0 = gmx_setzero_pr();
249 gmx_mm_pr iq_SSE1 = gmx_setzero_pr();
250 gmx_mm_pr iq_SSE2 = gmx_setzero_pr();
251 gmx_mm_pr iq_SSE3 = gmx_setzero_pr();
254 gmx_mm_pr hrc_3_SSE, moh_rc_SSE;
258 /* Coulomb table variables */
259 gmx_mm_pr invtsp_SSE;
260 const real *tab_coul_F;
262 const real *tab_coul_V;
264 #ifdef GMX_MM256_HERE
265 int ti0_array[2*GMX_SIMD_WIDTH_HERE-1], *ti0;
266 int ti1_array[2*GMX_SIMD_WIDTH_HERE-1], *ti1;
267 int ti2_array[2*GMX_SIMD_WIDTH_HERE-1], *ti2;
268 int ti3_array[2*GMX_SIMD_WIDTH_HERE-1], *ti3;
271 gmx_mm_pr mhalfsp_SSE;
275 #ifdef CALC_COUL_EWALD
276 gmx_mm_pr beta2_SSE, beta_SSE;
279 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
280 gmx_mm_pr sh_ewald_SSE;
286 gmx_mm_pr hsig_i_SSE0, seps_i_SSE0;
287 gmx_mm_pr hsig_i_SSE1, seps_i_SSE1;
288 gmx_mm_pr hsig_i_SSE2, seps_i_SSE2;
289 gmx_mm_pr hsig_i_SSE3, seps_i_SSE3;
292 real pvdw_array[2*UNROLLI*UNROLLJ+3];
293 real *pvdw_c6, *pvdw_c12;
294 gmx_mm_pr c6_SSE0, c12_SSE0;
295 gmx_mm_pr c6_SSE1, c12_SSE1;
296 gmx_mm_pr c6_SSE2, c12_SSE2;
297 gmx_mm_pr c6_SSE3, c12_SSE3;
303 gmx_mm_pr c6s_SSE0, c12s_SSE0;
304 gmx_mm_pr c6s_SSE1, c12s_SSE1;
305 gmx_mm_pr c6s_SSE2 = gmx_setzero_pr(), c12s_SSE2 = gmx_setzero_pr();
306 gmx_mm_pr c6s_SSE3 = gmx_setzero_pr(), c12s_SSE3 = gmx_setzero_pr();
308 #endif /* LJ_COMB_LB */
310 gmx_mm_pr vctotSSE, VvdwtotSSE;
311 gmx_mm_pr sixthSSE, twelvethSSE;
313 gmx_mm_pr avoid_sing_SSE;
315 #ifdef VDW_CUTOFF_CHECK
316 gmx_mm_pr rcvdw2_SSE;
320 gmx_mm_pr sh_invrc6_SSE, sh_invrc12_SSE;
322 /* cppcheck-suppress unassignedVariable */
323 real tmpsum_array[15], *tmpsum;
325 #ifdef CALC_SHIFTFORCES
326 /* cppcheck-suppress unassignedVariable */
327 real shf_array[15], *shf;
336 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
339 /* No combination rule used */
341 nbfp_ptr = nbat->nbfp_s4;
342 #define NBFP_STRIDE 4
344 nbfp_ptr = nbat->nbfp;
345 #define NBFP_STRIDE 2
347 nbfp_stride = NBFP_STRIDE;
350 /* Load j-i for the first i */
351 diag_jmi_SSE = gmx_load_pr(nbat->simd_4xn_diag);
352 /* Generate all the diagonal masks as comparison results */
353 #if UNROLLI == UNROLLJ
354 diag_SSE0 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
355 diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
356 diag_SSE1 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
357 diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
358 diag_SSE2 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
359 diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
360 diag_SSE3 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
362 #if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ
363 diag0_SSE0 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
364 diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
365 diag0_SSE1 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
366 diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
367 diag0_SSE2 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
368 diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
369 diag0_SSE3 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
370 diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
372 #if UNROLLI == 2*UNROLLJ
373 /* Load j-i for the second half of the j-cluster */
374 diag_jmi_SSE = gmx_load_pr(nbat->simd_4xn_diag+UNROLLJ);
377 diag1_SSE0 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
378 diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
379 diag1_SSE1 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
380 diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
381 diag1_SSE2 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
382 diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
383 diag1_SSE3 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
388 #ifdef GMX_MM256_HERE
389 /* Generate aligned table index pointers */
390 ti0 = (int *)(((size_t)(ti0_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
391 ti1 = (int *)(((size_t)(ti1_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
392 ti2 = (int *)(((size_t)(ti2_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
393 ti3 = (int *)(((size_t)(ti3_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
396 invtsp_SSE = gmx_set1_pr(ic->tabq_scale);
398 mhalfsp_SSE = gmx_set1_pr(-0.5/ic->tabq_scale);
402 tab_coul_F = ic->tabq_coul_FDV0;
404 tab_coul_F = ic->tabq_coul_F;
405 tab_coul_V = ic->tabq_coul_V;
407 #endif /* CALC_COUL_TAB */
409 #ifdef CALC_COUL_EWALD
410 beta2_SSE = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
411 beta_SSE = gmx_set1_pr(ic->ewaldcoeff);
414 #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
415 sh_ewald_SSE = gmx_set1_pr(ic->sh_ewald);
421 shiftvec = shift_vec[0];
424 avoid_sing_SSE = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
426 /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
427 rc2_SSE = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
428 #ifdef VDW_CUTOFF_CHECK
429 rcvdw2_SSE = gmx_set1_pr(ic->rvdw*ic->rvdw);
433 sixthSSE = gmx_set1_pr(1.0/6.0);
434 twelvethSSE = gmx_set1_pr(1.0/12.0);
436 sh_invrc6_SSE = gmx_set1_pr(ic->sh_invrc6);
437 sh_invrc12_SSE = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
440 mrc_3_SSE = gmx_set1_pr(-2*ic->k_rf);
443 hrc_3_SSE = gmx_set1_pr(ic->k_rf);
445 moh_rc_SSE = gmx_set1_pr(-ic->c_rf);
449 tmpsum = (real *)(((size_t)(tmpsum_array+7)) & (~((size_t)31)));
451 #ifdef CALC_SHIFTFORCES
452 shf = (real *)(((size_t)(shf_array+7)) & (~((size_t)31)));
456 pvdw_c6 = (real *)(((size_t)(pvdw_array+3)) & (~((size_t)15)));
457 pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
459 for (jp = 0; jp < UNROLLJ; jp++)
461 pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
462 pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
463 pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
464 pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
466 pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
467 pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
468 pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
469 pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
471 c6_SSE0 = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
472 c6_SSE1 = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
473 c6_SSE2 = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
474 c6_SSE3 = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
476 c12_SSE0 = gmx_load_pr(pvdw_c12+0*UNROLLJ);
477 c12_SSE1 = gmx_load_pr(pvdw_c12+1*UNROLLJ);
478 c12_SSE2 = gmx_load_pr(pvdw_c12+2*UNROLLJ);
479 c12_SSE3 = gmx_load_pr(pvdw_c12+3*UNROLLJ);
480 #endif /* FIX_LJ_C */
483 egps_ishift = nbat->neg_2log;
484 egps_imask = (1<<egps_ishift) - 1;
485 egps_jshift = 2*nbat->neg_2log;
486 egps_jmask = (1<<egps_jshift) - 1;
487 egps_jstride = (UNROLLJ>>1)*UNROLLJ;
488 /* Major division is over i-particle energy groups, determine the stride */
489 Vstride_i = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
495 for (n = 0; n < nbl->nci; n++)
499 ish = (nbln->shift & NBNXN_CI_SHIFT);
501 cjind0 = nbln->cj_ind_start;
502 cjind1 = nbln->cj_ind_end;
504 ci_sh = (ish == CENTRAL ? ci : -1);
506 shX_SSE = gmx_load1_pr(shiftvec+ish3);
507 shY_SSE = gmx_load1_pr(shiftvec+ish3+1);
508 shZ_SSE = gmx_load1_pr(shiftvec+ish3+2);
515 sci = (ci>>1)*STRIDE;
516 scix = sci*DIM + (ci & 1)*(STRIDE>>1);
517 sci2 = sci*2 + (ci & 1)*(STRIDE>>1);
518 sci += (ci & 1)*(STRIDE>>1);
521 /* We have 5 LJ/C combinations, but use only three inner loops,
522 * as the other combinations are unlikely and/or not much faster:
523 * inner half-LJ + C for half-LJ + C / no-LJ + C
524 * inner LJ + C for full-LJ + C
525 * inner LJ for full-LJ + no-C / half-LJ + no-C
527 do_LJ = (nbln->shift & NBNXN_CI_DO_LJ(0));
528 do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
529 half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
532 egps_i = nbat->energrp[ci];
536 for (ia = 0; ia < UNROLLI; ia++)
538 egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
539 vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
540 vctp[ia] = Vc + egp_ia*Vstride_i;
544 #if defined CALC_ENERGIES
546 if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
549 if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
552 if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
559 Vc_sub_self = 0.5*ic->c_rf;
563 Vc_sub_self = 0.5*tab_coul_F[2];
565 Vc_sub_self = 0.5*tab_coul_V[0];
568 #ifdef CALC_COUL_EWALD
570 Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
573 for (ia = 0; ia < UNROLLI; ia++)
579 vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
583 -= facel*qi*qi*Vc_sub_self;
588 /* Load i atom data */
589 sciy = scix + STRIDE;
590 sciz = sciy + STRIDE;
591 ix_SSE0 = gmx_add_pr(gmx_load1_pr(x+scix), shX_SSE);
592 ix_SSE1 = gmx_add_pr(gmx_load1_pr(x+scix+1), shX_SSE);
593 ix_SSE2 = gmx_add_pr(gmx_load1_pr(x+scix+2), shX_SSE);
594 ix_SSE3 = gmx_add_pr(gmx_load1_pr(x+scix+3), shX_SSE);
595 iy_SSE0 = gmx_add_pr(gmx_load1_pr(x+sciy), shY_SSE);
596 iy_SSE1 = gmx_add_pr(gmx_load1_pr(x+sciy+1), shY_SSE);
597 iy_SSE2 = gmx_add_pr(gmx_load1_pr(x+sciy+2), shY_SSE);
598 iy_SSE3 = gmx_add_pr(gmx_load1_pr(x+sciy+3), shY_SSE);
599 iz_SSE0 = gmx_add_pr(gmx_load1_pr(x+sciz), shZ_SSE);
600 iz_SSE1 = gmx_add_pr(gmx_load1_pr(x+sciz+1), shZ_SSE);
601 iz_SSE2 = gmx_add_pr(gmx_load1_pr(x+sciz+2), shZ_SSE);
602 iz_SSE3 = gmx_add_pr(gmx_load1_pr(x+sciz+3), shZ_SSE);
606 iq_SSE0 = gmx_set1_pr(facel*q[sci]);
607 iq_SSE1 = gmx_set1_pr(facel*q[sci+1]);
608 iq_SSE2 = gmx_set1_pr(facel*q[sci+2]);
609 iq_SSE3 = gmx_set1_pr(facel*q[sci+3]);
613 hsig_i_SSE0 = gmx_load1_pr(ljc+sci2+0);
614 hsig_i_SSE1 = gmx_load1_pr(ljc+sci2+1);
615 hsig_i_SSE2 = gmx_load1_pr(ljc+sci2+2);
616 hsig_i_SSE3 = gmx_load1_pr(ljc+sci2+3);
617 seps_i_SSE0 = gmx_load1_pr(ljc+sci2+STRIDE+0);
618 seps_i_SSE1 = gmx_load1_pr(ljc+sci2+STRIDE+1);
619 seps_i_SSE2 = gmx_load1_pr(ljc+sci2+STRIDE+2);
620 seps_i_SSE3 = gmx_load1_pr(ljc+sci2+STRIDE+3);
623 c6s_SSE0 = gmx_load1_pr(ljc+sci2+0);
624 c6s_SSE1 = gmx_load1_pr(ljc+sci2+1);
627 c6s_SSE2 = gmx_load1_pr(ljc+sci2+2);
628 c6s_SSE3 = gmx_load1_pr(ljc+sci2+3);
630 c12s_SSE0 = gmx_load1_pr(ljc+sci2+STRIDE+0);
631 c12s_SSE1 = gmx_load1_pr(ljc+sci2+STRIDE+1);
634 c12s_SSE2 = gmx_load1_pr(ljc+sci2+STRIDE+2);
635 c12s_SSE3 = gmx_load1_pr(ljc+sci2+STRIDE+3);
638 nbfp0 = nbfp_ptr + type[sci ]*nbat->ntype*nbfp_stride;
639 nbfp1 = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
642 nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
643 nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
648 /* Zero the potential energy for this list */
649 VvdwtotSSE = gmx_setzero_pr();
650 vctotSSE = gmx_setzero_pr();
652 /* Clear i atom forces */
653 fix_SSE0 = gmx_setzero_pr();
654 fix_SSE1 = gmx_setzero_pr();
655 fix_SSE2 = gmx_setzero_pr();
656 fix_SSE3 = gmx_setzero_pr();
657 fiy_SSE0 = gmx_setzero_pr();
658 fiy_SSE1 = gmx_setzero_pr();
659 fiy_SSE2 = gmx_setzero_pr();
660 fiy_SSE3 = gmx_setzero_pr();
661 fiz_SSE0 = gmx_setzero_pr();
662 fiz_SSE1 = gmx_setzero_pr();
663 fiz_SSE2 = gmx_setzero_pr();
664 fiz_SSE3 = gmx_setzero_pr();
668 /* Currently all kernels use (at least half) LJ */
675 while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
677 #include "nbnxn_kernel_simd_4xn_inner.h"
681 for (; (cjind < cjind1); cjind++)
683 #include "nbnxn_kernel_simd_4xn_inner.h"
692 while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
694 #include "nbnxn_kernel_simd_4xn_inner.h"
698 for (; (cjind < cjind1); cjind++)
700 #include "nbnxn_kernel_simd_4xn_inner.h"
707 while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
709 #include "nbnxn_kernel_simd_4xn_inner.h"
713 for (; (cjind < cjind1); cjind++)
715 #include "nbnxn_kernel_simd_4xn_inner.h"
719 ninner += cjind1 - cjind0;
721 /* Add accumulated i-forces to the force array */
724 #define gmx_load_ps4 _mm_load_ps
725 #define gmx_store_ps4 _mm_store_ps
726 #define gmx_add_ps4 _mm_add_ps
728 #define gmx_load_ps4 _mm256_load_pd
729 #define gmx_store_ps4 _mm256_store_pd
730 #define gmx_add_ps4 _mm256_add_pd
732 GMX_MM_TRANSPOSE_SUM4_PR(fix_SSE0, fix_SSE1, fix_SSE2, fix_SSE3, fix_SSE);
733 gmx_store_ps4(f+scix, gmx_add_ps4(fix_SSE, gmx_load_ps4(f+scix)));
735 GMX_MM_TRANSPOSE_SUM4_PR(fiy_SSE0, fiy_SSE1, fiy_SSE2, fiy_SSE3, fiy_SSE);
736 gmx_store_ps4(f+sciy, gmx_add_ps4(fiy_SSE, gmx_load_ps4(f+sciy)));
738 GMX_MM_TRANSPOSE_SUM4_PR(fiz_SSE0, fiz_SSE1, fiz_SSE2, fiz_SSE3, fiz_SSE);
739 gmx_store_ps4(f+sciz, gmx_add_ps4(fiz_SSE, gmx_load_ps4(f+sciz)));
741 #ifdef CALC_SHIFTFORCES
742 gmx_store_ps4(shf, fix_SSE);
743 fshift[ish3+0] += SUM_SIMD4(shf);
744 gmx_store_ps4(shf, fiy_SSE);
745 fshift[ish3+1] += SUM_SIMD4(shf);
746 gmx_store_ps4(shf, fiz_SSE);
747 fshift[ish3+2] += SUM_SIMD4(shf);
750 GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE0, fix_SSE1, fix0_SSE);
751 _mm_store_pd(f+scix, _mm_add_pd(fix0_SSE, _mm_load_pd(f+scix)));
752 GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE2, fix_SSE3, fix2_SSE);
753 _mm_store_pd(f+scix+2, _mm_add_pd(fix2_SSE, _mm_load_pd(f+scix+2)));
755 GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE0, fiy_SSE1, fiy0_SSE);
756 _mm_store_pd(f+sciy, _mm_add_pd(fiy0_SSE, _mm_load_pd(f+sciy)));
757 GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE2, fiy_SSE3, fiy2_SSE);
758 _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_SSE, _mm_load_pd(f+sciy+2)));
760 GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE0, fiz_SSE1, fiz0_SSE);
761 _mm_store_pd(f+sciz, _mm_add_pd(fiz0_SSE, _mm_load_pd(f+sciz)));
762 GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE2, fiz_SSE3, fiz2_SSE);
763 _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_SSE, _mm_load_pd(f+sciz+2)));
765 #ifdef CALC_SHIFTFORCES
766 _mm_store_pd(shf, _mm_add_pd(fix0_SSE, fix2_SSE));
767 fshift[ish3+0] += shf[0] + shf[1];
768 _mm_store_pd(shf, _mm_add_pd(fiy0_SSE, fiy2_SSE));
769 fshift[ish3+1] += shf[0] + shf[1];
770 _mm_store_pd(shf, _mm_add_pd(fiz0_SSE, fiz2_SSE));
771 fshift[ish3+2] += shf[0] + shf[1];
778 gmx_store_pr(tmpsum, vctotSSE);
779 *Vc += SUM_SIMD(tmpsum);
782 gmx_store_pr(tmpsum, VvdwtotSSE);
783 *Vvdw += SUM_SIMD(tmpsum);
786 /* Outer loop uses 6 flops/iteration */
790 printf("atom pairs %d\n", npair);
798 #undef CALC_SHIFTFORCES