2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2009, The GROMACS Development Team
6 * Copyright (c) 2012,2013, by the GROMACS development team, led by
7 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
8 * others, as listed in the AUTHORS file in the top-level source
9 * directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
38 #if !(GMX_NBNXN_SIMD_BITWIDTH == 128 || GMX_NBNXN_SIMD_BITWIDTH == 256)
39 #error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
42 #ifdef GMX_NBNXN_HALF_WIDTH_SIMD
43 #define GMX_USE_HALF_WIDTH_SIMD_HERE
45 #include "gmx_simd_macros.h"
47 #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
49 #define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
50 #define UNROLLJ GMX_SIMD_WIDTH_HERE
52 /* The stride of all the atom data arrays is max(UNROLLI,UNROLLJ) */
53 #if GMX_SIMD_WIDTH_HERE >= UNROLLI
54 #define STRIDE GMX_SIMD_WIDTH_HERE
56 #define STRIDE UNROLLI
59 #if GMX_SIMD_WIDTH_HERE == 2
60 #define SUM_SIMD(x) (x[0]+x[1])
62 #if GMX_SIMD_WIDTH_HERE == 4
63 #define SUM_SIMD(x) SUM_SIMD4(x)
65 #if GMX_SIMD_WIDTH_HERE == 8
66 #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
68 #error "unsupported kernel configuration"
74 /* Decide if we should use the FDV0 table layout */
75 #if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
76 /* With full AVX-256 SIMD, half SIMD-width table loads are optimal */
77 #if GMX_SIMD_WIDTH_HERE/2 == 4
81 /* We use the FDV0 table layout when we can use aligned table loads */
82 #if GMX_SIMD_WIDTH_HERE == 4
88 #define SIMD_MASK_ALL 0xffffffff
90 #include "nbnxn_kernel_simd_utils.h"
92 /* All functionality defines are set here, except for:
93 * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
94 * CHECK_EXCLS, which is set just before including the inner loop contents.
95 * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
96 * set before calling the kernel function. We might want to move that
97 * to inside the n-loop and have a different combination rule for different
98 * ci's, as no combination rule gives a 50% performance hit for LJ.
101 /* We always calculate shift forces, because it's cheap anyhow */
102 #define CALC_SHIFTFORCES
104 /* Assumes all LJ parameters are identical */
105 /* #define FIX_LJ_C */
107 /* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
108 * with all combinations off electrostatics (coul), LJ combination rules (ljc)
109 * and energy calculations (ene), depending on the defines set.
112 #define NBK_FUNC_NAME_C_LJC(base, coul, ljc, ene) base ## _ ## coul ## _comb_ ## ljc ## _ ## ene
114 #if defined LJ_COMB_GEOM
115 #define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, geom, ene)
117 #if defined LJ_COMB_LB
118 #define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, lb, ene)
120 #define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, none, ene)
125 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, rf, ene)
128 #ifndef VDW_CUTOFF_CHECK
129 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab, ene)
131 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab_twin, ene)
134 #ifdef CALC_COUL_EWALD
135 #ifndef VDW_CUTOFF_CHECK
136 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald, ene)
138 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald_twin, ene)
143 #ifndef CALC_ENERGIES
144 NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, noener)
146 #ifndef ENERGY_GROUPS
147 NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, ener)
149 NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, energrp)
153 #undef NBK_FUNC_NAME_C
154 #undef NBK_FUNC_NAME_C_LJC
155 (const nbnxn_pairlist_t *nbl,
156 const nbnxn_atomdata_t *nbat,
157 const interaction_const_t *ic,
160 #ifdef CALC_SHIFTFORCES
171 const nbnxn_ci_t *nbln;
172 const nbnxn_cj_t *l_cj;
175 const real *shiftvec;
177 const real *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
183 gmx_bool do_LJ, half_LJ, do_coul;
184 int sci, scix, sciy, sciz, sci2;
185 int cjind0, cjind1, cjind;
190 int egps_ishift, egps_imask;
191 int egps_jshift, egps_jmask, egps_jstride;
193 real *vvdwtp[UNROLLI];
200 gmx_mm_pr ix_S0, iy_S0, iz_S0;
201 gmx_mm_pr ix_S1, iy_S1, iz_S1;
202 gmx_mm_pr ix_S2, iy_S2, iz_S2;
203 gmx_mm_pr ix_S3, iy_S3, iz_S3;
204 gmx_mm_pr fix_S0, fiy_S0, fiz_S0;
205 gmx_mm_pr fix_S1, fiy_S1, fiz_S1;
206 gmx_mm_pr fix_S2, fiy_S2, fiz_S2;
207 gmx_mm_pr fix_S3, fiy_S3, fiz_S3;
210 __m128 fix_S, fiy_S, fiz_S;
212 __m256d fix_S, fiy_S, fiz_S;
215 __m128d fix0_S, fiy0_S, fiz0_S;
216 __m128d fix2_S, fiy2_S, fiz2_S;
219 gmx_mm_pr diag_jmi_S;
220 #if UNROLLI == UNROLLJ
221 gmx_mm_pr diag_S0, diag_S1, diag_S2, diag_S3;
223 gmx_mm_pr diag0_S0, diag0_S1, diag0_S2, diag0_S3;
224 gmx_mm_pr diag1_S0, diag1_S1, diag1_S2, diag1_S3;
227 #ifdef gmx_checkbitmask_epi32
228 gmx_epi32 mask_S0, mask_S1, mask_S2, mask_S3;
230 gmx_mm_pr mask_S0, mask_S1, mask_S2, mask_S3;
233 gmx_mm_pr zero_S = gmx_set1_pr(0);
235 gmx_mm_pr one_S = gmx_set1_pr(1.0);
236 gmx_mm_pr iq_S0 = gmx_setzero_pr();
237 gmx_mm_pr iq_S1 = gmx_setzero_pr();
238 gmx_mm_pr iq_S2 = gmx_setzero_pr();
239 gmx_mm_pr iq_S3 = gmx_setzero_pr();
242 gmx_mm_pr hrc_3_S, moh_rc_S;
246 /* Coulomb table variables */
248 const real *tab_coul_F;
250 const real *tab_coul_V;
252 #if GMX_NBNXN_SIMD_BITWIDTH == 256
253 int ti0_array[2*GMX_SIMD_WIDTH_HERE-1], *ti0;
254 int ti1_array[2*GMX_SIMD_WIDTH_HERE-1], *ti1;
255 int ti2_array[2*GMX_SIMD_WIDTH_HERE-1], *ti2;
256 int ti3_array[2*GMX_SIMD_WIDTH_HERE-1], *ti3;
263 #ifdef CALC_COUL_EWALD
264 gmx_mm_pr beta2_S, beta_S;
267 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
268 gmx_mm_pr sh_ewald_S;
274 gmx_mm_pr hsig_i_S0, seps_i_S0;
275 gmx_mm_pr hsig_i_S1, seps_i_S1;
276 gmx_mm_pr hsig_i_S2, seps_i_S2;
277 gmx_mm_pr hsig_i_S3, seps_i_S3;
280 real pvdw_array[2*UNROLLI*UNROLLJ+3];
281 real *pvdw_c6, *pvdw_c12;
282 gmx_mm_pr c6_S0, c12_S0;
283 gmx_mm_pr c6_S1, c12_S1;
284 gmx_mm_pr c6_S2, c12_S2;
285 gmx_mm_pr c6_S3, c12_S3;
291 gmx_mm_pr c6s_S0, c12s_S0;
292 gmx_mm_pr c6s_S1, c12s_S1;
293 gmx_mm_pr c6s_S2 = gmx_setzero_pr(), c12s_S2 = gmx_setzero_pr();
294 gmx_mm_pr c6s_S3 = gmx_setzero_pr(), c12s_S3 = gmx_setzero_pr();
296 #endif /* LJ_COMB_LB */
298 gmx_mm_pr vctot_S, Vvdwtot_S;
299 gmx_mm_pr sixth_S, twelveth_S;
301 gmx_mm_pr avoid_sing_S;
303 #ifdef VDW_CUTOFF_CHECK
308 gmx_mm_pr sh_invrc6_S, sh_invrc12_S;
310 /* cppcheck-suppress unassignedVariable */
311 real tmpsum_array[15], *tmpsum;
313 #ifdef CALC_SHIFTFORCES
314 /* cppcheck-suppress unassignedVariable */
315 real shf_array[15], *shf;
324 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
327 /* No combination rule used */
329 nbfp_ptr = nbat->nbfp_s4;
330 #define NBFP_STRIDE 4
332 nbfp_ptr = nbat->nbfp;
333 #define NBFP_STRIDE 2
335 nbfp_stride = NBFP_STRIDE;
338 /* Load j-i for the first i */
339 diag_jmi_S = gmx_load_pr(nbat->simd_4xn_diag);
340 /* Generate all the diagonal masks as comparison results */
341 #if UNROLLI == UNROLLJ
342 diag_S0 = gmx_cmplt_pr(zero_S, diag_jmi_S);
343 diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
344 diag_S1 = gmx_cmplt_pr(zero_S, diag_jmi_S);
345 diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
346 diag_S2 = gmx_cmplt_pr(zero_S, diag_jmi_S);
347 diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
348 diag_S3 = gmx_cmplt_pr(zero_S, diag_jmi_S);
350 #if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ
351 diag0_S0 = gmx_cmplt_pr(zero_S, diag_jmi_S);
352 diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
353 diag0_S1 = gmx_cmplt_pr(zero_S, diag_jmi_S);
354 diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
355 diag0_S2 = gmx_cmplt_pr(zero_S, diag_jmi_S);
356 diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
357 diag0_S3 = gmx_cmplt_pr(zero_S, diag_jmi_S);
358 diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
360 #if UNROLLI == 2*UNROLLJ
361 /* Load j-i for the second half of the j-cluster */
362 diag_jmi_S = gmx_load_pr(nbat->simd_4xn_diag+UNROLLJ);
365 diag1_S0 = gmx_cmplt_pr(zero_S, diag_jmi_S);
366 diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
367 diag1_S1 = gmx_cmplt_pr(zero_S, diag_jmi_S);
368 diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
369 diag1_S2 = gmx_cmplt_pr(zero_S, diag_jmi_S);
370 diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
371 diag1_S3 = gmx_cmplt_pr(zero_S, diag_jmi_S);
375 /* Load masks for topology exclusion masking */
376 #ifdef gmx_checkbitmask_epi32
377 mask_S0 = gmx_load_si(nbat->simd_excl_mask + 0*GMX_NBNXN_SIMD_BITWIDTH/32);
378 mask_S1 = gmx_load_si(nbat->simd_excl_mask + 1*GMX_NBNXN_SIMD_BITWIDTH/32);
379 mask_S2 = gmx_load_si(nbat->simd_excl_mask + 2*GMX_NBNXN_SIMD_BITWIDTH/32);
380 mask_S3 = gmx_load_si(nbat->simd_excl_mask + 3*GMX_NBNXN_SIMD_BITWIDTH/32);
382 mask_S0 = gmx_load_pr((real *)nbat->simd_excl_mask + 0*UNROLLJ);
383 mask_S1 = gmx_load_pr((real *)nbat->simd_excl_mask + 1*UNROLLJ);
384 mask_S2 = gmx_load_pr((real *)nbat->simd_excl_mask + 2*UNROLLJ);
385 mask_S3 = gmx_load_pr((real *)nbat->simd_excl_mask + 3*UNROLLJ);
389 #if GMX_NBNXN_SIMD_BITWIDTH == 256
390 /* Generate aligned table index pointers */
391 ti0 = gmx_simd_align_int(ti0_array);
392 ti1 = gmx_simd_align_int(ti1_array);
393 ti2 = gmx_simd_align_int(ti2_array);
394 ti3 = gmx_simd_align_int(ti3_array);
397 invtsp_S = gmx_set1_pr(ic->tabq_scale);
399 mhalfsp_S = gmx_set1_pr(-0.5/ic->tabq_scale);
403 tab_coul_F = ic->tabq_coul_FDV0;
405 tab_coul_F = ic->tabq_coul_F;
406 tab_coul_V = ic->tabq_coul_V;
408 #endif /* CALC_COUL_TAB */
410 #ifdef CALC_COUL_EWALD
411 beta2_S = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
412 beta_S = gmx_set1_pr(ic->ewaldcoeff);
415 #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
416 sh_ewald_S = gmx_set1_pr(ic->sh_ewald);
422 shiftvec = shift_vec[0];
425 avoid_sing_S = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
427 /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
428 rc2_S = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
429 #ifdef VDW_CUTOFF_CHECK
430 rcvdw2_S = gmx_set1_pr(ic->rvdw*ic->rvdw);
434 sixth_S = gmx_set1_pr(1.0/6.0);
435 twelveth_S = gmx_set1_pr(1.0/12.0);
437 sh_invrc6_S = gmx_set1_pr(ic->sh_invrc6);
438 sh_invrc12_S = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
441 mrc_3_S = gmx_set1_pr(-2*ic->k_rf);
444 hrc_3_S = gmx_set1_pr(ic->k_rf);
446 moh_rc_S = gmx_set1_pr(-ic->c_rf);
450 tmpsum = gmx_simd_align_real(tmpsum_array);
452 #ifdef CALC_SHIFTFORCES
453 shf = gmx_simd_align_real(shf_array);
457 pvdw_c6 = gmx_simd_align_real(pvdw_array+3);
458 pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
460 for (jp = 0; jp < UNROLLJ; jp++)
462 pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
463 pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
464 pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
465 pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
467 pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
468 pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
469 pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
470 pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
472 c6_S0 = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
473 c6_S1 = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
474 c6_S2 = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
475 c6_S3 = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
477 c12_S0 = gmx_load_pr(pvdw_c12+0*UNROLLJ);
478 c12_S1 = gmx_load_pr(pvdw_c12+1*UNROLLJ);
479 c12_S2 = gmx_load_pr(pvdw_c12+2*UNROLLJ);
480 c12_S3 = gmx_load_pr(pvdw_c12+3*UNROLLJ);
481 #endif /* FIX_LJ_C */
484 egps_ishift = nbat->neg_2log;
485 egps_imask = (1<<egps_ishift) - 1;
486 egps_jshift = 2*nbat->neg_2log;
487 egps_jmask = (1<<egps_jshift) - 1;
488 egps_jstride = (UNROLLJ>>1)*UNROLLJ;
489 /* Major division is over i-particle energy groups, determine the stride */
490 Vstride_i = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
496 for (n = 0; n < nbl->nci; n++)
500 ish = (nbln->shift & NBNXN_CI_SHIFT);
502 cjind0 = nbln->cj_ind_start;
503 cjind1 = nbln->cj_ind_end;
505 ci_sh = (ish == CENTRAL ? ci : -1);
507 shX_S = gmx_load1_pr(shiftvec+ish3);
508 shY_S = gmx_load1_pr(shiftvec+ish3+1);
509 shZ_S = gmx_load1_pr(shiftvec+ish3+2);
516 sci = (ci>>1)*STRIDE;
517 scix = sci*DIM + (ci & 1)*(STRIDE>>1);
518 sci2 = sci*2 + (ci & 1)*(STRIDE>>1);
519 sci += (ci & 1)*(STRIDE>>1);
522 /* We have 5 LJ/C combinations, but use only three inner loops,
523 * as the other combinations are unlikely and/or not much faster:
524 * inner half-LJ + C for half-LJ + C / no-LJ + C
525 * inner LJ + C for full-LJ + C
526 * inner LJ for full-LJ + no-C / half-LJ + no-C
528 do_LJ = (nbln->shift & NBNXN_CI_DO_LJ(0));
529 do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
530 half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
533 egps_i = nbat->energrp[ci];
537 for (ia = 0; ia < UNROLLI; ia++)
539 egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
540 vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
541 vctp[ia] = Vc + egp_ia*Vstride_i;
545 #if defined CALC_ENERGIES
547 if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
550 if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
553 if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
560 Vc_sub_self = 0.5*ic->c_rf;
564 Vc_sub_self = 0.5*tab_coul_F[2];
566 Vc_sub_self = 0.5*tab_coul_V[0];
569 #ifdef CALC_COUL_EWALD
571 Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
574 for (ia = 0; ia < UNROLLI; ia++)
580 vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
584 -= facel*qi*qi*Vc_sub_self;
589 /* Load i atom data */
590 sciy = scix + STRIDE;
591 sciz = sciy + STRIDE;
592 ix_S0 = gmx_add_pr(gmx_load1_pr(x+scix), shX_S);
593 ix_S1 = gmx_add_pr(gmx_load1_pr(x+scix+1), shX_S);
594 ix_S2 = gmx_add_pr(gmx_load1_pr(x+scix+2), shX_S);
595 ix_S3 = gmx_add_pr(gmx_load1_pr(x+scix+3), shX_S);
596 iy_S0 = gmx_add_pr(gmx_load1_pr(x+sciy), shY_S);
597 iy_S1 = gmx_add_pr(gmx_load1_pr(x+sciy+1), shY_S);
598 iy_S2 = gmx_add_pr(gmx_load1_pr(x+sciy+2), shY_S);
599 iy_S3 = gmx_add_pr(gmx_load1_pr(x+sciy+3), shY_S);
600 iz_S0 = gmx_add_pr(gmx_load1_pr(x+sciz), shZ_S);
601 iz_S1 = gmx_add_pr(gmx_load1_pr(x+sciz+1), shZ_S);
602 iz_S2 = gmx_add_pr(gmx_load1_pr(x+sciz+2), shZ_S);
603 iz_S3 = gmx_add_pr(gmx_load1_pr(x+sciz+3), shZ_S);
607 iq_S0 = gmx_set1_pr(facel*q[sci]);
608 iq_S1 = gmx_set1_pr(facel*q[sci+1]);
609 iq_S2 = gmx_set1_pr(facel*q[sci+2]);
610 iq_S3 = gmx_set1_pr(facel*q[sci+3]);
614 hsig_i_S0 = gmx_load1_pr(ljc+sci2+0);
615 hsig_i_S1 = gmx_load1_pr(ljc+sci2+1);
616 hsig_i_S2 = gmx_load1_pr(ljc+sci2+2);
617 hsig_i_S3 = gmx_load1_pr(ljc+sci2+3);
618 seps_i_S0 = gmx_load1_pr(ljc+sci2+STRIDE+0);
619 seps_i_S1 = gmx_load1_pr(ljc+sci2+STRIDE+1);
620 seps_i_S2 = gmx_load1_pr(ljc+sci2+STRIDE+2);
621 seps_i_S3 = gmx_load1_pr(ljc+sci2+STRIDE+3);
624 c6s_S0 = gmx_load1_pr(ljc+sci2+0);
625 c6s_S1 = gmx_load1_pr(ljc+sci2+1);
628 c6s_S2 = gmx_load1_pr(ljc+sci2+2);
629 c6s_S3 = gmx_load1_pr(ljc+sci2+3);
631 c12s_S0 = gmx_load1_pr(ljc+sci2+STRIDE+0);
632 c12s_S1 = gmx_load1_pr(ljc+sci2+STRIDE+1);
635 c12s_S2 = gmx_load1_pr(ljc+sci2+STRIDE+2);
636 c12s_S3 = gmx_load1_pr(ljc+sci2+STRIDE+3);
639 nbfp0 = nbfp_ptr + type[sci ]*nbat->ntype*nbfp_stride;
640 nbfp1 = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
643 nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
644 nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
649 /* Zero the potential energy for this list */
650 Vvdwtot_S = gmx_setzero_pr();
651 vctot_S = gmx_setzero_pr();
653 /* Clear i atom forces */
654 fix_S0 = gmx_setzero_pr();
655 fix_S1 = gmx_setzero_pr();
656 fix_S2 = gmx_setzero_pr();
657 fix_S3 = gmx_setzero_pr();
658 fiy_S0 = gmx_setzero_pr();
659 fiy_S1 = gmx_setzero_pr();
660 fiy_S2 = gmx_setzero_pr();
661 fiy_S3 = gmx_setzero_pr();
662 fiz_S0 = gmx_setzero_pr();
663 fiz_S1 = gmx_setzero_pr();
664 fiz_S2 = gmx_setzero_pr();
665 fiz_S3 = gmx_setzero_pr();
669 /* Currently all kernels use (at least half) LJ */
676 while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
678 #include "nbnxn_kernel_simd_4xn_inner.h"
682 for (; (cjind < cjind1); cjind++)
684 #include "nbnxn_kernel_simd_4xn_inner.h"
693 while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
695 #include "nbnxn_kernel_simd_4xn_inner.h"
699 for (; (cjind < cjind1); cjind++)
701 #include "nbnxn_kernel_simd_4xn_inner.h"
708 while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
710 #include "nbnxn_kernel_simd_4xn_inner.h"
714 for (; (cjind < cjind1); cjind++)
716 #include "nbnxn_kernel_simd_4xn_inner.h"
720 ninner += cjind1 - cjind0;
722 /* Add accumulated i-forces to the force array */
725 #define gmx_load_pr4 _mm_load_ps
726 #define gmx_store_pr4 _mm_store_ps
727 #define gmx_add_pr4 _mm_add_ps
729 #define gmx_load_pr4 _mm256_load_pd
730 #define gmx_store_pr4 _mm256_store_pd
731 #define gmx_add_pr4 _mm256_add_pd
733 GMX_MM_TRANSPOSE_SUM4_PR(fix_S0, fix_S1, fix_S2, fix_S3, fix_S);
734 gmx_store_pr4(f+scix, gmx_add_pr4(fix_S, gmx_load_pr4(f+scix)));
736 GMX_MM_TRANSPOSE_SUM4_PR(fiy_S0, fiy_S1, fiy_S2, fiy_S3, fiy_S);
737 gmx_store_pr4(f+sciy, gmx_add_pr4(fiy_S, gmx_load_pr4(f+sciy)));
739 GMX_MM_TRANSPOSE_SUM4_PR(fiz_S0, fiz_S1, fiz_S2, fiz_S3, fiz_S);
740 gmx_store_pr4(f+sciz, gmx_add_pr4(fiz_S, gmx_load_pr4(f+sciz)));
742 #ifdef CALC_SHIFTFORCES
743 gmx_store_pr4(shf, fix_S);
744 fshift[ish3+0] += SUM_SIMD4(shf);
745 gmx_store_pr4(shf, fiy_S);
746 fshift[ish3+1] += SUM_SIMD4(shf);
747 gmx_store_pr4(shf, fiz_S);
748 fshift[ish3+2] += SUM_SIMD4(shf);
751 GMX_MM_TRANSPOSE_SUM2_PD(fix_S0, fix_S1, fix0_S);
752 _mm_store_pd(f+scix, _mm_add_pd(fix0_S, _mm_load_pd(f+scix)));
753 GMX_MM_TRANSPOSE_SUM2_PD(fix_S2, fix_S3, fix2_S);
754 _mm_store_pd(f+scix+2, _mm_add_pd(fix2_S, _mm_load_pd(f+scix+2)));
756 GMX_MM_TRANSPOSE_SUM2_PD(fiy_S0, fiy_S1, fiy0_S);
757 _mm_store_pd(f+sciy, _mm_add_pd(fiy0_S, _mm_load_pd(f+sciy)));
758 GMX_MM_TRANSPOSE_SUM2_PD(fiy_S2, fiy_S3, fiy2_S);
759 _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_S, _mm_load_pd(f+sciy+2)));
761 GMX_MM_TRANSPOSE_SUM2_PD(fiz_S0, fiz_S1, fiz0_S);
762 _mm_store_pd(f+sciz, _mm_add_pd(fiz0_S, _mm_load_pd(f+sciz)));
763 GMX_MM_TRANSPOSE_SUM2_PD(fiz_S2, fiz_S3, fiz2_S);
764 _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_S, _mm_load_pd(f+sciz+2)));
766 #ifdef CALC_SHIFTFORCES
767 _mm_store_pd(shf, _mm_add_pd(fix0_S, fix2_S));
768 fshift[ish3+0] += shf[0] + shf[1];
769 _mm_store_pd(shf, _mm_add_pd(fiy0_S, fiy2_S));
770 fshift[ish3+1] += shf[0] + shf[1];
771 _mm_store_pd(shf, _mm_add_pd(fiz0_S, fiz2_S));
772 fshift[ish3+2] += shf[0] + shf[1];
779 gmx_store_pr(tmpsum, vctot_S);
780 *Vc += SUM_SIMD(tmpsum);
783 gmx_store_pr(tmpsum, Vvdwtot_S);
784 *Vvdw += SUM_SIMD(tmpsum);
787 /* Outer loop uses 6 flops/iteration */
791 printf("atom pairs %d\n", npair);
800 #undef CALC_SHIFTFORCES
808 #undef GMX_USE_HALF_WIDTH_SIMD_HERE