2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2009, The GROMACS Development Team
6 * Copyright (c) 2012, by the GROMACS development team, led by
7 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
8 * others, as listed in the AUTHORS file in the top-level source
9 * directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
38 /* GMX_MM256_HERE should be set before including this file */
39 #include "gmx_simd_macros.h"
41 #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
43 #define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
44 #define UNROLLJ (GMX_SIMD_WIDTH_HERE/2)
46 #if defined GMX_MM256_HERE
52 /* single precision 2x(4+4) kernel */
53 #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
56 #error "unsupported kernel configuration"
60 #define SIMD_MASK_ALL 0xffffffff
62 #include "nbnxn_kernel_simd_utils.h"
64 /* All functionality defines are set here, except for:
65 * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
66 * CHECK_EXCLS, which is set just before including the inner loop contents.
67 * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
68 * set before calling the kernel function. We might want to move that
69 * to inside the n-loop and have a different combination rule for different
70 * ci's, as no combination rule gives a 50% performance hit for LJ.
73 /* We always calculate shift forces, because it's cheap anyhow */
74 #define CALC_SHIFTFORCES
76 /* Assumes all LJ parameters are identical */
77 /* #define FIX_LJ_C */
79 /* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
80 * with all combinations off electrostatics (coul), LJ combination rules (ljc)
81 * and energy calculations (ene), depending on the defines set.
84 #define NBK_FUNC_NAME_C_LJC(base, coul, ljc, ene) base ## _ ## coul ## _comb_ ## ljc ## _ ## ene
86 #if defined LJ_COMB_GEOM
87 #define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, geom, ene)
89 #if defined LJ_COMB_LB
90 #define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, lb, ene)
92 #define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, none, ene)
97 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, rf, ene)
100 #ifndef VDW_CUTOFF_CHECK
101 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab, ene)
103 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab_twin, ene)
106 #ifdef CALC_COUL_EWALD
107 #ifndef VDW_CUTOFF_CHECK
108 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald, ene)
110 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald_twin, ene)
115 #ifndef CALC_ENERGIES
116 NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn, noener)
118 #ifndef ENERGY_GROUPS
119 NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn, ener)
121 NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn, energrp)
125 #undef NBK_FUNC_NAME_C
126 #undef NBK_FUNC_NAME_C_LJC
127 (const nbnxn_pairlist_t *nbl,
128 const nbnxn_atomdata_t *nbat,
129 const interaction_const_t *ic,
132 #ifdef CALC_SHIFTFORCES
143 const nbnxn_ci_t *nbln;
144 const nbnxn_cj_t *l_cj;
147 const real *shiftvec;
149 const real *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
155 gmx_bool do_LJ, half_LJ, do_coul;
156 int sci, scix, sciy, sciz, sci2;
157 int cjind0, cjind1, cjind;
162 int egps_ishift, egps_imask;
163 int egps_jshift, egps_jmask, egps_jstride;
165 real *vvdwtp[UNROLLI];
172 gmx_mm_pr ix_SSE0, iy_SSE0, iz_SSE0;
173 gmx_mm_pr ix_SSE2, iy_SSE2, iz_SSE2;
174 gmx_mm_pr fix_SSE0, fiy_SSE0, fiz_SSE0;
175 gmx_mm_pr fix_SSE2, fiy_SSE2, fiz_SSE2;
178 __m128 fix_SSE, fiy_SSE, fiz_SSE;
180 __m256d fix_SSE, fiy_SSE, fiz_SSE;
183 __m128d fix0_SSE, fiy0_SSE, fiz0_SSE;
184 __m128d fix2_SSE, fiy2_SSE, fiz2_SSE;
187 /* AVX: use floating point masks, as there are no integer instructions */
188 gmx_mm_pr mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
189 gmx_mm_pr mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
191 gmx_mm_pr diag_jmi_SSE;
192 #if UNROLLI == UNROLLJ
193 gmx_mm_pr diag_SSE0, diag_SSE2;
195 gmx_mm_pr diag0_SSE0, diag0_SSE2;
196 gmx_mm_pr diag1_SSE0, diag1_SSE2;
199 gmx_mm_pr zero_SSE = gmx_set1_pr(0);
201 gmx_mm_pr one_SSE = gmx_set1_pr(1.0);
202 gmx_mm_pr iq_SSE0 = gmx_setzero_pr();
203 gmx_mm_pr iq_SSE2 = gmx_setzero_pr();
206 gmx_mm_pr hrc_3_SSE, moh_rc_SSE;
210 /* Coulomb table variables */
211 gmx_mm_pr invtsp_SSE;
212 const real *tab_coul_F;
214 const real *tab_coul_V;
216 #ifdef GMX_MM256_HERE
217 int ti0_array[2*GMX_SIMD_WIDTH_HERE-1], *ti0;
218 int ti2_array[2*GMX_SIMD_WIDTH_HERE-1], *ti2;
221 gmx_mm_pr mhalfsp_SSE;
225 #ifdef CALC_COUL_EWALD
226 gmx_mm_pr beta2_SSE, beta_SSE;
229 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
230 gmx_mm_pr sh_ewald_SSE;
236 gmx_mm_pr hsig_i_SSE0, seps_i_SSE0;
237 gmx_mm_pr hsig_i_SSE2, seps_i_SSE2;
240 real pvdw_array[2*UNROLLI*UNROLLJ+3];
241 real *pvdw_c6, *pvdw_c12;
242 gmx_mm_pr c6_SSE0, c12_SSE0;
243 gmx_mm_pr c6_SSE2, c12_SSE2;
249 gmx_mm_pr c6s_SSE0, c12s_SSE0;
250 gmx_mm_pr c6s_SSE1, c12s_SSE1;
251 gmx_mm_pr c6s_SSE2 = gmx_setzero_pr(), c12s_SSE2 = gmx_setzero_pr();
252 gmx_mm_pr c6s_SSE3 = gmx_setzero_pr(), c12s_SSE3 = gmx_setzero_pr();
254 #endif /* LJ_COMB_LB */
256 gmx_mm_pr vctotSSE, VvdwtotSSE;
257 gmx_mm_pr sixthSSE, twelvethSSE;
259 gmx_mm_pr avoid_sing_SSE;
261 #ifdef VDW_CUTOFF_CHECK
262 gmx_mm_pr rcvdw2_SSE;
266 gmx_mm_pr sh_invrc6_SSE, sh_invrc12_SSE;
268 /* cppcheck-suppress unassignedVariable */
269 real tmpsum_array[15], *tmpsum;
271 #ifdef CALC_SHIFTFORCES
272 /* cppcheck-suppress unassignedVariable */
273 real shf_array[15], *shf;
282 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
285 /* No combination rule used */
287 nbfp_ptr = nbat->nbfp_s4;
288 #define NBFP_STRIDE 4
290 nbfp_ptr = nbat->nbfp;
291 #define NBFP_STRIDE 2
293 nbfp_stride = NBFP_STRIDE;
296 /* Load j-i for the first i */
297 diag_jmi_SSE = gmx_load_pr(nbat->simd_2xnn_diag);
298 /* Generate all the diagonal masks as comparison results */
299 #if UNROLLI == UNROLLJ
300 diag_SSE0 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
301 diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
302 diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
303 diag_SSE2 = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
305 #if 2*UNROLLI == UNROLLJ
306 diag0_SSE0 = gmx_cmplt_pr(diag_i_SSE, diag_j_SSE);
307 diag_i_SSE = gmx_add_pr(diag_i_SSE, one_SSE);
308 diag_i_SSE = gmx_add_pr(diag_i_SSE, one_SSE);
309 diag0_SSE2 = gmx_cmplt_pr(diag_i_SSE, diag_j_SSE);
310 diag_i_SSE = gmx_add_pr(diag_i_SSE, one_SSE);
311 diag_i_SSE = gmx_add_pr(diag_i_SSE, one_SSE);
312 diag1_SSE0 = gmx_cmplt_pr(diag_i_SSE, diag_j_SSE);
313 diag_i_SSE = gmx_add_pr(diag_i_SSE, one_SSE);
314 diag_i_SSE = gmx_add_pr(diag_i_SSE, one_SSE);
315 diag1_SSE2 = gmx_cmplt_pr(diag_i_SSE, diag_j_SSE);
320 #ifdef GMX_MM256_HERE
321 /* Generate aligned table index pointers */
322 ti0 = (int *)(((size_t)(ti0_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
323 ti2 = (int *)(((size_t)(ti2_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
326 invtsp_SSE = gmx_set1_pr(ic->tabq_scale);
328 mhalfsp_SSE = gmx_set1_pr(-0.5/ic->tabq_scale);
332 tab_coul_F = ic->tabq_coul_FDV0;
334 tab_coul_F = ic->tabq_coul_F;
335 tab_coul_V = ic->tabq_coul_V;
337 #endif /* CALC_COUL_TAB */
339 #ifdef CALC_COUL_EWALD
340 beta2_SSE = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
341 beta_SSE = gmx_set1_pr(ic->ewaldcoeff);
344 #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
345 sh_ewald_SSE = gmx_set1_pr(ic->sh_ewald);
351 shiftvec = shift_vec[0];
354 avoid_sing_SSE = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
356 /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
357 rc2_SSE = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
358 #ifdef VDW_CUTOFF_CHECK
359 rcvdw2_SSE = gmx_set1_pr(ic->rvdw*ic->rvdw);
363 sixthSSE = gmx_set1_pr(1.0/6.0);
364 twelvethSSE = gmx_set1_pr(1.0/12.0);
366 sh_invrc6_SSE = gmx_set1_pr(ic->sh_invrc6);
367 sh_invrc12_SSE = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
370 mrc_3_SSE = gmx_set1_pr(-2*ic->k_rf);
373 hrc_3_SSE = gmx_set1_pr(ic->k_rf);
375 moh_rc_SSE = gmx_set1_pr(-ic->c_rf);
379 tmpsum = (real *)(((size_t)(tmpsum_array+7)) & (~((size_t)31)));
381 #ifdef CALC_SHIFTFORCES
382 shf = (real *)(((size_t)(shf_array+7)) & (~((size_t)31)));
386 pvdw_c6 = (real *)(((size_t)(pvdw_array+3)) & (~((size_t)15)));
387 pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
389 for (jp = 0; jp < UNROLLJ; jp++)
391 pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
392 pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
393 pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
394 pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
396 pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
397 pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
398 pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
399 pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
401 c6_SSE0 = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
402 c6_SSE1 = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
403 c6_SSE2 = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
404 c6_SSE3 = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
406 c12_SSE0 = gmx_load_pr(pvdw_c12+0*UNROLLJ);
407 c12_SSE1 = gmx_load_pr(pvdw_c12+1*UNROLLJ);
408 c12_SSE2 = gmx_load_pr(pvdw_c12+2*UNROLLJ);
409 c12_SSE3 = gmx_load_pr(pvdw_c12+3*UNROLLJ);
410 #endif /* FIX_LJ_C */
413 egps_ishift = nbat->neg_2log;
414 egps_imask = (1<<egps_ishift) - 1;
415 egps_jshift = 2*nbat->neg_2log;
416 egps_jmask = (1<<egps_jshift) - 1;
417 egps_jstride = (UNROLLJ>>1)*UNROLLJ;
418 /* Major division is over i-particle energy groups, determine the stride */
419 Vstride_i = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
425 for (n = 0; n < nbl->nci; n++)
429 ish = (nbln->shift & NBNXN_CI_SHIFT);
431 cjind0 = nbln->cj_ind_start;
432 cjind1 = nbln->cj_ind_end;
434 ci_sh = (ish == CENTRAL ? ci : -1);
436 shX_SSE = gmx_load1_pr(shiftvec+ish3);
437 shY_SSE = gmx_load1_pr(shiftvec+ish3+1);
438 shZ_SSE = gmx_load1_pr(shiftvec+ish3+2);
445 sci = (ci>>1)*STRIDE;
446 scix = sci*DIM + (ci & 1)*(STRIDE>>1);
447 sci2 = sci*2 + (ci & 1)*(STRIDE>>1);
448 sci += (ci & 1)*(STRIDE>>1);
451 /* We have 5 LJ/C combinations, but use only three inner loops,
452 * as the other combinations are unlikely and/or not much faster:
453 * inner half-LJ + C for half-LJ + C / no-LJ + C
454 * inner LJ + C for full-LJ + C
455 * inner LJ for full-LJ + no-C / half-LJ + no-C
457 do_LJ = (nbln->shift & NBNXN_CI_DO_LJ(0));
458 do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
459 half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
462 egps_i = nbat->energrp[ci];
466 for (ia = 0; ia < UNROLLI; ia++)
468 egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
469 vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
470 vctp[ia] = Vc + egp_ia*Vstride_i;
474 #if defined CALC_ENERGIES
476 if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
479 if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
482 if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
489 Vc_sub_self = 0.5*ic->c_rf;
493 Vc_sub_self = 0.5*tab_coul_F[2];
495 Vc_sub_self = 0.5*tab_coul_V[0];
498 #ifdef CALC_COUL_EWALD
500 Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
503 for (ia = 0; ia < UNROLLI; ia++)
509 vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
513 -= facel*qi*qi*Vc_sub_self;
518 #define gmx_load2_hpr(x) _mm256_insertf128_ps(gmx_load1_pr(x), gmx_load1_hpr(x+1), 1)
520 /* Load i atom data */
521 sciy = scix + STRIDE;
522 sciz = sciy + STRIDE;
523 ix_SSE0 = gmx_add_pr(gmx_load2_hpr(x+scix), shX_SSE);
524 ix_SSE2 = gmx_add_pr(gmx_load2_hpr(x+scix+2), shX_SSE);
525 iy_SSE0 = gmx_add_pr(gmx_load2_hpr(x+sciy), shY_SSE);
526 iy_SSE2 = gmx_add_pr(gmx_load2_hpr(x+sciy+2), shY_SSE);
527 iz_SSE0 = gmx_add_pr(gmx_load2_hpr(x+sciz), shZ_SSE);
528 iz_SSE2 = gmx_add_pr(gmx_load2_hpr(x+sciz+2), shZ_SSE);
534 facel_SSE = gmx_set1_pr(facel);
536 iq_SSE0 = gmx_mul_pr(facel_SSE, gmx_load2_hpr(q+sci));
537 iq_SSE2 = gmx_mul_pr(facel_SSE, gmx_load2_hpr(q+sci+2));
541 hsig_i_SSE0 = gmx_load2_hpr(ljc+sci2+0);
542 hsig_i_SSE2 = gmx_load2_hpr(ljc+sci2+2);
543 seps_i_SSE0 = gmx_load2_hpr(ljc+sci2+STRIDE+0);
544 seps_i_SSE2 = gmx_load2_hpr(ljc+sci2+STRIDE+2);
547 c6s_SSE0 = gmx_load2_hpr(ljc+sci2+0);
550 c6s_SSE2 = gmx_load2_hpr(ljc+sci2+2);
552 c12s_SSE0 = gmx_load2_hpr(ljc+sci2+STRIDE+0);
555 c12s_SSE2 = gmx_load2_hpr(ljc+sci2+STRIDE+2);
558 nbfp0 = nbfp_ptr + type[sci ]*nbat->ntype*nbfp_stride;
559 nbfp1 = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
562 nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
563 nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
568 /* Zero the potential energy for this list */
569 VvdwtotSSE = gmx_setzero_pr();
570 vctotSSE = gmx_setzero_pr();
572 /* Clear i atom forces */
573 fix_SSE0 = gmx_setzero_pr();
574 fix_SSE2 = gmx_setzero_pr();
575 fiy_SSE0 = gmx_setzero_pr();
576 fiy_SSE2 = gmx_setzero_pr();
577 fiz_SSE0 = gmx_setzero_pr();
578 fiz_SSE2 = gmx_setzero_pr();
582 /* Currently all kernels use (at least half) LJ */
589 while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
591 #include "nbnxn_kernel_simd_2xnn_inner.h"
595 for (; (cjind < cjind1); cjind++)
597 #include "nbnxn_kernel_simd_2xnn_inner.h"
606 while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
608 #include "nbnxn_kernel_simd_2xnn_inner.h"
612 for (; (cjind < cjind1); cjind++)
614 #include "nbnxn_kernel_simd_2xnn_inner.h"
621 while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
623 #include "nbnxn_kernel_simd_2xnn_inner.h"
627 for (; (cjind < cjind1); cjind++)
629 #include "nbnxn_kernel_simd_2xnn_inner.h"
633 ninner += cjind1 - cjind0;
635 /* Add accumulated i-forces to the force array */
638 #define gmx_load_ps4 _mm_load_ps
639 #define gmx_store_ps4 _mm_store_ps
640 #define gmx_add_ps4 _mm_add_ps
642 #define gmx_load_ps4 _mm256_load_pd
643 #define gmx_store_ps4 _mm256_store_pd
644 #define gmx_add_ps4 _mm256_add_pd
646 GMX_MM_TRANSPOSE_SUM4H_PR(fix_SSE0, fix_SSE2, fix_SSE);
647 gmx_store_ps4(f+scix, gmx_add_ps4(fix_SSE, gmx_load_ps4(f+scix)));
649 GMX_MM_TRANSPOSE_SUM4H_PR(fiy_SSE0, fiy_SSE2, fiy_SSE);
650 gmx_store_ps4(f+sciy, gmx_add_ps4(fiy_SSE, gmx_load_ps4(f+sciy)));
652 GMX_MM_TRANSPOSE_SUM4H_PR(fiz_SSE0, fiz_SSE2, fiz_SSE);
653 gmx_store_ps4(f+sciz, gmx_add_ps4(fiz_SSE, gmx_load_ps4(f+sciz)));
655 #ifdef CALC_SHIFTFORCES
656 gmx_store_ps4(shf, fix_SSE);
657 fshift[ish3+0] += SUM_SIMD4(shf);
658 gmx_store_ps4(shf, fiy_SSE);
659 fshift[ish3+1] += SUM_SIMD4(shf);
660 gmx_store_ps4(shf, fiz_SSE);
661 fshift[ish3+2] += SUM_SIMD4(shf);
664 GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE0, fix_SSE1, fix0_SSE);
665 _mm_store_pd(f+scix, _mm_add_pd(fix0_SSE, _mm_load_pd(f+scix)));
666 GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE2, fix_SSE3, fix2_SSE);
667 _mm_store_pd(f+scix+2, _mm_add_pd(fix2_SSE, _mm_load_pd(f+scix+2)));
669 GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE0, fiy_SSE1, fiy0_SSE);
670 _mm_store_pd(f+sciy, _mm_add_pd(fiy0_SSE, _mm_load_pd(f+sciy)));
671 GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE2, fiy_SSE3, fiy2_SSE);
672 _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_SSE, _mm_load_pd(f+sciy+2)));
674 GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE0, fiz_SSE1, fiz0_SSE);
675 _mm_store_pd(f+sciz, _mm_add_pd(fiz0_SSE, _mm_load_pd(f+sciz)));
676 GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE2, fiz_SSE3, fiz2_SSE);
677 _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_SSE, _mm_load_pd(f+sciz+2)));
679 #ifdef CALC_SHIFTFORCES
680 _mm_store_pd(shf, _mm_add_pd(fix0_SSE, fix2_SSE));
681 fshift[ish3+0] += shf[0] + shf[1];
682 _mm_store_pd(shf, _mm_add_pd(fiy0_SSE, fiy2_SSE));
683 fshift[ish3+1] += shf[0] + shf[1];
684 _mm_store_pd(shf, _mm_add_pd(fiz0_SSE, fiz2_SSE));
685 fshift[ish3+2] += shf[0] + shf[1];
692 gmx_store_pr(tmpsum, vctotSSE);
693 *Vc += SUM_SIMD(tmpsum);
696 gmx_store_pr(tmpsum, VvdwtotSSE);
697 *Vvdw += SUM_SIMD(tmpsum);
700 /* Outer loop uses 6 flops/iteration */
704 printf("atom pairs %d\n", npair);
714 #undef CALC_SHIFTFORCES