2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2009, The GROMACS Development Team
6 * Copyright (c) 2012, by the GROMACS development team, led by
7 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
8 * others, as listed in the AUTHORS file in the top-level source
9 * directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
38 /* GMX_MM256_HERE should be set before including this file */
39 #include "gmx_simd_macros.h"
41 #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
43 #define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
44 #define UNROLLJ (GMX_SIMD_WIDTH_HERE/2)
46 #if defined GMX_MM256_HERE
52 /* single precision 2x(4+4) kernel */
53 #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
56 #error "unsupported kernel configuration"
60 #define SIMD_MASK_ALL 0xffffffff
62 #include "nbnxn_kernel_simd_utils.h"
64 /* All functionality defines are set here, except for:
65 * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
66 * CHECK_EXCLS, which is set just before including the inner loop contents.
67 * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
68 * set before calling the kernel function. We might want to move that
69 * to inside the n-loop and have a different combination rule for different
70 * ci's, as no combination rule gives a 50% performance hit for LJ.
73 /* We always calculate shift forces, because it's cheap anyhow */
74 #define CALC_SHIFTFORCES
76 /* Assumes all LJ parameters are identical */
77 /* #define FIX_LJ_C */
79 /* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
80 * with all combinations off electrostatics (coul), LJ combination rules (ljc)
81 * and energy calculations (ene), depending on the defines set.
84 #define NBK_FUNC_NAME_C_LJC(base,coul,ljc,ene) base##_##coul##_comb_##ljc##_##ene
86 #if defined LJ_COMB_GEOM
87 #define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,geom,ene)
89 #if defined LJ_COMB_LB
90 #define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,lb,ene)
92 #define NBK_FUNC_NAME_C(base,coul,ene) NBK_FUNC_NAME_C_LJC(base,coul,none,ene)
97 #define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,rf,ene)
100 #ifndef VDW_CUTOFF_CHECK
101 #define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab,ene)
103 #define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,tab_twin,ene)
106 #ifdef CALC_COUL_EWALD
107 #ifndef VDW_CUTOFF_CHECK
108 #define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald,ene)
110 #define NBK_FUNC_NAME(base,ene) NBK_FUNC_NAME_C(base,ewald_twin,ene)
115 #ifndef CALC_ENERGIES
116 NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,noener)
118 #ifndef ENERGY_GROUPS
119 NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,ener)
121 NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn,energrp)
125 #undef NBK_FUNC_NAME_C
126 #undef NBK_FUNC_NAME_C_LJC
127 (const nbnxn_pairlist_t *nbl,
128 const nbnxn_atomdata_t *nbat,
129 const interaction_const_t *ic,
132 #ifdef CALC_SHIFTFORCES
143 const nbnxn_ci_t *nbln;
144 const nbnxn_cj_t *l_cj;
147 const real *shiftvec;
149 const real *nbfp0,*nbfp1,*nbfp2=NULL,*nbfp3=NULL;
155 gmx_bool do_LJ,half_LJ,do_coul;
156 int sci,scix,sciy,sciz,sci2;
157 int cjind0,cjind1,cjind;
162 int egps_ishift,egps_imask;
163 int egps_jshift,egps_jmask,egps_jstride;
165 real *vvdwtp[UNROLLI];
172 gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0;
173 gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2;
174 gmx_mm_pr fix_SSE0,fiy_SSE0,fiz_SSE0;
175 gmx_mm_pr fix_SSE2,fiy_SSE2,fiz_SSE2;
178 __m128 fix_SSE,fiy_SSE,fiz_SSE;
180 __m256d fix_SSE,fiy_SSE,fiz_SSE;
183 __m128d fix0_SSE,fiy0_SSE,fiz0_SSE;
184 __m128d fix2_SSE,fiy2_SSE,fiz2_SSE;
187 /* AVX: use floating point masks, as there are no integer instructions */
188 gmx_mm_pr mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
189 gmx_mm_pr mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
191 gmx_mm_pr diag_jmi_SSE;
192 #if UNROLLI == UNROLLJ
193 gmx_mm_pr diag_SSE0,diag_SSE2;
195 gmx_mm_pr diag0_SSE0,diag0_SSE2;
196 gmx_mm_pr diag1_SSE0,diag1_SSE2;
199 gmx_mm_pr zero_SSE = gmx_set1_pr(0);
201 gmx_mm_pr one_SSE=gmx_set1_pr(1.0);
202 gmx_mm_pr iq_SSE0=gmx_setzero_pr();
203 gmx_mm_pr iq_SSE2=gmx_setzero_pr();
206 gmx_mm_pr hrc_3_SSE,moh_rc_SSE;
210 /* Coulomb table variables */
211 gmx_mm_pr invtsp_SSE;
212 const real *tab_coul_F;
214 const real *tab_coul_V;
216 #ifdef GMX_MM256_HERE
217 int ti0_array[2*GMX_SIMD_WIDTH_HERE-1],*ti0;
218 int ti2_array[2*GMX_SIMD_WIDTH_HERE-1],*ti2;
221 gmx_mm_pr mhalfsp_SSE;
225 #ifdef CALC_COUL_EWALD
226 gmx_mm_pr beta2_SSE,beta_SSE;
229 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
230 gmx_mm_pr sh_ewald_SSE;
236 gmx_mm_pr hsig_i_SSE0,seps_i_SSE0;
237 gmx_mm_pr hsig_i_SSE2,seps_i_SSE2;
240 real pvdw_array[2*UNROLLI*UNROLLJ+3];
241 real *pvdw_c6,*pvdw_c12;
242 gmx_mm_pr c6_SSE0,c12_SSE0;
243 gmx_mm_pr c6_SSE2,c12_SSE2;
249 gmx_mm_pr c6s_SSE0,c12s_SSE0;
250 gmx_mm_pr c6s_SSE1,c12s_SSE1;
251 gmx_mm_pr c6s_SSE2=gmx_setzero_pr(),c12s_SSE2=gmx_setzero_pr();
252 gmx_mm_pr c6s_SSE3=gmx_setzero_pr(),c12s_SSE3=gmx_setzero_pr();
254 #endif /* LJ_COMB_LB */
256 gmx_mm_pr vctotSSE,VvdwtotSSE;
257 gmx_mm_pr sixthSSE,twelvethSSE;
259 gmx_mm_pr avoid_sing_SSE;
261 #ifdef VDW_CUTOFF_CHECK
262 gmx_mm_pr rcvdw2_SSE;
266 gmx_mm_pr sh_invrc6_SSE,sh_invrc12_SSE;
268 /* cppcheck-suppress unassignedVariable */
269 real tmpsum_array[15],*tmpsum;
271 #ifdef CALC_SHIFTFORCES
272 /* cppcheck-suppress unassignedVariable */
273 real shf_array[15],*shf;
282 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
285 /* No combination rule used */
287 nbfp_ptr = nbat->nbfp_s4;
288 #define NBFP_STRIDE 4
290 nbfp_ptr = nbat->nbfp;
291 #define NBFP_STRIDE 2
293 nbfp_stride = NBFP_STRIDE;
296 /* Load j-i for the first i */
297 diag_jmi_SSE = gmx_load_pr(nbat->simd_2xnn_diag);
298 /* Generate all the diagonal masks as comparison results */
299 #if UNROLLI == UNROLLJ
300 diag_SSE0 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
301 diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
302 diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
303 diag_SSE2 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
305 #if 2*UNROLLI == UNROLLJ
306 diag0_SSE0 = gmx_cmplt_pr(diag_i_SSE,diag_j_SSE);
307 diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
308 diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
309 diag0_SSE2 = gmx_cmplt_pr(diag_i_SSE,diag_j_SSE);
310 diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
311 diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
312 diag1_SSE0 = gmx_cmplt_pr(diag_i_SSE,diag_j_SSE);
313 diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
314 diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
315 diag1_SSE2 = gmx_cmplt_pr(diag_i_SSE,diag_j_SSE);
320 #ifdef GMX_MM256_HERE
321 /* Generate aligned table index pointers */
322 ti0 = (int *)(((size_t)(ti0_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
323 ti2 = (int *)(((size_t)(ti2_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
326 invtsp_SSE = gmx_set1_pr(ic->tabq_scale);
328 mhalfsp_SSE = gmx_set1_pr(-0.5/ic->tabq_scale);
332 tab_coul_F = ic->tabq_coul_FDV0;
334 tab_coul_F = ic->tabq_coul_F;
335 tab_coul_V = ic->tabq_coul_V;
337 #endif /* CALC_COUL_TAB */
339 #ifdef CALC_COUL_EWALD
340 beta2_SSE = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
341 beta_SSE = gmx_set1_pr(ic->ewaldcoeff);
344 #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
345 sh_ewald_SSE = gmx_set1_pr(ic->sh_ewald);
351 shiftvec = shift_vec[0];
354 avoid_sing_SSE = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
356 /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
357 rc2_SSE = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
358 #ifdef VDW_CUTOFF_CHECK
359 rcvdw2_SSE = gmx_set1_pr(ic->rvdw*ic->rvdw);
363 sixthSSE = gmx_set1_pr(1.0/6.0);
364 twelvethSSE = gmx_set1_pr(1.0/12.0);
366 sh_invrc6_SSE = gmx_set1_pr(ic->sh_invrc6);
367 sh_invrc12_SSE = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
370 mrc_3_SSE = gmx_set1_pr(-2*ic->k_rf);
373 hrc_3_SSE = gmx_set1_pr(ic->k_rf);
375 moh_rc_SSE = gmx_set1_pr(-ic->c_rf);
379 tmpsum = (real *)(((size_t)(tmpsum_array+7)) & (~((size_t)31)));
381 #ifdef CALC_SHIFTFORCES
382 shf = (real *)(((size_t)(shf_array+7)) & (~((size_t)31)));
386 pvdw_c6 = (real *)(((size_t)(pvdw_array+3)) & (~((size_t)15)));
387 pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
389 for(jp=0; jp<UNROLLJ; jp++)
391 pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
392 pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
393 pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
394 pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
396 pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
397 pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
398 pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
399 pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
401 c6_SSE0 = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
402 c6_SSE1 = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
403 c6_SSE2 = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
404 c6_SSE3 = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
406 c12_SSE0 = gmx_load_pr(pvdw_c12+0*UNROLLJ);
407 c12_SSE1 = gmx_load_pr(pvdw_c12+1*UNROLLJ);
408 c12_SSE2 = gmx_load_pr(pvdw_c12+2*UNROLLJ);
409 c12_SSE3 = gmx_load_pr(pvdw_c12+3*UNROLLJ);
410 #endif /* FIX_LJ_C */
413 egps_ishift = nbat->neg_2log;
414 egps_imask = (1<<egps_ishift) - 1;
415 egps_jshift = 2*nbat->neg_2log;
416 egps_jmask = (1<<egps_jshift) - 1;
417 egps_jstride = (UNROLLJ>>1)*UNROLLJ;
418 /* Major division is over i-particle energy groups, determine the stride */
419 Vstride_i = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
425 for(n=0; n<nbl->nci; n++)
429 ish = (nbln->shift & NBNXN_CI_SHIFT);
431 cjind0 = nbln->cj_ind_start;
432 cjind1 = nbln->cj_ind_end;
434 ci_sh = (ish == CENTRAL ? ci : -1);
436 shX_SSE = gmx_load1_pr(shiftvec+ish3);
437 shY_SSE = gmx_load1_pr(shiftvec+ish3+1);
438 shZ_SSE = gmx_load1_pr(shiftvec+ish3+2);
445 sci = (ci>>1)*STRIDE;
446 scix = sci*DIM + (ci & 1)*(STRIDE>>1);
447 sci2 = sci*2 + (ci & 1)*(STRIDE>>1);
448 sci += (ci & 1)*(STRIDE>>1);
451 /* We have 5 LJ/C combinations, but use only three inner loops,
452 * as the other combinations are unlikely and/or not much faster:
453 * inner half-LJ + C for half-LJ + C / no-LJ + C
454 * inner LJ + C for full-LJ + C
455 * inner LJ for full-LJ + no-C / half-LJ + no-C
457 do_LJ = (nbln->shift & NBNXN_CI_DO_LJ(0));
458 do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
459 half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
462 egps_i = nbat->energrp[ci];
466 for(ia=0; ia<UNROLLI; ia++)
468 egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
469 vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
470 vctp[ia] = Vc + egp_ia*Vstride_i;
474 #if defined CALC_ENERGIES
476 if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
479 if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
482 if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
489 Vc_sub_self = 0.5*ic->c_rf;
493 Vc_sub_self = 0.5*tab_coul_F[2];
495 Vc_sub_self = 0.5*tab_coul_V[0];
498 #ifdef CALC_COUL_EWALD
500 Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
503 for(ia=0; ia<UNROLLI; ia++)
509 vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
513 -= facel*qi*qi*Vc_sub_self;
518 #define gmx_load2_hpr(x) _mm256_insertf128_ps(gmx_load1_pr(x),gmx_load1_hpr(x+1),1)
520 /* Load i atom data */
521 sciy = scix + STRIDE;
522 sciz = sciy + STRIDE;
523 ix_SSE0 = gmx_add_pr(gmx_load2_hpr(x+scix) ,shX_SSE);
524 ix_SSE2 = gmx_add_pr(gmx_load2_hpr(x+scix+2),shX_SSE);
525 iy_SSE0 = gmx_add_pr(gmx_load2_hpr(x+sciy) ,shY_SSE);
526 iy_SSE2 = gmx_add_pr(gmx_load2_hpr(x+sciy+2),shY_SSE);
527 iz_SSE0 = gmx_add_pr(gmx_load2_hpr(x+sciz) ,shZ_SSE);
528 iz_SSE2 = gmx_add_pr(gmx_load2_hpr(x+sciz+2),shZ_SSE);
534 facel_SSE = gmx_set1_pr(facel);
536 iq_SSE0 = gmx_mul_pr(facel_SSE,gmx_load2_hpr(q+sci));
537 iq_SSE2 = gmx_mul_pr(facel_SSE,gmx_load2_hpr(q+sci+2));
541 hsig_i_SSE0 = gmx_load2_hpr(ljc+sci2+0);
542 hsig_i_SSE2 = gmx_load2_hpr(ljc+sci2+2);
543 seps_i_SSE0 = gmx_load2_hpr(ljc+sci2+STRIDE+0);
544 seps_i_SSE2 = gmx_load2_hpr(ljc+sci2+STRIDE+2);
547 c6s_SSE0 = gmx_load2_hpr(ljc+sci2+0);
550 c6s_SSE2 = gmx_load2_hpr(ljc+sci2+2);
552 c12s_SSE0 = gmx_load2_hpr(ljc+sci2+STRIDE+0);
555 c12s_SSE2 = gmx_load2_hpr(ljc+sci2+STRIDE+2);
558 nbfp0 = nbfp_ptr + type[sci ]*nbat->ntype*nbfp_stride;
559 nbfp1 = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
562 nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
563 nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
568 /* Zero the potential energy for this list */
569 VvdwtotSSE = gmx_setzero_pr();
570 vctotSSE = gmx_setzero_pr();
572 /* Clear i atom forces */
573 fix_SSE0 = gmx_setzero_pr();
574 fix_SSE2 = gmx_setzero_pr();
575 fiy_SSE0 = gmx_setzero_pr();
576 fiy_SSE2 = gmx_setzero_pr();
577 fiz_SSE0 = gmx_setzero_pr();
578 fiz_SSE2 = gmx_setzero_pr();
582 /* Currently all kernels use (at least half) LJ */
589 while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
591 #include "nbnxn_kernel_simd_2xnn_inner.h"
595 for(; (cjind<cjind1); cjind++)
597 #include "nbnxn_kernel_simd_2xnn_inner.h"
606 while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
608 #include "nbnxn_kernel_simd_2xnn_inner.h"
612 for(; (cjind<cjind1); cjind++)
614 #include "nbnxn_kernel_simd_2xnn_inner.h"
621 while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
623 #include "nbnxn_kernel_simd_2xnn_inner.h"
627 for(; (cjind<cjind1); cjind++)
629 #include "nbnxn_kernel_simd_2xnn_inner.h"
633 ninner += cjind1 - cjind0;
635 /* Add accumulated i-forces to the force array */
638 #define gmx_load_ps4 _mm_load_ps
639 #define gmx_store_ps4 _mm_store_ps
640 #define gmx_add_ps4 _mm_add_ps
642 #define gmx_load_ps4 _mm256_load_pd
643 #define gmx_store_ps4 _mm256_store_pd
644 #define gmx_add_ps4 _mm256_add_pd
646 GMX_MM_TRANSPOSE_SUM4H_PR(fix_SSE0,fix_SSE2,fix_SSE);
647 gmx_store_ps4(f+scix, gmx_add_ps4(fix_SSE, gmx_load_ps4(f+scix)));
649 GMX_MM_TRANSPOSE_SUM4H_PR(fiy_SSE0,fiy_SSE2,fiy_SSE);
650 gmx_store_ps4(f+sciy, gmx_add_ps4(fiy_SSE, gmx_load_ps4(f+sciy)));
652 GMX_MM_TRANSPOSE_SUM4H_PR(fiz_SSE0,fiz_SSE2,fiz_SSE);
653 gmx_store_ps4(f+sciz, gmx_add_ps4(fiz_SSE, gmx_load_ps4(f+sciz)));
655 #ifdef CALC_SHIFTFORCES
656 gmx_store_ps4(shf,fix_SSE);
657 fshift[ish3+0] += SUM_SIMD4(shf);
658 gmx_store_ps4(shf,fiy_SSE);
659 fshift[ish3+1] += SUM_SIMD4(shf);
660 gmx_store_ps4(shf,fiz_SSE);
661 fshift[ish3+2] += SUM_SIMD4(shf);
664 GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE0,fix_SSE1,fix0_SSE);
665 _mm_store_pd(f+scix, _mm_add_pd(fix0_SSE, _mm_load_pd(f+scix)));
666 GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE2,fix_SSE3,fix2_SSE);
667 _mm_store_pd(f+scix+2, _mm_add_pd(fix2_SSE, _mm_load_pd(f+scix+2)));
669 GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE0,fiy_SSE1,fiy0_SSE);
670 _mm_store_pd(f+sciy, _mm_add_pd(fiy0_SSE, _mm_load_pd(f+sciy)));
671 GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE2,fiy_SSE3,fiy2_SSE);
672 _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_SSE, _mm_load_pd(f+sciy+2)));
674 GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE0,fiz_SSE1,fiz0_SSE);
675 _mm_store_pd(f+sciz, _mm_add_pd(fiz0_SSE, _mm_load_pd(f+sciz)));
676 GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE2,fiz_SSE3,fiz2_SSE);
677 _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_SSE, _mm_load_pd(f+sciz+2)));
679 #ifdef CALC_SHIFTFORCES
680 _mm_store_pd(shf,_mm_add_pd(fix0_SSE,fix2_SSE));
681 fshift[ish3+0] += shf[0] + shf[1];
682 _mm_store_pd(shf,_mm_add_pd(fiy0_SSE,fiy2_SSE));
683 fshift[ish3+1] += shf[0] + shf[1];
684 _mm_store_pd(shf,_mm_add_pd(fiz0_SSE,fiz2_SSE));
685 fshift[ish3+2] += shf[0] + shf[1];
692 gmx_store_pr(tmpsum,vctotSSE);
693 *Vc += SUM_SIMD(tmpsum);
696 gmx_store_pr(tmpsum,VvdwtotSSE);
697 *Vvdw += SUM_SIMD(tmpsum);
700 /* Outer loop uses 6 flops/iteration */
704 printf("atom pairs %d\n",npair);
714 #undef CALC_SHIFTFORCES