2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 const nbnxn_ci_t *nbln;
38 const nbnxn_cj_t *l_cj;
43 const real *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
48 gmx_bool do_LJ, half_LJ, do_coul;
49 int sci, scix, sciy, sciz, sci2;
50 int cjind0, cjind1, cjind;
55 int egps_ishift, egps_imask;
56 int egps_jshift, egps_jmask, egps_jstride;
58 real *vvdwtp[UNROLLI];
65 gmx_mm_pr ix_S0, iy_S0, iz_S0;
66 gmx_mm_pr ix_S1, iy_S1, iz_S1;
67 gmx_mm_pr ix_S2, iy_S2, iz_S2;
68 gmx_mm_pr ix_S3, iy_S3, iz_S3;
69 gmx_mm_pr fix_S0, fiy_S0, fiz_S0;
70 gmx_mm_pr fix_S1, fiy_S1, fiz_S1;
71 gmx_mm_pr fix_S2, fiy_S2, fiz_S2;
72 gmx_mm_pr fix_S3, fiy_S3, fiz_S3;
74 /* We use an i-force SIMD register width of 4 */
76 #define gmx_mm_pr4 gmx_mm_pr
77 #define gmx_load_pr4 gmx_load_pr
78 #define gmx_store_pr4 gmx_store_pr
79 #define gmx_add_pr4 gmx_add_pr
81 /* The pr4 stuff is defined in nbnxn_kernel_simd_utils.h */
83 gmx_mm_pr4 fix_S, fiy_S, fiz_S;
85 /* We use an i-force SIMD register width of 2 */
86 gmx_mm_pr fix0_S, fiy0_S, fiz0_S;
87 gmx_mm_pr fix2_S, fiy2_S, fiz2_S;
90 gmx_mm_pr diagonal_jmi_S;
91 #if UNROLLI == UNROLLJ
92 gmx_mm_pb diagonal_mask_S0, diagonal_mask_S1, diagonal_mask_S2, diagonal_mask_S3;
94 gmx_mm_pb diagonal_mask0_S0, diagonal_mask0_S1, diagonal_mask0_S2, diagonal_mask0_S3;
95 gmx_mm_pb diagonal_mask1_S0, diagonal_mask1_S1, diagonal_mask1_S2, diagonal_mask1_S3;
98 unsigned *exclusion_filter;
99 gmx_exclfilter filter_S0, filter_S1, filter_S2, filter_S3;
101 gmx_mm_pr zero_S = gmx_set1_pr(0.0);
103 gmx_mm_pr one_S = gmx_set1_pr(1.0);
104 gmx_mm_pr iq_S0 = gmx_setzero_pr();
105 gmx_mm_pr iq_S1 = gmx_setzero_pr();
106 gmx_mm_pr iq_S2 = gmx_setzero_pr();
107 gmx_mm_pr iq_S3 = gmx_setzero_pr();
110 gmx_mm_pr hrc_3_S, moh_rc_S;
114 /* Coulomb table variables */
116 const real *tab_coul_F;
118 const real *tab_coul_V;
120 /* Thread-local working buffers for force and potential lookups */
121 int ti0_array[2*GMX_SIMD_WIDTH_HERE-1], *ti0 = NULL;
122 int ti1_array[2*GMX_SIMD_WIDTH_HERE-1], *ti1 = NULL;
123 int ti2_array[2*GMX_SIMD_WIDTH_HERE-1], *ti2 = NULL;
124 int ti3_array[2*GMX_SIMD_WIDTH_HERE-1], *ti3 = NULL;
130 #ifdef CALC_COUL_EWALD
131 gmx_mm_pr beta2_S, beta_S;
134 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
135 gmx_mm_pr sh_ewald_S;
141 gmx_mm_pr hsig_i_S0, seps_i_S0;
142 gmx_mm_pr hsig_i_S1, seps_i_S1;
143 gmx_mm_pr hsig_i_S2, seps_i_S2;
144 gmx_mm_pr hsig_i_S3, seps_i_S3;
147 real pvdw_array[2*UNROLLI*UNROLLJ+3];
148 real *pvdw_c6, *pvdw_c12;
149 gmx_mm_pr c6_S0, c12_S0;
150 gmx_mm_pr c6_S1, c12_S1;
151 gmx_mm_pr c6_S2, c12_S2;
152 gmx_mm_pr c6_S3, c12_S3;
158 gmx_mm_pr c6s_S0, c12s_S0;
159 gmx_mm_pr c6s_S1, c12s_S1;
160 gmx_mm_pr c6s_S2 = gmx_setzero_pr(), c12s_S2 = gmx_setzero_pr();
161 gmx_mm_pr c6s_S3 = gmx_setzero_pr(), c12s_S3 = gmx_setzero_pr();
163 #endif /* LJ_COMB_LB */
165 gmx_mm_pr vctot_S, Vvdwtot_S;
166 gmx_mm_pr sixth_S, twelveth_S;
168 gmx_mm_pr avoid_sing_S;
170 #ifdef VDW_CUTOFF_CHECK
175 gmx_mm_pr sh_invrc6_S, sh_invrc12_S;
177 /* cppcheck-suppress unassignedVariable */
178 real tmpsum_array[15], *tmpsum;
180 #ifdef CALC_SHIFTFORCES
181 /* cppcheck-suppress unassignedVariable */
182 real shf_array[15], *shf;
191 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
194 /* No combination rule used */
195 nbfp_ptr = (4 == nbfp_stride) ? nbat->nbfp_s4 : nbat->nbfp;
198 /* Load j-i for the first i */
199 diagonal_jmi_S = gmx_load_pr(nbat->simd_4xn_diagonal_j_minus_i);
200 /* Generate all the diagonal masks as comparison results */
201 #if UNROLLI == UNROLLJ
202 diagonal_mask_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
203 diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
204 diagonal_mask_S1 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
205 diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
206 diagonal_mask_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
207 diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
208 diagonal_mask_S3 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
210 #if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ
211 diagonal_mask0_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
212 diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
213 diagonal_mask0_S1 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
214 diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
215 diagonal_mask0_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
216 diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
217 diagonal_mask0_S3 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
218 diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
220 #if UNROLLI == 2*UNROLLJ
221 /* Load j-i for the second half of the j-cluster */
222 diagonal_jmi_S = gmx_load_pr(nbat->simd_4xn_diagonal_j_minus_i + UNROLLJ);
225 diagonal_mask1_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
226 diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
227 diagonal_mask1_S1 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
228 diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
229 diagonal_mask1_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
230 diagonal_jmi_S = gmx_sub_pr(diagonal_jmi_S, one_S);
231 diagonal_mask1_S3 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
235 /* Load masks for topology exclusion masking. filter_stride is
236 static const, so the conditional will be optimized away. */
237 if (1 == filter_stride)
239 exclusion_filter = nbat->simd_exclusion_filter1;
241 else /* (2 == filter_stride) */
243 exclusion_filter = nbat->simd_exclusion_filter2;
246 /* Here we cast the exclusion filters from unsigned * to int * or real *.
247 * Since we only check bits, the actual value they represent does not
248 * matter, as long as both filter and mask data are treated the same way.
250 filter_S0 = gmx_load_exclusion_filter(exclusion_filter + 0*UNROLLJ*filter_stride);
251 filter_S1 = gmx_load_exclusion_filter(exclusion_filter + 1*UNROLLJ*filter_stride);
252 filter_S2 = gmx_load_exclusion_filter(exclusion_filter + 2*UNROLLJ*filter_stride);
253 filter_S3 = gmx_load_exclusion_filter(exclusion_filter + 3*UNROLLJ*filter_stride);
256 /* Generate aligned table index pointers */
257 ti0 = prepare_table_load_buffer(ti0_array);
258 ti1 = prepare_table_load_buffer(ti1_array);
259 ti2 = prepare_table_load_buffer(ti2_array);
260 ti3 = prepare_table_load_buffer(ti3_array);
262 invtsp_S = gmx_set1_pr(ic->tabq_scale);
264 mhalfsp_S = gmx_set1_pr(-0.5/ic->tabq_scale);
268 tab_coul_F = ic->tabq_coul_FDV0;
270 tab_coul_F = ic->tabq_coul_F;
271 tab_coul_V = ic->tabq_coul_V;
273 #endif /* CALC_COUL_TAB */
275 #ifdef CALC_COUL_EWALD
276 beta2_S = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
277 beta_S = gmx_set1_pr(ic->ewaldcoeff);
280 #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
281 sh_ewald_S = gmx_set1_pr(ic->sh_ewald);
287 shiftvec = shift_vec[0];
290 avoid_sing_S = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
292 /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
293 rc2_S = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
294 #ifdef VDW_CUTOFF_CHECK
295 rcvdw2_S = gmx_set1_pr(ic->rvdw*ic->rvdw);
299 sixth_S = gmx_set1_pr(1.0/6.0);
300 twelveth_S = gmx_set1_pr(1.0/12.0);
302 sh_invrc6_S = gmx_set1_pr(ic->sh_invrc6);
303 sh_invrc12_S = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
306 mrc_3_S = gmx_set1_pr(-2*ic->k_rf);
309 hrc_3_S = gmx_set1_pr(ic->k_rf);
311 moh_rc_S = gmx_set1_pr(-ic->c_rf);
315 tmpsum = gmx_simd_align_real(tmpsum_array);
317 #ifdef CALC_SHIFTFORCES
318 shf = gmx_simd_align_real(shf_array);
322 pvdw_c6 = gmx_simd_align_real(pvdw_array+3);
323 pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
325 for (jp = 0; jp < UNROLLJ; jp++)
327 pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
328 pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
329 pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
330 pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
332 pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
333 pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
334 pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
335 pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
337 c6_S0 = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
338 c6_S1 = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
339 c6_S2 = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
340 c6_S3 = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
342 c12_S0 = gmx_load_pr(pvdw_c12+0*UNROLLJ);
343 c12_S1 = gmx_load_pr(pvdw_c12+1*UNROLLJ);
344 c12_S2 = gmx_load_pr(pvdw_c12+2*UNROLLJ);
345 c12_S3 = gmx_load_pr(pvdw_c12+3*UNROLLJ);
346 #endif /* FIX_LJ_C */
349 egps_ishift = nbat->neg_2log;
350 egps_imask = (1<<egps_ishift) - 1;
351 egps_jshift = 2*nbat->neg_2log;
352 egps_jmask = (1<<egps_jshift) - 1;
353 egps_jstride = (UNROLLJ>>1)*UNROLLJ;
354 /* Major division is over i-particle energy groups, determine the stride */
355 Vstride_i = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
361 for (n = 0; n < nbl->nci; n++)
365 ish = (nbln->shift & NBNXN_CI_SHIFT);
367 cjind0 = nbln->cj_ind_start;
368 cjind1 = nbln->cj_ind_end;
370 ci_sh = (ish == CENTRAL ? ci : -1);
372 shX_S = gmx_load1_pr(shiftvec+ish3);
373 shY_S = gmx_load1_pr(shiftvec+ish3+1);
374 shZ_S = gmx_load1_pr(shiftvec+ish3+2);
381 sci = (ci>>1)*STRIDE;
382 scix = sci*DIM + (ci & 1)*(STRIDE>>1);
383 sci2 = sci*2 + (ci & 1)*(STRIDE>>1);
384 sci += (ci & 1)*(STRIDE>>1);
387 /* We have 5 LJ/C combinations, but use only three inner loops,
388 * as the other combinations are unlikely and/or not much faster:
389 * inner half-LJ + C for half-LJ + C / no-LJ + C
390 * inner LJ + C for full-LJ + C
391 * inner LJ for full-LJ + no-C / half-LJ + no-C
393 do_LJ = (nbln->shift & NBNXN_CI_DO_LJ(0));
394 do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
395 half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
398 egps_i = nbat->energrp[ci];
402 for (ia = 0; ia < UNROLLI; ia++)
404 egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
405 vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
406 vctp[ia] = Vc + egp_ia*Vstride_i;
410 #if defined CALC_ENERGIES
412 if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
415 if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
418 if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
425 Vc_sub_self = 0.5*ic->c_rf;
429 Vc_sub_self = 0.5*tab_coul_F[2];
431 Vc_sub_self = 0.5*tab_coul_V[0];
434 #ifdef CALC_COUL_EWALD
436 Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
439 for (ia = 0; ia < UNROLLI; ia++)
445 vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
449 -= facel*qi*qi*Vc_sub_self;
454 /* Load i atom data */
455 sciy = scix + STRIDE;
456 sciz = sciy + STRIDE;
457 ix_S0 = gmx_add_pr(gmx_load1_pr(x+scix), shX_S);
458 ix_S1 = gmx_add_pr(gmx_load1_pr(x+scix+1), shX_S);
459 ix_S2 = gmx_add_pr(gmx_load1_pr(x+scix+2), shX_S);
460 ix_S3 = gmx_add_pr(gmx_load1_pr(x+scix+3), shX_S);
461 iy_S0 = gmx_add_pr(gmx_load1_pr(x+sciy), shY_S);
462 iy_S1 = gmx_add_pr(gmx_load1_pr(x+sciy+1), shY_S);
463 iy_S2 = gmx_add_pr(gmx_load1_pr(x+sciy+2), shY_S);
464 iy_S3 = gmx_add_pr(gmx_load1_pr(x+sciy+3), shY_S);
465 iz_S0 = gmx_add_pr(gmx_load1_pr(x+sciz), shZ_S);
466 iz_S1 = gmx_add_pr(gmx_load1_pr(x+sciz+1), shZ_S);
467 iz_S2 = gmx_add_pr(gmx_load1_pr(x+sciz+2), shZ_S);
468 iz_S3 = gmx_add_pr(gmx_load1_pr(x+sciz+3), shZ_S);
472 iq_S0 = gmx_set1_pr(facel*q[sci]);
473 iq_S1 = gmx_set1_pr(facel*q[sci+1]);
474 iq_S2 = gmx_set1_pr(facel*q[sci+2]);
475 iq_S3 = gmx_set1_pr(facel*q[sci+3]);
479 hsig_i_S0 = gmx_load1_pr(ljc+sci2+0);
480 hsig_i_S1 = gmx_load1_pr(ljc+sci2+1);
481 hsig_i_S2 = gmx_load1_pr(ljc+sci2+2);
482 hsig_i_S3 = gmx_load1_pr(ljc+sci2+3);
483 seps_i_S0 = gmx_load1_pr(ljc+sci2+STRIDE+0);
484 seps_i_S1 = gmx_load1_pr(ljc+sci2+STRIDE+1);
485 seps_i_S2 = gmx_load1_pr(ljc+sci2+STRIDE+2);
486 seps_i_S3 = gmx_load1_pr(ljc+sci2+STRIDE+3);
489 c6s_S0 = gmx_load1_pr(ljc+sci2+0);
490 c6s_S1 = gmx_load1_pr(ljc+sci2+1);
493 c6s_S2 = gmx_load1_pr(ljc+sci2+2);
494 c6s_S3 = gmx_load1_pr(ljc+sci2+3);
496 c12s_S0 = gmx_load1_pr(ljc+sci2+STRIDE+0);
497 c12s_S1 = gmx_load1_pr(ljc+sci2+STRIDE+1);
500 c12s_S2 = gmx_load1_pr(ljc+sci2+STRIDE+2);
501 c12s_S3 = gmx_load1_pr(ljc+sci2+STRIDE+3);
504 nbfp0 = nbfp_ptr + type[sci ]*nbat->ntype*nbfp_stride;
505 nbfp1 = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
508 nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
509 nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
514 /* Zero the potential energy for this list */
515 Vvdwtot_S = gmx_setzero_pr();
516 vctot_S = gmx_setzero_pr();
518 /* Clear i atom forces */
519 fix_S0 = gmx_setzero_pr();
520 fix_S1 = gmx_setzero_pr();
521 fix_S2 = gmx_setzero_pr();
522 fix_S3 = gmx_setzero_pr();
523 fiy_S0 = gmx_setzero_pr();
524 fiy_S1 = gmx_setzero_pr();
525 fiy_S2 = gmx_setzero_pr();
526 fiy_S3 = gmx_setzero_pr();
527 fiz_S0 = gmx_setzero_pr();
528 fiz_S1 = gmx_setzero_pr();
529 fiz_S2 = gmx_setzero_pr();
530 fiz_S3 = gmx_setzero_pr();
534 /* Currently all kernels use (at least half) LJ */
541 while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
543 #include "nbnxn_kernel_simd_4xn_inner.h"
547 for (; (cjind < cjind1); cjind++)
549 #include "nbnxn_kernel_simd_4xn_inner.h"
558 while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
560 #include "nbnxn_kernel_simd_4xn_inner.h"
564 for (; (cjind < cjind1); cjind++)
566 #include "nbnxn_kernel_simd_4xn_inner.h"
573 while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
575 #include "nbnxn_kernel_simd_4xn_inner.h"
579 for (; (cjind < cjind1); cjind++)
581 #include "nbnxn_kernel_simd_4xn_inner.h"
585 ninner += cjind1 - cjind0;
587 /* Add accumulated i-forces to the force array */
589 fix_S = gmx_mm_transpose_sum4_pr(fix_S0, fix_S1, fix_S2, fix_S3);
590 gmx_store_pr4(f+scix, gmx_add_pr4(fix_S, gmx_load_pr4(f+scix)));
592 fiy_S = gmx_mm_transpose_sum4_pr(fiy_S0, fiy_S1, fiy_S2, fiy_S3);
593 gmx_store_pr4(f+sciy, gmx_add_pr4(fiy_S, gmx_load_pr4(f+sciy)));
595 fiz_S = gmx_mm_transpose_sum4_pr(fiz_S0, fiz_S1, fiz_S2, fiz_S3);
596 gmx_store_pr4(f+sciz, gmx_add_pr4(fiz_S, gmx_load_pr4(f+sciz)));
598 #ifdef CALC_SHIFTFORCES
599 gmx_store_pr4(shf, fix_S);
600 fshift[ish3+0] += SUM_SIMD4(shf);
601 gmx_store_pr4(shf, fiy_S);
602 fshift[ish3+1] += SUM_SIMD4(shf);
603 gmx_store_pr4(shf, fiz_S);
604 fshift[ish3+2] += SUM_SIMD4(shf);
607 fix0_S = gmx_mm_transpose_sum2_pr(fix_S0, fix_S1);
608 gmx_store_pr(f+scix, gmx_add_pr(fix0_S, gmx_load_pr(f+scix)));
609 fix2_S = gmx_mm_transpose_sum2_pr(fix_S2, fix_S3);
610 gmx_store_pr(f+scix+2, gmx_add_pr(fix2_S, gmx_load_pr(f+scix+2)));
612 fiy0_S = gmx_mm_transpose_sum2_pr(fiy_S0, fiy_S1);
613 gmx_store_pr(f+sciy, gmx_add_pr(fiy0_S, gmx_load_pr(f+sciy)));
614 fiy2_S = gmx_mm_transpose_sum2_pr(fiy_S2, fiy_S3);
615 gmx_store_pr(f+sciy+2, gmx_add_pr(fiy2_S, gmx_load_pr(f+sciy+2)));
617 fiz0_S = gmx_mm_transpose_sum2_pr(fiz_S0, fiz_S1);
618 gmx_store_pr(f+sciz, gmx_add_pr(fiz0_S, gmx_load_pr(f+sciz)));
619 fiz2_S = gmx_mm_transpose_sum2_pr(fiz_S2, fiz_S3);
620 gmx_store_pr(f+sciz+2, gmx_add_pr(fiz2_S, gmx_load_pr(f+sciz+2)));
622 #ifdef CALC_SHIFTFORCES
623 gmx_store_pr(shf, gmx_add_pr(fix0_S, fix2_S));
624 fshift[ish3+0] += shf[0] + shf[1];
625 gmx_store_pr(shf, gmx_add_pr(fiy0_S, fiy2_S));
626 fshift[ish3+1] += shf[0] + shf[1];
627 gmx_store_pr(shf, gmx_add_pr(fiz0_S, fiz2_S));
628 fshift[ish3+2] += shf[0] + shf[1];
635 gmx_store_pr(tmpsum, vctot_S);
636 *Vc += SUM_SIMD(tmpsum);
639 gmx_store_pr(tmpsum, Vvdwtot_S);
640 *Vvdw += SUM_SIMD(tmpsum);
643 /* Outer loop uses 6 flops/iteration */
647 printf("atom pairs %d\n", npair);