2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_128_fma_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_128_fma_double
51 * Electrostatics interaction: ReactionField
52 * VdW interaction: LennardJones
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_128_fma_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int j_coord_offsetA,j_coord_offsetB;
75 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
77 real *shiftvec,*fshift,*x,*f;
78 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
82 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
84 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
87 int vdwjidx0A,vdwjidx0B;
88 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
89 int vdwjidx1A,vdwjidx1B;
90 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
91 int vdwjidx2A,vdwjidx2B;
92 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
93 int vdwjidx3A,vdwjidx3B;
94 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
95 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
97 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
98 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
99 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
100 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
101 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
102 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
103 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
104 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
105 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
108 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
111 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
112 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
113 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
114 real rswitch_scalar,d_scalar;
115 __m128d dummy_mask,cutoff_mask;
116 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
117 __m128d one = _mm_set1_pd(1.0);
118 __m128d two = _mm_set1_pd(2.0);
124 jindex = nlist->jindex;
126 shiftidx = nlist->shift;
128 shiftvec = fr->shift_vec[0];
129 fshift = fr->fshift[0];
130 facel = _mm_set1_pd(fr->ic->epsfac);
131 charge = mdatoms->chargeA;
132 krf = _mm_set1_pd(fr->ic->k_rf);
133 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
134 crf = _mm_set1_pd(fr->ic->c_rf);
135 nvdwtype = fr->ntype;
137 vdwtype = mdatoms->typeA;
139 /* Setup water-specific parameters */
140 inr = nlist->iinr[0];
141 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
142 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
143 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
144 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
146 jq1 = _mm_set1_pd(charge[inr+1]);
147 jq2 = _mm_set1_pd(charge[inr+2]);
148 jq3 = _mm_set1_pd(charge[inr+3]);
149 vdwjidx0A = 2*vdwtype[inr+0];
150 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
151 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
152 qq11 = _mm_mul_pd(iq1,jq1);
153 qq12 = _mm_mul_pd(iq1,jq2);
154 qq13 = _mm_mul_pd(iq1,jq3);
155 qq21 = _mm_mul_pd(iq2,jq1);
156 qq22 = _mm_mul_pd(iq2,jq2);
157 qq23 = _mm_mul_pd(iq2,jq3);
158 qq31 = _mm_mul_pd(iq3,jq1);
159 qq32 = _mm_mul_pd(iq3,jq2);
160 qq33 = _mm_mul_pd(iq3,jq3);
162 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
163 rcutoff_scalar = fr->ic->rcoulomb;
164 rcutoff = _mm_set1_pd(rcutoff_scalar);
165 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
167 rswitch_scalar = fr->ic->rvdw_switch;
168 rswitch = _mm_set1_pd(rswitch_scalar);
169 /* Setup switch parameters */
170 d_scalar = rcutoff_scalar-rswitch_scalar;
171 d = _mm_set1_pd(d_scalar);
172 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
173 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
174 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
175 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
176 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
177 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
179 /* Avoid stupid compiler warnings */
187 /* Start outer loop over neighborlists */
188 for(iidx=0; iidx<nri; iidx++)
190 /* Load shift vector for this list */
191 i_shift_offset = DIM*shiftidx[iidx];
193 /* Load limits for loop over neighbors */
194 j_index_start = jindex[iidx];
195 j_index_end = jindex[iidx+1];
197 /* Get outer coordinate index */
199 i_coord_offset = DIM*inr;
201 /* Load i particle coords and add shift vector */
202 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
203 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
205 fix0 = _mm_setzero_pd();
206 fiy0 = _mm_setzero_pd();
207 fiz0 = _mm_setzero_pd();
208 fix1 = _mm_setzero_pd();
209 fiy1 = _mm_setzero_pd();
210 fiz1 = _mm_setzero_pd();
211 fix2 = _mm_setzero_pd();
212 fiy2 = _mm_setzero_pd();
213 fiz2 = _mm_setzero_pd();
214 fix3 = _mm_setzero_pd();
215 fiy3 = _mm_setzero_pd();
216 fiz3 = _mm_setzero_pd();
218 /* Reset potential sums */
219 velecsum = _mm_setzero_pd();
220 vvdwsum = _mm_setzero_pd();
222 /* Start inner kernel loop */
223 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
226 /* Get j neighbor index, and coordinate index */
229 j_coord_offsetA = DIM*jnrA;
230 j_coord_offsetB = DIM*jnrB;
232 /* load j atom coordinates */
233 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
234 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
235 &jy2,&jz2,&jx3,&jy3,&jz3);
237 /* Calculate displacement vector */
238 dx00 = _mm_sub_pd(ix0,jx0);
239 dy00 = _mm_sub_pd(iy0,jy0);
240 dz00 = _mm_sub_pd(iz0,jz0);
241 dx11 = _mm_sub_pd(ix1,jx1);
242 dy11 = _mm_sub_pd(iy1,jy1);
243 dz11 = _mm_sub_pd(iz1,jz1);
244 dx12 = _mm_sub_pd(ix1,jx2);
245 dy12 = _mm_sub_pd(iy1,jy2);
246 dz12 = _mm_sub_pd(iz1,jz2);
247 dx13 = _mm_sub_pd(ix1,jx3);
248 dy13 = _mm_sub_pd(iy1,jy3);
249 dz13 = _mm_sub_pd(iz1,jz3);
250 dx21 = _mm_sub_pd(ix2,jx1);
251 dy21 = _mm_sub_pd(iy2,jy1);
252 dz21 = _mm_sub_pd(iz2,jz1);
253 dx22 = _mm_sub_pd(ix2,jx2);
254 dy22 = _mm_sub_pd(iy2,jy2);
255 dz22 = _mm_sub_pd(iz2,jz2);
256 dx23 = _mm_sub_pd(ix2,jx3);
257 dy23 = _mm_sub_pd(iy2,jy3);
258 dz23 = _mm_sub_pd(iz2,jz3);
259 dx31 = _mm_sub_pd(ix3,jx1);
260 dy31 = _mm_sub_pd(iy3,jy1);
261 dz31 = _mm_sub_pd(iz3,jz1);
262 dx32 = _mm_sub_pd(ix3,jx2);
263 dy32 = _mm_sub_pd(iy3,jy2);
264 dz32 = _mm_sub_pd(iz3,jz2);
265 dx33 = _mm_sub_pd(ix3,jx3);
266 dy33 = _mm_sub_pd(iy3,jy3);
267 dz33 = _mm_sub_pd(iz3,jz3);
269 /* Calculate squared distance and things based on it */
270 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
271 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
272 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
273 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
274 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
275 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
276 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
277 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
278 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
279 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
281 rinv00 = avx128fma_invsqrt_d(rsq00);
282 rinv11 = avx128fma_invsqrt_d(rsq11);
283 rinv12 = avx128fma_invsqrt_d(rsq12);
284 rinv13 = avx128fma_invsqrt_d(rsq13);
285 rinv21 = avx128fma_invsqrt_d(rsq21);
286 rinv22 = avx128fma_invsqrt_d(rsq22);
287 rinv23 = avx128fma_invsqrt_d(rsq23);
288 rinv31 = avx128fma_invsqrt_d(rsq31);
289 rinv32 = avx128fma_invsqrt_d(rsq32);
290 rinv33 = avx128fma_invsqrt_d(rsq33);
292 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
293 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
294 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
295 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
296 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
297 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
298 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
299 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
300 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
301 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
303 fjx0 = _mm_setzero_pd();
304 fjy0 = _mm_setzero_pd();
305 fjz0 = _mm_setzero_pd();
306 fjx1 = _mm_setzero_pd();
307 fjy1 = _mm_setzero_pd();
308 fjz1 = _mm_setzero_pd();
309 fjx2 = _mm_setzero_pd();
310 fjy2 = _mm_setzero_pd();
311 fjz2 = _mm_setzero_pd();
312 fjx3 = _mm_setzero_pd();
313 fjy3 = _mm_setzero_pd();
314 fjz3 = _mm_setzero_pd();
316 /**************************
317 * CALCULATE INTERACTIONS *
318 **************************/
320 if (gmx_mm_any_lt(rsq00,rcutoff2))
323 r00 = _mm_mul_pd(rsq00,rinv00);
325 /* LENNARD-JONES DISPERSION/REPULSION */
327 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
328 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
329 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
330 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
331 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
333 d = _mm_sub_pd(r00,rswitch);
334 d = _mm_max_pd(d,_mm_setzero_pd());
335 d2 = _mm_mul_pd(d,d);
336 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
338 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
340 /* Evaluate switch function */
341 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
342 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
343 vvdw = _mm_mul_pd(vvdw,sw);
344 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
346 /* Update potential sum for this i atom from the interaction with this j atom. */
347 vvdw = _mm_and_pd(vvdw,cutoff_mask);
348 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
352 fscal = _mm_and_pd(fscal,cutoff_mask);
354 /* Update vectorial force */
355 fix0 = _mm_macc_pd(dx00,fscal,fix0);
356 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
357 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
359 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
360 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
361 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
365 /**************************
366 * CALCULATE INTERACTIONS *
367 **************************/
369 if (gmx_mm_any_lt(rsq11,rcutoff2))
372 /* REACTION-FIELD ELECTROSTATICS */
373 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_macc_pd(krf,rsq11,rinv11),crf));
374 felec = _mm_mul_pd(qq11,_mm_msub_pd(rinv11,rinvsq11,krf2));
376 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
378 /* Update potential sum for this i atom from the interaction with this j atom. */
379 velec = _mm_and_pd(velec,cutoff_mask);
380 velecsum = _mm_add_pd(velecsum,velec);
384 fscal = _mm_and_pd(fscal,cutoff_mask);
386 /* Update vectorial force */
387 fix1 = _mm_macc_pd(dx11,fscal,fix1);
388 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
389 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
391 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
392 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
393 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
397 /**************************
398 * CALCULATE INTERACTIONS *
399 **************************/
401 if (gmx_mm_any_lt(rsq12,rcutoff2))
404 /* REACTION-FIELD ELECTROSTATICS */
405 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_macc_pd(krf,rsq12,rinv12),crf));
406 felec = _mm_mul_pd(qq12,_mm_msub_pd(rinv12,rinvsq12,krf2));
408 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
410 /* Update potential sum for this i atom from the interaction with this j atom. */
411 velec = _mm_and_pd(velec,cutoff_mask);
412 velecsum = _mm_add_pd(velecsum,velec);
416 fscal = _mm_and_pd(fscal,cutoff_mask);
418 /* Update vectorial force */
419 fix1 = _mm_macc_pd(dx12,fscal,fix1);
420 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
421 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
423 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
424 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
425 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
429 /**************************
430 * CALCULATE INTERACTIONS *
431 **************************/
433 if (gmx_mm_any_lt(rsq13,rcutoff2))
436 /* REACTION-FIELD ELECTROSTATICS */
437 velec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_macc_pd(krf,rsq13,rinv13),crf));
438 felec = _mm_mul_pd(qq13,_mm_msub_pd(rinv13,rinvsq13,krf2));
440 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
442 /* Update potential sum for this i atom from the interaction with this j atom. */
443 velec = _mm_and_pd(velec,cutoff_mask);
444 velecsum = _mm_add_pd(velecsum,velec);
448 fscal = _mm_and_pd(fscal,cutoff_mask);
450 /* Update vectorial force */
451 fix1 = _mm_macc_pd(dx13,fscal,fix1);
452 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
453 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
455 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
456 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
457 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
461 /**************************
462 * CALCULATE INTERACTIONS *
463 **************************/
465 if (gmx_mm_any_lt(rsq21,rcutoff2))
468 /* REACTION-FIELD ELECTROSTATICS */
469 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_macc_pd(krf,rsq21,rinv21),crf));
470 felec = _mm_mul_pd(qq21,_mm_msub_pd(rinv21,rinvsq21,krf2));
472 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
474 /* Update potential sum for this i atom from the interaction with this j atom. */
475 velec = _mm_and_pd(velec,cutoff_mask);
476 velecsum = _mm_add_pd(velecsum,velec);
480 fscal = _mm_and_pd(fscal,cutoff_mask);
482 /* Update vectorial force */
483 fix2 = _mm_macc_pd(dx21,fscal,fix2);
484 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
485 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
487 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
488 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
489 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
493 /**************************
494 * CALCULATE INTERACTIONS *
495 **************************/
497 if (gmx_mm_any_lt(rsq22,rcutoff2))
500 /* REACTION-FIELD ELECTROSTATICS */
501 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_macc_pd(krf,rsq22,rinv22),crf));
502 felec = _mm_mul_pd(qq22,_mm_msub_pd(rinv22,rinvsq22,krf2));
504 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
506 /* Update potential sum for this i atom from the interaction with this j atom. */
507 velec = _mm_and_pd(velec,cutoff_mask);
508 velecsum = _mm_add_pd(velecsum,velec);
512 fscal = _mm_and_pd(fscal,cutoff_mask);
514 /* Update vectorial force */
515 fix2 = _mm_macc_pd(dx22,fscal,fix2);
516 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
517 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
519 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
520 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
521 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
525 /**************************
526 * CALCULATE INTERACTIONS *
527 **************************/
529 if (gmx_mm_any_lt(rsq23,rcutoff2))
532 /* REACTION-FIELD ELECTROSTATICS */
533 velec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_macc_pd(krf,rsq23,rinv23),crf));
534 felec = _mm_mul_pd(qq23,_mm_msub_pd(rinv23,rinvsq23,krf2));
536 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
538 /* Update potential sum for this i atom from the interaction with this j atom. */
539 velec = _mm_and_pd(velec,cutoff_mask);
540 velecsum = _mm_add_pd(velecsum,velec);
544 fscal = _mm_and_pd(fscal,cutoff_mask);
546 /* Update vectorial force */
547 fix2 = _mm_macc_pd(dx23,fscal,fix2);
548 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
549 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
551 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
552 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
553 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
557 /**************************
558 * CALCULATE INTERACTIONS *
559 **************************/
561 if (gmx_mm_any_lt(rsq31,rcutoff2))
564 /* REACTION-FIELD ELECTROSTATICS */
565 velec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_macc_pd(krf,rsq31,rinv31),crf));
566 felec = _mm_mul_pd(qq31,_mm_msub_pd(rinv31,rinvsq31,krf2));
568 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
570 /* Update potential sum for this i atom from the interaction with this j atom. */
571 velec = _mm_and_pd(velec,cutoff_mask);
572 velecsum = _mm_add_pd(velecsum,velec);
576 fscal = _mm_and_pd(fscal,cutoff_mask);
578 /* Update vectorial force */
579 fix3 = _mm_macc_pd(dx31,fscal,fix3);
580 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
581 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
583 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
584 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
585 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
589 /**************************
590 * CALCULATE INTERACTIONS *
591 **************************/
593 if (gmx_mm_any_lt(rsq32,rcutoff2))
596 /* REACTION-FIELD ELECTROSTATICS */
597 velec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_macc_pd(krf,rsq32,rinv32),crf));
598 felec = _mm_mul_pd(qq32,_mm_msub_pd(rinv32,rinvsq32,krf2));
600 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
602 /* Update potential sum for this i atom from the interaction with this j atom. */
603 velec = _mm_and_pd(velec,cutoff_mask);
604 velecsum = _mm_add_pd(velecsum,velec);
608 fscal = _mm_and_pd(fscal,cutoff_mask);
610 /* Update vectorial force */
611 fix3 = _mm_macc_pd(dx32,fscal,fix3);
612 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
613 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
615 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
616 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
617 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
621 /**************************
622 * CALCULATE INTERACTIONS *
623 **************************/
625 if (gmx_mm_any_lt(rsq33,rcutoff2))
628 /* REACTION-FIELD ELECTROSTATICS */
629 velec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_macc_pd(krf,rsq33,rinv33),crf));
630 felec = _mm_mul_pd(qq33,_mm_msub_pd(rinv33,rinvsq33,krf2));
632 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
634 /* Update potential sum for this i atom from the interaction with this j atom. */
635 velec = _mm_and_pd(velec,cutoff_mask);
636 velecsum = _mm_add_pd(velecsum,velec);
640 fscal = _mm_and_pd(fscal,cutoff_mask);
642 /* Update vectorial force */
643 fix3 = _mm_macc_pd(dx33,fscal,fix3);
644 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
645 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
647 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
648 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
649 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
653 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
655 /* Inner loop uses 416 flops */
662 j_coord_offsetA = DIM*jnrA;
664 /* load j atom coordinates */
665 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
666 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
667 &jy2,&jz2,&jx3,&jy3,&jz3);
669 /* Calculate displacement vector */
670 dx00 = _mm_sub_pd(ix0,jx0);
671 dy00 = _mm_sub_pd(iy0,jy0);
672 dz00 = _mm_sub_pd(iz0,jz0);
673 dx11 = _mm_sub_pd(ix1,jx1);
674 dy11 = _mm_sub_pd(iy1,jy1);
675 dz11 = _mm_sub_pd(iz1,jz1);
676 dx12 = _mm_sub_pd(ix1,jx2);
677 dy12 = _mm_sub_pd(iy1,jy2);
678 dz12 = _mm_sub_pd(iz1,jz2);
679 dx13 = _mm_sub_pd(ix1,jx3);
680 dy13 = _mm_sub_pd(iy1,jy3);
681 dz13 = _mm_sub_pd(iz1,jz3);
682 dx21 = _mm_sub_pd(ix2,jx1);
683 dy21 = _mm_sub_pd(iy2,jy1);
684 dz21 = _mm_sub_pd(iz2,jz1);
685 dx22 = _mm_sub_pd(ix2,jx2);
686 dy22 = _mm_sub_pd(iy2,jy2);
687 dz22 = _mm_sub_pd(iz2,jz2);
688 dx23 = _mm_sub_pd(ix2,jx3);
689 dy23 = _mm_sub_pd(iy2,jy3);
690 dz23 = _mm_sub_pd(iz2,jz3);
691 dx31 = _mm_sub_pd(ix3,jx1);
692 dy31 = _mm_sub_pd(iy3,jy1);
693 dz31 = _mm_sub_pd(iz3,jz1);
694 dx32 = _mm_sub_pd(ix3,jx2);
695 dy32 = _mm_sub_pd(iy3,jy2);
696 dz32 = _mm_sub_pd(iz3,jz2);
697 dx33 = _mm_sub_pd(ix3,jx3);
698 dy33 = _mm_sub_pd(iy3,jy3);
699 dz33 = _mm_sub_pd(iz3,jz3);
701 /* Calculate squared distance and things based on it */
702 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
703 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
704 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
705 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
706 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
707 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
708 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
709 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
710 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
711 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
713 rinv00 = avx128fma_invsqrt_d(rsq00);
714 rinv11 = avx128fma_invsqrt_d(rsq11);
715 rinv12 = avx128fma_invsqrt_d(rsq12);
716 rinv13 = avx128fma_invsqrt_d(rsq13);
717 rinv21 = avx128fma_invsqrt_d(rsq21);
718 rinv22 = avx128fma_invsqrt_d(rsq22);
719 rinv23 = avx128fma_invsqrt_d(rsq23);
720 rinv31 = avx128fma_invsqrt_d(rsq31);
721 rinv32 = avx128fma_invsqrt_d(rsq32);
722 rinv33 = avx128fma_invsqrt_d(rsq33);
724 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
725 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
726 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
727 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
728 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
729 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
730 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
731 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
732 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
733 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
735 fjx0 = _mm_setzero_pd();
736 fjy0 = _mm_setzero_pd();
737 fjz0 = _mm_setzero_pd();
738 fjx1 = _mm_setzero_pd();
739 fjy1 = _mm_setzero_pd();
740 fjz1 = _mm_setzero_pd();
741 fjx2 = _mm_setzero_pd();
742 fjy2 = _mm_setzero_pd();
743 fjz2 = _mm_setzero_pd();
744 fjx3 = _mm_setzero_pd();
745 fjy3 = _mm_setzero_pd();
746 fjz3 = _mm_setzero_pd();
748 /**************************
749 * CALCULATE INTERACTIONS *
750 **************************/
752 if (gmx_mm_any_lt(rsq00,rcutoff2))
755 r00 = _mm_mul_pd(rsq00,rinv00);
757 /* LENNARD-JONES DISPERSION/REPULSION */
759 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
760 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
761 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
762 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
763 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
765 d = _mm_sub_pd(r00,rswitch);
766 d = _mm_max_pd(d,_mm_setzero_pd());
767 d2 = _mm_mul_pd(d,d);
768 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
770 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
772 /* Evaluate switch function */
773 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
774 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
775 vvdw = _mm_mul_pd(vvdw,sw);
776 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
778 /* Update potential sum for this i atom from the interaction with this j atom. */
779 vvdw = _mm_and_pd(vvdw,cutoff_mask);
780 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
781 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
785 fscal = _mm_and_pd(fscal,cutoff_mask);
787 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
789 /* Update vectorial force */
790 fix0 = _mm_macc_pd(dx00,fscal,fix0);
791 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
792 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
794 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
795 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
796 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
800 /**************************
801 * CALCULATE INTERACTIONS *
802 **************************/
804 if (gmx_mm_any_lt(rsq11,rcutoff2))
807 /* REACTION-FIELD ELECTROSTATICS */
808 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_macc_pd(krf,rsq11,rinv11),crf));
809 felec = _mm_mul_pd(qq11,_mm_msub_pd(rinv11,rinvsq11,krf2));
811 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
813 /* Update potential sum for this i atom from the interaction with this j atom. */
814 velec = _mm_and_pd(velec,cutoff_mask);
815 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
816 velecsum = _mm_add_pd(velecsum,velec);
820 fscal = _mm_and_pd(fscal,cutoff_mask);
822 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
824 /* Update vectorial force */
825 fix1 = _mm_macc_pd(dx11,fscal,fix1);
826 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
827 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
829 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
830 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
831 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
835 /**************************
836 * CALCULATE INTERACTIONS *
837 **************************/
839 if (gmx_mm_any_lt(rsq12,rcutoff2))
842 /* REACTION-FIELD ELECTROSTATICS */
843 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_macc_pd(krf,rsq12,rinv12),crf));
844 felec = _mm_mul_pd(qq12,_mm_msub_pd(rinv12,rinvsq12,krf2));
846 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
848 /* Update potential sum for this i atom from the interaction with this j atom. */
849 velec = _mm_and_pd(velec,cutoff_mask);
850 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
851 velecsum = _mm_add_pd(velecsum,velec);
855 fscal = _mm_and_pd(fscal,cutoff_mask);
857 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
859 /* Update vectorial force */
860 fix1 = _mm_macc_pd(dx12,fscal,fix1);
861 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
862 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
864 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
865 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
866 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
870 /**************************
871 * CALCULATE INTERACTIONS *
872 **************************/
874 if (gmx_mm_any_lt(rsq13,rcutoff2))
877 /* REACTION-FIELD ELECTROSTATICS */
878 velec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_macc_pd(krf,rsq13,rinv13),crf));
879 felec = _mm_mul_pd(qq13,_mm_msub_pd(rinv13,rinvsq13,krf2));
881 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
883 /* Update potential sum for this i atom from the interaction with this j atom. */
884 velec = _mm_and_pd(velec,cutoff_mask);
885 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
886 velecsum = _mm_add_pd(velecsum,velec);
890 fscal = _mm_and_pd(fscal,cutoff_mask);
892 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
894 /* Update vectorial force */
895 fix1 = _mm_macc_pd(dx13,fscal,fix1);
896 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
897 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
899 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
900 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
901 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
905 /**************************
906 * CALCULATE INTERACTIONS *
907 **************************/
909 if (gmx_mm_any_lt(rsq21,rcutoff2))
912 /* REACTION-FIELD ELECTROSTATICS */
913 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_macc_pd(krf,rsq21,rinv21),crf));
914 felec = _mm_mul_pd(qq21,_mm_msub_pd(rinv21,rinvsq21,krf2));
916 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
918 /* Update potential sum for this i atom from the interaction with this j atom. */
919 velec = _mm_and_pd(velec,cutoff_mask);
920 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
921 velecsum = _mm_add_pd(velecsum,velec);
925 fscal = _mm_and_pd(fscal,cutoff_mask);
927 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
929 /* Update vectorial force */
930 fix2 = _mm_macc_pd(dx21,fscal,fix2);
931 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
932 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
934 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
935 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
936 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
940 /**************************
941 * CALCULATE INTERACTIONS *
942 **************************/
944 if (gmx_mm_any_lt(rsq22,rcutoff2))
947 /* REACTION-FIELD ELECTROSTATICS */
948 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_macc_pd(krf,rsq22,rinv22),crf));
949 felec = _mm_mul_pd(qq22,_mm_msub_pd(rinv22,rinvsq22,krf2));
951 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
953 /* Update potential sum for this i atom from the interaction with this j atom. */
954 velec = _mm_and_pd(velec,cutoff_mask);
955 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
956 velecsum = _mm_add_pd(velecsum,velec);
960 fscal = _mm_and_pd(fscal,cutoff_mask);
962 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
964 /* Update vectorial force */
965 fix2 = _mm_macc_pd(dx22,fscal,fix2);
966 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
967 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
969 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
970 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
971 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
975 /**************************
976 * CALCULATE INTERACTIONS *
977 **************************/
979 if (gmx_mm_any_lt(rsq23,rcutoff2))
982 /* REACTION-FIELD ELECTROSTATICS */
983 velec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_macc_pd(krf,rsq23,rinv23),crf));
984 felec = _mm_mul_pd(qq23,_mm_msub_pd(rinv23,rinvsq23,krf2));
986 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
988 /* Update potential sum for this i atom from the interaction with this j atom. */
989 velec = _mm_and_pd(velec,cutoff_mask);
990 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
991 velecsum = _mm_add_pd(velecsum,velec);
995 fscal = _mm_and_pd(fscal,cutoff_mask);
997 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
999 /* Update vectorial force */
1000 fix2 = _mm_macc_pd(dx23,fscal,fix2);
1001 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
1002 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
1004 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
1005 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
1006 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
1010 /**************************
1011 * CALCULATE INTERACTIONS *
1012 **************************/
1014 if (gmx_mm_any_lt(rsq31,rcutoff2))
1017 /* REACTION-FIELD ELECTROSTATICS */
1018 velec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_macc_pd(krf,rsq31,rinv31),crf));
1019 felec = _mm_mul_pd(qq31,_mm_msub_pd(rinv31,rinvsq31,krf2));
1021 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1023 /* Update potential sum for this i atom from the interaction with this j atom. */
1024 velec = _mm_and_pd(velec,cutoff_mask);
1025 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1026 velecsum = _mm_add_pd(velecsum,velec);
1030 fscal = _mm_and_pd(fscal,cutoff_mask);
1032 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1034 /* Update vectorial force */
1035 fix3 = _mm_macc_pd(dx31,fscal,fix3);
1036 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
1037 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
1039 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
1040 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
1041 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
1045 /**************************
1046 * CALCULATE INTERACTIONS *
1047 **************************/
1049 if (gmx_mm_any_lt(rsq32,rcutoff2))
1052 /* REACTION-FIELD ELECTROSTATICS */
1053 velec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_macc_pd(krf,rsq32,rinv32),crf));
1054 felec = _mm_mul_pd(qq32,_mm_msub_pd(rinv32,rinvsq32,krf2));
1056 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1058 /* Update potential sum for this i atom from the interaction with this j atom. */
1059 velec = _mm_and_pd(velec,cutoff_mask);
1060 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1061 velecsum = _mm_add_pd(velecsum,velec);
1065 fscal = _mm_and_pd(fscal,cutoff_mask);
1067 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1069 /* Update vectorial force */
1070 fix3 = _mm_macc_pd(dx32,fscal,fix3);
1071 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
1072 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
1074 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
1075 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
1076 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
1080 /**************************
1081 * CALCULATE INTERACTIONS *
1082 **************************/
1084 if (gmx_mm_any_lt(rsq33,rcutoff2))
1087 /* REACTION-FIELD ELECTROSTATICS */
1088 velec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_macc_pd(krf,rsq33,rinv33),crf));
1089 felec = _mm_mul_pd(qq33,_mm_msub_pd(rinv33,rinvsq33,krf2));
1091 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1093 /* Update potential sum for this i atom from the interaction with this j atom. */
1094 velec = _mm_and_pd(velec,cutoff_mask);
1095 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1096 velecsum = _mm_add_pd(velecsum,velec);
1100 fscal = _mm_and_pd(fscal,cutoff_mask);
1102 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1104 /* Update vectorial force */
1105 fix3 = _mm_macc_pd(dx33,fscal,fix3);
1106 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
1107 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
1109 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
1110 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
1111 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
1115 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1117 /* Inner loop uses 416 flops */
1120 /* End of innermost loop */
1122 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1123 f+i_coord_offset,fshift+i_shift_offset);
1126 /* Update potential energies */
1127 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1128 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1130 /* Increment number of inner iterations */
1131 inneriter += j_index_end - j_index_start;
1133 /* Outer loop uses 26 flops */
1136 /* Increment number of outer iterations */
1139 /* Update outer/inner flops */
1141 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*416);
1144 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_128_fma_double
1145 * Electrostatics interaction: ReactionField
1146 * VdW interaction: LennardJones
1147 * Geometry: Water4-Water4
1148 * Calculate force/pot: Force
1151 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_128_fma_double
1152 (t_nblist * gmx_restrict nlist,
1153 rvec * gmx_restrict xx,
1154 rvec * gmx_restrict ff,
1155 struct t_forcerec * gmx_restrict fr,
1156 t_mdatoms * gmx_restrict mdatoms,
1157 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1158 t_nrnb * gmx_restrict nrnb)
1160 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1161 * just 0 for non-waters.
1162 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1163 * jnr indices corresponding to data put in the four positions in the SIMD register.
1165 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1166 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1168 int j_coord_offsetA,j_coord_offsetB;
1169 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1170 real rcutoff_scalar;
1171 real *shiftvec,*fshift,*x,*f;
1172 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1174 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1176 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1178 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1180 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1181 int vdwjidx0A,vdwjidx0B;
1182 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1183 int vdwjidx1A,vdwjidx1B;
1184 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1185 int vdwjidx2A,vdwjidx2B;
1186 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1187 int vdwjidx3A,vdwjidx3B;
1188 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1189 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1190 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1191 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1192 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1193 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1194 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1195 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1196 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1197 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1198 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1199 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1202 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1205 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1206 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1207 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1208 real rswitch_scalar,d_scalar;
1209 __m128d dummy_mask,cutoff_mask;
1210 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1211 __m128d one = _mm_set1_pd(1.0);
1212 __m128d two = _mm_set1_pd(2.0);
1218 jindex = nlist->jindex;
1220 shiftidx = nlist->shift;
1222 shiftvec = fr->shift_vec[0];
1223 fshift = fr->fshift[0];
1224 facel = _mm_set1_pd(fr->ic->epsfac);
1225 charge = mdatoms->chargeA;
1226 krf = _mm_set1_pd(fr->ic->k_rf);
1227 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
1228 crf = _mm_set1_pd(fr->ic->c_rf);
1229 nvdwtype = fr->ntype;
1230 vdwparam = fr->nbfp;
1231 vdwtype = mdatoms->typeA;
1233 /* Setup water-specific parameters */
1234 inr = nlist->iinr[0];
1235 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1236 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1237 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1238 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1240 jq1 = _mm_set1_pd(charge[inr+1]);
1241 jq2 = _mm_set1_pd(charge[inr+2]);
1242 jq3 = _mm_set1_pd(charge[inr+3]);
1243 vdwjidx0A = 2*vdwtype[inr+0];
1244 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1245 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1246 qq11 = _mm_mul_pd(iq1,jq1);
1247 qq12 = _mm_mul_pd(iq1,jq2);
1248 qq13 = _mm_mul_pd(iq1,jq3);
1249 qq21 = _mm_mul_pd(iq2,jq1);
1250 qq22 = _mm_mul_pd(iq2,jq2);
1251 qq23 = _mm_mul_pd(iq2,jq3);
1252 qq31 = _mm_mul_pd(iq3,jq1);
1253 qq32 = _mm_mul_pd(iq3,jq2);
1254 qq33 = _mm_mul_pd(iq3,jq3);
1256 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1257 rcutoff_scalar = fr->ic->rcoulomb;
1258 rcutoff = _mm_set1_pd(rcutoff_scalar);
1259 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1261 rswitch_scalar = fr->ic->rvdw_switch;
1262 rswitch = _mm_set1_pd(rswitch_scalar);
1263 /* Setup switch parameters */
1264 d_scalar = rcutoff_scalar-rswitch_scalar;
1265 d = _mm_set1_pd(d_scalar);
1266 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1267 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1268 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1269 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1270 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1271 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1273 /* Avoid stupid compiler warnings */
1275 j_coord_offsetA = 0;
1276 j_coord_offsetB = 0;
1281 /* Start outer loop over neighborlists */
1282 for(iidx=0; iidx<nri; iidx++)
1284 /* Load shift vector for this list */
1285 i_shift_offset = DIM*shiftidx[iidx];
1287 /* Load limits for loop over neighbors */
1288 j_index_start = jindex[iidx];
1289 j_index_end = jindex[iidx+1];
1291 /* Get outer coordinate index */
1293 i_coord_offset = DIM*inr;
1295 /* Load i particle coords and add shift vector */
1296 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1297 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1299 fix0 = _mm_setzero_pd();
1300 fiy0 = _mm_setzero_pd();
1301 fiz0 = _mm_setzero_pd();
1302 fix1 = _mm_setzero_pd();
1303 fiy1 = _mm_setzero_pd();
1304 fiz1 = _mm_setzero_pd();
1305 fix2 = _mm_setzero_pd();
1306 fiy2 = _mm_setzero_pd();
1307 fiz2 = _mm_setzero_pd();
1308 fix3 = _mm_setzero_pd();
1309 fiy3 = _mm_setzero_pd();
1310 fiz3 = _mm_setzero_pd();
1312 /* Start inner kernel loop */
1313 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1316 /* Get j neighbor index, and coordinate index */
1318 jnrB = jjnr[jidx+1];
1319 j_coord_offsetA = DIM*jnrA;
1320 j_coord_offsetB = DIM*jnrB;
1322 /* load j atom coordinates */
1323 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1324 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1325 &jy2,&jz2,&jx3,&jy3,&jz3);
1327 /* Calculate displacement vector */
1328 dx00 = _mm_sub_pd(ix0,jx0);
1329 dy00 = _mm_sub_pd(iy0,jy0);
1330 dz00 = _mm_sub_pd(iz0,jz0);
1331 dx11 = _mm_sub_pd(ix1,jx1);
1332 dy11 = _mm_sub_pd(iy1,jy1);
1333 dz11 = _mm_sub_pd(iz1,jz1);
1334 dx12 = _mm_sub_pd(ix1,jx2);
1335 dy12 = _mm_sub_pd(iy1,jy2);
1336 dz12 = _mm_sub_pd(iz1,jz2);
1337 dx13 = _mm_sub_pd(ix1,jx3);
1338 dy13 = _mm_sub_pd(iy1,jy3);
1339 dz13 = _mm_sub_pd(iz1,jz3);
1340 dx21 = _mm_sub_pd(ix2,jx1);
1341 dy21 = _mm_sub_pd(iy2,jy1);
1342 dz21 = _mm_sub_pd(iz2,jz1);
1343 dx22 = _mm_sub_pd(ix2,jx2);
1344 dy22 = _mm_sub_pd(iy2,jy2);
1345 dz22 = _mm_sub_pd(iz2,jz2);
1346 dx23 = _mm_sub_pd(ix2,jx3);
1347 dy23 = _mm_sub_pd(iy2,jy3);
1348 dz23 = _mm_sub_pd(iz2,jz3);
1349 dx31 = _mm_sub_pd(ix3,jx1);
1350 dy31 = _mm_sub_pd(iy3,jy1);
1351 dz31 = _mm_sub_pd(iz3,jz1);
1352 dx32 = _mm_sub_pd(ix3,jx2);
1353 dy32 = _mm_sub_pd(iy3,jy2);
1354 dz32 = _mm_sub_pd(iz3,jz2);
1355 dx33 = _mm_sub_pd(ix3,jx3);
1356 dy33 = _mm_sub_pd(iy3,jy3);
1357 dz33 = _mm_sub_pd(iz3,jz3);
1359 /* Calculate squared distance and things based on it */
1360 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1361 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1362 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1363 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1364 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1365 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1366 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1367 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1368 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1369 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1371 rinv00 = avx128fma_invsqrt_d(rsq00);
1372 rinv11 = avx128fma_invsqrt_d(rsq11);
1373 rinv12 = avx128fma_invsqrt_d(rsq12);
1374 rinv13 = avx128fma_invsqrt_d(rsq13);
1375 rinv21 = avx128fma_invsqrt_d(rsq21);
1376 rinv22 = avx128fma_invsqrt_d(rsq22);
1377 rinv23 = avx128fma_invsqrt_d(rsq23);
1378 rinv31 = avx128fma_invsqrt_d(rsq31);
1379 rinv32 = avx128fma_invsqrt_d(rsq32);
1380 rinv33 = avx128fma_invsqrt_d(rsq33);
1382 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1383 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1384 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1385 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1386 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1387 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1388 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1389 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1390 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1391 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1393 fjx0 = _mm_setzero_pd();
1394 fjy0 = _mm_setzero_pd();
1395 fjz0 = _mm_setzero_pd();
1396 fjx1 = _mm_setzero_pd();
1397 fjy1 = _mm_setzero_pd();
1398 fjz1 = _mm_setzero_pd();
1399 fjx2 = _mm_setzero_pd();
1400 fjy2 = _mm_setzero_pd();
1401 fjz2 = _mm_setzero_pd();
1402 fjx3 = _mm_setzero_pd();
1403 fjy3 = _mm_setzero_pd();
1404 fjz3 = _mm_setzero_pd();
1406 /**************************
1407 * CALCULATE INTERACTIONS *
1408 **************************/
1410 if (gmx_mm_any_lt(rsq00,rcutoff2))
1413 r00 = _mm_mul_pd(rsq00,rinv00);
1415 /* LENNARD-JONES DISPERSION/REPULSION */
1417 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1418 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1419 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1420 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
1421 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1423 d = _mm_sub_pd(r00,rswitch);
1424 d = _mm_max_pd(d,_mm_setzero_pd());
1425 d2 = _mm_mul_pd(d,d);
1426 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1428 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1430 /* Evaluate switch function */
1431 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1432 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1433 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1437 fscal = _mm_and_pd(fscal,cutoff_mask);
1439 /* Update vectorial force */
1440 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1441 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1442 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1444 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1445 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1446 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1450 /**************************
1451 * CALCULATE INTERACTIONS *
1452 **************************/
1454 if (gmx_mm_any_lt(rsq11,rcutoff2))
1457 /* REACTION-FIELD ELECTROSTATICS */
1458 felec = _mm_mul_pd(qq11,_mm_msub_pd(rinv11,rinvsq11,krf2));
1460 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1464 fscal = _mm_and_pd(fscal,cutoff_mask);
1466 /* Update vectorial force */
1467 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1468 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1469 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1471 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1472 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1473 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1477 /**************************
1478 * CALCULATE INTERACTIONS *
1479 **************************/
1481 if (gmx_mm_any_lt(rsq12,rcutoff2))
1484 /* REACTION-FIELD ELECTROSTATICS */
1485 felec = _mm_mul_pd(qq12,_mm_msub_pd(rinv12,rinvsq12,krf2));
1487 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1491 fscal = _mm_and_pd(fscal,cutoff_mask);
1493 /* Update vectorial force */
1494 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1495 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1496 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1498 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1499 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1500 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1504 /**************************
1505 * CALCULATE INTERACTIONS *
1506 **************************/
1508 if (gmx_mm_any_lt(rsq13,rcutoff2))
1511 /* REACTION-FIELD ELECTROSTATICS */
1512 felec = _mm_mul_pd(qq13,_mm_msub_pd(rinv13,rinvsq13,krf2));
1514 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1518 fscal = _mm_and_pd(fscal,cutoff_mask);
1520 /* Update vectorial force */
1521 fix1 = _mm_macc_pd(dx13,fscal,fix1);
1522 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
1523 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
1525 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
1526 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
1527 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
1531 /**************************
1532 * CALCULATE INTERACTIONS *
1533 **************************/
1535 if (gmx_mm_any_lt(rsq21,rcutoff2))
1538 /* REACTION-FIELD ELECTROSTATICS */
1539 felec = _mm_mul_pd(qq21,_mm_msub_pd(rinv21,rinvsq21,krf2));
1541 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1545 fscal = _mm_and_pd(fscal,cutoff_mask);
1547 /* Update vectorial force */
1548 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1549 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1550 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1552 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1553 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1554 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1558 /**************************
1559 * CALCULATE INTERACTIONS *
1560 **************************/
1562 if (gmx_mm_any_lt(rsq22,rcutoff2))
1565 /* REACTION-FIELD ELECTROSTATICS */
1566 felec = _mm_mul_pd(qq22,_mm_msub_pd(rinv22,rinvsq22,krf2));
1568 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1572 fscal = _mm_and_pd(fscal,cutoff_mask);
1574 /* Update vectorial force */
1575 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1576 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1577 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1579 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1580 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1581 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1585 /**************************
1586 * CALCULATE INTERACTIONS *
1587 **************************/
1589 if (gmx_mm_any_lt(rsq23,rcutoff2))
1592 /* REACTION-FIELD ELECTROSTATICS */
1593 felec = _mm_mul_pd(qq23,_mm_msub_pd(rinv23,rinvsq23,krf2));
1595 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1599 fscal = _mm_and_pd(fscal,cutoff_mask);
1601 /* Update vectorial force */
1602 fix2 = _mm_macc_pd(dx23,fscal,fix2);
1603 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
1604 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
1606 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
1607 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
1608 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
1612 /**************************
1613 * CALCULATE INTERACTIONS *
1614 **************************/
1616 if (gmx_mm_any_lt(rsq31,rcutoff2))
1619 /* REACTION-FIELD ELECTROSTATICS */
1620 felec = _mm_mul_pd(qq31,_mm_msub_pd(rinv31,rinvsq31,krf2));
1622 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1626 fscal = _mm_and_pd(fscal,cutoff_mask);
1628 /* Update vectorial force */
1629 fix3 = _mm_macc_pd(dx31,fscal,fix3);
1630 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
1631 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
1633 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
1634 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
1635 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
1639 /**************************
1640 * CALCULATE INTERACTIONS *
1641 **************************/
1643 if (gmx_mm_any_lt(rsq32,rcutoff2))
1646 /* REACTION-FIELD ELECTROSTATICS */
1647 felec = _mm_mul_pd(qq32,_mm_msub_pd(rinv32,rinvsq32,krf2));
1649 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1653 fscal = _mm_and_pd(fscal,cutoff_mask);
1655 /* Update vectorial force */
1656 fix3 = _mm_macc_pd(dx32,fscal,fix3);
1657 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
1658 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
1660 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
1661 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
1662 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
1666 /**************************
1667 * CALCULATE INTERACTIONS *
1668 **************************/
1670 if (gmx_mm_any_lt(rsq33,rcutoff2))
1673 /* REACTION-FIELD ELECTROSTATICS */
1674 felec = _mm_mul_pd(qq33,_mm_msub_pd(rinv33,rinvsq33,krf2));
1676 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1680 fscal = _mm_and_pd(fscal,cutoff_mask);
1682 /* Update vectorial force */
1683 fix3 = _mm_macc_pd(dx33,fscal,fix3);
1684 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
1685 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
1687 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
1688 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
1689 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
1693 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1695 /* Inner loop uses 359 flops */
1698 if(jidx<j_index_end)
1702 j_coord_offsetA = DIM*jnrA;
1704 /* load j atom coordinates */
1705 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1706 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1707 &jy2,&jz2,&jx3,&jy3,&jz3);
1709 /* Calculate displacement vector */
1710 dx00 = _mm_sub_pd(ix0,jx0);
1711 dy00 = _mm_sub_pd(iy0,jy0);
1712 dz00 = _mm_sub_pd(iz0,jz0);
1713 dx11 = _mm_sub_pd(ix1,jx1);
1714 dy11 = _mm_sub_pd(iy1,jy1);
1715 dz11 = _mm_sub_pd(iz1,jz1);
1716 dx12 = _mm_sub_pd(ix1,jx2);
1717 dy12 = _mm_sub_pd(iy1,jy2);
1718 dz12 = _mm_sub_pd(iz1,jz2);
1719 dx13 = _mm_sub_pd(ix1,jx3);
1720 dy13 = _mm_sub_pd(iy1,jy3);
1721 dz13 = _mm_sub_pd(iz1,jz3);
1722 dx21 = _mm_sub_pd(ix2,jx1);
1723 dy21 = _mm_sub_pd(iy2,jy1);
1724 dz21 = _mm_sub_pd(iz2,jz1);
1725 dx22 = _mm_sub_pd(ix2,jx2);
1726 dy22 = _mm_sub_pd(iy2,jy2);
1727 dz22 = _mm_sub_pd(iz2,jz2);
1728 dx23 = _mm_sub_pd(ix2,jx3);
1729 dy23 = _mm_sub_pd(iy2,jy3);
1730 dz23 = _mm_sub_pd(iz2,jz3);
1731 dx31 = _mm_sub_pd(ix3,jx1);
1732 dy31 = _mm_sub_pd(iy3,jy1);
1733 dz31 = _mm_sub_pd(iz3,jz1);
1734 dx32 = _mm_sub_pd(ix3,jx2);
1735 dy32 = _mm_sub_pd(iy3,jy2);
1736 dz32 = _mm_sub_pd(iz3,jz2);
1737 dx33 = _mm_sub_pd(ix3,jx3);
1738 dy33 = _mm_sub_pd(iy3,jy3);
1739 dz33 = _mm_sub_pd(iz3,jz3);
1741 /* Calculate squared distance and things based on it */
1742 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1743 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1744 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1745 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1746 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1747 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1748 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1749 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1750 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1751 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1753 rinv00 = avx128fma_invsqrt_d(rsq00);
1754 rinv11 = avx128fma_invsqrt_d(rsq11);
1755 rinv12 = avx128fma_invsqrt_d(rsq12);
1756 rinv13 = avx128fma_invsqrt_d(rsq13);
1757 rinv21 = avx128fma_invsqrt_d(rsq21);
1758 rinv22 = avx128fma_invsqrt_d(rsq22);
1759 rinv23 = avx128fma_invsqrt_d(rsq23);
1760 rinv31 = avx128fma_invsqrt_d(rsq31);
1761 rinv32 = avx128fma_invsqrt_d(rsq32);
1762 rinv33 = avx128fma_invsqrt_d(rsq33);
1764 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1765 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1766 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1767 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1768 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1769 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1770 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1771 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1772 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1773 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1775 fjx0 = _mm_setzero_pd();
1776 fjy0 = _mm_setzero_pd();
1777 fjz0 = _mm_setzero_pd();
1778 fjx1 = _mm_setzero_pd();
1779 fjy1 = _mm_setzero_pd();
1780 fjz1 = _mm_setzero_pd();
1781 fjx2 = _mm_setzero_pd();
1782 fjy2 = _mm_setzero_pd();
1783 fjz2 = _mm_setzero_pd();
1784 fjx3 = _mm_setzero_pd();
1785 fjy3 = _mm_setzero_pd();
1786 fjz3 = _mm_setzero_pd();
1788 /**************************
1789 * CALCULATE INTERACTIONS *
1790 **************************/
1792 if (gmx_mm_any_lt(rsq00,rcutoff2))
1795 r00 = _mm_mul_pd(rsq00,rinv00);
1797 /* LENNARD-JONES DISPERSION/REPULSION */
1799 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1800 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1801 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1802 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
1803 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1805 d = _mm_sub_pd(r00,rswitch);
1806 d = _mm_max_pd(d,_mm_setzero_pd());
1807 d2 = _mm_mul_pd(d,d);
1808 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1810 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1812 /* Evaluate switch function */
1813 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1814 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1815 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1819 fscal = _mm_and_pd(fscal,cutoff_mask);
1821 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1823 /* Update vectorial force */
1824 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1825 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1826 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1828 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1829 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1830 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1834 /**************************
1835 * CALCULATE INTERACTIONS *
1836 **************************/
1838 if (gmx_mm_any_lt(rsq11,rcutoff2))
1841 /* REACTION-FIELD ELECTROSTATICS */
1842 felec = _mm_mul_pd(qq11,_mm_msub_pd(rinv11,rinvsq11,krf2));
1844 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1848 fscal = _mm_and_pd(fscal,cutoff_mask);
1850 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1852 /* Update vectorial force */
1853 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1854 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1855 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1857 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1858 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1859 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1863 /**************************
1864 * CALCULATE INTERACTIONS *
1865 **************************/
1867 if (gmx_mm_any_lt(rsq12,rcutoff2))
1870 /* REACTION-FIELD ELECTROSTATICS */
1871 felec = _mm_mul_pd(qq12,_mm_msub_pd(rinv12,rinvsq12,krf2));
1873 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1877 fscal = _mm_and_pd(fscal,cutoff_mask);
1879 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1881 /* Update vectorial force */
1882 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1883 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1884 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1886 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1887 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1888 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1892 /**************************
1893 * CALCULATE INTERACTIONS *
1894 **************************/
1896 if (gmx_mm_any_lt(rsq13,rcutoff2))
1899 /* REACTION-FIELD ELECTROSTATICS */
1900 felec = _mm_mul_pd(qq13,_mm_msub_pd(rinv13,rinvsq13,krf2));
1902 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1906 fscal = _mm_and_pd(fscal,cutoff_mask);
1908 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1910 /* Update vectorial force */
1911 fix1 = _mm_macc_pd(dx13,fscal,fix1);
1912 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
1913 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
1915 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
1916 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
1917 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
1921 /**************************
1922 * CALCULATE INTERACTIONS *
1923 **************************/
1925 if (gmx_mm_any_lt(rsq21,rcutoff2))
1928 /* REACTION-FIELD ELECTROSTATICS */
1929 felec = _mm_mul_pd(qq21,_mm_msub_pd(rinv21,rinvsq21,krf2));
1931 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1935 fscal = _mm_and_pd(fscal,cutoff_mask);
1937 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1939 /* Update vectorial force */
1940 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1941 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1942 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1944 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1945 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1946 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1950 /**************************
1951 * CALCULATE INTERACTIONS *
1952 **************************/
1954 if (gmx_mm_any_lt(rsq22,rcutoff2))
1957 /* REACTION-FIELD ELECTROSTATICS */
1958 felec = _mm_mul_pd(qq22,_mm_msub_pd(rinv22,rinvsq22,krf2));
1960 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1964 fscal = _mm_and_pd(fscal,cutoff_mask);
1966 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1968 /* Update vectorial force */
1969 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1970 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1971 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1973 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1974 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1975 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1979 /**************************
1980 * CALCULATE INTERACTIONS *
1981 **************************/
1983 if (gmx_mm_any_lt(rsq23,rcutoff2))
1986 /* REACTION-FIELD ELECTROSTATICS */
1987 felec = _mm_mul_pd(qq23,_mm_msub_pd(rinv23,rinvsq23,krf2));
1989 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1993 fscal = _mm_and_pd(fscal,cutoff_mask);
1995 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1997 /* Update vectorial force */
1998 fix2 = _mm_macc_pd(dx23,fscal,fix2);
1999 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
2000 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
2002 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
2003 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
2004 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
2008 /**************************
2009 * CALCULATE INTERACTIONS *
2010 **************************/
2012 if (gmx_mm_any_lt(rsq31,rcutoff2))
2015 /* REACTION-FIELD ELECTROSTATICS */
2016 felec = _mm_mul_pd(qq31,_mm_msub_pd(rinv31,rinvsq31,krf2));
2018 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
2022 fscal = _mm_and_pd(fscal,cutoff_mask);
2024 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2026 /* Update vectorial force */
2027 fix3 = _mm_macc_pd(dx31,fscal,fix3);
2028 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
2029 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
2031 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
2032 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
2033 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
2037 /**************************
2038 * CALCULATE INTERACTIONS *
2039 **************************/
2041 if (gmx_mm_any_lt(rsq32,rcutoff2))
2044 /* REACTION-FIELD ELECTROSTATICS */
2045 felec = _mm_mul_pd(qq32,_mm_msub_pd(rinv32,rinvsq32,krf2));
2047 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
2051 fscal = _mm_and_pd(fscal,cutoff_mask);
2053 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2055 /* Update vectorial force */
2056 fix3 = _mm_macc_pd(dx32,fscal,fix3);
2057 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
2058 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
2060 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
2061 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
2062 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
2066 /**************************
2067 * CALCULATE INTERACTIONS *
2068 **************************/
2070 if (gmx_mm_any_lt(rsq33,rcutoff2))
2073 /* REACTION-FIELD ELECTROSTATICS */
2074 felec = _mm_mul_pd(qq33,_mm_msub_pd(rinv33,rinvsq33,krf2));
2076 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
2080 fscal = _mm_and_pd(fscal,cutoff_mask);
2082 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2084 /* Update vectorial force */
2085 fix3 = _mm_macc_pd(dx33,fscal,fix3);
2086 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
2087 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
2089 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
2090 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
2091 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
2095 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2097 /* Inner loop uses 359 flops */
2100 /* End of innermost loop */
2102 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2103 f+i_coord_offset,fshift+i_shift_offset);
2105 /* Increment number of inner iterations */
2106 inneriter += j_index_end - j_index_start;
2108 /* Outer loop uses 24 flops */
2111 /* Increment number of outer iterations */
2114 /* Update outer/inner flops */
2116 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*359);