2 * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_double.h"
34 #include "kernelutil_x86_avx_128_fma_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_avx_128_fma_double
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_avx_128_fma_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B;
75 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 int vdwjidx1A,vdwjidx1B;
77 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78 int vdwjidx2A,vdwjidx2B;
79 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80 int vdwjidx3A,vdwjidx3B;
81 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
95 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
98 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
99 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
101 __m128i ifour = _mm_set1_epi32(4);
102 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
104 __m128d dummy_mask,cutoff_mask;
105 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
106 __m128d one = _mm_set1_pd(1.0);
107 __m128d two = _mm_set1_pd(2.0);
113 jindex = nlist->jindex;
115 shiftidx = nlist->shift;
117 shiftvec = fr->shift_vec[0];
118 fshift = fr->fshift[0];
119 facel = _mm_set1_pd(fr->epsfac);
120 charge = mdatoms->chargeA;
121 krf = _mm_set1_pd(fr->ic->k_rf);
122 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
123 crf = _mm_set1_pd(fr->ic->c_rf);
124 nvdwtype = fr->ntype;
126 vdwtype = mdatoms->typeA;
128 vftab = kernel_data->table_vdw->data;
129 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
131 /* Setup water-specific parameters */
132 inr = nlist->iinr[0];
133 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
134 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
135 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
136 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
138 jq1 = _mm_set1_pd(charge[inr+1]);
139 jq2 = _mm_set1_pd(charge[inr+2]);
140 jq3 = _mm_set1_pd(charge[inr+3]);
141 vdwjidx0A = 2*vdwtype[inr+0];
142 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
143 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
144 qq11 = _mm_mul_pd(iq1,jq1);
145 qq12 = _mm_mul_pd(iq1,jq2);
146 qq13 = _mm_mul_pd(iq1,jq3);
147 qq21 = _mm_mul_pd(iq2,jq1);
148 qq22 = _mm_mul_pd(iq2,jq2);
149 qq23 = _mm_mul_pd(iq2,jq3);
150 qq31 = _mm_mul_pd(iq3,jq1);
151 qq32 = _mm_mul_pd(iq3,jq2);
152 qq33 = _mm_mul_pd(iq3,jq3);
154 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
155 rcutoff_scalar = fr->rcoulomb;
156 rcutoff = _mm_set1_pd(rcutoff_scalar);
157 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
159 /* Avoid stupid compiler warnings */
167 /* Start outer loop over neighborlists */
168 for(iidx=0; iidx<nri; iidx++)
170 /* Load shift vector for this list */
171 i_shift_offset = DIM*shiftidx[iidx];
173 /* Load limits for loop over neighbors */
174 j_index_start = jindex[iidx];
175 j_index_end = jindex[iidx+1];
177 /* Get outer coordinate index */
179 i_coord_offset = DIM*inr;
181 /* Load i particle coords and add shift vector */
182 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
183 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
185 fix0 = _mm_setzero_pd();
186 fiy0 = _mm_setzero_pd();
187 fiz0 = _mm_setzero_pd();
188 fix1 = _mm_setzero_pd();
189 fiy1 = _mm_setzero_pd();
190 fiz1 = _mm_setzero_pd();
191 fix2 = _mm_setzero_pd();
192 fiy2 = _mm_setzero_pd();
193 fiz2 = _mm_setzero_pd();
194 fix3 = _mm_setzero_pd();
195 fiy3 = _mm_setzero_pd();
196 fiz3 = _mm_setzero_pd();
198 /* Reset potential sums */
199 velecsum = _mm_setzero_pd();
200 vvdwsum = _mm_setzero_pd();
202 /* Start inner kernel loop */
203 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
206 /* Get j neighbor index, and coordinate index */
209 j_coord_offsetA = DIM*jnrA;
210 j_coord_offsetB = DIM*jnrB;
212 /* load j atom coordinates */
213 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
214 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
215 &jy2,&jz2,&jx3,&jy3,&jz3);
217 /* Calculate displacement vector */
218 dx00 = _mm_sub_pd(ix0,jx0);
219 dy00 = _mm_sub_pd(iy0,jy0);
220 dz00 = _mm_sub_pd(iz0,jz0);
221 dx11 = _mm_sub_pd(ix1,jx1);
222 dy11 = _mm_sub_pd(iy1,jy1);
223 dz11 = _mm_sub_pd(iz1,jz1);
224 dx12 = _mm_sub_pd(ix1,jx2);
225 dy12 = _mm_sub_pd(iy1,jy2);
226 dz12 = _mm_sub_pd(iz1,jz2);
227 dx13 = _mm_sub_pd(ix1,jx3);
228 dy13 = _mm_sub_pd(iy1,jy3);
229 dz13 = _mm_sub_pd(iz1,jz3);
230 dx21 = _mm_sub_pd(ix2,jx1);
231 dy21 = _mm_sub_pd(iy2,jy1);
232 dz21 = _mm_sub_pd(iz2,jz1);
233 dx22 = _mm_sub_pd(ix2,jx2);
234 dy22 = _mm_sub_pd(iy2,jy2);
235 dz22 = _mm_sub_pd(iz2,jz2);
236 dx23 = _mm_sub_pd(ix2,jx3);
237 dy23 = _mm_sub_pd(iy2,jy3);
238 dz23 = _mm_sub_pd(iz2,jz3);
239 dx31 = _mm_sub_pd(ix3,jx1);
240 dy31 = _mm_sub_pd(iy3,jy1);
241 dz31 = _mm_sub_pd(iz3,jz1);
242 dx32 = _mm_sub_pd(ix3,jx2);
243 dy32 = _mm_sub_pd(iy3,jy2);
244 dz32 = _mm_sub_pd(iz3,jz2);
245 dx33 = _mm_sub_pd(ix3,jx3);
246 dy33 = _mm_sub_pd(iy3,jy3);
247 dz33 = _mm_sub_pd(iz3,jz3);
249 /* Calculate squared distance and things based on it */
250 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
251 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
252 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
253 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
254 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
255 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
256 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
257 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
258 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
259 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
261 rinv00 = gmx_mm_invsqrt_pd(rsq00);
262 rinv11 = gmx_mm_invsqrt_pd(rsq11);
263 rinv12 = gmx_mm_invsqrt_pd(rsq12);
264 rinv13 = gmx_mm_invsqrt_pd(rsq13);
265 rinv21 = gmx_mm_invsqrt_pd(rsq21);
266 rinv22 = gmx_mm_invsqrt_pd(rsq22);
267 rinv23 = gmx_mm_invsqrt_pd(rsq23);
268 rinv31 = gmx_mm_invsqrt_pd(rsq31);
269 rinv32 = gmx_mm_invsqrt_pd(rsq32);
270 rinv33 = gmx_mm_invsqrt_pd(rsq33);
272 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
273 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
274 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
275 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
276 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
277 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
278 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
279 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
280 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
282 fjx0 = _mm_setzero_pd();
283 fjy0 = _mm_setzero_pd();
284 fjz0 = _mm_setzero_pd();
285 fjx1 = _mm_setzero_pd();
286 fjy1 = _mm_setzero_pd();
287 fjz1 = _mm_setzero_pd();
288 fjx2 = _mm_setzero_pd();
289 fjy2 = _mm_setzero_pd();
290 fjz2 = _mm_setzero_pd();
291 fjx3 = _mm_setzero_pd();
292 fjy3 = _mm_setzero_pd();
293 fjz3 = _mm_setzero_pd();
295 /**************************
296 * CALCULATE INTERACTIONS *
297 **************************/
299 r00 = _mm_mul_pd(rsq00,rinv00);
301 /* Calculate table index by multiplying r with table scale and truncate to integer */
302 rt = _mm_mul_pd(r00,vftabscale);
303 vfitab = _mm_cvttpd_epi32(rt);
305 vfeps = _mm_frcz_pd(rt);
307 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
309 twovfeps = _mm_add_pd(vfeps,vfeps);
310 vfitab = _mm_slli_epi32(vfitab,3);
312 /* CUBIC SPLINE TABLE DISPERSION */
313 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
314 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
315 GMX_MM_TRANSPOSE2_PD(Y,F);
316 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
317 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
318 GMX_MM_TRANSPOSE2_PD(G,H);
319 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
320 VV = _mm_macc_pd(vfeps,Fp,Y);
321 vvdw6 = _mm_mul_pd(c6_00,VV);
322 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
323 fvdw6 = _mm_mul_pd(c6_00,FF);
325 /* CUBIC SPLINE TABLE REPULSION */
326 vfitab = _mm_add_epi32(vfitab,ifour);
327 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
328 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
329 GMX_MM_TRANSPOSE2_PD(Y,F);
330 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
331 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
332 GMX_MM_TRANSPOSE2_PD(G,H);
333 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
334 VV = _mm_macc_pd(vfeps,Fp,Y);
335 vvdw12 = _mm_mul_pd(c12_00,VV);
336 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
337 fvdw12 = _mm_mul_pd(c12_00,FF);
338 vvdw = _mm_add_pd(vvdw12,vvdw6);
339 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
341 /* Update potential sum for this i atom from the interaction with this j atom. */
342 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
346 /* Update vectorial force */
347 fix0 = _mm_macc_pd(dx00,fscal,fix0);
348 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
349 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
351 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
352 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
353 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
355 /**************************
356 * CALCULATE INTERACTIONS *
357 **************************/
359 if (gmx_mm_any_lt(rsq11,rcutoff2))
362 /* REACTION-FIELD ELECTROSTATICS */
363 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_macc_pd(krf,rsq11,rinv11),crf));
364 felec = _mm_mul_pd(qq11,_mm_msub_pd(rinv11,rinvsq11,krf2));
366 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
368 /* Update potential sum for this i atom from the interaction with this j atom. */
369 velec = _mm_and_pd(velec,cutoff_mask);
370 velecsum = _mm_add_pd(velecsum,velec);
374 fscal = _mm_and_pd(fscal,cutoff_mask);
376 /* Update vectorial force */
377 fix1 = _mm_macc_pd(dx11,fscal,fix1);
378 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
379 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
381 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
382 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
383 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
387 /**************************
388 * CALCULATE INTERACTIONS *
389 **************************/
391 if (gmx_mm_any_lt(rsq12,rcutoff2))
394 /* REACTION-FIELD ELECTROSTATICS */
395 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_macc_pd(krf,rsq12,rinv12),crf));
396 felec = _mm_mul_pd(qq12,_mm_msub_pd(rinv12,rinvsq12,krf2));
398 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
400 /* Update potential sum for this i atom from the interaction with this j atom. */
401 velec = _mm_and_pd(velec,cutoff_mask);
402 velecsum = _mm_add_pd(velecsum,velec);
406 fscal = _mm_and_pd(fscal,cutoff_mask);
408 /* Update vectorial force */
409 fix1 = _mm_macc_pd(dx12,fscal,fix1);
410 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
411 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
413 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
414 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
415 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
419 /**************************
420 * CALCULATE INTERACTIONS *
421 **************************/
423 if (gmx_mm_any_lt(rsq13,rcutoff2))
426 /* REACTION-FIELD ELECTROSTATICS */
427 velec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_macc_pd(krf,rsq13,rinv13),crf));
428 felec = _mm_mul_pd(qq13,_mm_msub_pd(rinv13,rinvsq13,krf2));
430 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
432 /* Update potential sum for this i atom from the interaction with this j atom. */
433 velec = _mm_and_pd(velec,cutoff_mask);
434 velecsum = _mm_add_pd(velecsum,velec);
438 fscal = _mm_and_pd(fscal,cutoff_mask);
440 /* Update vectorial force */
441 fix1 = _mm_macc_pd(dx13,fscal,fix1);
442 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
443 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
445 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
446 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
447 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
451 /**************************
452 * CALCULATE INTERACTIONS *
453 **************************/
455 if (gmx_mm_any_lt(rsq21,rcutoff2))
458 /* REACTION-FIELD ELECTROSTATICS */
459 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_macc_pd(krf,rsq21,rinv21),crf));
460 felec = _mm_mul_pd(qq21,_mm_msub_pd(rinv21,rinvsq21,krf2));
462 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
464 /* Update potential sum for this i atom from the interaction with this j atom. */
465 velec = _mm_and_pd(velec,cutoff_mask);
466 velecsum = _mm_add_pd(velecsum,velec);
470 fscal = _mm_and_pd(fscal,cutoff_mask);
472 /* Update vectorial force */
473 fix2 = _mm_macc_pd(dx21,fscal,fix2);
474 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
475 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
477 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
478 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
479 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
483 /**************************
484 * CALCULATE INTERACTIONS *
485 **************************/
487 if (gmx_mm_any_lt(rsq22,rcutoff2))
490 /* REACTION-FIELD ELECTROSTATICS */
491 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_macc_pd(krf,rsq22,rinv22),crf));
492 felec = _mm_mul_pd(qq22,_mm_msub_pd(rinv22,rinvsq22,krf2));
494 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
496 /* Update potential sum for this i atom from the interaction with this j atom. */
497 velec = _mm_and_pd(velec,cutoff_mask);
498 velecsum = _mm_add_pd(velecsum,velec);
502 fscal = _mm_and_pd(fscal,cutoff_mask);
504 /* Update vectorial force */
505 fix2 = _mm_macc_pd(dx22,fscal,fix2);
506 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
507 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
509 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
510 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
511 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
515 /**************************
516 * CALCULATE INTERACTIONS *
517 **************************/
519 if (gmx_mm_any_lt(rsq23,rcutoff2))
522 /* REACTION-FIELD ELECTROSTATICS */
523 velec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_macc_pd(krf,rsq23,rinv23),crf));
524 felec = _mm_mul_pd(qq23,_mm_msub_pd(rinv23,rinvsq23,krf2));
526 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
528 /* Update potential sum for this i atom from the interaction with this j atom. */
529 velec = _mm_and_pd(velec,cutoff_mask);
530 velecsum = _mm_add_pd(velecsum,velec);
534 fscal = _mm_and_pd(fscal,cutoff_mask);
536 /* Update vectorial force */
537 fix2 = _mm_macc_pd(dx23,fscal,fix2);
538 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
539 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
541 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
542 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
543 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
547 /**************************
548 * CALCULATE INTERACTIONS *
549 **************************/
551 if (gmx_mm_any_lt(rsq31,rcutoff2))
554 /* REACTION-FIELD ELECTROSTATICS */
555 velec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_macc_pd(krf,rsq31,rinv31),crf));
556 felec = _mm_mul_pd(qq31,_mm_msub_pd(rinv31,rinvsq31,krf2));
558 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
560 /* Update potential sum for this i atom from the interaction with this j atom. */
561 velec = _mm_and_pd(velec,cutoff_mask);
562 velecsum = _mm_add_pd(velecsum,velec);
566 fscal = _mm_and_pd(fscal,cutoff_mask);
568 /* Update vectorial force */
569 fix3 = _mm_macc_pd(dx31,fscal,fix3);
570 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
571 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
573 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
574 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
575 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
579 /**************************
580 * CALCULATE INTERACTIONS *
581 **************************/
583 if (gmx_mm_any_lt(rsq32,rcutoff2))
586 /* REACTION-FIELD ELECTROSTATICS */
587 velec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_macc_pd(krf,rsq32,rinv32),crf));
588 felec = _mm_mul_pd(qq32,_mm_msub_pd(rinv32,rinvsq32,krf2));
590 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
592 /* Update potential sum for this i atom from the interaction with this j atom. */
593 velec = _mm_and_pd(velec,cutoff_mask);
594 velecsum = _mm_add_pd(velecsum,velec);
598 fscal = _mm_and_pd(fscal,cutoff_mask);
600 /* Update vectorial force */
601 fix3 = _mm_macc_pd(dx32,fscal,fix3);
602 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
603 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
605 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
606 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
607 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
611 /**************************
612 * CALCULATE INTERACTIONS *
613 **************************/
615 if (gmx_mm_any_lt(rsq33,rcutoff2))
618 /* REACTION-FIELD ELECTROSTATICS */
619 velec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_macc_pd(krf,rsq33,rinv33),crf));
620 felec = _mm_mul_pd(qq33,_mm_msub_pd(rinv33,rinvsq33,krf2));
622 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
624 /* Update potential sum for this i atom from the interaction with this j atom. */
625 velec = _mm_and_pd(velec,cutoff_mask);
626 velecsum = _mm_add_pd(velecsum,velec);
630 fscal = _mm_and_pd(fscal,cutoff_mask);
632 /* Update vectorial force */
633 fix3 = _mm_macc_pd(dx33,fscal,fix3);
634 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
635 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
637 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
638 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
639 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
643 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
645 /* Inner loop uses 413 flops */
652 j_coord_offsetA = DIM*jnrA;
654 /* load j atom coordinates */
655 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
656 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
657 &jy2,&jz2,&jx3,&jy3,&jz3);
659 /* Calculate displacement vector */
660 dx00 = _mm_sub_pd(ix0,jx0);
661 dy00 = _mm_sub_pd(iy0,jy0);
662 dz00 = _mm_sub_pd(iz0,jz0);
663 dx11 = _mm_sub_pd(ix1,jx1);
664 dy11 = _mm_sub_pd(iy1,jy1);
665 dz11 = _mm_sub_pd(iz1,jz1);
666 dx12 = _mm_sub_pd(ix1,jx2);
667 dy12 = _mm_sub_pd(iy1,jy2);
668 dz12 = _mm_sub_pd(iz1,jz2);
669 dx13 = _mm_sub_pd(ix1,jx3);
670 dy13 = _mm_sub_pd(iy1,jy3);
671 dz13 = _mm_sub_pd(iz1,jz3);
672 dx21 = _mm_sub_pd(ix2,jx1);
673 dy21 = _mm_sub_pd(iy2,jy1);
674 dz21 = _mm_sub_pd(iz2,jz1);
675 dx22 = _mm_sub_pd(ix2,jx2);
676 dy22 = _mm_sub_pd(iy2,jy2);
677 dz22 = _mm_sub_pd(iz2,jz2);
678 dx23 = _mm_sub_pd(ix2,jx3);
679 dy23 = _mm_sub_pd(iy2,jy3);
680 dz23 = _mm_sub_pd(iz2,jz3);
681 dx31 = _mm_sub_pd(ix3,jx1);
682 dy31 = _mm_sub_pd(iy3,jy1);
683 dz31 = _mm_sub_pd(iz3,jz1);
684 dx32 = _mm_sub_pd(ix3,jx2);
685 dy32 = _mm_sub_pd(iy3,jy2);
686 dz32 = _mm_sub_pd(iz3,jz2);
687 dx33 = _mm_sub_pd(ix3,jx3);
688 dy33 = _mm_sub_pd(iy3,jy3);
689 dz33 = _mm_sub_pd(iz3,jz3);
691 /* Calculate squared distance and things based on it */
692 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
693 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
694 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
695 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
696 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
697 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
698 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
699 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
700 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
701 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
703 rinv00 = gmx_mm_invsqrt_pd(rsq00);
704 rinv11 = gmx_mm_invsqrt_pd(rsq11);
705 rinv12 = gmx_mm_invsqrt_pd(rsq12);
706 rinv13 = gmx_mm_invsqrt_pd(rsq13);
707 rinv21 = gmx_mm_invsqrt_pd(rsq21);
708 rinv22 = gmx_mm_invsqrt_pd(rsq22);
709 rinv23 = gmx_mm_invsqrt_pd(rsq23);
710 rinv31 = gmx_mm_invsqrt_pd(rsq31);
711 rinv32 = gmx_mm_invsqrt_pd(rsq32);
712 rinv33 = gmx_mm_invsqrt_pd(rsq33);
714 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
715 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
716 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
717 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
718 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
719 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
720 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
721 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
722 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
724 fjx0 = _mm_setzero_pd();
725 fjy0 = _mm_setzero_pd();
726 fjz0 = _mm_setzero_pd();
727 fjx1 = _mm_setzero_pd();
728 fjy1 = _mm_setzero_pd();
729 fjz1 = _mm_setzero_pd();
730 fjx2 = _mm_setzero_pd();
731 fjy2 = _mm_setzero_pd();
732 fjz2 = _mm_setzero_pd();
733 fjx3 = _mm_setzero_pd();
734 fjy3 = _mm_setzero_pd();
735 fjz3 = _mm_setzero_pd();
737 /**************************
738 * CALCULATE INTERACTIONS *
739 **************************/
741 r00 = _mm_mul_pd(rsq00,rinv00);
743 /* Calculate table index by multiplying r with table scale and truncate to integer */
744 rt = _mm_mul_pd(r00,vftabscale);
745 vfitab = _mm_cvttpd_epi32(rt);
747 vfeps = _mm_frcz_pd(rt);
749 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
751 twovfeps = _mm_add_pd(vfeps,vfeps);
752 vfitab = _mm_slli_epi32(vfitab,3);
754 /* CUBIC SPLINE TABLE DISPERSION */
755 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
756 F = _mm_setzero_pd();
757 GMX_MM_TRANSPOSE2_PD(Y,F);
758 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
759 H = _mm_setzero_pd();
760 GMX_MM_TRANSPOSE2_PD(G,H);
761 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
762 VV = _mm_macc_pd(vfeps,Fp,Y);
763 vvdw6 = _mm_mul_pd(c6_00,VV);
764 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
765 fvdw6 = _mm_mul_pd(c6_00,FF);
767 /* CUBIC SPLINE TABLE REPULSION */
768 vfitab = _mm_add_epi32(vfitab,ifour);
769 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
770 F = _mm_setzero_pd();
771 GMX_MM_TRANSPOSE2_PD(Y,F);
772 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
773 H = _mm_setzero_pd();
774 GMX_MM_TRANSPOSE2_PD(G,H);
775 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
776 VV = _mm_macc_pd(vfeps,Fp,Y);
777 vvdw12 = _mm_mul_pd(c12_00,VV);
778 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
779 fvdw12 = _mm_mul_pd(c12_00,FF);
780 vvdw = _mm_add_pd(vvdw12,vvdw6);
781 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
783 /* Update potential sum for this i atom from the interaction with this j atom. */
784 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
785 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
789 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
791 /* Update vectorial force */
792 fix0 = _mm_macc_pd(dx00,fscal,fix0);
793 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
794 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
796 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
797 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
798 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
800 /**************************
801 * CALCULATE INTERACTIONS *
802 **************************/
804 if (gmx_mm_any_lt(rsq11,rcutoff2))
807 /* REACTION-FIELD ELECTROSTATICS */
808 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_macc_pd(krf,rsq11,rinv11),crf));
809 felec = _mm_mul_pd(qq11,_mm_msub_pd(rinv11,rinvsq11,krf2));
811 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
813 /* Update potential sum for this i atom from the interaction with this j atom. */
814 velec = _mm_and_pd(velec,cutoff_mask);
815 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
816 velecsum = _mm_add_pd(velecsum,velec);
820 fscal = _mm_and_pd(fscal,cutoff_mask);
822 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
824 /* Update vectorial force */
825 fix1 = _mm_macc_pd(dx11,fscal,fix1);
826 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
827 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
829 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
830 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
831 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
835 /**************************
836 * CALCULATE INTERACTIONS *
837 **************************/
839 if (gmx_mm_any_lt(rsq12,rcutoff2))
842 /* REACTION-FIELD ELECTROSTATICS */
843 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_macc_pd(krf,rsq12,rinv12),crf));
844 felec = _mm_mul_pd(qq12,_mm_msub_pd(rinv12,rinvsq12,krf2));
846 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
848 /* Update potential sum for this i atom from the interaction with this j atom. */
849 velec = _mm_and_pd(velec,cutoff_mask);
850 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
851 velecsum = _mm_add_pd(velecsum,velec);
855 fscal = _mm_and_pd(fscal,cutoff_mask);
857 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
859 /* Update vectorial force */
860 fix1 = _mm_macc_pd(dx12,fscal,fix1);
861 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
862 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
864 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
865 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
866 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
870 /**************************
871 * CALCULATE INTERACTIONS *
872 **************************/
874 if (gmx_mm_any_lt(rsq13,rcutoff2))
877 /* REACTION-FIELD ELECTROSTATICS */
878 velec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_macc_pd(krf,rsq13,rinv13),crf));
879 felec = _mm_mul_pd(qq13,_mm_msub_pd(rinv13,rinvsq13,krf2));
881 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
883 /* Update potential sum for this i atom from the interaction with this j atom. */
884 velec = _mm_and_pd(velec,cutoff_mask);
885 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
886 velecsum = _mm_add_pd(velecsum,velec);
890 fscal = _mm_and_pd(fscal,cutoff_mask);
892 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
894 /* Update vectorial force */
895 fix1 = _mm_macc_pd(dx13,fscal,fix1);
896 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
897 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
899 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
900 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
901 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
905 /**************************
906 * CALCULATE INTERACTIONS *
907 **************************/
909 if (gmx_mm_any_lt(rsq21,rcutoff2))
912 /* REACTION-FIELD ELECTROSTATICS */
913 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_macc_pd(krf,rsq21,rinv21),crf));
914 felec = _mm_mul_pd(qq21,_mm_msub_pd(rinv21,rinvsq21,krf2));
916 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
918 /* Update potential sum for this i atom from the interaction with this j atom. */
919 velec = _mm_and_pd(velec,cutoff_mask);
920 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
921 velecsum = _mm_add_pd(velecsum,velec);
925 fscal = _mm_and_pd(fscal,cutoff_mask);
927 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
929 /* Update vectorial force */
930 fix2 = _mm_macc_pd(dx21,fscal,fix2);
931 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
932 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
934 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
935 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
936 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
940 /**************************
941 * CALCULATE INTERACTIONS *
942 **************************/
944 if (gmx_mm_any_lt(rsq22,rcutoff2))
947 /* REACTION-FIELD ELECTROSTATICS */
948 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_macc_pd(krf,rsq22,rinv22),crf));
949 felec = _mm_mul_pd(qq22,_mm_msub_pd(rinv22,rinvsq22,krf2));
951 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
953 /* Update potential sum for this i atom from the interaction with this j atom. */
954 velec = _mm_and_pd(velec,cutoff_mask);
955 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
956 velecsum = _mm_add_pd(velecsum,velec);
960 fscal = _mm_and_pd(fscal,cutoff_mask);
962 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
964 /* Update vectorial force */
965 fix2 = _mm_macc_pd(dx22,fscal,fix2);
966 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
967 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
969 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
970 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
971 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
975 /**************************
976 * CALCULATE INTERACTIONS *
977 **************************/
979 if (gmx_mm_any_lt(rsq23,rcutoff2))
982 /* REACTION-FIELD ELECTROSTATICS */
983 velec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_macc_pd(krf,rsq23,rinv23),crf));
984 felec = _mm_mul_pd(qq23,_mm_msub_pd(rinv23,rinvsq23,krf2));
986 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
988 /* Update potential sum for this i atom from the interaction with this j atom. */
989 velec = _mm_and_pd(velec,cutoff_mask);
990 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
991 velecsum = _mm_add_pd(velecsum,velec);
995 fscal = _mm_and_pd(fscal,cutoff_mask);
997 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
999 /* Update vectorial force */
1000 fix2 = _mm_macc_pd(dx23,fscal,fix2);
1001 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
1002 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
1004 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
1005 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
1006 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
1010 /**************************
1011 * CALCULATE INTERACTIONS *
1012 **************************/
1014 if (gmx_mm_any_lt(rsq31,rcutoff2))
1017 /* REACTION-FIELD ELECTROSTATICS */
1018 velec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_macc_pd(krf,rsq31,rinv31),crf));
1019 felec = _mm_mul_pd(qq31,_mm_msub_pd(rinv31,rinvsq31,krf2));
1021 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1023 /* Update potential sum for this i atom from the interaction with this j atom. */
1024 velec = _mm_and_pd(velec,cutoff_mask);
1025 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1026 velecsum = _mm_add_pd(velecsum,velec);
1030 fscal = _mm_and_pd(fscal,cutoff_mask);
1032 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1034 /* Update vectorial force */
1035 fix3 = _mm_macc_pd(dx31,fscal,fix3);
1036 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
1037 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
1039 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
1040 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
1041 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
1045 /**************************
1046 * CALCULATE INTERACTIONS *
1047 **************************/
1049 if (gmx_mm_any_lt(rsq32,rcutoff2))
1052 /* REACTION-FIELD ELECTROSTATICS */
1053 velec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_macc_pd(krf,rsq32,rinv32),crf));
1054 felec = _mm_mul_pd(qq32,_mm_msub_pd(rinv32,rinvsq32,krf2));
1056 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1058 /* Update potential sum for this i atom from the interaction with this j atom. */
1059 velec = _mm_and_pd(velec,cutoff_mask);
1060 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1061 velecsum = _mm_add_pd(velecsum,velec);
1065 fscal = _mm_and_pd(fscal,cutoff_mask);
1067 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1069 /* Update vectorial force */
1070 fix3 = _mm_macc_pd(dx32,fscal,fix3);
1071 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
1072 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
1074 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
1075 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
1076 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
1080 /**************************
1081 * CALCULATE INTERACTIONS *
1082 **************************/
1084 if (gmx_mm_any_lt(rsq33,rcutoff2))
1087 /* REACTION-FIELD ELECTROSTATICS */
1088 velec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_macc_pd(krf,rsq33,rinv33),crf));
1089 felec = _mm_mul_pd(qq33,_mm_msub_pd(rinv33,rinvsq33,krf2));
1091 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1093 /* Update potential sum for this i atom from the interaction with this j atom. */
1094 velec = _mm_and_pd(velec,cutoff_mask);
1095 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1096 velecsum = _mm_add_pd(velecsum,velec);
1100 fscal = _mm_and_pd(fscal,cutoff_mask);
1102 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1104 /* Update vectorial force */
1105 fix3 = _mm_macc_pd(dx33,fscal,fix3);
1106 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
1107 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
1109 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
1110 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
1111 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
1115 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1117 /* Inner loop uses 413 flops */
1120 /* End of innermost loop */
1122 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1123 f+i_coord_offset,fshift+i_shift_offset);
1126 /* Update potential energies */
1127 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1128 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1130 /* Increment number of inner iterations */
1131 inneriter += j_index_end - j_index_start;
1133 /* Outer loop uses 26 flops */
1136 /* Increment number of outer iterations */
1139 /* Update outer/inner flops */
1141 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*413);
1144 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_avx_128_fma_double
1145 * Electrostatics interaction: ReactionField
1146 * VdW interaction: CubicSplineTable
1147 * Geometry: Water4-Water4
1148 * Calculate force/pot: Force
1151 nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_avx_128_fma_double
1152 (t_nblist * gmx_restrict nlist,
1153 rvec * gmx_restrict xx,
1154 rvec * gmx_restrict ff,
1155 t_forcerec * gmx_restrict fr,
1156 t_mdatoms * gmx_restrict mdatoms,
1157 nb_kernel_data_t * gmx_restrict kernel_data,
1158 t_nrnb * gmx_restrict nrnb)
1160 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1161 * just 0 for non-waters.
1162 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1163 * jnr indices corresponding to data put in the four positions in the SIMD register.
1165 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1166 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1168 int j_coord_offsetA,j_coord_offsetB;
1169 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1170 real rcutoff_scalar;
1171 real *shiftvec,*fshift,*x,*f;
1172 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1174 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1176 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1178 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1180 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1181 int vdwjidx0A,vdwjidx0B;
1182 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1183 int vdwjidx1A,vdwjidx1B;
1184 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1185 int vdwjidx2A,vdwjidx2B;
1186 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1187 int vdwjidx3A,vdwjidx3B;
1188 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1189 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1190 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1191 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1192 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1193 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1194 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1195 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1196 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1197 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1198 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1199 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1202 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1205 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1206 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1208 __m128i ifour = _mm_set1_epi32(4);
1209 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
1211 __m128d dummy_mask,cutoff_mask;
1212 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1213 __m128d one = _mm_set1_pd(1.0);
1214 __m128d two = _mm_set1_pd(2.0);
1220 jindex = nlist->jindex;
1222 shiftidx = nlist->shift;
1224 shiftvec = fr->shift_vec[0];
1225 fshift = fr->fshift[0];
1226 facel = _mm_set1_pd(fr->epsfac);
1227 charge = mdatoms->chargeA;
1228 krf = _mm_set1_pd(fr->ic->k_rf);
1229 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
1230 crf = _mm_set1_pd(fr->ic->c_rf);
1231 nvdwtype = fr->ntype;
1232 vdwparam = fr->nbfp;
1233 vdwtype = mdatoms->typeA;
1235 vftab = kernel_data->table_vdw->data;
1236 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
1238 /* Setup water-specific parameters */
1239 inr = nlist->iinr[0];
1240 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1241 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1242 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1243 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1245 jq1 = _mm_set1_pd(charge[inr+1]);
1246 jq2 = _mm_set1_pd(charge[inr+2]);
1247 jq3 = _mm_set1_pd(charge[inr+3]);
1248 vdwjidx0A = 2*vdwtype[inr+0];
1249 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1250 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1251 qq11 = _mm_mul_pd(iq1,jq1);
1252 qq12 = _mm_mul_pd(iq1,jq2);
1253 qq13 = _mm_mul_pd(iq1,jq3);
1254 qq21 = _mm_mul_pd(iq2,jq1);
1255 qq22 = _mm_mul_pd(iq2,jq2);
1256 qq23 = _mm_mul_pd(iq2,jq3);
1257 qq31 = _mm_mul_pd(iq3,jq1);
1258 qq32 = _mm_mul_pd(iq3,jq2);
1259 qq33 = _mm_mul_pd(iq3,jq3);
1261 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1262 rcutoff_scalar = fr->rcoulomb;
1263 rcutoff = _mm_set1_pd(rcutoff_scalar);
1264 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1266 /* Avoid stupid compiler warnings */
1268 j_coord_offsetA = 0;
1269 j_coord_offsetB = 0;
1274 /* Start outer loop over neighborlists */
1275 for(iidx=0; iidx<nri; iidx++)
1277 /* Load shift vector for this list */
1278 i_shift_offset = DIM*shiftidx[iidx];
1280 /* Load limits for loop over neighbors */
1281 j_index_start = jindex[iidx];
1282 j_index_end = jindex[iidx+1];
1284 /* Get outer coordinate index */
1286 i_coord_offset = DIM*inr;
1288 /* Load i particle coords and add shift vector */
1289 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1290 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1292 fix0 = _mm_setzero_pd();
1293 fiy0 = _mm_setzero_pd();
1294 fiz0 = _mm_setzero_pd();
1295 fix1 = _mm_setzero_pd();
1296 fiy1 = _mm_setzero_pd();
1297 fiz1 = _mm_setzero_pd();
1298 fix2 = _mm_setzero_pd();
1299 fiy2 = _mm_setzero_pd();
1300 fiz2 = _mm_setzero_pd();
1301 fix3 = _mm_setzero_pd();
1302 fiy3 = _mm_setzero_pd();
1303 fiz3 = _mm_setzero_pd();
1305 /* Start inner kernel loop */
1306 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1309 /* Get j neighbor index, and coordinate index */
1311 jnrB = jjnr[jidx+1];
1312 j_coord_offsetA = DIM*jnrA;
1313 j_coord_offsetB = DIM*jnrB;
1315 /* load j atom coordinates */
1316 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1317 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1318 &jy2,&jz2,&jx3,&jy3,&jz3);
1320 /* Calculate displacement vector */
1321 dx00 = _mm_sub_pd(ix0,jx0);
1322 dy00 = _mm_sub_pd(iy0,jy0);
1323 dz00 = _mm_sub_pd(iz0,jz0);
1324 dx11 = _mm_sub_pd(ix1,jx1);
1325 dy11 = _mm_sub_pd(iy1,jy1);
1326 dz11 = _mm_sub_pd(iz1,jz1);
1327 dx12 = _mm_sub_pd(ix1,jx2);
1328 dy12 = _mm_sub_pd(iy1,jy2);
1329 dz12 = _mm_sub_pd(iz1,jz2);
1330 dx13 = _mm_sub_pd(ix1,jx3);
1331 dy13 = _mm_sub_pd(iy1,jy3);
1332 dz13 = _mm_sub_pd(iz1,jz3);
1333 dx21 = _mm_sub_pd(ix2,jx1);
1334 dy21 = _mm_sub_pd(iy2,jy1);
1335 dz21 = _mm_sub_pd(iz2,jz1);
1336 dx22 = _mm_sub_pd(ix2,jx2);
1337 dy22 = _mm_sub_pd(iy2,jy2);
1338 dz22 = _mm_sub_pd(iz2,jz2);
1339 dx23 = _mm_sub_pd(ix2,jx3);
1340 dy23 = _mm_sub_pd(iy2,jy3);
1341 dz23 = _mm_sub_pd(iz2,jz3);
1342 dx31 = _mm_sub_pd(ix3,jx1);
1343 dy31 = _mm_sub_pd(iy3,jy1);
1344 dz31 = _mm_sub_pd(iz3,jz1);
1345 dx32 = _mm_sub_pd(ix3,jx2);
1346 dy32 = _mm_sub_pd(iy3,jy2);
1347 dz32 = _mm_sub_pd(iz3,jz2);
1348 dx33 = _mm_sub_pd(ix3,jx3);
1349 dy33 = _mm_sub_pd(iy3,jy3);
1350 dz33 = _mm_sub_pd(iz3,jz3);
1352 /* Calculate squared distance and things based on it */
1353 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1354 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1355 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1356 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1357 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1358 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1359 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1360 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1361 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1362 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1364 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1365 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1366 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1367 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1368 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1369 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1370 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1371 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1372 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1373 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1375 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1376 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1377 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1378 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1379 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1380 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1381 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1382 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1383 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1385 fjx0 = _mm_setzero_pd();
1386 fjy0 = _mm_setzero_pd();
1387 fjz0 = _mm_setzero_pd();
1388 fjx1 = _mm_setzero_pd();
1389 fjy1 = _mm_setzero_pd();
1390 fjz1 = _mm_setzero_pd();
1391 fjx2 = _mm_setzero_pd();
1392 fjy2 = _mm_setzero_pd();
1393 fjz2 = _mm_setzero_pd();
1394 fjx3 = _mm_setzero_pd();
1395 fjy3 = _mm_setzero_pd();
1396 fjz3 = _mm_setzero_pd();
1398 /**************************
1399 * CALCULATE INTERACTIONS *
1400 **************************/
1402 r00 = _mm_mul_pd(rsq00,rinv00);
1404 /* Calculate table index by multiplying r with table scale and truncate to integer */
1405 rt = _mm_mul_pd(r00,vftabscale);
1406 vfitab = _mm_cvttpd_epi32(rt);
1408 vfeps = _mm_frcz_pd(rt);
1410 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1412 twovfeps = _mm_add_pd(vfeps,vfeps);
1413 vfitab = _mm_slli_epi32(vfitab,3);
1415 /* CUBIC SPLINE TABLE DISPERSION */
1416 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1417 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1418 GMX_MM_TRANSPOSE2_PD(Y,F);
1419 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1420 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1421 GMX_MM_TRANSPOSE2_PD(G,H);
1422 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1423 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1424 fvdw6 = _mm_mul_pd(c6_00,FF);
1426 /* CUBIC SPLINE TABLE REPULSION */
1427 vfitab = _mm_add_epi32(vfitab,ifour);
1428 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1429 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1430 GMX_MM_TRANSPOSE2_PD(Y,F);
1431 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1432 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1433 GMX_MM_TRANSPOSE2_PD(G,H);
1434 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1435 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1436 fvdw12 = _mm_mul_pd(c12_00,FF);
1437 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1441 /* Update vectorial force */
1442 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1443 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1444 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1446 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1447 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1448 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1450 /**************************
1451 * CALCULATE INTERACTIONS *
1452 **************************/
1454 if (gmx_mm_any_lt(rsq11,rcutoff2))
1457 /* REACTION-FIELD ELECTROSTATICS */
1458 felec = _mm_mul_pd(qq11,_mm_msub_pd(rinv11,rinvsq11,krf2));
1460 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1464 fscal = _mm_and_pd(fscal,cutoff_mask);
1466 /* Update vectorial force */
1467 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1468 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1469 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1471 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1472 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1473 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1477 /**************************
1478 * CALCULATE INTERACTIONS *
1479 **************************/
1481 if (gmx_mm_any_lt(rsq12,rcutoff2))
1484 /* REACTION-FIELD ELECTROSTATICS */
1485 felec = _mm_mul_pd(qq12,_mm_msub_pd(rinv12,rinvsq12,krf2));
1487 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1491 fscal = _mm_and_pd(fscal,cutoff_mask);
1493 /* Update vectorial force */
1494 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1495 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1496 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1498 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1499 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1500 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1504 /**************************
1505 * CALCULATE INTERACTIONS *
1506 **************************/
1508 if (gmx_mm_any_lt(rsq13,rcutoff2))
1511 /* REACTION-FIELD ELECTROSTATICS */
1512 felec = _mm_mul_pd(qq13,_mm_msub_pd(rinv13,rinvsq13,krf2));
1514 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1518 fscal = _mm_and_pd(fscal,cutoff_mask);
1520 /* Update vectorial force */
1521 fix1 = _mm_macc_pd(dx13,fscal,fix1);
1522 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
1523 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
1525 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
1526 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
1527 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
1531 /**************************
1532 * CALCULATE INTERACTIONS *
1533 **************************/
1535 if (gmx_mm_any_lt(rsq21,rcutoff2))
1538 /* REACTION-FIELD ELECTROSTATICS */
1539 felec = _mm_mul_pd(qq21,_mm_msub_pd(rinv21,rinvsq21,krf2));
1541 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1545 fscal = _mm_and_pd(fscal,cutoff_mask);
1547 /* Update vectorial force */
1548 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1549 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1550 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1552 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1553 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1554 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1558 /**************************
1559 * CALCULATE INTERACTIONS *
1560 **************************/
1562 if (gmx_mm_any_lt(rsq22,rcutoff2))
1565 /* REACTION-FIELD ELECTROSTATICS */
1566 felec = _mm_mul_pd(qq22,_mm_msub_pd(rinv22,rinvsq22,krf2));
1568 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1572 fscal = _mm_and_pd(fscal,cutoff_mask);
1574 /* Update vectorial force */
1575 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1576 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1577 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1579 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1580 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1581 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1585 /**************************
1586 * CALCULATE INTERACTIONS *
1587 **************************/
1589 if (gmx_mm_any_lt(rsq23,rcutoff2))
1592 /* REACTION-FIELD ELECTROSTATICS */
1593 felec = _mm_mul_pd(qq23,_mm_msub_pd(rinv23,rinvsq23,krf2));
1595 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1599 fscal = _mm_and_pd(fscal,cutoff_mask);
1601 /* Update vectorial force */
1602 fix2 = _mm_macc_pd(dx23,fscal,fix2);
1603 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
1604 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
1606 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
1607 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
1608 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
1612 /**************************
1613 * CALCULATE INTERACTIONS *
1614 **************************/
1616 if (gmx_mm_any_lt(rsq31,rcutoff2))
1619 /* REACTION-FIELD ELECTROSTATICS */
1620 felec = _mm_mul_pd(qq31,_mm_msub_pd(rinv31,rinvsq31,krf2));
1622 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1626 fscal = _mm_and_pd(fscal,cutoff_mask);
1628 /* Update vectorial force */
1629 fix3 = _mm_macc_pd(dx31,fscal,fix3);
1630 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
1631 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
1633 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
1634 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
1635 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
1639 /**************************
1640 * CALCULATE INTERACTIONS *
1641 **************************/
1643 if (gmx_mm_any_lt(rsq32,rcutoff2))
1646 /* REACTION-FIELD ELECTROSTATICS */
1647 felec = _mm_mul_pd(qq32,_mm_msub_pd(rinv32,rinvsq32,krf2));
1649 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1653 fscal = _mm_and_pd(fscal,cutoff_mask);
1655 /* Update vectorial force */
1656 fix3 = _mm_macc_pd(dx32,fscal,fix3);
1657 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
1658 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
1660 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
1661 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
1662 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
1666 /**************************
1667 * CALCULATE INTERACTIONS *
1668 **************************/
1670 if (gmx_mm_any_lt(rsq33,rcutoff2))
1673 /* REACTION-FIELD ELECTROSTATICS */
1674 felec = _mm_mul_pd(qq33,_mm_msub_pd(rinv33,rinvsq33,krf2));
1676 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1680 fscal = _mm_and_pd(fscal,cutoff_mask);
1682 /* Update vectorial force */
1683 fix3 = _mm_macc_pd(dx33,fscal,fix3);
1684 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
1685 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
1687 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
1688 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
1689 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
1693 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1695 /* Inner loop uses 351 flops */
1698 if(jidx<j_index_end)
1702 j_coord_offsetA = DIM*jnrA;
1704 /* load j atom coordinates */
1705 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1706 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1707 &jy2,&jz2,&jx3,&jy3,&jz3);
1709 /* Calculate displacement vector */
1710 dx00 = _mm_sub_pd(ix0,jx0);
1711 dy00 = _mm_sub_pd(iy0,jy0);
1712 dz00 = _mm_sub_pd(iz0,jz0);
1713 dx11 = _mm_sub_pd(ix1,jx1);
1714 dy11 = _mm_sub_pd(iy1,jy1);
1715 dz11 = _mm_sub_pd(iz1,jz1);
1716 dx12 = _mm_sub_pd(ix1,jx2);
1717 dy12 = _mm_sub_pd(iy1,jy2);
1718 dz12 = _mm_sub_pd(iz1,jz2);
1719 dx13 = _mm_sub_pd(ix1,jx3);
1720 dy13 = _mm_sub_pd(iy1,jy3);
1721 dz13 = _mm_sub_pd(iz1,jz3);
1722 dx21 = _mm_sub_pd(ix2,jx1);
1723 dy21 = _mm_sub_pd(iy2,jy1);
1724 dz21 = _mm_sub_pd(iz2,jz1);
1725 dx22 = _mm_sub_pd(ix2,jx2);
1726 dy22 = _mm_sub_pd(iy2,jy2);
1727 dz22 = _mm_sub_pd(iz2,jz2);
1728 dx23 = _mm_sub_pd(ix2,jx3);
1729 dy23 = _mm_sub_pd(iy2,jy3);
1730 dz23 = _mm_sub_pd(iz2,jz3);
1731 dx31 = _mm_sub_pd(ix3,jx1);
1732 dy31 = _mm_sub_pd(iy3,jy1);
1733 dz31 = _mm_sub_pd(iz3,jz1);
1734 dx32 = _mm_sub_pd(ix3,jx2);
1735 dy32 = _mm_sub_pd(iy3,jy2);
1736 dz32 = _mm_sub_pd(iz3,jz2);
1737 dx33 = _mm_sub_pd(ix3,jx3);
1738 dy33 = _mm_sub_pd(iy3,jy3);
1739 dz33 = _mm_sub_pd(iz3,jz3);
1741 /* Calculate squared distance and things based on it */
1742 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1743 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1744 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1745 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1746 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1747 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1748 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1749 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1750 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1751 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1753 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1754 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1755 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1756 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1757 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1758 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1759 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1760 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1761 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1762 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1764 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1765 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1766 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1767 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1768 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1769 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1770 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1771 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1772 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1774 fjx0 = _mm_setzero_pd();
1775 fjy0 = _mm_setzero_pd();
1776 fjz0 = _mm_setzero_pd();
1777 fjx1 = _mm_setzero_pd();
1778 fjy1 = _mm_setzero_pd();
1779 fjz1 = _mm_setzero_pd();
1780 fjx2 = _mm_setzero_pd();
1781 fjy2 = _mm_setzero_pd();
1782 fjz2 = _mm_setzero_pd();
1783 fjx3 = _mm_setzero_pd();
1784 fjy3 = _mm_setzero_pd();
1785 fjz3 = _mm_setzero_pd();
1787 /**************************
1788 * CALCULATE INTERACTIONS *
1789 **************************/
1791 r00 = _mm_mul_pd(rsq00,rinv00);
1793 /* Calculate table index by multiplying r with table scale and truncate to integer */
1794 rt = _mm_mul_pd(r00,vftabscale);
1795 vfitab = _mm_cvttpd_epi32(rt);
1797 vfeps = _mm_frcz_pd(rt);
1799 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1801 twovfeps = _mm_add_pd(vfeps,vfeps);
1802 vfitab = _mm_slli_epi32(vfitab,3);
1804 /* CUBIC SPLINE TABLE DISPERSION */
1805 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1806 F = _mm_setzero_pd();
1807 GMX_MM_TRANSPOSE2_PD(Y,F);
1808 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1809 H = _mm_setzero_pd();
1810 GMX_MM_TRANSPOSE2_PD(G,H);
1811 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1812 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1813 fvdw6 = _mm_mul_pd(c6_00,FF);
1815 /* CUBIC SPLINE TABLE REPULSION */
1816 vfitab = _mm_add_epi32(vfitab,ifour);
1817 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1818 F = _mm_setzero_pd();
1819 GMX_MM_TRANSPOSE2_PD(Y,F);
1820 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1821 H = _mm_setzero_pd();
1822 GMX_MM_TRANSPOSE2_PD(G,H);
1823 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1824 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1825 fvdw12 = _mm_mul_pd(c12_00,FF);
1826 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1830 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1832 /* Update vectorial force */
1833 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1834 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1835 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1837 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1838 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1839 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1841 /**************************
1842 * CALCULATE INTERACTIONS *
1843 **************************/
1845 if (gmx_mm_any_lt(rsq11,rcutoff2))
1848 /* REACTION-FIELD ELECTROSTATICS */
1849 felec = _mm_mul_pd(qq11,_mm_msub_pd(rinv11,rinvsq11,krf2));
1851 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1855 fscal = _mm_and_pd(fscal,cutoff_mask);
1857 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1859 /* Update vectorial force */
1860 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1861 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1862 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1864 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1865 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1866 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1870 /**************************
1871 * CALCULATE INTERACTIONS *
1872 **************************/
1874 if (gmx_mm_any_lt(rsq12,rcutoff2))
1877 /* REACTION-FIELD ELECTROSTATICS */
1878 felec = _mm_mul_pd(qq12,_mm_msub_pd(rinv12,rinvsq12,krf2));
1880 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1884 fscal = _mm_and_pd(fscal,cutoff_mask);
1886 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1888 /* Update vectorial force */
1889 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1890 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1891 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1893 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1894 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1895 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1899 /**************************
1900 * CALCULATE INTERACTIONS *
1901 **************************/
1903 if (gmx_mm_any_lt(rsq13,rcutoff2))
1906 /* REACTION-FIELD ELECTROSTATICS */
1907 felec = _mm_mul_pd(qq13,_mm_msub_pd(rinv13,rinvsq13,krf2));
1909 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1913 fscal = _mm_and_pd(fscal,cutoff_mask);
1915 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1917 /* Update vectorial force */
1918 fix1 = _mm_macc_pd(dx13,fscal,fix1);
1919 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
1920 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
1922 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
1923 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
1924 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
1928 /**************************
1929 * CALCULATE INTERACTIONS *
1930 **************************/
1932 if (gmx_mm_any_lt(rsq21,rcutoff2))
1935 /* REACTION-FIELD ELECTROSTATICS */
1936 felec = _mm_mul_pd(qq21,_mm_msub_pd(rinv21,rinvsq21,krf2));
1938 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1942 fscal = _mm_and_pd(fscal,cutoff_mask);
1944 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1946 /* Update vectorial force */
1947 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1948 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1949 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1951 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1952 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1953 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1957 /**************************
1958 * CALCULATE INTERACTIONS *
1959 **************************/
1961 if (gmx_mm_any_lt(rsq22,rcutoff2))
1964 /* REACTION-FIELD ELECTROSTATICS */
1965 felec = _mm_mul_pd(qq22,_mm_msub_pd(rinv22,rinvsq22,krf2));
1967 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1971 fscal = _mm_and_pd(fscal,cutoff_mask);
1973 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1975 /* Update vectorial force */
1976 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1977 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1978 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1980 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1981 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1982 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1986 /**************************
1987 * CALCULATE INTERACTIONS *
1988 **************************/
1990 if (gmx_mm_any_lt(rsq23,rcutoff2))
1993 /* REACTION-FIELD ELECTROSTATICS */
1994 felec = _mm_mul_pd(qq23,_mm_msub_pd(rinv23,rinvsq23,krf2));
1996 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
2000 fscal = _mm_and_pd(fscal,cutoff_mask);
2002 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2004 /* Update vectorial force */
2005 fix2 = _mm_macc_pd(dx23,fscal,fix2);
2006 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
2007 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
2009 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
2010 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
2011 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
2015 /**************************
2016 * CALCULATE INTERACTIONS *
2017 **************************/
2019 if (gmx_mm_any_lt(rsq31,rcutoff2))
2022 /* REACTION-FIELD ELECTROSTATICS */
2023 felec = _mm_mul_pd(qq31,_mm_msub_pd(rinv31,rinvsq31,krf2));
2025 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
2029 fscal = _mm_and_pd(fscal,cutoff_mask);
2031 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2033 /* Update vectorial force */
2034 fix3 = _mm_macc_pd(dx31,fscal,fix3);
2035 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
2036 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
2038 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
2039 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
2040 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
2044 /**************************
2045 * CALCULATE INTERACTIONS *
2046 **************************/
2048 if (gmx_mm_any_lt(rsq32,rcutoff2))
2051 /* REACTION-FIELD ELECTROSTATICS */
2052 felec = _mm_mul_pd(qq32,_mm_msub_pd(rinv32,rinvsq32,krf2));
2054 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
2058 fscal = _mm_and_pd(fscal,cutoff_mask);
2060 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2062 /* Update vectorial force */
2063 fix3 = _mm_macc_pd(dx32,fscal,fix3);
2064 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
2065 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
2067 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
2068 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
2069 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
2073 /**************************
2074 * CALCULATE INTERACTIONS *
2075 **************************/
2077 if (gmx_mm_any_lt(rsq33,rcutoff2))
2080 /* REACTION-FIELD ELECTROSTATICS */
2081 felec = _mm_mul_pd(qq33,_mm_msub_pd(rinv33,rinvsq33,krf2));
2083 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
2087 fscal = _mm_and_pd(fscal,cutoff_mask);
2089 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2091 /* Update vectorial force */
2092 fix3 = _mm_macc_pd(dx33,fscal,fix3);
2093 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
2094 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
2096 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
2097 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
2098 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
2102 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2104 /* Inner loop uses 351 flops */
2107 /* End of innermost loop */
2109 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2110 f+i_coord_offset,fshift+i_shift_offset);
2112 /* Increment number of inner iterations */
2113 inneriter += j_index_end - j_index_start;
2115 /* Outer loop uses 24 flops */
2118 /* Increment number of outer iterations */
2121 /* Update outer/inner flops */
2123 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*351);