2 * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_double.h"
34 #include "kernelutil_x86_avx_128_fma_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B;
73 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B;
75 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B;
77 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
93 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
94 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
96 __m128i ifour = _mm_set1_epi32(4);
97 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
99 __m128d dummy_mask,cutoff_mask;
100 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
101 __m128d one = _mm_set1_pd(1.0);
102 __m128d two = _mm_set1_pd(2.0);
108 jindex = nlist->jindex;
110 shiftidx = nlist->shift;
112 shiftvec = fr->shift_vec[0];
113 fshift = fr->fshift[0];
114 facel = _mm_set1_pd(fr->epsfac);
115 charge = mdatoms->chargeA;
116 krf = _mm_set1_pd(fr->ic->k_rf);
117 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
118 crf = _mm_set1_pd(fr->ic->c_rf);
119 nvdwtype = fr->ntype;
121 vdwtype = mdatoms->typeA;
123 vftab = kernel_data->table_vdw->data;
124 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
126 /* Setup water-specific parameters */
127 inr = nlist->iinr[0];
128 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
129 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
130 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
131 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
133 jq0 = _mm_set1_pd(charge[inr+0]);
134 jq1 = _mm_set1_pd(charge[inr+1]);
135 jq2 = _mm_set1_pd(charge[inr+2]);
136 vdwjidx0A = 2*vdwtype[inr+0];
137 qq00 = _mm_mul_pd(iq0,jq0);
138 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
139 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
140 qq01 = _mm_mul_pd(iq0,jq1);
141 qq02 = _mm_mul_pd(iq0,jq2);
142 qq10 = _mm_mul_pd(iq1,jq0);
143 qq11 = _mm_mul_pd(iq1,jq1);
144 qq12 = _mm_mul_pd(iq1,jq2);
145 qq20 = _mm_mul_pd(iq2,jq0);
146 qq21 = _mm_mul_pd(iq2,jq1);
147 qq22 = _mm_mul_pd(iq2,jq2);
149 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
150 rcutoff_scalar = fr->rcoulomb;
151 rcutoff = _mm_set1_pd(rcutoff_scalar);
152 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
154 /* Avoid stupid compiler warnings */
162 /* Start outer loop over neighborlists */
163 for(iidx=0; iidx<nri; iidx++)
165 /* Load shift vector for this list */
166 i_shift_offset = DIM*shiftidx[iidx];
168 /* Load limits for loop over neighbors */
169 j_index_start = jindex[iidx];
170 j_index_end = jindex[iidx+1];
172 /* Get outer coordinate index */
174 i_coord_offset = DIM*inr;
176 /* Load i particle coords and add shift vector */
177 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
178 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
180 fix0 = _mm_setzero_pd();
181 fiy0 = _mm_setzero_pd();
182 fiz0 = _mm_setzero_pd();
183 fix1 = _mm_setzero_pd();
184 fiy1 = _mm_setzero_pd();
185 fiz1 = _mm_setzero_pd();
186 fix2 = _mm_setzero_pd();
187 fiy2 = _mm_setzero_pd();
188 fiz2 = _mm_setzero_pd();
190 /* Reset potential sums */
191 velecsum = _mm_setzero_pd();
192 vvdwsum = _mm_setzero_pd();
194 /* Start inner kernel loop */
195 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
198 /* Get j neighbor index, and coordinate index */
201 j_coord_offsetA = DIM*jnrA;
202 j_coord_offsetB = DIM*jnrB;
204 /* load j atom coordinates */
205 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
206 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
208 /* Calculate displacement vector */
209 dx00 = _mm_sub_pd(ix0,jx0);
210 dy00 = _mm_sub_pd(iy0,jy0);
211 dz00 = _mm_sub_pd(iz0,jz0);
212 dx01 = _mm_sub_pd(ix0,jx1);
213 dy01 = _mm_sub_pd(iy0,jy1);
214 dz01 = _mm_sub_pd(iz0,jz1);
215 dx02 = _mm_sub_pd(ix0,jx2);
216 dy02 = _mm_sub_pd(iy0,jy2);
217 dz02 = _mm_sub_pd(iz0,jz2);
218 dx10 = _mm_sub_pd(ix1,jx0);
219 dy10 = _mm_sub_pd(iy1,jy0);
220 dz10 = _mm_sub_pd(iz1,jz0);
221 dx11 = _mm_sub_pd(ix1,jx1);
222 dy11 = _mm_sub_pd(iy1,jy1);
223 dz11 = _mm_sub_pd(iz1,jz1);
224 dx12 = _mm_sub_pd(ix1,jx2);
225 dy12 = _mm_sub_pd(iy1,jy2);
226 dz12 = _mm_sub_pd(iz1,jz2);
227 dx20 = _mm_sub_pd(ix2,jx0);
228 dy20 = _mm_sub_pd(iy2,jy0);
229 dz20 = _mm_sub_pd(iz2,jz0);
230 dx21 = _mm_sub_pd(ix2,jx1);
231 dy21 = _mm_sub_pd(iy2,jy1);
232 dz21 = _mm_sub_pd(iz2,jz1);
233 dx22 = _mm_sub_pd(ix2,jx2);
234 dy22 = _mm_sub_pd(iy2,jy2);
235 dz22 = _mm_sub_pd(iz2,jz2);
237 /* Calculate squared distance and things based on it */
238 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
239 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
240 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
241 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
242 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
243 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
244 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
245 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
246 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
248 rinv00 = gmx_mm_invsqrt_pd(rsq00);
249 rinv01 = gmx_mm_invsqrt_pd(rsq01);
250 rinv02 = gmx_mm_invsqrt_pd(rsq02);
251 rinv10 = gmx_mm_invsqrt_pd(rsq10);
252 rinv11 = gmx_mm_invsqrt_pd(rsq11);
253 rinv12 = gmx_mm_invsqrt_pd(rsq12);
254 rinv20 = gmx_mm_invsqrt_pd(rsq20);
255 rinv21 = gmx_mm_invsqrt_pd(rsq21);
256 rinv22 = gmx_mm_invsqrt_pd(rsq22);
258 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
259 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
260 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
261 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
262 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
263 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
264 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
265 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
266 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
268 fjx0 = _mm_setzero_pd();
269 fjy0 = _mm_setzero_pd();
270 fjz0 = _mm_setzero_pd();
271 fjx1 = _mm_setzero_pd();
272 fjy1 = _mm_setzero_pd();
273 fjz1 = _mm_setzero_pd();
274 fjx2 = _mm_setzero_pd();
275 fjy2 = _mm_setzero_pd();
276 fjz2 = _mm_setzero_pd();
278 /**************************
279 * CALCULATE INTERACTIONS *
280 **************************/
282 if (gmx_mm_any_lt(rsq00,rcutoff2))
285 r00 = _mm_mul_pd(rsq00,rinv00);
287 /* Calculate table index by multiplying r with table scale and truncate to integer */
288 rt = _mm_mul_pd(r00,vftabscale);
289 vfitab = _mm_cvttpd_epi32(rt);
291 vfeps = _mm_frcz_pd(rt);
293 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
295 twovfeps = _mm_add_pd(vfeps,vfeps);
296 vfitab = _mm_slli_epi32(vfitab,3);
298 /* REACTION-FIELD ELECTROSTATICS */
299 velec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_macc_pd(krf,rsq00,rinv00),crf));
300 felec = _mm_mul_pd(qq00,_mm_msub_pd(rinv00,rinvsq00,krf2));
302 /* CUBIC SPLINE TABLE DISPERSION */
303 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
304 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
305 GMX_MM_TRANSPOSE2_PD(Y,F);
306 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
307 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
308 GMX_MM_TRANSPOSE2_PD(G,H);
309 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
310 VV = _mm_macc_pd(vfeps,Fp,Y);
311 vvdw6 = _mm_mul_pd(c6_00,VV);
312 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
313 fvdw6 = _mm_mul_pd(c6_00,FF);
315 /* CUBIC SPLINE TABLE REPULSION */
316 vfitab = _mm_add_epi32(vfitab,ifour);
317 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
318 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
319 GMX_MM_TRANSPOSE2_PD(Y,F);
320 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
321 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
322 GMX_MM_TRANSPOSE2_PD(G,H);
323 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
324 VV = _mm_macc_pd(vfeps,Fp,Y);
325 vvdw12 = _mm_mul_pd(c12_00,VV);
326 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
327 fvdw12 = _mm_mul_pd(c12_00,FF);
328 vvdw = _mm_add_pd(vvdw12,vvdw6);
329 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
331 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
333 /* Update potential sum for this i atom from the interaction with this j atom. */
334 velec = _mm_and_pd(velec,cutoff_mask);
335 velecsum = _mm_add_pd(velecsum,velec);
336 vvdw = _mm_and_pd(vvdw,cutoff_mask);
337 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
339 fscal = _mm_add_pd(felec,fvdw);
341 fscal = _mm_and_pd(fscal,cutoff_mask);
343 /* Update vectorial force */
344 fix0 = _mm_macc_pd(dx00,fscal,fix0);
345 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
346 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
348 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
349 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
350 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
354 /**************************
355 * CALCULATE INTERACTIONS *
356 **************************/
358 if (gmx_mm_any_lt(rsq01,rcutoff2))
361 /* REACTION-FIELD ELECTROSTATICS */
362 velec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_macc_pd(krf,rsq01,rinv01),crf));
363 felec = _mm_mul_pd(qq01,_mm_msub_pd(rinv01,rinvsq01,krf2));
365 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
367 /* Update potential sum for this i atom from the interaction with this j atom. */
368 velec = _mm_and_pd(velec,cutoff_mask);
369 velecsum = _mm_add_pd(velecsum,velec);
373 fscal = _mm_and_pd(fscal,cutoff_mask);
375 /* Update vectorial force */
376 fix0 = _mm_macc_pd(dx01,fscal,fix0);
377 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
378 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
380 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
381 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
382 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
386 /**************************
387 * CALCULATE INTERACTIONS *
388 **************************/
390 if (gmx_mm_any_lt(rsq02,rcutoff2))
393 /* REACTION-FIELD ELECTROSTATICS */
394 velec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_macc_pd(krf,rsq02,rinv02),crf));
395 felec = _mm_mul_pd(qq02,_mm_msub_pd(rinv02,rinvsq02,krf2));
397 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
399 /* Update potential sum for this i atom from the interaction with this j atom. */
400 velec = _mm_and_pd(velec,cutoff_mask);
401 velecsum = _mm_add_pd(velecsum,velec);
405 fscal = _mm_and_pd(fscal,cutoff_mask);
407 /* Update vectorial force */
408 fix0 = _mm_macc_pd(dx02,fscal,fix0);
409 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
410 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
412 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
413 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
414 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
418 /**************************
419 * CALCULATE INTERACTIONS *
420 **************************/
422 if (gmx_mm_any_lt(rsq10,rcutoff2))
425 /* REACTION-FIELD ELECTROSTATICS */
426 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_macc_pd(krf,rsq10,rinv10),crf));
427 felec = _mm_mul_pd(qq10,_mm_msub_pd(rinv10,rinvsq10,krf2));
429 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
431 /* Update potential sum for this i atom from the interaction with this j atom. */
432 velec = _mm_and_pd(velec,cutoff_mask);
433 velecsum = _mm_add_pd(velecsum,velec);
437 fscal = _mm_and_pd(fscal,cutoff_mask);
439 /* Update vectorial force */
440 fix1 = _mm_macc_pd(dx10,fscal,fix1);
441 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
442 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
444 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
445 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
446 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
450 /**************************
451 * CALCULATE INTERACTIONS *
452 **************************/
454 if (gmx_mm_any_lt(rsq11,rcutoff2))
457 /* REACTION-FIELD ELECTROSTATICS */
458 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_macc_pd(krf,rsq11,rinv11),crf));
459 felec = _mm_mul_pd(qq11,_mm_msub_pd(rinv11,rinvsq11,krf2));
461 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
463 /* Update potential sum for this i atom from the interaction with this j atom. */
464 velec = _mm_and_pd(velec,cutoff_mask);
465 velecsum = _mm_add_pd(velecsum,velec);
469 fscal = _mm_and_pd(fscal,cutoff_mask);
471 /* Update vectorial force */
472 fix1 = _mm_macc_pd(dx11,fscal,fix1);
473 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
474 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
476 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
477 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
478 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
482 /**************************
483 * CALCULATE INTERACTIONS *
484 **************************/
486 if (gmx_mm_any_lt(rsq12,rcutoff2))
489 /* REACTION-FIELD ELECTROSTATICS */
490 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_macc_pd(krf,rsq12,rinv12),crf));
491 felec = _mm_mul_pd(qq12,_mm_msub_pd(rinv12,rinvsq12,krf2));
493 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
495 /* Update potential sum for this i atom from the interaction with this j atom. */
496 velec = _mm_and_pd(velec,cutoff_mask);
497 velecsum = _mm_add_pd(velecsum,velec);
501 fscal = _mm_and_pd(fscal,cutoff_mask);
503 /* Update vectorial force */
504 fix1 = _mm_macc_pd(dx12,fscal,fix1);
505 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
506 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
508 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
509 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
510 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
514 /**************************
515 * CALCULATE INTERACTIONS *
516 **************************/
518 if (gmx_mm_any_lt(rsq20,rcutoff2))
521 /* REACTION-FIELD ELECTROSTATICS */
522 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_macc_pd(krf,rsq20,rinv20),crf));
523 felec = _mm_mul_pd(qq20,_mm_msub_pd(rinv20,rinvsq20,krf2));
525 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
527 /* Update potential sum for this i atom from the interaction with this j atom. */
528 velec = _mm_and_pd(velec,cutoff_mask);
529 velecsum = _mm_add_pd(velecsum,velec);
533 fscal = _mm_and_pd(fscal,cutoff_mask);
535 /* Update vectorial force */
536 fix2 = _mm_macc_pd(dx20,fscal,fix2);
537 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
538 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
540 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
541 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
542 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
546 /**************************
547 * CALCULATE INTERACTIONS *
548 **************************/
550 if (gmx_mm_any_lt(rsq21,rcutoff2))
553 /* REACTION-FIELD ELECTROSTATICS */
554 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_macc_pd(krf,rsq21,rinv21),crf));
555 felec = _mm_mul_pd(qq21,_mm_msub_pd(rinv21,rinvsq21,krf2));
557 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
559 /* Update potential sum for this i atom from the interaction with this j atom. */
560 velec = _mm_and_pd(velec,cutoff_mask);
561 velecsum = _mm_add_pd(velecsum,velec);
565 fscal = _mm_and_pd(fscal,cutoff_mask);
567 /* Update vectorial force */
568 fix2 = _mm_macc_pd(dx21,fscal,fix2);
569 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
570 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
572 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
573 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
574 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
578 /**************************
579 * CALCULATE INTERACTIONS *
580 **************************/
582 if (gmx_mm_any_lt(rsq22,rcutoff2))
585 /* REACTION-FIELD ELECTROSTATICS */
586 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_macc_pd(krf,rsq22,rinv22),crf));
587 felec = _mm_mul_pd(qq22,_mm_msub_pd(rinv22,rinvsq22,krf2));
589 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
591 /* Update potential sum for this i atom from the interaction with this j atom. */
592 velec = _mm_and_pd(velec,cutoff_mask);
593 velecsum = _mm_add_pd(velecsum,velec);
597 fscal = _mm_and_pd(fscal,cutoff_mask);
599 /* Update vectorial force */
600 fix2 = _mm_macc_pd(dx22,fscal,fix2);
601 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
602 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
604 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
605 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
606 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
610 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
612 /* Inner loop uses 387 flops */
619 j_coord_offsetA = DIM*jnrA;
621 /* load j atom coordinates */
622 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
623 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
625 /* Calculate displacement vector */
626 dx00 = _mm_sub_pd(ix0,jx0);
627 dy00 = _mm_sub_pd(iy0,jy0);
628 dz00 = _mm_sub_pd(iz0,jz0);
629 dx01 = _mm_sub_pd(ix0,jx1);
630 dy01 = _mm_sub_pd(iy0,jy1);
631 dz01 = _mm_sub_pd(iz0,jz1);
632 dx02 = _mm_sub_pd(ix0,jx2);
633 dy02 = _mm_sub_pd(iy0,jy2);
634 dz02 = _mm_sub_pd(iz0,jz2);
635 dx10 = _mm_sub_pd(ix1,jx0);
636 dy10 = _mm_sub_pd(iy1,jy0);
637 dz10 = _mm_sub_pd(iz1,jz0);
638 dx11 = _mm_sub_pd(ix1,jx1);
639 dy11 = _mm_sub_pd(iy1,jy1);
640 dz11 = _mm_sub_pd(iz1,jz1);
641 dx12 = _mm_sub_pd(ix1,jx2);
642 dy12 = _mm_sub_pd(iy1,jy2);
643 dz12 = _mm_sub_pd(iz1,jz2);
644 dx20 = _mm_sub_pd(ix2,jx0);
645 dy20 = _mm_sub_pd(iy2,jy0);
646 dz20 = _mm_sub_pd(iz2,jz0);
647 dx21 = _mm_sub_pd(ix2,jx1);
648 dy21 = _mm_sub_pd(iy2,jy1);
649 dz21 = _mm_sub_pd(iz2,jz1);
650 dx22 = _mm_sub_pd(ix2,jx2);
651 dy22 = _mm_sub_pd(iy2,jy2);
652 dz22 = _mm_sub_pd(iz2,jz2);
654 /* Calculate squared distance and things based on it */
655 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
656 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
657 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
658 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
659 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
660 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
661 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
662 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
663 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
665 rinv00 = gmx_mm_invsqrt_pd(rsq00);
666 rinv01 = gmx_mm_invsqrt_pd(rsq01);
667 rinv02 = gmx_mm_invsqrt_pd(rsq02);
668 rinv10 = gmx_mm_invsqrt_pd(rsq10);
669 rinv11 = gmx_mm_invsqrt_pd(rsq11);
670 rinv12 = gmx_mm_invsqrt_pd(rsq12);
671 rinv20 = gmx_mm_invsqrt_pd(rsq20);
672 rinv21 = gmx_mm_invsqrt_pd(rsq21);
673 rinv22 = gmx_mm_invsqrt_pd(rsq22);
675 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
676 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
677 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
678 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
679 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
680 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
681 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
682 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
683 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
685 fjx0 = _mm_setzero_pd();
686 fjy0 = _mm_setzero_pd();
687 fjz0 = _mm_setzero_pd();
688 fjx1 = _mm_setzero_pd();
689 fjy1 = _mm_setzero_pd();
690 fjz1 = _mm_setzero_pd();
691 fjx2 = _mm_setzero_pd();
692 fjy2 = _mm_setzero_pd();
693 fjz2 = _mm_setzero_pd();
695 /**************************
696 * CALCULATE INTERACTIONS *
697 **************************/
699 if (gmx_mm_any_lt(rsq00,rcutoff2))
702 r00 = _mm_mul_pd(rsq00,rinv00);
704 /* Calculate table index by multiplying r with table scale and truncate to integer */
705 rt = _mm_mul_pd(r00,vftabscale);
706 vfitab = _mm_cvttpd_epi32(rt);
708 vfeps = _mm_frcz_pd(rt);
710 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
712 twovfeps = _mm_add_pd(vfeps,vfeps);
713 vfitab = _mm_slli_epi32(vfitab,3);
715 /* REACTION-FIELD ELECTROSTATICS */
716 velec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_macc_pd(krf,rsq00,rinv00),crf));
717 felec = _mm_mul_pd(qq00,_mm_msub_pd(rinv00,rinvsq00,krf2));
719 /* CUBIC SPLINE TABLE DISPERSION */
720 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
721 F = _mm_setzero_pd();
722 GMX_MM_TRANSPOSE2_PD(Y,F);
723 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
724 H = _mm_setzero_pd();
725 GMX_MM_TRANSPOSE2_PD(G,H);
726 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
727 VV = _mm_macc_pd(vfeps,Fp,Y);
728 vvdw6 = _mm_mul_pd(c6_00,VV);
729 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
730 fvdw6 = _mm_mul_pd(c6_00,FF);
732 /* CUBIC SPLINE TABLE REPULSION */
733 vfitab = _mm_add_epi32(vfitab,ifour);
734 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
735 F = _mm_setzero_pd();
736 GMX_MM_TRANSPOSE2_PD(Y,F);
737 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
738 H = _mm_setzero_pd();
739 GMX_MM_TRANSPOSE2_PD(G,H);
740 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
741 VV = _mm_macc_pd(vfeps,Fp,Y);
742 vvdw12 = _mm_mul_pd(c12_00,VV);
743 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
744 fvdw12 = _mm_mul_pd(c12_00,FF);
745 vvdw = _mm_add_pd(vvdw12,vvdw6);
746 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
748 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
750 /* Update potential sum for this i atom from the interaction with this j atom. */
751 velec = _mm_and_pd(velec,cutoff_mask);
752 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
753 velecsum = _mm_add_pd(velecsum,velec);
754 vvdw = _mm_and_pd(vvdw,cutoff_mask);
755 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
756 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
758 fscal = _mm_add_pd(felec,fvdw);
760 fscal = _mm_and_pd(fscal,cutoff_mask);
762 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
764 /* Update vectorial force */
765 fix0 = _mm_macc_pd(dx00,fscal,fix0);
766 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
767 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
769 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
770 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
771 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
775 /**************************
776 * CALCULATE INTERACTIONS *
777 **************************/
779 if (gmx_mm_any_lt(rsq01,rcutoff2))
782 /* REACTION-FIELD ELECTROSTATICS */
783 velec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_macc_pd(krf,rsq01,rinv01),crf));
784 felec = _mm_mul_pd(qq01,_mm_msub_pd(rinv01,rinvsq01,krf2));
786 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
788 /* Update potential sum for this i atom from the interaction with this j atom. */
789 velec = _mm_and_pd(velec,cutoff_mask);
790 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
791 velecsum = _mm_add_pd(velecsum,velec);
795 fscal = _mm_and_pd(fscal,cutoff_mask);
797 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
799 /* Update vectorial force */
800 fix0 = _mm_macc_pd(dx01,fscal,fix0);
801 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
802 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
804 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
805 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
806 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
810 /**************************
811 * CALCULATE INTERACTIONS *
812 **************************/
814 if (gmx_mm_any_lt(rsq02,rcutoff2))
817 /* REACTION-FIELD ELECTROSTATICS */
818 velec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_macc_pd(krf,rsq02,rinv02),crf));
819 felec = _mm_mul_pd(qq02,_mm_msub_pd(rinv02,rinvsq02,krf2));
821 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
823 /* Update potential sum for this i atom from the interaction with this j atom. */
824 velec = _mm_and_pd(velec,cutoff_mask);
825 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
826 velecsum = _mm_add_pd(velecsum,velec);
830 fscal = _mm_and_pd(fscal,cutoff_mask);
832 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
834 /* Update vectorial force */
835 fix0 = _mm_macc_pd(dx02,fscal,fix0);
836 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
837 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
839 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
840 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
841 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
845 /**************************
846 * CALCULATE INTERACTIONS *
847 **************************/
849 if (gmx_mm_any_lt(rsq10,rcutoff2))
852 /* REACTION-FIELD ELECTROSTATICS */
853 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_macc_pd(krf,rsq10,rinv10),crf));
854 felec = _mm_mul_pd(qq10,_mm_msub_pd(rinv10,rinvsq10,krf2));
856 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
858 /* Update potential sum for this i atom from the interaction with this j atom. */
859 velec = _mm_and_pd(velec,cutoff_mask);
860 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
861 velecsum = _mm_add_pd(velecsum,velec);
865 fscal = _mm_and_pd(fscal,cutoff_mask);
867 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
869 /* Update vectorial force */
870 fix1 = _mm_macc_pd(dx10,fscal,fix1);
871 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
872 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
874 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
875 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
876 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
880 /**************************
881 * CALCULATE INTERACTIONS *
882 **************************/
884 if (gmx_mm_any_lt(rsq11,rcutoff2))
887 /* REACTION-FIELD ELECTROSTATICS */
888 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_macc_pd(krf,rsq11,rinv11),crf));
889 felec = _mm_mul_pd(qq11,_mm_msub_pd(rinv11,rinvsq11,krf2));
891 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
893 /* Update potential sum for this i atom from the interaction with this j atom. */
894 velec = _mm_and_pd(velec,cutoff_mask);
895 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
896 velecsum = _mm_add_pd(velecsum,velec);
900 fscal = _mm_and_pd(fscal,cutoff_mask);
902 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
904 /* Update vectorial force */
905 fix1 = _mm_macc_pd(dx11,fscal,fix1);
906 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
907 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
909 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
910 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
911 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
915 /**************************
916 * CALCULATE INTERACTIONS *
917 **************************/
919 if (gmx_mm_any_lt(rsq12,rcutoff2))
922 /* REACTION-FIELD ELECTROSTATICS */
923 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_macc_pd(krf,rsq12,rinv12),crf));
924 felec = _mm_mul_pd(qq12,_mm_msub_pd(rinv12,rinvsq12,krf2));
926 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
928 /* Update potential sum for this i atom from the interaction with this j atom. */
929 velec = _mm_and_pd(velec,cutoff_mask);
930 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
931 velecsum = _mm_add_pd(velecsum,velec);
935 fscal = _mm_and_pd(fscal,cutoff_mask);
937 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
939 /* Update vectorial force */
940 fix1 = _mm_macc_pd(dx12,fscal,fix1);
941 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
942 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
944 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
945 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
946 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
950 /**************************
951 * CALCULATE INTERACTIONS *
952 **************************/
954 if (gmx_mm_any_lt(rsq20,rcutoff2))
957 /* REACTION-FIELD ELECTROSTATICS */
958 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_macc_pd(krf,rsq20,rinv20),crf));
959 felec = _mm_mul_pd(qq20,_mm_msub_pd(rinv20,rinvsq20,krf2));
961 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
963 /* Update potential sum for this i atom from the interaction with this j atom. */
964 velec = _mm_and_pd(velec,cutoff_mask);
965 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
966 velecsum = _mm_add_pd(velecsum,velec);
970 fscal = _mm_and_pd(fscal,cutoff_mask);
972 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
974 /* Update vectorial force */
975 fix2 = _mm_macc_pd(dx20,fscal,fix2);
976 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
977 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
979 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
980 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
981 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
985 /**************************
986 * CALCULATE INTERACTIONS *
987 **************************/
989 if (gmx_mm_any_lt(rsq21,rcutoff2))
992 /* REACTION-FIELD ELECTROSTATICS */
993 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_macc_pd(krf,rsq21,rinv21),crf));
994 felec = _mm_mul_pd(qq21,_mm_msub_pd(rinv21,rinvsq21,krf2));
996 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
998 /* Update potential sum for this i atom from the interaction with this j atom. */
999 velec = _mm_and_pd(velec,cutoff_mask);
1000 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1001 velecsum = _mm_add_pd(velecsum,velec);
1005 fscal = _mm_and_pd(fscal,cutoff_mask);
1007 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1009 /* Update vectorial force */
1010 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1011 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1012 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1014 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1015 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1016 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1020 /**************************
1021 * CALCULATE INTERACTIONS *
1022 **************************/
1024 if (gmx_mm_any_lt(rsq22,rcutoff2))
1027 /* REACTION-FIELD ELECTROSTATICS */
1028 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_macc_pd(krf,rsq22,rinv22),crf));
1029 felec = _mm_mul_pd(qq22,_mm_msub_pd(rinv22,rinvsq22,krf2));
1031 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1033 /* Update potential sum for this i atom from the interaction with this j atom. */
1034 velec = _mm_and_pd(velec,cutoff_mask);
1035 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1036 velecsum = _mm_add_pd(velecsum,velec);
1040 fscal = _mm_and_pd(fscal,cutoff_mask);
1042 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1044 /* Update vectorial force */
1045 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1046 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1047 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1049 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1050 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1051 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1055 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1057 /* Inner loop uses 387 flops */
1060 /* End of innermost loop */
1062 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1063 f+i_coord_offset,fshift+i_shift_offset);
1066 /* Update potential energies */
1067 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1068 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1070 /* Increment number of inner iterations */
1071 inneriter += j_index_end - j_index_start;
1073 /* Outer loop uses 20 flops */
1076 /* Increment number of outer iterations */
1079 /* Update outer/inner flops */
1081 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*387);
1084 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_avx_128_fma_double
1085 * Electrostatics interaction: ReactionField
1086 * VdW interaction: CubicSplineTable
1087 * Geometry: Water3-Water3
1088 * Calculate force/pot: Force
1091 nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_avx_128_fma_double
1092 (t_nblist * gmx_restrict nlist,
1093 rvec * gmx_restrict xx,
1094 rvec * gmx_restrict ff,
1095 t_forcerec * gmx_restrict fr,
1096 t_mdatoms * gmx_restrict mdatoms,
1097 nb_kernel_data_t * gmx_restrict kernel_data,
1098 t_nrnb * gmx_restrict nrnb)
1100 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1101 * just 0 for non-waters.
1102 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1103 * jnr indices corresponding to data put in the four positions in the SIMD register.
1105 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1106 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1108 int j_coord_offsetA,j_coord_offsetB;
1109 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1110 real rcutoff_scalar;
1111 real *shiftvec,*fshift,*x,*f;
1112 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1114 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1116 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1118 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1119 int vdwjidx0A,vdwjidx0B;
1120 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1121 int vdwjidx1A,vdwjidx1B;
1122 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1123 int vdwjidx2A,vdwjidx2B;
1124 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1125 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1126 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1127 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1128 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1129 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1130 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1131 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1132 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1133 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1134 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1137 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1140 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1141 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1143 __m128i ifour = _mm_set1_epi32(4);
1144 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
1146 __m128d dummy_mask,cutoff_mask;
1147 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1148 __m128d one = _mm_set1_pd(1.0);
1149 __m128d two = _mm_set1_pd(2.0);
1155 jindex = nlist->jindex;
1157 shiftidx = nlist->shift;
1159 shiftvec = fr->shift_vec[0];
1160 fshift = fr->fshift[0];
1161 facel = _mm_set1_pd(fr->epsfac);
1162 charge = mdatoms->chargeA;
1163 krf = _mm_set1_pd(fr->ic->k_rf);
1164 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
1165 crf = _mm_set1_pd(fr->ic->c_rf);
1166 nvdwtype = fr->ntype;
1167 vdwparam = fr->nbfp;
1168 vdwtype = mdatoms->typeA;
1170 vftab = kernel_data->table_vdw->data;
1171 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
1173 /* Setup water-specific parameters */
1174 inr = nlist->iinr[0];
1175 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1176 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1177 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1178 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1180 jq0 = _mm_set1_pd(charge[inr+0]);
1181 jq1 = _mm_set1_pd(charge[inr+1]);
1182 jq2 = _mm_set1_pd(charge[inr+2]);
1183 vdwjidx0A = 2*vdwtype[inr+0];
1184 qq00 = _mm_mul_pd(iq0,jq0);
1185 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1186 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1187 qq01 = _mm_mul_pd(iq0,jq1);
1188 qq02 = _mm_mul_pd(iq0,jq2);
1189 qq10 = _mm_mul_pd(iq1,jq0);
1190 qq11 = _mm_mul_pd(iq1,jq1);
1191 qq12 = _mm_mul_pd(iq1,jq2);
1192 qq20 = _mm_mul_pd(iq2,jq0);
1193 qq21 = _mm_mul_pd(iq2,jq1);
1194 qq22 = _mm_mul_pd(iq2,jq2);
1196 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1197 rcutoff_scalar = fr->rcoulomb;
1198 rcutoff = _mm_set1_pd(rcutoff_scalar);
1199 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1201 /* Avoid stupid compiler warnings */
1203 j_coord_offsetA = 0;
1204 j_coord_offsetB = 0;
1209 /* Start outer loop over neighborlists */
1210 for(iidx=0; iidx<nri; iidx++)
1212 /* Load shift vector for this list */
1213 i_shift_offset = DIM*shiftidx[iidx];
1215 /* Load limits for loop over neighbors */
1216 j_index_start = jindex[iidx];
1217 j_index_end = jindex[iidx+1];
1219 /* Get outer coordinate index */
1221 i_coord_offset = DIM*inr;
1223 /* Load i particle coords and add shift vector */
1224 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1225 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1227 fix0 = _mm_setzero_pd();
1228 fiy0 = _mm_setzero_pd();
1229 fiz0 = _mm_setzero_pd();
1230 fix1 = _mm_setzero_pd();
1231 fiy1 = _mm_setzero_pd();
1232 fiz1 = _mm_setzero_pd();
1233 fix2 = _mm_setzero_pd();
1234 fiy2 = _mm_setzero_pd();
1235 fiz2 = _mm_setzero_pd();
1237 /* Start inner kernel loop */
1238 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1241 /* Get j neighbor index, and coordinate index */
1243 jnrB = jjnr[jidx+1];
1244 j_coord_offsetA = DIM*jnrA;
1245 j_coord_offsetB = DIM*jnrB;
1247 /* load j atom coordinates */
1248 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1249 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1251 /* Calculate displacement vector */
1252 dx00 = _mm_sub_pd(ix0,jx0);
1253 dy00 = _mm_sub_pd(iy0,jy0);
1254 dz00 = _mm_sub_pd(iz0,jz0);
1255 dx01 = _mm_sub_pd(ix0,jx1);
1256 dy01 = _mm_sub_pd(iy0,jy1);
1257 dz01 = _mm_sub_pd(iz0,jz1);
1258 dx02 = _mm_sub_pd(ix0,jx2);
1259 dy02 = _mm_sub_pd(iy0,jy2);
1260 dz02 = _mm_sub_pd(iz0,jz2);
1261 dx10 = _mm_sub_pd(ix1,jx0);
1262 dy10 = _mm_sub_pd(iy1,jy0);
1263 dz10 = _mm_sub_pd(iz1,jz0);
1264 dx11 = _mm_sub_pd(ix1,jx1);
1265 dy11 = _mm_sub_pd(iy1,jy1);
1266 dz11 = _mm_sub_pd(iz1,jz1);
1267 dx12 = _mm_sub_pd(ix1,jx2);
1268 dy12 = _mm_sub_pd(iy1,jy2);
1269 dz12 = _mm_sub_pd(iz1,jz2);
1270 dx20 = _mm_sub_pd(ix2,jx0);
1271 dy20 = _mm_sub_pd(iy2,jy0);
1272 dz20 = _mm_sub_pd(iz2,jz0);
1273 dx21 = _mm_sub_pd(ix2,jx1);
1274 dy21 = _mm_sub_pd(iy2,jy1);
1275 dz21 = _mm_sub_pd(iz2,jz1);
1276 dx22 = _mm_sub_pd(ix2,jx2);
1277 dy22 = _mm_sub_pd(iy2,jy2);
1278 dz22 = _mm_sub_pd(iz2,jz2);
1280 /* Calculate squared distance and things based on it */
1281 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1282 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1283 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1284 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1285 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1286 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1287 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1288 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1289 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1291 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1292 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1293 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1294 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1295 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1296 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1297 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1298 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1299 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1301 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1302 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1303 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1304 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1305 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1306 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1307 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1308 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1309 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1311 fjx0 = _mm_setzero_pd();
1312 fjy0 = _mm_setzero_pd();
1313 fjz0 = _mm_setzero_pd();
1314 fjx1 = _mm_setzero_pd();
1315 fjy1 = _mm_setzero_pd();
1316 fjz1 = _mm_setzero_pd();
1317 fjx2 = _mm_setzero_pd();
1318 fjy2 = _mm_setzero_pd();
1319 fjz2 = _mm_setzero_pd();
1321 /**************************
1322 * CALCULATE INTERACTIONS *
1323 **************************/
1325 if (gmx_mm_any_lt(rsq00,rcutoff2))
1328 r00 = _mm_mul_pd(rsq00,rinv00);
1330 /* Calculate table index by multiplying r with table scale and truncate to integer */
1331 rt = _mm_mul_pd(r00,vftabscale);
1332 vfitab = _mm_cvttpd_epi32(rt);
1334 vfeps = _mm_frcz_pd(rt);
1336 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1338 twovfeps = _mm_add_pd(vfeps,vfeps);
1339 vfitab = _mm_slli_epi32(vfitab,3);
1341 /* REACTION-FIELD ELECTROSTATICS */
1342 felec = _mm_mul_pd(qq00,_mm_msub_pd(rinv00,rinvsq00,krf2));
1344 /* CUBIC SPLINE TABLE DISPERSION */
1345 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1346 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1347 GMX_MM_TRANSPOSE2_PD(Y,F);
1348 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1349 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1350 GMX_MM_TRANSPOSE2_PD(G,H);
1351 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1352 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1353 fvdw6 = _mm_mul_pd(c6_00,FF);
1355 /* CUBIC SPLINE TABLE REPULSION */
1356 vfitab = _mm_add_epi32(vfitab,ifour);
1357 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1358 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1359 GMX_MM_TRANSPOSE2_PD(Y,F);
1360 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1361 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1362 GMX_MM_TRANSPOSE2_PD(G,H);
1363 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1364 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1365 fvdw12 = _mm_mul_pd(c12_00,FF);
1366 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1368 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1370 fscal = _mm_add_pd(felec,fvdw);
1372 fscal = _mm_and_pd(fscal,cutoff_mask);
1374 /* Update vectorial force */
1375 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1376 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1377 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1379 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1380 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1381 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1385 /**************************
1386 * CALCULATE INTERACTIONS *
1387 **************************/
1389 if (gmx_mm_any_lt(rsq01,rcutoff2))
1392 /* REACTION-FIELD ELECTROSTATICS */
1393 felec = _mm_mul_pd(qq01,_mm_msub_pd(rinv01,rinvsq01,krf2));
1395 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
1399 fscal = _mm_and_pd(fscal,cutoff_mask);
1401 /* Update vectorial force */
1402 fix0 = _mm_macc_pd(dx01,fscal,fix0);
1403 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
1404 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
1406 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
1407 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
1408 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
1412 /**************************
1413 * CALCULATE INTERACTIONS *
1414 **************************/
1416 if (gmx_mm_any_lt(rsq02,rcutoff2))
1419 /* REACTION-FIELD ELECTROSTATICS */
1420 felec = _mm_mul_pd(qq02,_mm_msub_pd(rinv02,rinvsq02,krf2));
1422 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
1426 fscal = _mm_and_pd(fscal,cutoff_mask);
1428 /* Update vectorial force */
1429 fix0 = _mm_macc_pd(dx02,fscal,fix0);
1430 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
1431 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
1433 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
1434 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
1435 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
1439 /**************************
1440 * CALCULATE INTERACTIONS *
1441 **************************/
1443 if (gmx_mm_any_lt(rsq10,rcutoff2))
1446 /* REACTION-FIELD ELECTROSTATICS */
1447 felec = _mm_mul_pd(qq10,_mm_msub_pd(rinv10,rinvsq10,krf2));
1449 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1453 fscal = _mm_and_pd(fscal,cutoff_mask);
1455 /* Update vectorial force */
1456 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1457 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1458 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1460 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1461 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1462 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1466 /**************************
1467 * CALCULATE INTERACTIONS *
1468 **************************/
1470 if (gmx_mm_any_lt(rsq11,rcutoff2))
1473 /* REACTION-FIELD ELECTROSTATICS */
1474 felec = _mm_mul_pd(qq11,_mm_msub_pd(rinv11,rinvsq11,krf2));
1476 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1480 fscal = _mm_and_pd(fscal,cutoff_mask);
1482 /* Update vectorial force */
1483 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1484 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1485 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1487 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1488 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1489 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1493 /**************************
1494 * CALCULATE INTERACTIONS *
1495 **************************/
1497 if (gmx_mm_any_lt(rsq12,rcutoff2))
1500 /* REACTION-FIELD ELECTROSTATICS */
1501 felec = _mm_mul_pd(qq12,_mm_msub_pd(rinv12,rinvsq12,krf2));
1503 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1507 fscal = _mm_and_pd(fscal,cutoff_mask);
1509 /* Update vectorial force */
1510 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1511 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1512 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1514 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1515 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1516 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1520 /**************************
1521 * CALCULATE INTERACTIONS *
1522 **************************/
1524 if (gmx_mm_any_lt(rsq20,rcutoff2))
1527 /* REACTION-FIELD ELECTROSTATICS */
1528 felec = _mm_mul_pd(qq20,_mm_msub_pd(rinv20,rinvsq20,krf2));
1530 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1534 fscal = _mm_and_pd(fscal,cutoff_mask);
1536 /* Update vectorial force */
1537 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1538 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1539 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1541 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1542 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1543 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1547 /**************************
1548 * CALCULATE INTERACTIONS *
1549 **************************/
1551 if (gmx_mm_any_lt(rsq21,rcutoff2))
1554 /* REACTION-FIELD ELECTROSTATICS */
1555 felec = _mm_mul_pd(qq21,_mm_msub_pd(rinv21,rinvsq21,krf2));
1557 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1561 fscal = _mm_and_pd(fscal,cutoff_mask);
1563 /* Update vectorial force */
1564 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1565 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1566 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1568 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1569 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1570 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1574 /**************************
1575 * CALCULATE INTERACTIONS *
1576 **************************/
1578 if (gmx_mm_any_lt(rsq22,rcutoff2))
1581 /* REACTION-FIELD ELECTROSTATICS */
1582 felec = _mm_mul_pd(qq22,_mm_msub_pd(rinv22,rinvsq22,krf2));
1584 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1588 fscal = _mm_and_pd(fscal,cutoff_mask);
1590 /* Update vectorial force */
1591 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1592 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1593 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1595 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1596 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1597 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1601 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1603 /* Inner loop uses 324 flops */
1606 if(jidx<j_index_end)
1610 j_coord_offsetA = DIM*jnrA;
1612 /* load j atom coordinates */
1613 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1614 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1616 /* Calculate displacement vector */
1617 dx00 = _mm_sub_pd(ix0,jx0);
1618 dy00 = _mm_sub_pd(iy0,jy0);
1619 dz00 = _mm_sub_pd(iz0,jz0);
1620 dx01 = _mm_sub_pd(ix0,jx1);
1621 dy01 = _mm_sub_pd(iy0,jy1);
1622 dz01 = _mm_sub_pd(iz0,jz1);
1623 dx02 = _mm_sub_pd(ix0,jx2);
1624 dy02 = _mm_sub_pd(iy0,jy2);
1625 dz02 = _mm_sub_pd(iz0,jz2);
1626 dx10 = _mm_sub_pd(ix1,jx0);
1627 dy10 = _mm_sub_pd(iy1,jy0);
1628 dz10 = _mm_sub_pd(iz1,jz0);
1629 dx11 = _mm_sub_pd(ix1,jx1);
1630 dy11 = _mm_sub_pd(iy1,jy1);
1631 dz11 = _mm_sub_pd(iz1,jz1);
1632 dx12 = _mm_sub_pd(ix1,jx2);
1633 dy12 = _mm_sub_pd(iy1,jy2);
1634 dz12 = _mm_sub_pd(iz1,jz2);
1635 dx20 = _mm_sub_pd(ix2,jx0);
1636 dy20 = _mm_sub_pd(iy2,jy0);
1637 dz20 = _mm_sub_pd(iz2,jz0);
1638 dx21 = _mm_sub_pd(ix2,jx1);
1639 dy21 = _mm_sub_pd(iy2,jy1);
1640 dz21 = _mm_sub_pd(iz2,jz1);
1641 dx22 = _mm_sub_pd(ix2,jx2);
1642 dy22 = _mm_sub_pd(iy2,jy2);
1643 dz22 = _mm_sub_pd(iz2,jz2);
1645 /* Calculate squared distance and things based on it */
1646 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1647 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1648 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1649 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1650 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1651 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1652 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1653 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1654 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1656 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1657 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1658 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1659 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1660 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1661 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1662 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1663 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1664 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1666 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1667 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1668 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1669 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1670 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1671 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1672 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1673 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1674 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1676 fjx0 = _mm_setzero_pd();
1677 fjy0 = _mm_setzero_pd();
1678 fjz0 = _mm_setzero_pd();
1679 fjx1 = _mm_setzero_pd();
1680 fjy1 = _mm_setzero_pd();
1681 fjz1 = _mm_setzero_pd();
1682 fjx2 = _mm_setzero_pd();
1683 fjy2 = _mm_setzero_pd();
1684 fjz2 = _mm_setzero_pd();
1686 /**************************
1687 * CALCULATE INTERACTIONS *
1688 **************************/
1690 if (gmx_mm_any_lt(rsq00,rcutoff2))
1693 r00 = _mm_mul_pd(rsq00,rinv00);
1695 /* Calculate table index by multiplying r with table scale and truncate to integer */
1696 rt = _mm_mul_pd(r00,vftabscale);
1697 vfitab = _mm_cvttpd_epi32(rt);
1699 vfeps = _mm_frcz_pd(rt);
1701 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1703 twovfeps = _mm_add_pd(vfeps,vfeps);
1704 vfitab = _mm_slli_epi32(vfitab,3);
1706 /* REACTION-FIELD ELECTROSTATICS */
1707 felec = _mm_mul_pd(qq00,_mm_msub_pd(rinv00,rinvsq00,krf2));
1709 /* CUBIC SPLINE TABLE DISPERSION */
1710 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1711 F = _mm_setzero_pd();
1712 GMX_MM_TRANSPOSE2_PD(Y,F);
1713 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1714 H = _mm_setzero_pd();
1715 GMX_MM_TRANSPOSE2_PD(G,H);
1716 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1717 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1718 fvdw6 = _mm_mul_pd(c6_00,FF);
1720 /* CUBIC SPLINE TABLE REPULSION */
1721 vfitab = _mm_add_epi32(vfitab,ifour);
1722 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1723 F = _mm_setzero_pd();
1724 GMX_MM_TRANSPOSE2_PD(Y,F);
1725 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1726 H = _mm_setzero_pd();
1727 GMX_MM_TRANSPOSE2_PD(G,H);
1728 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1729 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1730 fvdw12 = _mm_mul_pd(c12_00,FF);
1731 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1733 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1735 fscal = _mm_add_pd(felec,fvdw);
1737 fscal = _mm_and_pd(fscal,cutoff_mask);
1739 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1741 /* Update vectorial force */
1742 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1743 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1744 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1746 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1747 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1748 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1752 /**************************
1753 * CALCULATE INTERACTIONS *
1754 **************************/
1756 if (gmx_mm_any_lt(rsq01,rcutoff2))
1759 /* REACTION-FIELD ELECTROSTATICS */
1760 felec = _mm_mul_pd(qq01,_mm_msub_pd(rinv01,rinvsq01,krf2));
1762 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
1766 fscal = _mm_and_pd(fscal,cutoff_mask);
1768 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1770 /* Update vectorial force */
1771 fix0 = _mm_macc_pd(dx01,fscal,fix0);
1772 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
1773 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
1775 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
1776 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
1777 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
1781 /**************************
1782 * CALCULATE INTERACTIONS *
1783 **************************/
1785 if (gmx_mm_any_lt(rsq02,rcutoff2))
1788 /* REACTION-FIELD ELECTROSTATICS */
1789 felec = _mm_mul_pd(qq02,_mm_msub_pd(rinv02,rinvsq02,krf2));
1791 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
1795 fscal = _mm_and_pd(fscal,cutoff_mask);
1797 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1799 /* Update vectorial force */
1800 fix0 = _mm_macc_pd(dx02,fscal,fix0);
1801 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
1802 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
1804 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
1805 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
1806 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
1810 /**************************
1811 * CALCULATE INTERACTIONS *
1812 **************************/
1814 if (gmx_mm_any_lt(rsq10,rcutoff2))
1817 /* REACTION-FIELD ELECTROSTATICS */
1818 felec = _mm_mul_pd(qq10,_mm_msub_pd(rinv10,rinvsq10,krf2));
1820 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1824 fscal = _mm_and_pd(fscal,cutoff_mask);
1826 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1828 /* Update vectorial force */
1829 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1830 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1831 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1833 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1834 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1835 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1839 /**************************
1840 * CALCULATE INTERACTIONS *
1841 **************************/
1843 if (gmx_mm_any_lt(rsq11,rcutoff2))
1846 /* REACTION-FIELD ELECTROSTATICS */
1847 felec = _mm_mul_pd(qq11,_mm_msub_pd(rinv11,rinvsq11,krf2));
1849 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1853 fscal = _mm_and_pd(fscal,cutoff_mask);
1855 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1857 /* Update vectorial force */
1858 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1859 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1860 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1862 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1863 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1864 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1868 /**************************
1869 * CALCULATE INTERACTIONS *
1870 **************************/
1872 if (gmx_mm_any_lt(rsq12,rcutoff2))
1875 /* REACTION-FIELD ELECTROSTATICS */
1876 felec = _mm_mul_pd(qq12,_mm_msub_pd(rinv12,rinvsq12,krf2));
1878 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1882 fscal = _mm_and_pd(fscal,cutoff_mask);
1884 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1886 /* Update vectorial force */
1887 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1888 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1889 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1891 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1892 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1893 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1897 /**************************
1898 * CALCULATE INTERACTIONS *
1899 **************************/
1901 if (gmx_mm_any_lt(rsq20,rcutoff2))
1904 /* REACTION-FIELD ELECTROSTATICS */
1905 felec = _mm_mul_pd(qq20,_mm_msub_pd(rinv20,rinvsq20,krf2));
1907 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1911 fscal = _mm_and_pd(fscal,cutoff_mask);
1913 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1915 /* Update vectorial force */
1916 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1917 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1918 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1920 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1921 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1922 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1926 /**************************
1927 * CALCULATE INTERACTIONS *
1928 **************************/
1930 if (gmx_mm_any_lt(rsq21,rcutoff2))
1933 /* REACTION-FIELD ELECTROSTATICS */
1934 felec = _mm_mul_pd(qq21,_mm_msub_pd(rinv21,rinvsq21,krf2));
1936 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1940 fscal = _mm_and_pd(fscal,cutoff_mask);
1942 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1944 /* Update vectorial force */
1945 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1946 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1947 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1949 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1950 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1951 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1955 /**************************
1956 * CALCULATE INTERACTIONS *
1957 **************************/
1959 if (gmx_mm_any_lt(rsq22,rcutoff2))
1962 /* REACTION-FIELD ELECTROSTATICS */
1963 felec = _mm_mul_pd(qq22,_mm_msub_pd(rinv22,rinvsq22,krf2));
1965 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1969 fscal = _mm_and_pd(fscal,cutoff_mask);
1971 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1973 /* Update vectorial force */
1974 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1975 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1976 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1978 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1979 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1980 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1984 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1986 /* Inner loop uses 324 flops */
1989 /* End of innermost loop */
1991 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1992 f+i_coord_offset,fshift+i_shift_offset);
1994 /* Increment number of inner iterations */
1995 inneriter += j_index_end - j_index_start;
1997 /* Outer loop uses 18 flops */
2000 /* Increment number of outer iterations */
2003 /* Update outer/inner flops */
2005 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*324);