2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_double
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B;
75 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 int vdwjidx1A,vdwjidx1B;
77 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78 int vdwjidx2A,vdwjidx2B;
79 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80 int vdwjidx3A,vdwjidx3B;
81 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
95 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
98 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
99 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
101 __m128i ifour = _mm_set1_epi32(4);
102 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
104 __m128d dummy_mask,cutoff_mask;
105 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
106 __m128d one = _mm_set1_pd(1.0);
107 __m128d two = _mm_set1_pd(2.0);
113 jindex = nlist->jindex;
115 shiftidx = nlist->shift;
117 shiftvec = fr->shift_vec[0];
118 fshift = fr->fshift[0];
119 facel = _mm_set1_pd(fr->epsfac);
120 charge = mdatoms->chargeA;
121 krf = _mm_set1_pd(fr->ic->k_rf);
122 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
123 crf = _mm_set1_pd(fr->ic->c_rf);
124 nvdwtype = fr->ntype;
126 vdwtype = mdatoms->typeA;
128 vftab = kernel_data->table_vdw->data;
129 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
131 /* Setup water-specific parameters */
132 inr = nlist->iinr[0];
133 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
134 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
135 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
136 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
138 jq1 = _mm_set1_pd(charge[inr+1]);
139 jq2 = _mm_set1_pd(charge[inr+2]);
140 jq3 = _mm_set1_pd(charge[inr+3]);
141 vdwjidx0A = 2*vdwtype[inr+0];
142 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
143 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
144 qq11 = _mm_mul_pd(iq1,jq1);
145 qq12 = _mm_mul_pd(iq1,jq2);
146 qq13 = _mm_mul_pd(iq1,jq3);
147 qq21 = _mm_mul_pd(iq2,jq1);
148 qq22 = _mm_mul_pd(iq2,jq2);
149 qq23 = _mm_mul_pd(iq2,jq3);
150 qq31 = _mm_mul_pd(iq3,jq1);
151 qq32 = _mm_mul_pd(iq3,jq2);
152 qq33 = _mm_mul_pd(iq3,jq3);
154 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
155 rcutoff_scalar = fr->rcoulomb;
156 rcutoff = _mm_set1_pd(rcutoff_scalar);
157 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
159 /* Avoid stupid compiler warnings */
167 /* Start outer loop over neighborlists */
168 for(iidx=0; iidx<nri; iidx++)
170 /* Load shift vector for this list */
171 i_shift_offset = DIM*shiftidx[iidx];
173 /* Load limits for loop over neighbors */
174 j_index_start = jindex[iidx];
175 j_index_end = jindex[iidx+1];
177 /* Get outer coordinate index */
179 i_coord_offset = DIM*inr;
181 /* Load i particle coords and add shift vector */
182 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
183 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
185 fix0 = _mm_setzero_pd();
186 fiy0 = _mm_setzero_pd();
187 fiz0 = _mm_setzero_pd();
188 fix1 = _mm_setzero_pd();
189 fiy1 = _mm_setzero_pd();
190 fiz1 = _mm_setzero_pd();
191 fix2 = _mm_setzero_pd();
192 fiy2 = _mm_setzero_pd();
193 fiz2 = _mm_setzero_pd();
194 fix3 = _mm_setzero_pd();
195 fiy3 = _mm_setzero_pd();
196 fiz3 = _mm_setzero_pd();
198 /* Reset potential sums */
199 velecsum = _mm_setzero_pd();
200 vvdwsum = _mm_setzero_pd();
202 /* Start inner kernel loop */
203 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
206 /* Get j neighbor index, and coordinate index */
209 j_coord_offsetA = DIM*jnrA;
210 j_coord_offsetB = DIM*jnrB;
212 /* load j atom coordinates */
213 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
214 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
215 &jy2,&jz2,&jx3,&jy3,&jz3);
217 /* Calculate displacement vector */
218 dx00 = _mm_sub_pd(ix0,jx0);
219 dy00 = _mm_sub_pd(iy0,jy0);
220 dz00 = _mm_sub_pd(iz0,jz0);
221 dx11 = _mm_sub_pd(ix1,jx1);
222 dy11 = _mm_sub_pd(iy1,jy1);
223 dz11 = _mm_sub_pd(iz1,jz1);
224 dx12 = _mm_sub_pd(ix1,jx2);
225 dy12 = _mm_sub_pd(iy1,jy2);
226 dz12 = _mm_sub_pd(iz1,jz2);
227 dx13 = _mm_sub_pd(ix1,jx3);
228 dy13 = _mm_sub_pd(iy1,jy3);
229 dz13 = _mm_sub_pd(iz1,jz3);
230 dx21 = _mm_sub_pd(ix2,jx1);
231 dy21 = _mm_sub_pd(iy2,jy1);
232 dz21 = _mm_sub_pd(iz2,jz1);
233 dx22 = _mm_sub_pd(ix2,jx2);
234 dy22 = _mm_sub_pd(iy2,jy2);
235 dz22 = _mm_sub_pd(iz2,jz2);
236 dx23 = _mm_sub_pd(ix2,jx3);
237 dy23 = _mm_sub_pd(iy2,jy3);
238 dz23 = _mm_sub_pd(iz2,jz3);
239 dx31 = _mm_sub_pd(ix3,jx1);
240 dy31 = _mm_sub_pd(iy3,jy1);
241 dz31 = _mm_sub_pd(iz3,jz1);
242 dx32 = _mm_sub_pd(ix3,jx2);
243 dy32 = _mm_sub_pd(iy3,jy2);
244 dz32 = _mm_sub_pd(iz3,jz2);
245 dx33 = _mm_sub_pd(ix3,jx3);
246 dy33 = _mm_sub_pd(iy3,jy3);
247 dz33 = _mm_sub_pd(iz3,jz3);
249 /* Calculate squared distance and things based on it */
250 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
251 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
252 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
253 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
254 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
255 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
256 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
257 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
258 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
259 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
261 rinv00 = gmx_mm_invsqrt_pd(rsq00);
262 rinv11 = gmx_mm_invsqrt_pd(rsq11);
263 rinv12 = gmx_mm_invsqrt_pd(rsq12);
264 rinv13 = gmx_mm_invsqrt_pd(rsq13);
265 rinv21 = gmx_mm_invsqrt_pd(rsq21);
266 rinv22 = gmx_mm_invsqrt_pd(rsq22);
267 rinv23 = gmx_mm_invsqrt_pd(rsq23);
268 rinv31 = gmx_mm_invsqrt_pd(rsq31);
269 rinv32 = gmx_mm_invsqrt_pd(rsq32);
270 rinv33 = gmx_mm_invsqrt_pd(rsq33);
272 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
273 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
274 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
275 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
276 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
277 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
278 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
279 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
280 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
282 fjx0 = _mm_setzero_pd();
283 fjy0 = _mm_setzero_pd();
284 fjz0 = _mm_setzero_pd();
285 fjx1 = _mm_setzero_pd();
286 fjy1 = _mm_setzero_pd();
287 fjz1 = _mm_setzero_pd();
288 fjx2 = _mm_setzero_pd();
289 fjy2 = _mm_setzero_pd();
290 fjz2 = _mm_setzero_pd();
291 fjx3 = _mm_setzero_pd();
292 fjy3 = _mm_setzero_pd();
293 fjz3 = _mm_setzero_pd();
295 /**************************
296 * CALCULATE INTERACTIONS *
297 **************************/
299 r00 = _mm_mul_pd(rsq00,rinv00);
301 /* Calculate table index by multiplying r with table scale and truncate to integer */
302 rt = _mm_mul_pd(r00,vftabscale);
303 vfitab = _mm_cvttpd_epi32(rt);
304 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
305 vfitab = _mm_slli_epi32(vfitab,3);
307 /* CUBIC SPLINE TABLE DISPERSION */
308 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
309 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
310 GMX_MM_TRANSPOSE2_PD(Y,F);
311 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
312 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
313 GMX_MM_TRANSPOSE2_PD(G,H);
314 Heps = _mm_mul_pd(vfeps,H);
315 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
316 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
317 vvdw6 = _mm_mul_pd(c6_00,VV);
318 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
319 fvdw6 = _mm_mul_pd(c6_00,FF);
321 /* CUBIC SPLINE TABLE REPULSION */
322 vfitab = _mm_add_epi32(vfitab,ifour);
323 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
324 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
325 GMX_MM_TRANSPOSE2_PD(Y,F);
326 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
327 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
328 GMX_MM_TRANSPOSE2_PD(G,H);
329 Heps = _mm_mul_pd(vfeps,H);
330 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
331 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
332 vvdw12 = _mm_mul_pd(c12_00,VV);
333 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
334 fvdw12 = _mm_mul_pd(c12_00,FF);
335 vvdw = _mm_add_pd(vvdw12,vvdw6);
336 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
338 /* Update potential sum for this i atom from the interaction with this j atom. */
339 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
343 /* Calculate temporary vectorial force */
344 tx = _mm_mul_pd(fscal,dx00);
345 ty = _mm_mul_pd(fscal,dy00);
346 tz = _mm_mul_pd(fscal,dz00);
348 /* Update vectorial force */
349 fix0 = _mm_add_pd(fix0,tx);
350 fiy0 = _mm_add_pd(fiy0,ty);
351 fiz0 = _mm_add_pd(fiz0,tz);
353 fjx0 = _mm_add_pd(fjx0,tx);
354 fjy0 = _mm_add_pd(fjy0,ty);
355 fjz0 = _mm_add_pd(fjz0,tz);
357 /**************************
358 * CALCULATE INTERACTIONS *
359 **************************/
361 if (gmx_mm_any_lt(rsq11,rcutoff2))
364 /* REACTION-FIELD ELECTROSTATICS */
365 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
366 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
368 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
370 /* Update potential sum for this i atom from the interaction with this j atom. */
371 velec = _mm_and_pd(velec,cutoff_mask);
372 velecsum = _mm_add_pd(velecsum,velec);
376 fscal = _mm_and_pd(fscal,cutoff_mask);
378 /* Calculate temporary vectorial force */
379 tx = _mm_mul_pd(fscal,dx11);
380 ty = _mm_mul_pd(fscal,dy11);
381 tz = _mm_mul_pd(fscal,dz11);
383 /* Update vectorial force */
384 fix1 = _mm_add_pd(fix1,tx);
385 fiy1 = _mm_add_pd(fiy1,ty);
386 fiz1 = _mm_add_pd(fiz1,tz);
388 fjx1 = _mm_add_pd(fjx1,tx);
389 fjy1 = _mm_add_pd(fjy1,ty);
390 fjz1 = _mm_add_pd(fjz1,tz);
394 /**************************
395 * CALCULATE INTERACTIONS *
396 **************************/
398 if (gmx_mm_any_lt(rsq12,rcutoff2))
401 /* REACTION-FIELD ELECTROSTATICS */
402 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
403 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
405 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
407 /* Update potential sum for this i atom from the interaction with this j atom. */
408 velec = _mm_and_pd(velec,cutoff_mask);
409 velecsum = _mm_add_pd(velecsum,velec);
413 fscal = _mm_and_pd(fscal,cutoff_mask);
415 /* Calculate temporary vectorial force */
416 tx = _mm_mul_pd(fscal,dx12);
417 ty = _mm_mul_pd(fscal,dy12);
418 tz = _mm_mul_pd(fscal,dz12);
420 /* Update vectorial force */
421 fix1 = _mm_add_pd(fix1,tx);
422 fiy1 = _mm_add_pd(fiy1,ty);
423 fiz1 = _mm_add_pd(fiz1,tz);
425 fjx2 = _mm_add_pd(fjx2,tx);
426 fjy2 = _mm_add_pd(fjy2,ty);
427 fjz2 = _mm_add_pd(fjz2,tz);
431 /**************************
432 * CALCULATE INTERACTIONS *
433 **************************/
435 if (gmx_mm_any_lt(rsq13,rcutoff2))
438 /* REACTION-FIELD ELECTROSTATICS */
439 velec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_add_pd(rinv13,_mm_mul_pd(krf,rsq13)),crf));
440 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
442 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
444 /* Update potential sum for this i atom from the interaction with this j atom. */
445 velec = _mm_and_pd(velec,cutoff_mask);
446 velecsum = _mm_add_pd(velecsum,velec);
450 fscal = _mm_and_pd(fscal,cutoff_mask);
452 /* Calculate temporary vectorial force */
453 tx = _mm_mul_pd(fscal,dx13);
454 ty = _mm_mul_pd(fscal,dy13);
455 tz = _mm_mul_pd(fscal,dz13);
457 /* Update vectorial force */
458 fix1 = _mm_add_pd(fix1,tx);
459 fiy1 = _mm_add_pd(fiy1,ty);
460 fiz1 = _mm_add_pd(fiz1,tz);
462 fjx3 = _mm_add_pd(fjx3,tx);
463 fjy3 = _mm_add_pd(fjy3,ty);
464 fjz3 = _mm_add_pd(fjz3,tz);
468 /**************************
469 * CALCULATE INTERACTIONS *
470 **************************/
472 if (gmx_mm_any_lt(rsq21,rcutoff2))
475 /* REACTION-FIELD ELECTROSTATICS */
476 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
477 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
479 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
481 /* Update potential sum for this i atom from the interaction with this j atom. */
482 velec = _mm_and_pd(velec,cutoff_mask);
483 velecsum = _mm_add_pd(velecsum,velec);
487 fscal = _mm_and_pd(fscal,cutoff_mask);
489 /* Calculate temporary vectorial force */
490 tx = _mm_mul_pd(fscal,dx21);
491 ty = _mm_mul_pd(fscal,dy21);
492 tz = _mm_mul_pd(fscal,dz21);
494 /* Update vectorial force */
495 fix2 = _mm_add_pd(fix2,tx);
496 fiy2 = _mm_add_pd(fiy2,ty);
497 fiz2 = _mm_add_pd(fiz2,tz);
499 fjx1 = _mm_add_pd(fjx1,tx);
500 fjy1 = _mm_add_pd(fjy1,ty);
501 fjz1 = _mm_add_pd(fjz1,tz);
505 /**************************
506 * CALCULATE INTERACTIONS *
507 **************************/
509 if (gmx_mm_any_lt(rsq22,rcutoff2))
512 /* REACTION-FIELD ELECTROSTATICS */
513 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
514 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
516 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
518 /* Update potential sum for this i atom from the interaction with this j atom. */
519 velec = _mm_and_pd(velec,cutoff_mask);
520 velecsum = _mm_add_pd(velecsum,velec);
524 fscal = _mm_and_pd(fscal,cutoff_mask);
526 /* Calculate temporary vectorial force */
527 tx = _mm_mul_pd(fscal,dx22);
528 ty = _mm_mul_pd(fscal,dy22);
529 tz = _mm_mul_pd(fscal,dz22);
531 /* Update vectorial force */
532 fix2 = _mm_add_pd(fix2,tx);
533 fiy2 = _mm_add_pd(fiy2,ty);
534 fiz2 = _mm_add_pd(fiz2,tz);
536 fjx2 = _mm_add_pd(fjx2,tx);
537 fjy2 = _mm_add_pd(fjy2,ty);
538 fjz2 = _mm_add_pd(fjz2,tz);
542 /**************************
543 * CALCULATE INTERACTIONS *
544 **************************/
546 if (gmx_mm_any_lt(rsq23,rcutoff2))
549 /* REACTION-FIELD ELECTROSTATICS */
550 velec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_add_pd(rinv23,_mm_mul_pd(krf,rsq23)),crf));
551 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
553 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
555 /* Update potential sum for this i atom from the interaction with this j atom. */
556 velec = _mm_and_pd(velec,cutoff_mask);
557 velecsum = _mm_add_pd(velecsum,velec);
561 fscal = _mm_and_pd(fscal,cutoff_mask);
563 /* Calculate temporary vectorial force */
564 tx = _mm_mul_pd(fscal,dx23);
565 ty = _mm_mul_pd(fscal,dy23);
566 tz = _mm_mul_pd(fscal,dz23);
568 /* Update vectorial force */
569 fix2 = _mm_add_pd(fix2,tx);
570 fiy2 = _mm_add_pd(fiy2,ty);
571 fiz2 = _mm_add_pd(fiz2,tz);
573 fjx3 = _mm_add_pd(fjx3,tx);
574 fjy3 = _mm_add_pd(fjy3,ty);
575 fjz3 = _mm_add_pd(fjz3,tz);
579 /**************************
580 * CALCULATE INTERACTIONS *
581 **************************/
583 if (gmx_mm_any_lt(rsq31,rcutoff2))
586 /* REACTION-FIELD ELECTROSTATICS */
587 velec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_add_pd(rinv31,_mm_mul_pd(krf,rsq31)),crf));
588 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
590 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
592 /* Update potential sum for this i atom from the interaction with this j atom. */
593 velec = _mm_and_pd(velec,cutoff_mask);
594 velecsum = _mm_add_pd(velecsum,velec);
598 fscal = _mm_and_pd(fscal,cutoff_mask);
600 /* Calculate temporary vectorial force */
601 tx = _mm_mul_pd(fscal,dx31);
602 ty = _mm_mul_pd(fscal,dy31);
603 tz = _mm_mul_pd(fscal,dz31);
605 /* Update vectorial force */
606 fix3 = _mm_add_pd(fix3,tx);
607 fiy3 = _mm_add_pd(fiy3,ty);
608 fiz3 = _mm_add_pd(fiz3,tz);
610 fjx1 = _mm_add_pd(fjx1,tx);
611 fjy1 = _mm_add_pd(fjy1,ty);
612 fjz1 = _mm_add_pd(fjz1,tz);
616 /**************************
617 * CALCULATE INTERACTIONS *
618 **************************/
620 if (gmx_mm_any_lt(rsq32,rcutoff2))
623 /* REACTION-FIELD ELECTROSTATICS */
624 velec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_add_pd(rinv32,_mm_mul_pd(krf,rsq32)),crf));
625 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
627 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
629 /* Update potential sum for this i atom from the interaction with this j atom. */
630 velec = _mm_and_pd(velec,cutoff_mask);
631 velecsum = _mm_add_pd(velecsum,velec);
635 fscal = _mm_and_pd(fscal,cutoff_mask);
637 /* Calculate temporary vectorial force */
638 tx = _mm_mul_pd(fscal,dx32);
639 ty = _mm_mul_pd(fscal,dy32);
640 tz = _mm_mul_pd(fscal,dz32);
642 /* Update vectorial force */
643 fix3 = _mm_add_pd(fix3,tx);
644 fiy3 = _mm_add_pd(fiy3,ty);
645 fiz3 = _mm_add_pd(fiz3,tz);
647 fjx2 = _mm_add_pd(fjx2,tx);
648 fjy2 = _mm_add_pd(fjy2,ty);
649 fjz2 = _mm_add_pd(fjz2,tz);
653 /**************************
654 * CALCULATE INTERACTIONS *
655 **************************/
657 if (gmx_mm_any_lt(rsq33,rcutoff2))
660 /* REACTION-FIELD ELECTROSTATICS */
661 velec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_add_pd(rinv33,_mm_mul_pd(krf,rsq33)),crf));
662 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
664 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
666 /* Update potential sum for this i atom from the interaction with this j atom. */
667 velec = _mm_and_pd(velec,cutoff_mask);
668 velecsum = _mm_add_pd(velecsum,velec);
672 fscal = _mm_and_pd(fscal,cutoff_mask);
674 /* Calculate temporary vectorial force */
675 tx = _mm_mul_pd(fscal,dx33);
676 ty = _mm_mul_pd(fscal,dy33);
677 tz = _mm_mul_pd(fscal,dz33);
679 /* Update vectorial force */
680 fix3 = _mm_add_pd(fix3,tx);
681 fiy3 = _mm_add_pd(fiy3,ty);
682 fiz3 = _mm_add_pd(fiz3,tz);
684 fjx3 = _mm_add_pd(fjx3,tx);
685 fjy3 = _mm_add_pd(fjy3,ty);
686 fjz3 = _mm_add_pd(fjz3,tz);
690 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
692 /* Inner loop uses 383 flops */
699 j_coord_offsetA = DIM*jnrA;
701 /* load j atom coordinates */
702 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
703 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
704 &jy2,&jz2,&jx3,&jy3,&jz3);
706 /* Calculate displacement vector */
707 dx00 = _mm_sub_pd(ix0,jx0);
708 dy00 = _mm_sub_pd(iy0,jy0);
709 dz00 = _mm_sub_pd(iz0,jz0);
710 dx11 = _mm_sub_pd(ix1,jx1);
711 dy11 = _mm_sub_pd(iy1,jy1);
712 dz11 = _mm_sub_pd(iz1,jz1);
713 dx12 = _mm_sub_pd(ix1,jx2);
714 dy12 = _mm_sub_pd(iy1,jy2);
715 dz12 = _mm_sub_pd(iz1,jz2);
716 dx13 = _mm_sub_pd(ix1,jx3);
717 dy13 = _mm_sub_pd(iy1,jy3);
718 dz13 = _mm_sub_pd(iz1,jz3);
719 dx21 = _mm_sub_pd(ix2,jx1);
720 dy21 = _mm_sub_pd(iy2,jy1);
721 dz21 = _mm_sub_pd(iz2,jz1);
722 dx22 = _mm_sub_pd(ix2,jx2);
723 dy22 = _mm_sub_pd(iy2,jy2);
724 dz22 = _mm_sub_pd(iz2,jz2);
725 dx23 = _mm_sub_pd(ix2,jx3);
726 dy23 = _mm_sub_pd(iy2,jy3);
727 dz23 = _mm_sub_pd(iz2,jz3);
728 dx31 = _mm_sub_pd(ix3,jx1);
729 dy31 = _mm_sub_pd(iy3,jy1);
730 dz31 = _mm_sub_pd(iz3,jz1);
731 dx32 = _mm_sub_pd(ix3,jx2);
732 dy32 = _mm_sub_pd(iy3,jy2);
733 dz32 = _mm_sub_pd(iz3,jz2);
734 dx33 = _mm_sub_pd(ix3,jx3);
735 dy33 = _mm_sub_pd(iy3,jy3);
736 dz33 = _mm_sub_pd(iz3,jz3);
738 /* Calculate squared distance and things based on it */
739 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
740 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
741 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
742 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
743 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
744 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
745 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
746 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
747 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
748 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
750 rinv00 = gmx_mm_invsqrt_pd(rsq00);
751 rinv11 = gmx_mm_invsqrt_pd(rsq11);
752 rinv12 = gmx_mm_invsqrt_pd(rsq12);
753 rinv13 = gmx_mm_invsqrt_pd(rsq13);
754 rinv21 = gmx_mm_invsqrt_pd(rsq21);
755 rinv22 = gmx_mm_invsqrt_pd(rsq22);
756 rinv23 = gmx_mm_invsqrt_pd(rsq23);
757 rinv31 = gmx_mm_invsqrt_pd(rsq31);
758 rinv32 = gmx_mm_invsqrt_pd(rsq32);
759 rinv33 = gmx_mm_invsqrt_pd(rsq33);
761 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
762 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
763 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
764 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
765 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
766 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
767 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
768 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
769 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
771 fjx0 = _mm_setzero_pd();
772 fjy0 = _mm_setzero_pd();
773 fjz0 = _mm_setzero_pd();
774 fjx1 = _mm_setzero_pd();
775 fjy1 = _mm_setzero_pd();
776 fjz1 = _mm_setzero_pd();
777 fjx2 = _mm_setzero_pd();
778 fjy2 = _mm_setzero_pd();
779 fjz2 = _mm_setzero_pd();
780 fjx3 = _mm_setzero_pd();
781 fjy3 = _mm_setzero_pd();
782 fjz3 = _mm_setzero_pd();
784 /**************************
785 * CALCULATE INTERACTIONS *
786 **************************/
788 r00 = _mm_mul_pd(rsq00,rinv00);
790 /* Calculate table index by multiplying r with table scale and truncate to integer */
791 rt = _mm_mul_pd(r00,vftabscale);
792 vfitab = _mm_cvttpd_epi32(rt);
793 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
794 vfitab = _mm_slli_epi32(vfitab,3);
796 /* CUBIC SPLINE TABLE DISPERSION */
797 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
798 F = _mm_setzero_pd();
799 GMX_MM_TRANSPOSE2_PD(Y,F);
800 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
801 H = _mm_setzero_pd();
802 GMX_MM_TRANSPOSE2_PD(G,H);
803 Heps = _mm_mul_pd(vfeps,H);
804 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
805 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
806 vvdw6 = _mm_mul_pd(c6_00,VV);
807 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
808 fvdw6 = _mm_mul_pd(c6_00,FF);
810 /* CUBIC SPLINE TABLE REPULSION */
811 vfitab = _mm_add_epi32(vfitab,ifour);
812 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
813 F = _mm_setzero_pd();
814 GMX_MM_TRANSPOSE2_PD(Y,F);
815 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
816 H = _mm_setzero_pd();
817 GMX_MM_TRANSPOSE2_PD(G,H);
818 Heps = _mm_mul_pd(vfeps,H);
819 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
820 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
821 vvdw12 = _mm_mul_pd(c12_00,VV);
822 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
823 fvdw12 = _mm_mul_pd(c12_00,FF);
824 vvdw = _mm_add_pd(vvdw12,vvdw6);
825 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
827 /* Update potential sum for this i atom from the interaction with this j atom. */
828 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
829 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
833 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
835 /* Calculate temporary vectorial force */
836 tx = _mm_mul_pd(fscal,dx00);
837 ty = _mm_mul_pd(fscal,dy00);
838 tz = _mm_mul_pd(fscal,dz00);
840 /* Update vectorial force */
841 fix0 = _mm_add_pd(fix0,tx);
842 fiy0 = _mm_add_pd(fiy0,ty);
843 fiz0 = _mm_add_pd(fiz0,tz);
845 fjx0 = _mm_add_pd(fjx0,tx);
846 fjy0 = _mm_add_pd(fjy0,ty);
847 fjz0 = _mm_add_pd(fjz0,tz);
849 /**************************
850 * CALCULATE INTERACTIONS *
851 **************************/
853 if (gmx_mm_any_lt(rsq11,rcutoff2))
856 /* REACTION-FIELD ELECTROSTATICS */
857 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
858 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
860 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
862 /* Update potential sum for this i atom from the interaction with this j atom. */
863 velec = _mm_and_pd(velec,cutoff_mask);
864 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
865 velecsum = _mm_add_pd(velecsum,velec);
869 fscal = _mm_and_pd(fscal,cutoff_mask);
871 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
873 /* Calculate temporary vectorial force */
874 tx = _mm_mul_pd(fscal,dx11);
875 ty = _mm_mul_pd(fscal,dy11);
876 tz = _mm_mul_pd(fscal,dz11);
878 /* Update vectorial force */
879 fix1 = _mm_add_pd(fix1,tx);
880 fiy1 = _mm_add_pd(fiy1,ty);
881 fiz1 = _mm_add_pd(fiz1,tz);
883 fjx1 = _mm_add_pd(fjx1,tx);
884 fjy1 = _mm_add_pd(fjy1,ty);
885 fjz1 = _mm_add_pd(fjz1,tz);
889 /**************************
890 * CALCULATE INTERACTIONS *
891 **************************/
893 if (gmx_mm_any_lt(rsq12,rcutoff2))
896 /* REACTION-FIELD ELECTROSTATICS */
897 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
898 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
900 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
902 /* Update potential sum for this i atom from the interaction with this j atom. */
903 velec = _mm_and_pd(velec,cutoff_mask);
904 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
905 velecsum = _mm_add_pd(velecsum,velec);
909 fscal = _mm_and_pd(fscal,cutoff_mask);
911 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
913 /* Calculate temporary vectorial force */
914 tx = _mm_mul_pd(fscal,dx12);
915 ty = _mm_mul_pd(fscal,dy12);
916 tz = _mm_mul_pd(fscal,dz12);
918 /* Update vectorial force */
919 fix1 = _mm_add_pd(fix1,tx);
920 fiy1 = _mm_add_pd(fiy1,ty);
921 fiz1 = _mm_add_pd(fiz1,tz);
923 fjx2 = _mm_add_pd(fjx2,tx);
924 fjy2 = _mm_add_pd(fjy2,ty);
925 fjz2 = _mm_add_pd(fjz2,tz);
929 /**************************
930 * CALCULATE INTERACTIONS *
931 **************************/
933 if (gmx_mm_any_lt(rsq13,rcutoff2))
936 /* REACTION-FIELD ELECTROSTATICS */
937 velec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_add_pd(rinv13,_mm_mul_pd(krf,rsq13)),crf));
938 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
940 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
942 /* Update potential sum for this i atom from the interaction with this j atom. */
943 velec = _mm_and_pd(velec,cutoff_mask);
944 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
945 velecsum = _mm_add_pd(velecsum,velec);
949 fscal = _mm_and_pd(fscal,cutoff_mask);
951 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
953 /* Calculate temporary vectorial force */
954 tx = _mm_mul_pd(fscal,dx13);
955 ty = _mm_mul_pd(fscal,dy13);
956 tz = _mm_mul_pd(fscal,dz13);
958 /* Update vectorial force */
959 fix1 = _mm_add_pd(fix1,tx);
960 fiy1 = _mm_add_pd(fiy1,ty);
961 fiz1 = _mm_add_pd(fiz1,tz);
963 fjx3 = _mm_add_pd(fjx3,tx);
964 fjy3 = _mm_add_pd(fjy3,ty);
965 fjz3 = _mm_add_pd(fjz3,tz);
969 /**************************
970 * CALCULATE INTERACTIONS *
971 **************************/
973 if (gmx_mm_any_lt(rsq21,rcutoff2))
976 /* REACTION-FIELD ELECTROSTATICS */
977 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
978 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
980 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
982 /* Update potential sum for this i atom from the interaction with this j atom. */
983 velec = _mm_and_pd(velec,cutoff_mask);
984 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
985 velecsum = _mm_add_pd(velecsum,velec);
989 fscal = _mm_and_pd(fscal,cutoff_mask);
991 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
993 /* Calculate temporary vectorial force */
994 tx = _mm_mul_pd(fscal,dx21);
995 ty = _mm_mul_pd(fscal,dy21);
996 tz = _mm_mul_pd(fscal,dz21);
998 /* Update vectorial force */
999 fix2 = _mm_add_pd(fix2,tx);
1000 fiy2 = _mm_add_pd(fiy2,ty);
1001 fiz2 = _mm_add_pd(fiz2,tz);
1003 fjx1 = _mm_add_pd(fjx1,tx);
1004 fjy1 = _mm_add_pd(fjy1,ty);
1005 fjz1 = _mm_add_pd(fjz1,tz);
1009 /**************************
1010 * CALCULATE INTERACTIONS *
1011 **************************/
1013 if (gmx_mm_any_lt(rsq22,rcutoff2))
1016 /* REACTION-FIELD ELECTROSTATICS */
1017 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
1018 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1020 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1022 /* Update potential sum for this i atom from the interaction with this j atom. */
1023 velec = _mm_and_pd(velec,cutoff_mask);
1024 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1025 velecsum = _mm_add_pd(velecsum,velec);
1029 fscal = _mm_and_pd(fscal,cutoff_mask);
1031 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1033 /* Calculate temporary vectorial force */
1034 tx = _mm_mul_pd(fscal,dx22);
1035 ty = _mm_mul_pd(fscal,dy22);
1036 tz = _mm_mul_pd(fscal,dz22);
1038 /* Update vectorial force */
1039 fix2 = _mm_add_pd(fix2,tx);
1040 fiy2 = _mm_add_pd(fiy2,ty);
1041 fiz2 = _mm_add_pd(fiz2,tz);
1043 fjx2 = _mm_add_pd(fjx2,tx);
1044 fjy2 = _mm_add_pd(fjy2,ty);
1045 fjz2 = _mm_add_pd(fjz2,tz);
1049 /**************************
1050 * CALCULATE INTERACTIONS *
1051 **************************/
1053 if (gmx_mm_any_lt(rsq23,rcutoff2))
1056 /* REACTION-FIELD ELECTROSTATICS */
1057 velec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_add_pd(rinv23,_mm_mul_pd(krf,rsq23)),crf));
1058 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
1060 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1062 /* Update potential sum for this i atom from the interaction with this j atom. */
1063 velec = _mm_and_pd(velec,cutoff_mask);
1064 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1065 velecsum = _mm_add_pd(velecsum,velec);
1069 fscal = _mm_and_pd(fscal,cutoff_mask);
1071 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1073 /* Calculate temporary vectorial force */
1074 tx = _mm_mul_pd(fscal,dx23);
1075 ty = _mm_mul_pd(fscal,dy23);
1076 tz = _mm_mul_pd(fscal,dz23);
1078 /* Update vectorial force */
1079 fix2 = _mm_add_pd(fix2,tx);
1080 fiy2 = _mm_add_pd(fiy2,ty);
1081 fiz2 = _mm_add_pd(fiz2,tz);
1083 fjx3 = _mm_add_pd(fjx3,tx);
1084 fjy3 = _mm_add_pd(fjy3,ty);
1085 fjz3 = _mm_add_pd(fjz3,tz);
1089 /**************************
1090 * CALCULATE INTERACTIONS *
1091 **************************/
1093 if (gmx_mm_any_lt(rsq31,rcutoff2))
1096 /* REACTION-FIELD ELECTROSTATICS */
1097 velec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_add_pd(rinv31,_mm_mul_pd(krf,rsq31)),crf));
1098 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
1100 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1102 /* Update potential sum for this i atom from the interaction with this j atom. */
1103 velec = _mm_and_pd(velec,cutoff_mask);
1104 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1105 velecsum = _mm_add_pd(velecsum,velec);
1109 fscal = _mm_and_pd(fscal,cutoff_mask);
1111 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1113 /* Calculate temporary vectorial force */
1114 tx = _mm_mul_pd(fscal,dx31);
1115 ty = _mm_mul_pd(fscal,dy31);
1116 tz = _mm_mul_pd(fscal,dz31);
1118 /* Update vectorial force */
1119 fix3 = _mm_add_pd(fix3,tx);
1120 fiy3 = _mm_add_pd(fiy3,ty);
1121 fiz3 = _mm_add_pd(fiz3,tz);
1123 fjx1 = _mm_add_pd(fjx1,tx);
1124 fjy1 = _mm_add_pd(fjy1,ty);
1125 fjz1 = _mm_add_pd(fjz1,tz);
1129 /**************************
1130 * CALCULATE INTERACTIONS *
1131 **************************/
1133 if (gmx_mm_any_lt(rsq32,rcutoff2))
1136 /* REACTION-FIELD ELECTROSTATICS */
1137 velec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_add_pd(rinv32,_mm_mul_pd(krf,rsq32)),crf));
1138 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
1140 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1142 /* Update potential sum for this i atom from the interaction with this j atom. */
1143 velec = _mm_and_pd(velec,cutoff_mask);
1144 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1145 velecsum = _mm_add_pd(velecsum,velec);
1149 fscal = _mm_and_pd(fscal,cutoff_mask);
1151 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1153 /* Calculate temporary vectorial force */
1154 tx = _mm_mul_pd(fscal,dx32);
1155 ty = _mm_mul_pd(fscal,dy32);
1156 tz = _mm_mul_pd(fscal,dz32);
1158 /* Update vectorial force */
1159 fix3 = _mm_add_pd(fix3,tx);
1160 fiy3 = _mm_add_pd(fiy3,ty);
1161 fiz3 = _mm_add_pd(fiz3,tz);
1163 fjx2 = _mm_add_pd(fjx2,tx);
1164 fjy2 = _mm_add_pd(fjy2,ty);
1165 fjz2 = _mm_add_pd(fjz2,tz);
1169 /**************************
1170 * CALCULATE INTERACTIONS *
1171 **************************/
1173 if (gmx_mm_any_lt(rsq33,rcutoff2))
1176 /* REACTION-FIELD ELECTROSTATICS */
1177 velec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_add_pd(rinv33,_mm_mul_pd(krf,rsq33)),crf));
1178 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
1180 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1182 /* Update potential sum for this i atom from the interaction with this j atom. */
1183 velec = _mm_and_pd(velec,cutoff_mask);
1184 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1185 velecsum = _mm_add_pd(velecsum,velec);
1189 fscal = _mm_and_pd(fscal,cutoff_mask);
1191 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1193 /* Calculate temporary vectorial force */
1194 tx = _mm_mul_pd(fscal,dx33);
1195 ty = _mm_mul_pd(fscal,dy33);
1196 tz = _mm_mul_pd(fscal,dz33);
1198 /* Update vectorial force */
1199 fix3 = _mm_add_pd(fix3,tx);
1200 fiy3 = _mm_add_pd(fiy3,ty);
1201 fiz3 = _mm_add_pd(fiz3,tz);
1203 fjx3 = _mm_add_pd(fjx3,tx);
1204 fjy3 = _mm_add_pd(fjy3,ty);
1205 fjz3 = _mm_add_pd(fjz3,tz);
1209 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1211 /* Inner loop uses 383 flops */
1214 /* End of innermost loop */
1216 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1217 f+i_coord_offset,fshift+i_shift_offset);
1220 /* Update potential energies */
1221 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1222 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1224 /* Increment number of inner iterations */
1225 inneriter += j_index_end - j_index_start;
1227 /* Outer loop uses 26 flops */
1230 /* Increment number of outer iterations */
1233 /* Update outer/inner flops */
1235 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*383);
1238 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_double
1239 * Electrostatics interaction: ReactionField
1240 * VdW interaction: CubicSplineTable
1241 * Geometry: Water4-Water4
1242 * Calculate force/pot: Force
1245 nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_double
1246 (t_nblist * gmx_restrict nlist,
1247 rvec * gmx_restrict xx,
1248 rvec * gmx_restrict ff,
1249 t_forcerec * gmx_restrict fr,
1250 t_mdatoms * gmx_restrict mdatoms,
1251 nb_kernel_data_t * gmx_restrict kernel_data,
1252 t_nrnb * gmx_restrict nrnb)
1254 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1255 * just 0 for non-waters.
1256 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1257 * jnr indices corresponding to data put in the four positions in the SIMD register.
1259 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1260 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1262 int j_coord_offsetA,j_coord_offsetB;
1263 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1264 real rcutoff_scalar;
1265 real *shiftvec,*fshift,*x,*f;
1266 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1268 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1270 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1272 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1274 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1275 int vdwjidx0A,vdwjidx0B;
1276 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1277 int vdwjidx1A,vdwjidx1B;
1278 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1279 int vdwjidx2A,vdwjidx2B;
1280 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1281 int vdwjidx3A,vdwjidx3B;
1282 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1283 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1284 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1285 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1286 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1287 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1288 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1289 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1290 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1291 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1292 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1293 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1296 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1299 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1300 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1302 __m128i ifour = _mm_set1_epi32(4);
1303 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1305 __m128d dummy_mask,cutoff_mask;
1306 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1307 __m128d one = _mm_set1_pd(1.0);
1308 __m128d two = _mm_set1_pd(2.0);
1314 jindex = nlist->jindex;
1316 shiftidx = nlist->shift;
1318 shiftvec = fr->shift_vec[0];
1319 fshift = fr->fshift[0];
1320 facel = _mm_set1_pd(fr->epsfac);
1321 charge = mdatoms->chargeA;
1322 krf = _mm_set1_pd(fr->ic->k_rf);
1323 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
1324 crf = _mm_set1_pd(fr->ic->c_rf);
1325 nvdwtype = fr->ntype;
1326 vdwparam = fr->nbfp;
1327 vdwtype = mdatoms->typeA;
1329 vftab = kernel_data->table_vdw->data;
1330 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
1332 /* Setup water-specific parameters */
1333 inr = nlist->iinr[0];
1334 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1335 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1336 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1337 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1339 jq1 = _mm_set1_pd(charge[inr+1]);
1340 jq2 = _mm_set1_pd(charge[inr+2]);
1341 jq3 = _mm_set1_pd(charge[inr+3]);
1342 vdwjidx0A = 2*vdwtype[inr+0];
1343 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1344 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1345 qq11 = _mm_mul_pd(iq1,jq1);
1346 qq12 = _mm_mul_pd(iq1,jq2);
1347 qq13 = _mm_mul_pd(iq1,jq3);
1348 qq21 = _mm_mul_pd(iq2,jq1);
1349 qq22 = _mm_mul_pd(iq2,jq2);
1350 qq23 = _mm_mul_pd(iq2,jq3);
1351 qq31 = _mm_mul_pd(iq3,jq1);
1352 qq32 = _mm_mul_pd(iq3,jq2);
1353 qq33 = _mm_mul_pd(iq3,jq3);
1355 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1356 rcutoff_scalar = fr->rcoulomb;
1357 rcutoff = _mm_set1_pd(rcutoff_scalar);
1358 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1360 /* Avoid stupid compiler warnings */
1362 j_coord_offsetA = 0;
1363 j_coord_offsetB = 0;
1368 /* Start outer loop over neighborlists */
1369 for(iidx=0; iidx<nri; iidx++)
1371 /* Load shift vector for this list */
1372 i_shift_offset = DIM*shiftidx[iidx];
1374 /* Load limits for loop over neighbors */
1375 j_index_start = jindex[iidx];
1376 j_index_end = jindex[iidx+1];
1378 /* Get outer coordinate index */
1380 i_coord_offset = DIM*inr;
1382 /* Load i particle coords and add shift vector */
1383 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1384 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1386 fix0 = _mm_setzero_pd();
1387 fiy0 = _mm_setzero_pd();
1388 fiz0 = _mm_setzero_pd();
1389 fix1 = _mm_setzero_pd();
1390 fiy1 = _mm_setzero_pd();
1391 fiz1 = _mm_setzero_pd();
1392 fix2 = _mm_setzero_pd();
1393 fiy2 = _mm_setzero_pd();
1394 fiz2 = _mm_setzero_pd();
1395 fix3 = _mm_setzero_pd();
1396 fiy3 = _mm_setzero_pd();
1397 fiz3 = _mm_setzero_pd();
1399 /* Start inner kernel loop */
1400 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1403 /* Get j neighbor index, and coordinate index */
1405 jnrB = jjnr[jidx+1];
1406 j_coord_offsetA = DIM*jnrA;
1407 j_coord_offsetB = DIM*jnrB;
1409 /* load j atom coordinates */
1410 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1411 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1412 &jy2,&jz2,&jx3,&jy3,&jz3);
1414 /* Calculate displacement vector */
1415 dx00 = _mm_sub_pd(ix0,jx0);
1416 dy00 = _mm_sub_pd(iy0,jy0);
1417 dz00 = _mm_sub_pd(iz0,jz0);
1418 dx11 = _mm_sub_pd(ix1,jx1);
1419 dy11 = _mm_sub_pd(iy1,jy1);
1420 dz11 = _mm_sub_pd(iz1,jz1);
1421 dx12 = _mm_sub_pd(ix1,jx2);
1422 dy12 = _mm_sub_pd(iy1,jy2);
1423 dz12 = _mm_sub_pd(iz1,jz2);
1424 dx13 = _mm_sub_pd(ix1,jx3);
1425 dy13 = _mm_sub_pd(iy1,jy3);
1426 dz13 = _mm_sub_pd(iz1,jz3);
1427 dx21 = _mm_sub_pd(ix2,jx1);
1428 dy21 = _mm_sub_pd(iy2,jy1);
1429 dz21 = _mm_sub_pd(iz2,jz1);
1430 dx22 = _mm_sub_pd(ix2,jx2);
1431 dy22 = _mm_sub_pd(iy2,jy2);
1432 dz22 = _mm_sub_pd(iz2,jz2);
1433 dx23 = _mm_sub_pd(ix2,jx3);
1434 dy23 = _mm_sub_pd(iy2,jy3);
1435 dz23 = _mm_sub_pd(iz2,jz3);
1436 dx31 = _mm_sub_pd(ix3,jx1);
1437 dy31 = _mm_sub_pd(iy3,jy1);
1438 dz31 = _mm_sub_pd(iz3,jz1);
1439 dx32 = _mm_sub_pd(ix3,jx2);
1440 dy32 = _mm_sub_pd(iy3,jy2);
1441 dz32 = _mm_sub_pd(iz3,jz2);
1442 dx33 = _mm_sub_pd(ix3,jx3);
1443 dy33 = _mm_sub_pd(iy3,jy3);
1444 dz33 = _mm_sub_pd(iz3,jz3);
1446 /* Calculate squared distance and things based on it */
1447 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1448 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1449 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1450 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1451 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1452 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1453 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1454 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1455 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1456 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1458 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1459 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1460 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1461 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1462 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1463 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1464 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1465 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1466 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1467 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1469 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1470 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1471 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1472 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1473 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1474 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1475 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1476 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1477 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1479 fjx0 = _mm_setzero_pd();
1480 fjy0 = _mm_setzero_pd();
1481 fjz0 = _mm_setzero_pd();
1482 fjx1 = _mm_setzero_pd();
1483 fjy1 = _mm_setzero_pd();
1484 fjz1 = _mm_setzero_pd();
1485 fjx2 = _mm_setzero_pd();
1486 fjy2 = _mm_setzero_pd();
1487 fjz2 = _mm_setzero_pd();
1488 fjx3 = _mm_setzero_pd();
1489 fjy3 = _mm_setzero_pd();
1490 fjz3 = _mm_setzero_pd();
1492 /**************************
1493 * CALCULATE INTERACTIONS *
1494 **************************/
1496 r00 = _mm_mul_pd(rsq00,rinv00);
1498 /* Calculate table index by multiplying r with table scale and truncate to integer */
1499 rt = _mm_mul_pd(r00,vftabscale);
1500 vfitab = _mm_cvttpd_epi32(rt);
1501 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1502 vfitab = _mm_slli_epi32(vfitab,3);
1504 /* CUBIC SPLINE TABLE DISPERSION */
1505 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1506 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1507 GMX_MM_TRANSPOSE2_PD(Y,F);
1508 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1509 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1510 GMX_MM_TRANSPOSE2_PD(G,H);
1511 Heps = _mm_mul_pd(vfeps,H);
1512 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1513 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1514 fvdw6 = _mm_mul_pd(c6_00,FF);
1516 /* CUBIC SPLINE TABLE REPULSION */
1517 vfitab = _mm_add_epi32(vfitab,ifour);
1518 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1519 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1520 GMX_MM_TRANSPOSE2_PD(Y,F);
1521 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1522 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1523 GMX_MM_TRANSPOSE2_PD(G,H);
1524 Heps = _mm_mul_pd(vfeps,H);
1525 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1526 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1527 fvdw12 = _mm_mul_pd(c12_00,FF);
1528 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1532 /* Calculate temporary vectorial force */
1533 tx = _mm_mul_pd(fscal,dx00);
1534 ty = _mm_mul_pd(fscal,dy00);
1535 tz = _mm_mul_pd(fscal,dz00);
1537 /* Update vectorial force */
1538 fix0 = _mm_add_pd(fix0,tx);
1539 fiy0 = _mm_add_pd(fiy0,ty);
1540 fiz0 = _mm_add_pd(fiz0,tz);
1542 fjx0 = _mm_add_pd(fjx0,tx);
1543 fjy0 = _mm_add_pd(fjy0,ty);
1544 fjz0 = _mm_add_pd(fjz0,tz);
1546 /**************************
1547 * CALCULATE INTERACTIONS *
1548 **************************/
1550 if (gmx_mm_any_lt(rsq11,rcutoff2))
1553 /* REACTION-FIELD ELECTROSTATICS */
1554 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1556 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1560 fscal = _mm_and_pd(fscal,cutoff_mask);
1562 /* Calculate temporary vectorial force */
1563 tx = _mm_mul_pd(fscal,dx11);
1564 ty = _mm_mul_pd(fscal,dy11);
1565 tz = _mm_mul_pd(fscal,dz11);
1567 /* Update vectorial force */
1568 fix1 = _mm_add_pd(fix1,tx);
1569 fiy1 = _mm_add_pd(fiy1,ty);
1570 fiz1 = _mm_add_pd(fiz1,tz);
1572 fjx1 = _mm_add_pd(fjx1,tx);
1573 fjy1 = _mm_add_pd(fjy1,ty);
1574 fjz1 = _mm_add_pd(fjz1,tz);
1578 /**************************
1579 * CALCULATE INTERACTIONS *
1580 **************************/
1582 if (gmx_mm_any_lt(rsq12,rcutoff2))
1585 /* REACTION-FIELD ELECTROSTATICS */
1586 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
1588 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1592 fscal = _mm_and_pd(fscal,cutoff_mask);
1594 /* Calculate temporary vectorial force */
1595 tx = _mm_mul_pd(fscal,dx12);
1596 ty = _mm_mul_pd(fscal,dy12);
1597 tz = _mm_mul_pd(fscal,dz12);
1599 /* Update vectorial force */
1600 fix1 = _mm_add_pd(fix1,tx);
1601 fiy1 = _mm_add_pd(fiy1,ty);
1602 fiz1 = _mm_add_pd(fiz1,tz);
1604 fjx2 = _mm_add_pd(fjx2,tx);
1605 fjy2 = _mm_add_pd(fjy2,ty);
1606 fjz2 = _mm_add_pd(fjz2,tz);
1610 /**************************
1611 * CALCULATE INTERACTIONS *
1612 **************************/
1614 if (gmx_mm_any_lt(rsq13,rcutoff2))
1617 /* REACTION-FIELD ELECTROSTATICS */
1618 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
1620 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1624 fscal = _mm_and_pd(fscal,cutoff_mask);
1626 /* Calculate temporary vectorial force */
1627 tx = _mm_mul_pd(fscal,dx13);
1628 ty = _mm_mul_pd(fscal,dy13);
1629 tz = _mm_mul_pd(fscal,dz13);
1631 /* Update vectorial force */
1632 fix1 = _mm_add_pd(fix1,tx);
1633 fiy1 = _mm_add_pd(fiy1,ty);
1634 fiz1 = _mm_add_pd(fiz1,tz);
1636 fjx3 = _mm_add_pd(fjx3,tx);
1637 fjy3 = _mm_add_pd(fjy3,ty);
1638 fjz3 = _mm_add_pd(fjz3,tz);
1642 /**************************
1643 * CALCULATE INTERACTIONS *
1644 **************************/
1646 if (gmx_mm_any_lt(rsq21,rcutoff2))
1649 /* REACTION-FIELD ELECTROSTATICS */
1650 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1652 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1656 fscal = _mm_and_pd(fscal,cutoff_mask);
1658 /* Calculate temporary vectorial force */
1659 tx = _mm_mul_pd(fscal,dx21);
1660 ty = _mm_mul_pd(fscal,dy21);
1661 tz = _mm_mul_pd(fscal,dz21);
1663 /* Update vectorial force */
1664 fix2 = _mm_add_pd(fix2,tx);
1665 fiy2 = _mm_add_pd(fiy2,ty);
1666 fiz2 = _mm_add_pd(fiz2,tz);
1668 fjx1 = _mm_add_pd(fjx1,tx);
1669 fjy1 = _mm_add_pd(fjy1,ty);
1670 fjz1 = _mm_add_pd(fjz1,tz);
1674 /**************************
1675 * CALCULATE INTERACTIONS *
1676 **************************/
1678 if (gmx_mm_any_lt(rsq22,rcutoff2))
1681 /* REACTION-FIELD ELECTROSTATICS */
1682 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1684 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1688 fscal = _mm_and_pd(fscal,cutoff_mask);
1690 /* Calculate temporary vectorial force */
1691 tx = _mm_mul_pd(fscal,dx22);
1692 ty = _mm_mul_pd(fscal,dy22);
1693 tz = _mm_mul_pd(fscal,dz22);
1695 /* Update vectorial force */
1696 fix2 = _mm_add_pd(fix2,tx);
1697 fiy2 = _mm_add_pd(fiy2,ty);
1698 fiz2 = _mm_add_pd(fiz2,tz);
1700 fjx2 = _mm_add_pd(fjx2,tx);
1701 fjy2 = _mm_add_pd(fjy2,ty);
1702 fjz2 = _mm_add_pd(fjz2,tz);
1706 /**************************
1707 * CALCULATE INTERACTIONS *
1708 **************************/
1710 if (gmx_mm_any_lt(rsq23,rcutoff2))
1713 /* REACTION-FIELD ELECTROSTATICS */
1714 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
1716 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1720 fscal = _mm_and_pd(fscal,cutoff_mask);
1722 /* Calculate temporary vectorial force */
1723 tx = _mm_mul_pd(fscal,dx23);
1724 ty = _mm_mul_pd(fscal,dy23);
1725 tz = _mm_mul_pd(fscal,dz23);
1727 /* Update vectorial force */
1728 fix2 = _mm_add_pd(fix2,tx);
1729 fiy2 = _mm_add_pd(fiy2,ty);
1730 fiz2 = _mm_add_pd(fiz2,tz);
1732 fjx3 = _mm_add_pd(fjx3,tx);
1733 fjy3 = _mm_add_pd(fjy3,ty);
1734 fjz3 = _mm_add_pd(fjz3,tz);
1738 /**************************
1739 * CALCULATE INTERACTIONS *
1740 **************************/
1742 if (gmx_mm_any_lt(rsq31,rcutoff2))
1745 /* REACTION-FIELD ELECTROSTATICS */
1746 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
1748 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1752 fscal = _mm_and_pd(fscal,cutoff_mask);
1754 /* Calculate temporary vectorial force */
1755 tx = _mm_mul_pd(fscal,dx31);
1756 ty = _mm_mul_pd(fscal,dy31);
1757 tz = _mm_mul_pd(fscal,dz31);
1759 /* Update vectorial force */
1760 fix3 = _mm_add_pd(fix3,tx);
1761 fiy3 = _mm_add_pd(fiy3,ty);
1762 fiz3 = _mm_add_pd(fiz3,tz);
1764 fjx1 = _mm_add_pd(fjx1,tx);
1765 fjy1 = _mm_add_pd(fjy1,ty);
1766 fjz1 = _mm_add_pd(fjz1,tz);
1770 /**************************
1771 * CALCULATE INTERACTIONS *
1772 **************************/
1774 if (gmx_mm_any_lt(rsq32,rcutoff2))
1777 /* REACTION-FIELD ELECTROSTATICS */
1778 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
1780 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1784 fscal = _mm_and_pd(fscal,cutoff_mask);
1786 /* Calculate temporary vectorial force */
1787 tx = _mm_mul_pd(fscal,dx32);
1788 ty = _mm_mul_pd(fscal,dy32);
1789 tz = _mm_mul_pd(fscal,dz32);
1791 /* Update vectorial force */
1792 fix3 = _mm_add_pd(fix3,tx);
1793 fiy3 = _mm_add_pd(fiy3,ty);
1794 fiz3 = _mm_add_pd(fiz3,tz);
1796 fjx2 = _mm_add_pd(fjx2,tx);
1797 fjy2 = _mm_add_pd(fjy2,ty);
1798 fjz2 = _mm_add_pd(fjz2,tz);
1802 /**************************
1803 * CALCULATE INTERACTIONS *
1804 **************************/
1806 if (gmx_mm_any_lt(rsq33,rcutoff2))
1809 /* REACTION-FIELD ELECTROSTATICS */
1810 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
1812 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1816 fscal = _mm_and_pd(fscal,cutoff_mask);
1818 /* Calculate temporary vectorial force */
1819 tx = _mm_mul_pd(fscal,dx33);
1820 ty = _mm_mul_pd(fscal,dy33);
1821 tz = _mm_mul_pd(fscal,dz33);
1823 /* Update vectorial force */
1824 fix3 = _mm_add_pd(fix3,tx);
1825 fiy3 = _mm_add_pd(fiy3,ty);
1826 fiz3 = _mm_add_pd(fiz3,tz);
1828 fjx3 = _mm_add_pd(fjx3,tx);
1829 fjy3 = _mm_add_pd(fjy3,ty);
1830 fjz3 = _mm_add_pd(fjz3,tz);
1834 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1836 /* Inner loop uses 321 flops */
1839 if(jidx<j_index_end)
1843 j_coord_offsetA = DIM*jnrA;
1845 /* load j atom coordinates */
1846 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1847 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1848 &jy2,&jz2,&jx3,&jy3,&jz3);
1850 /* Calculate displacement vector */
1851 dx00 = _mm_sub_pd(ix0,jx0);
1852 dy00 = _mm_sub_pd(iy0,jy0);
1853 dz00 = _mm_sub_pd(iz0,jz0);
1854 dx11 = _mm_sub_pd(ix1,jx1);
1855 dy11 = _mm_sub_pd(iy1,jy1);
1856 dz11 = _mm_sub_pd(iz1,jz1);
1857 dx12 = _mm_sub_pd(ix1,jx2);
1858 dy12 = _mm_sub_pd(iy1,jy2);
1859 dz12 = _mm_sub_pd(iz1,jz2);
1860 dx13 = _mm_sub_pd(ix1,jx3);
1861 dy13 = _mm_sub_pd(iy1,jy3);
1862 dz13 = _mm_sub_pd(iz1,jz3);
1863 dx21 = _mm_sub_pd(ix2,jx1);
1864 dy21 = _mm_sub_pd(iy2,jy1);
1865 dz21 = _mm_sub_pd(iz2,jz1);
1866 dx22 = _mm_sub_pd(ix2,jx2);
1867 dy22 = _mm_sub_pd(iy2,jy2);
1868 dz22 = _mm_sub_pd(iz2,jz2);
1869 dx23 = _mm_sub_pd(ix2,jx3);
1870 dy23 = _mm_sub_pd(iy2,jy3);
1871 dz23 = _mm_sub_pd(iz2,jz3);
1872 dx31 = _mm_sub_pd(ix3,jx1);
1873 dy31 = _mm_sub_pd(iy3,jy1);
1874 dz31 = _mm_sub_pd(iz3,jz1);
1875 dx32 = _mm_sub_pd(ix3,jx2);
1876 dy32 = _mm_sub_pd(iy3,jy2);
1877 dz32 = _mm_sub_pd(iz3,jz2);
1878 dx33 = _mm_sub_pd(ix3,jx3);
1879 dy33 = _mm_sub_pd(iy3,jy3);
1880 dz33 = _mm_sub_pd(iz3,jz3);
1882 /* Calculate squared distance and things based on it */
1883 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1884 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1885 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1886 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1887 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1888 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1889 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1890 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1891 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1892 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1894 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1895 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1896 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1897 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1898 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1899 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1900 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1901 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1902 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1903 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1905 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1906 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1907 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1908 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1909 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1910 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1911 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1912 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1913 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1915 fjx0 = _mm_setzero_pd();
1916 fjy0 = _mm_setzero_pd();
1917 fjz0 = _mm_setzero_pd();
1918 fjx1 = _mm_setzero_pd();
1919 fjy1 = _mm_setzero_pd();
1920 fjz1 = _mm_setzero_pd();
1921 fjx2 = _mm_setzero_pd();
1922 fjy2 = _mm_setzero_pd();
1923 fjz2 = _mm_setzero_pd();
1924 fjx3 = _mm_setzero_pd();
1925 fjy3 = _mm_setzero_pd();
1926 fjz3 = _mm_setzero_pd();
1928 /**************************
1929 * CALCULATE INTERACTIONS *
1930 **************************/
1932 r00 = _mm_mul_pd(rsq00,rinv00);
1934 /* Calculate table index by multiplying r with table scale and truncate to integer */
1935 rt = _mm_mul_pd(r00,vftabscale);
1936 vfitab = _mm_cvttpd_epi32(rt);
1937 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1938 vfitab = _mm_slli_epi32(vfitab,3);
1940 /* CUBIC SPLINE TABLE DISPERSION */
1941 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1942 F = _mm_setzero_pd();
1943 GMX_MM_TRANSPOSE2_PD(Y,F);
1944 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1945 H = _mm_setzero_pd();
1946 GMX_MM_TRANSPOSE2_PD(G,H);
1947 Heps = _mm_mul_pd(vfeps,H);
1948 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1949 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1950 fvdw6 = _mm_mul_pd(c6_00,FF);
1952 /* CUBIC SPLINE TABLE REPULSION */
1953 vfitab = _mm_add_epi32(vfitab,ifour);
1954 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1955 F = _mm_setzero_pd();
1956 GMX_MM_TRANSPOSE2_PD(Y,F);
1957 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1958 H = _mm_setzero_pd();
1959 GMX_MM_TRANSPOSE2_PD(G,H);
1960 Heps = _mm_mul_pd(vfeps,H);
1961 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1962 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1963 fvdw12 = _mm_mul_pd(c12_00,FF);
1964 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1968 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1970 /* Calculate temporary vectorial force */
1971 tx = _mm_mul_pd(fscal,dx00);
1972 ty = _mm_mul_pd(fscal,dy00);
1973 tz = _mm_mul_pd(fscal,dz00);
1975 /* Update vectorial force */
1976 fix0 = _mm_add_pd(fix0,tx);
1977 fiy0 = _mm_add_pd(fiy0,ty);
1978 fiz0 = _mm_add_pd(fiz0,tz);
1980 fjx0 = _mm_add_pd(fjx0,tx);
1981 fjy0 = _mm_add_pd(fjy0,ty);
1982 fjz0 = _mm_add_pd(fjz0,tz);
1984 /**************************
1985 * CALCULATE INTERACTIONS *
1986 **************************/
1988 if (gmx_mm_any_lt(rsq11,rcutoff2))
1991 /* REACTION-FIELD ELECTROSTATICS */
1992 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1994 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1998 fscal = _mm_and_pd(fscal,cutoff_mask);
2000 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2002 /* Calculate temporary vectorial force */
2003 tx = _mm_mul_pd(fscal,dx11);
2004 ty = _mm_mul_pd(fscal,dy11);
2005 tz = _mm_mul_pd(fscal,dz11);
2007 /* Update vectorial force */
2008 fix1 = _mm_add_pd(fix1,tx);
2009 fiy1 = _mm_add_pd(fiy1,ty);
2010 fiz1 = _mm_add_pd(fiz1,tz);
2012 fjx1 = _mm_add_pd(fjx1,tx);
2013 fjy1 = _mm_add_pd(fjy1,ty);
2014 fjz1 = _mm_add_pd(fjz1,tz);
2018 /**************************
2019 * CALCULATE INTERACTIONS *
2020 **************************/
2022 if (gmx_mm_any_lt(rsq12,rcutoff2))
2025 /* REACTION-FIELD ELECTROSTATICS */
2026 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
2028 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2032 fscal = _mm_and_pd(fscal,cutoff_mask);
2034 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2036 /* Calculate temporary vectorial force */
2037 tx = _mm_mul_pd(fscal,dx12);
2038 ty = _mm_mul_pd(fscal,dy12);
2039 tz = _mm_mul_pd(fscal,dz12);
2041 /* Update vectorial force */
2042 fix1 = _mm_add_pd(fix1,tx);
2043 fiy1 = _mm_add_pd(fiy1,ty);
2044 fiz1 = _mm_add_pd(fiz1,tz);
2046 fjx2 = _mm_add_pd(fjx2,tx);
2047 fjy2 = _mm_add_pd(fjy2,ty);
2048 fjz2 = _mm_add_pd(fjz2,tz);
2052 /**************************
2053 * CALCULATE INTERACTIONS *
2054 **************************/
2056 if (gmx_mm_any_lt(rsq13,rcutoff2))
2059 /* REACTION-FIELD ELECTROSTATICS */
2060 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
2062 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
2066 fscal = _mm_and_pd(fscal,cutoff_mask);
2068 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2070 /* Calculate temporary vectorial force */
2071 tx = _mm_mul_pd(fscal,dx13);
2072 ty = _mm_mul_pd(fscal,dy13);
2073 tz = _mm_mul_pd(fscal,dz13);
2075 /* Update vectorial force */
2076 fix1 = _mm_add_pd(fix1,tx);
2077 fiy1 = _mm_add_pd(fiy1,ty);
2078 fiz1 = _mm_add_pd(fiz1,tz);
2080 fjx3 = _mm_add_pd(fjx3,tx);
2081 fjy3 = _mm_add_pd(fjy3,ty);
2082 fjz3 = _mm_add_pd(fjz3,tz);
2086 /**************************
2087 * CALCULATE INTERACTIONS *
2088 **************************/
2090 if (gmx_mm_any_lt(rsq21,rcutoff2))
2093 /* REACTION-FIELD ELECTROSTATICS */
2094 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
2096 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2100 fscal = _mm_and_pd(fscal,cutoff_mask);
2102 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2104 /* Calculate temporary vectorial force */
2105 tx = _mm_mul_pd(fscal,dx21);
2106 ty = _mm_mul_pd(fscal,dy21);
2107 tz = _mm_mul_pd(fscal,dz21);
2109 /* Update vectorial force */
2110 fix2 = _mm_add_pd(fix2,tx);
2111 fiy2 = _mm_add_pd(fiy2,ty);
2112 fiz2 = _mm_add_pd(fiz2,tz);
2114 fjx1 = _mm_add_pd(fjx1,tx);
2115 fjy1 = _mm_add_pd(fjy1,ty);
2116 fjz1 = _mm_add_pd(fjz1,tz);
2120 /**************************
2121 * CALCULATE INTERACTIONS *
2122 **************************/
2124 if (gmx_mm_any_lt(rsq22,rcutoff2))
2127 /* REACTION-FIELD ELECTROSTATICS */
2128 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
2130 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2134 fscal = _mm_and_pd(fscal,cutoff_mask);
2136 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2138 /* Calculate temporary vectorial force */
2139 tx = _mm_mul_pd(fscal,dx22);
2140 ty = _mm_mul_pd(fscal,dy22);
2141 tz = _mm_mul_pd(fscal,dz22);
2143 /* Update vectorial force */
2144 fix2 = _mm_add_pd(fix2,tx);
2145 fiy2 = _mm_add_pd(fiy2,ty);
2146 fiz2 = _mm_add_pd(fiz2,tz);
2148 fjx2 = _mm_add_pd(fjx2,tx);
2149 fjy2 = _mm_add_pd(fjy2,ty);
2150 fjz2 = _mm_add_pd(fjz2,tz);
2154 /**************************
2155 * CALCULATE INTERACTIONS *
2156 **************************/
2158 if (gmx_mm_any_lt(rsq23,rcutoff2))
2161 /* REACTION-FIELD ELECTROSTATICS */
2162 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
2164 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
2168 fscal = _mm_and_pd(fscal,cutoff_mask);
2170 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2172 /* Calculate temporary vectorial force */
2173 tx = _mm_mul_pd(fscal,dx23);
2174 ty = _mm_mul_pd(fscal,dy23);
2175 tz = _mm_mul_pd(fscal,dz23);
2177 /* Update vectorial force */
2178 fix2 = _mm_add_pd(fix2,tx);
2179 fiy2 = _mm_add_pd(fiy2,ty);
2180 fiz2 = _mm_add_pd(fiz2,tz);
2182 fjx3 = _mm_add_pd(fjx3,tx);
2183 fjy3 = _mm_add_pd(fjy3,ty);
2184 fjz3 = _mm_add_pd(fjz3,tz);
2188 /**************************
2189 * CALCULATE INTERACTIONS *
2190 **************************/
2192 if (gmx_mm_any_lt(rsq31,rcutoff2))
2195 /* REACTION-FIELD ELECTROSTATICS */
2196 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
2198 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
2202 fscal = _mm_and_pd(fscal,cutoff_mask);
2204 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2206 /* Calculate temporary vectorial force */
2207 tx = _mm_mul_pd(fscal,dx31);
2208 ty = _mm_mul_pd(fscal,dy31);
2209 tz = _mm_mul_pd(fscal,dz31);
2211 /* Update vectorial force */
2212 fix3 = _mm_add_pd(fix3,tx);
2213 fiy3 = _mm_add_pd(fiy3,ty);
2214 fiz3 = _mm_add_pd(fiz3,tz);
2216 fjx1 = _mm_add_pd(fjx1,tx);
2217 fjy1 = _mm_add_pd(fjy1,ty);
2218 fjz1 = _mm_add_pd(fjz1,tz);
2222 /**************************
2223 * CALCULATE INTERACTIONS *
2224 **************************/
2226 if (gmx_mm_any_lt(rsq32,rcutoff2))
2229 /* REACTION-FIELD ELECTROSTATICS */
2230 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
2232 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
2236 fscal = _mm_and_pd(fscal,cutoff_mask);
2238 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2240 /* Calculate temporary vectorial force */
2241 tx = _mm_mul_pd(fscal,dx32);
2242 ty = _mm_mul_pd(fscal,dy32);
2243 tz = _mm_mul_pd(fscal,dz32);
2245 /* Update vectorial force */
2246 fix3 = _mm_add_pd(fix3,tx);
2247 fiy3 = _mm_add_pd(fiy3,ty);
2248 fiz3 = _mm_add_pd(fiz3,tz);
2250 fjx2 = _mm_add_pd(fjx2,tx);
2251 fjy2 = _mm_add_pd(fjy2,ty);
2252 fjz2 = _mm_add_pd(fjz2,tz);
2256 /**************************
2257 * CALCULATE INTERACTIONS *
2258 **************************/
2260 if (gmx_mm_any_lt(rsq33,rcutoff2))
2263 /* REACTION-FIELD ELECTROSTATICS */
2264 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
2266 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
2270 fscal = _mm_and_pd(fscal,cutoff_mask);
2272 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2274 /* Calculate temporary vectorial force */
2275 tx = _mm_mul_pd(fscal,dx33);
2276 ty = _mm_mul_pd(fscal,dy33);
2277 tz = _mm_mul_pd(fscal,dz33);
2279 /* Update vectorial force */
2280 fix3 = _mm_add_pd(fix3,tx);
2281 fiy3 = _mm_add_pd(fiy3,ty);
2282 fiz3 = _mm_add_pd(fiz3,tz);
2284 fjx3 = _mm_add_pd(fjx3,tx);
2285 fjy3 = _mm_add_pd(fjy3,ty);
2286 fjz3 = _mm_add_pd(fjz3,tz);
2290 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2292 /* Inner loop uses 321 flops */
2295 /* End of innermost loop */
2297 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2298 f+i_coord_offset,fshift+i_shift_offset);
2300 /* Increment number of inner iterations */
2301 inneriter += j_index_end - j_index_start;
2303 /* Outer loop uses 24 flops */
2306 /* Increment number of outer iterations */
2309 /* Update outer/inner flops */
2311 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*321);