2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_sse2_double.h"
48 #include "kernelutil_x86_sse2_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_double
52 * Electrostatics interaction: ReactionField
53 * VdW interaction: CubicSplineTable
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_VF_sse2_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int j_coord_offsetA,j_coord_offsetB;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
88 int vdwjidx0A,vdwjidx0B;
89 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 int vdwjidx1A,vdwjidx1B;
91 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B;
93 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 int vdwjidx3A,vdwjidx3B;
95 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
96 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
100 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
103 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
104 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
105 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
106 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
109 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
113 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
115 __m128i ifour = _mm_set1_epi32(4);
116 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
118 __m128d dummy_mask,cutoff_mask;
119 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
120 __m128d one = _mm_set1_pd(1.0);
121 __m128d two = _mm_set1_pd(2.0);
127 jindex = nlist->jindex;
129 shiftidx = nlist->shift;
131 shiftvec = fr->shift_vec[0];
132 fshift = fr->fshift[0];
133 facel = _mm_set1_pd(fr->epsfac);
134 charge = mdatoms->chargeA;
135 krf = _mm_set1_pd(fr->ic->k_rf);
136 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
137 crf = _mm_set1_pd(fr->ic->c_rf);
138 nvdwtype = fr->ntype;
140 vdwtype = mdatoms->typeA;
142 vftab = kernel_data->table_vdw->data;
143 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
145 /* Setup water-specific parameters */
146 inr = nlist->iinr[0];
147 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
148 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
149 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
150 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
152 jq1 = _mm_set1_pd(charge[inr+1]);
153 jq2 = _mm_set1_pd(charge[inr+2]);
154 jq3 = _mm_set1_pd(charge[inr+3]);
155 vdwjidx0A = 2*vdwtype[inr+0];
156 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
157 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
158 qq11 = _mm_mul_pd(iq1,jq1);
159 qq12 = _mm_mul_pd(iq1,jq2);
160 qq13 = _mm_mul_pd(iq1,jq3);
161 qq21 = _mm_mul_pd(iq2,jq1);
162 qq22 = _mm_mul_pd(iq2,jq2);
163 qq23 = _mm_mul_pd(iq2,jq3);
164 qq31 = _mm_mul_pd(iq3,jq1);
165 qq32 = _mm_mul_pd(iq3,jq2);
166 qq33 = _mm_mul_pd(iq3,jq3);
168 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
169 rcutoff_scalar = fr->rcoulomb;
170 rcutoff = _mm_set1_pd(rcutoff_scalar);
171 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
173 /* Avoid stupid compiler warnings */
181 /* Start outer loop over neighborlists */
182 for(iidx=0; iidx<nri; iidx++)
184 /* Load shift vector for this list */
185 i_shift_offset = DIM*shiftidx[iidx];
187 /* Load limits for loop over neighbors */
188 j_index_start = jindex[iidx];
189 j_index_end = jindex[iidx+1];
191 /* Get outer coordinate index */
193 i_coord_offset = DIM*inr;
195 /* Load i particle coords and add shift vector */
196 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
197 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
199 fix0 = _mm_setzero_pd();
200 fiy0 = _mm_setzero_pd();
201 fiz0 = _mm_setzero_pd();
202 fix1 = _mm_setzero_pd();
203 fiy1 = _mm_setzero_pd();
204 fiz1 = _mm_setzero_pd();
205 fix2 = _mm_setzero_pd();
206 fiy2 = _mm_setzero_pd();
207 fiz2 = _mm_setzero_pd();
208 fix3 = _mm_setzero_pd();
209 fiy3 = _mm_setzero_pd();
210 fiz3 = _mm_setzero_pd();
212 /* Reset potential sums */
213 velecsum = _mm_setzero_pd();
214 vvdwsum = _mm_setzero_pd();
216 /* Start inner kernel loop */
217 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
220 /* Get j neighbor index, and coordinate index */
223 j_coord_offsetA = DIM*jnrA;
224 j_coord_offsetB = DIM*jnrB;
226 /* load j atom coordinates */
227 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
228 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
229 &jy2,&jz2,&jx3,&jy3,&jz3);
231 /* Calculate displacement vector */
232 dx00 = _mm_sub_pd(ix0,jx0);
233 dy00 = _mm_sub_pd(iy0,jy0);
234 dz00 = _mm_sub_pd(iz0,jz0);
235 dx11 = _mm_sub_pd(ix1,jx1);
236 dy11 = _mm_sub_pd(iy1,jy1);
237 dz11 = _mm_sub_pd(iz1,jz1);
238 dx12 = _mm_sub_pd(ix1,jx2);
239 dy12 = _mm_sub_pd(iy1,jy2);
240 dz12 = _mm_sub_pd(iz1,jz2);
241 dx13 = _mm_sub_pd(ix1,jx3);
242 dy13 = _mm_sub_pd(iy1,jy3);
243 dz13 = _mm_sub_pd(iz1,jz3);
244 dx21 = _mm_sub_pd(ix2,jx1);
245 dy21 = _mm_sub_pd(iy2,jy1);
246 dz21 = _mm_sub_pd(iz2,jz1);
247 dx22 = _mm_sub_pd(ix2,jx2);
248 dy22 = _mm_sub_pd(iy2,jy2);
249 dz22 = _mm_sub_pd(iz2,jz2);
250 dx23 = _mm_sub_pd(ix2,jx3);
251 dy23 = _mm_sub_pd(iy2,jy3);
252 dz23 = _mm_sub_pd(iz2,jz3);
253 dx31 = _mm_sub_pd(ix3,jx1);
254 dy31 = _mm_sub_pd(iy3,jy1);
255 dz31 = _mm_sub_pd(iz3,jz1);
256 dx32 = _mm_sub_pd(ix3,jx2);
257 dy32 = _mm_sub_pd(iy3,jy2);
258 dz32 = _mm_sub_pd(iz3,jz2);
259 dx33 = _mm_sub_pd(ix3,jx3);
260 dy33 = _mm_sub_pd(iy3,jy3);
261 dz33 = _mm_sub_pd(iz3,jz3);
263 /* Calculate squared distance and things based on it */
264 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
265 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
266 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
267 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
268 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
269 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
270 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
271 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
272 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
273 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
275 rinv00 = gmx_mm_invsqrt_pd(rsq00);
276 rinv11 = gmx_mm_invsqrt_pd(rsq11);
277 rinv12 = gmx_mm_invsqrt_pd(rsq12);
278 rinv13 = gmx_mm_invsqrt_pd(rsq13);
279 rinv21 = gmx_mm_invsqrt_pd(rsq21);
280 rinv22 = gmx_mm_invsqrt_pd(rsq22);
281 rinv23 = gmx_mm_invsqrt_pd(rsq23);
282 rinv31 = gmx_mm_invsqrt_pd(rsq31);
283 rinv32 = gmx_mm_invsqrt_pd(rsq32);
284 rinv33 = gmx_mm_invsqrt_pd(rsq33);
286 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
287 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
288 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
289 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
290 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
291 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
292 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
293 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
294 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
296 fjx0 = _mm_setzero_pd();
297 fjy0 = _mm_setzero_pd();
298 fjz0 = _mm_setzero_pd();
299 fjx1 = _mm_setzero_pd();
300 fjy1 = _mm_setzero_pd();
301 fjz1 = _mm_setzero_pd();
302 fjx2 = _mm_setzero_pd();
303 fjy2 = _mm_setzero_pd();
304 fjz2 = _mm_setzero_pd();
305 fjx3 = _mm_setzero_pd();
306 fjy3 = _mm_setzero_pd();
307 fjz3 = _mm_setzero_pd();
309 /**************************
310 * CALCULATE INTERACTIONS *
311 **************************/
313 r00 = _mm_mul_pd(rsq00,rinv00);
315 /* Calculate table index by multiplying r with table scale and truncate to integer */
316 rt = _mm_mul_pd(r00,vftabscale);
317 vfitab = _mm_cvttpd_epi32(rt);
318 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
319 vfitab = _mm_slli_epi32(vfitab,3);
321 /* CUBIC SPLINE TABLE DISPERSION */
322 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
323 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
324 GMX_MM_TRANSPOSE2_PD(Y,F);
325 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
326 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
327 GMX_MM_TRANSPOSE2_PD(G,H);
328 Heps = _mm_mul_pd(vfeps,H);
329 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
330 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
331 vvdw6 = _mm_mul_pd(c6_00,VV);
332 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
333 fvdw6 = _mm_mul_pd(c6_00,FF);
335 /* CUBIC SPLINE TABLE REPULSION */
336 vfitab = _mm_add_epi32(vfitab,ifour);
337 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
338 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
339 GMX_MM_TRANSPOSE2_PD(Y,F);
340 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
341 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
342 GMX_MM_TRANSPOSE2_PD(G,H);
343 Heps = _mm_mul_pd(vfeps,H);
344 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
345 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
346 vvdw12 = _mm_mul_pd(c12_00,VV);
347 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
348 fvdw12 = _mm_mul_pd(c12_00,FF);
349 vvdw = _mm_add_pd(vvdw12,vvdw6);
350 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
352 /* Update potential sum for this i atom from the interaction with this j atom. */
353 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
357 /* Calculate temporary vectorial force */
358 tx = _mm_mul_pd(fscal,dx00);
359 ty = _mm_mul_pd(fscal,dy00);
360 tz = _mm_mul_pd(fscal,dz00);
362 /* Update vectorial force */
363 fix0 = _mm_add_pd(fix0,tx);
364 fiy0 = _mm_add_pd(fiy0,ty);
365 fiz0 = _mm_add_pd(fiz0,tz);
367 fjx0 = _mm_add_pd(fjx0,tx);
368 fjy0 = _mm_add_pd(fjy0,ty);
369 fjz0 = _mm_add_pd(fjz0,tz);
371 /**************************
372 * CALCULATE INTERACTIONS *
373 **************************/
375 if (gmx_mm_any_lt(rsq11,rcutoff2))
378 /* REACTION-FIELD ELECTROSTATICS */
379 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
380 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
382 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
384 /* Update potential sum for this i atom from the interaction with this j atom. */
385 velec = _mm_and_pd(velec,cutoff_mask);
386 velecsum = _mm_add_pd(velecsum,velec);
390 fscal = _mm_and_pd(fscal,cutoff_mask);
392 /* Calculate temporary vectorial force */
393 tx = _mm_mul_pd(fscal,dx11);
394 ty = _mm_mul_pd(fscal,dy11);
395 tz = _mm_mul_pd(fscal,dz11);
397 /* Update vectorial force */
398 fix1 = _mm_add_pd(fix1,tx);
399 fiy1 = _mm_add_pd(fiy1,ty);
400 fiz1 = _mm_add_pd(fiz1,tz);
402 fjx1 = _mm_add_pd(fjx1,tx);
403 fjy1 = _mm_add_pd(fjy1,ty);
404 fjz1 = _mm_add_pd(fjz1,tz);
408 /**************************
409 * CALCULATE INTERACTIONS *
410 **************************/
412 if (gmx_mm_any_lt(rsq12,rcutoff2))
415 /* REACTION-FIELD ELECTROSTATICS */
416 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
417 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
419 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
421 /* Update potential sum for this i atom from the interaction with this j atom. */
422 velec = _mm_and_pd(velec,cutoff_mask);
423 velecsum = _mm_add_pd(velecsum,velec);
427 fscal = _mm_and_pd(fscal,cutoff_mask);
429 /* Calculate temporary vectorial force */
430 tx = _mm_mul_pd(fscal,dx12);
431 ty = _mm_mul_pd(fscal,dy12);
432 tz = _mm_mul_pd(fscal,dz12);
434 /* Update vectorial force */
435 fix1 = _mm_add_pd(fix1,tx);
436 fiy1 = _mm_add_pd(fiy1,ty);
437 fiz1 = _mm_add_pd(fiz1,tz);
439 fjx2 = _mm_add_pd(fjx2,tx);
440 fjy2 = _mm_add_pd(fjy2,ty);
441 fjz2 = _mm_add_pd(fjz2,tz);
445 /**************************
446 * CALCULATE INTERACTIONS *
447 **************************/
449 if (gmx_mm_any_lt(rsq13,rcutoff2))
452 /* REACTION-FIELD ELECTROSTATICS */
453 velec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_add_pd(rinv13,_mm_mul_pd(krf,rsq13)),crf));
454 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
456 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
458 /* Update potential sum for this i atom from the interaction with this j atom. */
459 velec = _mm_and_pd(velec,cutoff_mask);
460 velecsum = _mm_add_pd(velecsum,velec);
464 fscal = _mm_and_pd(fscal,cutoff_mask);
466 /* Calculate temporary vectorial force */
467 tx = _mm_mul_pd(fscal,dx13);
468 ty = _mm_mul_pd(fscal,dy13);
469 tz = _mm_mul_pd(fscal,dz13);
471 /* Update vectorial force */
472 fix1 = _mm_add_pd(fix1,tx);
473 fiy1 = _mm_add_pd(fiy1,ty);
474 fiz1 = _mm_add_pd(fiz1,tz);
476 fjx3 = _mm_add_pd(fjx3,tx);
477 fjy3 = _mm_add_pd(fjy3,ty);
478 fjz3 = _mm_add_pd(fjz3,tz);
482 /**************************
483 * CALCULATE INTERACTIONS *
484 **************************/
486 if (gmx_mm_any_lt(rsq21,rcutoff2))
489 /* REACTION-FIELD ELECTROSTATICS */
490 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
491 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
493 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
495 /* Update potential sum for this i atom from the interaction with this j atom. */
496 velec = _mm_and_pd(velec,cutoff_mask);
497 velecsum = _mm_add_pd(velecsum,velec);
501 fscal = _mm_and_pd(fscal,cutoff_mask);
503 /* Calculate temporary vectorial force */
504 tx = _mm_mul_pd(fscal,dx21);
505 ty = _mm_mul_pd(fscal,dy21);
506 tz = _mm_mul_pd(fscal,dz21);
508 /* Update vectorial force */
509 fix2 = _mm_add_pd(fix2,tx);
510 fiy2 = _mm_add_pd(fiy2,ty);
511 fiz2 = _mm_add_pd(fiz2,tz);
513 fjx1 = _mm_add_pd(fjx1,tx);
514 fjy1 = _mm_add_pd(fjy1,ty);
515 fjz1 = _mm_add_pd(fjz1,tz);
519 /**************************
520 * CALCULATE INTERACTIONS *
521 **************************/
523 if (gmx_mm_any_lt(rsq22,rcutoff2))
526 /* REACTION-FIELD ELECTROSTATICS */
527 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
528 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
530 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
532 /* Update potential sum for this i atom from the interaction with this j atom. */
533 velec = _mm_and_pd(velec,cutoff_mask);
534 velecsum = _mm_add_pd(velecsum,velec);
538 fscal = _mm_and_pd(fscal,cutoff_mask);
540 /* Calculate temporary vectorial force */
541 tx = _mm_mul_pd(fscal,dx22);
542 ty = _mm_mul_pd(fscal,dy22);
543 tz = _mm_mul_pd(fscal,dz22);
545 /* Update vectorial force */
546 fix2 = _mm_add_pd(fix2,tx);
547 fiy2 = _mm_add_pd(fiy2,ty);
548 fiz2 = _mm_add_pd(fiz2,tz);
550 fjx2 = _mm_add_pd(fjx2,tx);
551 fjy2 = _mm_add_pd(fjy2,ty);
552 fjz2 = _mm_add_pd(fjz2,tz);
556 /**************************
557 * CALCULATE INTERACTIONS *
558 **************************/
560 if (gmx_mm_any_lt(rsq23,rcutoff2))
563 /* REACTION-FIELD ELECTROSTATICS */
564 velec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_add_pd(rinv23,_mm_mul_pd(krf,rsq23)),crf));
565 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
567 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
569 /* Update potential sum for this i atom from the interaction with this j atom. */
570 velec = _mm_and_pd(velec,cutoff_mask);
571 velecsum = _mm_add_pd(velecsum,velec);
575 fscal = _mm_and_pd(fscal,cutoff_mask);
577 /* Calculate temporary vectorial force */
578 tx = _mm_mul_pd(fscal,dx23);
579 ty = _mm_mul_pd(fscal,dy23);
580 tz = _mm_mul_pd(fscal,dz23);
582 /* Update vectorial force */
583 fix2 = _mm_add_pd(fix2,tx);
584 fiy2 = _mm_add_pd(fiy2,ty);
585 fiz2 = _mm_add_pd(fiz2,tz);
587 fjx3 = _mm_add_pd(fjx3,tx);
588 fjy3 = _mm_add_pd(fjy3,ty);
589 fjz3 = _mm_add_pd(fjz3,tz);
593 /**************************
594 * CALCULATE INTERACTIONS *
595 **************************/
597 if (gmx_mm_any_lt(rsq31,rcutoff2))
600 /* REACTION-FIELD ELECTROSTATICS */
601 velec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_add_pd(rinv31,_mm_mul_pd(krf,rsq31)),crf));
602 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
604 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
606 /* Update potential sum for this i atom from the interaction with this j atom. */
607 velec = _mm_and_pd(velec,cutoff_mask);
608 velecsum = _mm_add_pd(velecsum,velec);
612 fscal = _mm_and_pd(fscal,cutoff_mask);
614 /* Calculate temporary vectorial force */
615 tx = _mm_mul_pd(fscal,dx31);
616 ty = _mm_mul_pd(fscal,dy31);
617 tz = _mm_mul_pd(fscal,dz31);
619 /* Update vectorial force */
620 fix3 = _mm_add_pd(fix3,tx);
621 fiy3 = _mm_add_pd(fiy3,ty);
622 fiz3 = _mm_add_pd(fiz3,tz);
624 fjx1 = _mm_add_pd(fjx1,tx);
625 fjy1 = _mm_add_pd(fjy1,ty);
626 fjz1 = _mm_add_pd(fjz1,tz);
630 /**************************
631 * CALCULATE INTERACTIONS *
632 **************************/
634 if (gmx_mm_any_lt(rsq32,rcutoff2))
637 /* REACTION-FIELD ELECTROSTATICS */
638 velec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_add_pd(rinv32,_mm_mul_pd(krf,rsq32)),crf));
639 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
641 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
643 /* Update potential sum for this i atom from the interaction with this j atom. */
644 velec = _mm_and_pd(velec,cutoff_mask);
645 velecsum = _mm_add_pd(velecsum,velec);
649 fscal = _mm_and_pd(fscal,cutoff_mask);
651 /* Calculate temporary vectorial force */
652 tx = _mm_mul_pd(fscal,dx32);
653 ty = _mm_mul_pd(fscal,dy32);
654 tz = _mm_mul_pd(fscal,dz32);
656 /* Update vectorial force */
657 fix3 = _mm_add_pd(fix3,tx);
658 fiy3 = _mm_add_pd(fiy3,ty);
659 fiz3 = _mm_add_pd(fiz3,tz);
661 fjx2 = _mm_add_pd(fjx2,tx);
662 fjy2 = _mm_add_pd(fjy2,ty);
663 fjz2 = _mm_add_pd(fjz2,tz);
667 /**************************
668 * CALCULATE INTERACTIONS *
669 **************************/
671 if (gmx_mm_any_lt(rsq33,rcutoff2))
674 /* REACTION-FIELD ELECTROSTATICS */
675 velec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_add_pd(rinv33,_mm_mul_pd(krf,rsq33)),crf));
676 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
678 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
680 /* Update potential sum for this i atom from the interaction with this j atom. */
681 velec = _mm_and_pd(velec,cutoff_mask);
682 velecsum = _mm_add_pd(velecsum,velec);
686 fscal = _mm_and_pd(fscal,cutoff_mask);
688 /* Calculate temporary vectorial force */
689 tx = _mm_mul_pd(fscal,dx33);
690 ty = _mm_mul_pd(fscal,dy33);
691 tz = _mm_mul_pd(fscal,dz33);
693 /* Update vectorial force */
694 fix3 = _mm_add_pd(fix3,tx);
695 fiy3 = _mm_add_pd(fiy3,ty);
696 fiz3 = _mm_add_pd(fiz3,tz);
698 fjx3 = _mm_add_pd(fjx3,tx);
699 fjy3 = _mm_add_pd(fjy3,ty);
700 fjz3 = _mm_add_pd(fjz3,tz);
704 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
706 /* Inner loop uses 383 flops */
713 j_coord_offsetA = DIM*jnrA;
715 /* load j atom coordinates */
716 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
717 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
718 &jy2,&jz2,&jx3,&jy3,&jz3);
720 /* Calculate displacement vector */
721 dx00 = _mm_sub_pd(ix0,jx0);
722 dy00 = _mm_sub_pd(iy0,jy0);
723 dz00 = _mm_sub_pd(iz0,jz0);
724 dx11 = _mm_sub_pd(ix1,jx1);
725 dy11 = _mm_sub_pd(iy1,jy1);
726 dz11 = _mm_sub_pd(iz1,jz1);
727 dx12 = _mm_sub_pd(ix1,jx2);
728 dy12 = _mm_sub_pd(iy1,jy2);
729 dz12 = _mm_sub_pd(iz1,jz2);
730 dx13 = _mm_sub_pd(ix1,jx3);
731 dy13 = _mm_sub_pd(iy1,jy3);
732 dz13 = _mm_sub_pd(iz1,jz3);
733 dx21 = _mm_sub_pd(ix2,jx1);
734 dy21 = _mm_sub_pd(iy2,jy1);
735 dz21 = _mm_sub_pd(iz2,jz1);
736 dx22 = _mm_sub_pd(ix2,jx2);
737 dy22 = _mm_sub_pd(iy2,jy2);
738 dz22 = _mm_sub_pd(iz2,jz2);
739 dx23 = _mm_sub_pd(ix2,jx3);
740 dy23 = _mm_sub_pd(iy2,jy3);
741 dz23 = _mm_sub_pd(iz2,jz3);
742 dx31 = _mm_sub_pd(ix3,jx1);
743 dy31 = _mm_sub_pd(iy3,jy1);
744 dz31 = _mm_sub_pd(iz3,jz1);
745 dx32 = _mm_sub_pd(ix3,jx2);
746 dy32 = _mm_sub_pd(iy3,jy2);
747 dz32 = _mm_sub_pd(iz3,jz2);
748 dx33 = _mm_sub_pd(ix3,jx3);
749 dy33 = _mm_sub_pd(iy3,jy3);
750 dz33 = _mm_sub_pd(iz3,jz3);
752 /* Calculate squared distance and things based on it */
753 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
754 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
755 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
756 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
757 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
758 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
759 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
760 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
761 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
762 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
764 rinv00 = gmx_mm_invsqrt_pd(rsq00);
765 rinv11 = gmx_mm_invsqrt_pd(rsq11);
766 rinv12 = gmx_mm_invsqrt_pd(rsq12);
767 rinv13 = gmx_mm_invsqrt_pd(rsq13);
768 rinv21 = gmx_mm_invsqrt_pd(rsq21);
769 rinv22 = gmx_mm_invsqrt_pd(rsq22);
770 rinv23 = gmx_mm_invsqrt_pd(rsq23);
771 rinv31 = gmx_mm_invsqrt_pd(rsq31);
772 rinv32 = gmx_mm_invsqrt_pd(rsq32);
773 rinv33 = gmx_mm_invsqrt_pd(rsq33);
775 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
776 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
777 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
778 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
779 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
780 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
781 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
782 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
783 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
785 fjx0 = _mm_setzero_pd();
786 fjy0 = _mm_setzero_pd();
787 fjz0 = _mm_setzero_pd();
788 fjx1 = _mm_setzero_pd();
789 fjy1 = _mm_setzero_pd();
790 fjz1 = _mm_setzero_pd();
791 fjx2 = _mm_setzero_pd();
792 fjy2 = _mm_setzero_pd();
793 fjz2 = _mm_setzero_pd();
794 fjx3 = _mm_setzero_pd();
795 fjy3 = _mm_setzero_pd();
796 fjz3 = _mm_setzero_pd();
798 /**************************
799 * CALCULATE INTERACTIONS *
800 **************************/
802 r00 = _mm_mul_pd(rsq00,rinv00);
804 /* Calculate table index by multiplying r with table scale and truncate to integer */
805 rt = _mm_mul_pd(r00,vftabscale);
806 vfitab = _mm_cvttpd_epi32(rt);
807 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
808 vfitab = _mm_slli_epi32(vfitab,3);
810 /* CUBIC SPLINE TABLE DISPERSION */
811 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
812 F = _mm_setzero_pd();
813 GMX_MM_TRANSPOSE2_PD(Y,F);
814 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
815 H = _mm_setzero_pd();
816 GMX_MM_TRANSPOSE2_PD(G,H);
817 Heps = _mm_mul_pd(vfeps,H);
818 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
819 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
820 vvdw6 = _mm_mul_pd(c6_00,VV);
821 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
822 fvdw6 = _mm_mul_pd(c6_00,FF);
824 /* CUBIC SPLINE TABLE REPULSION */
825 vfitab = _mm_add_epi32(vfitab,ifour);
826 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
827 F = _mm_setzero_pd();
828 GMX_MM_TRANSPOSE2_PD(Y,F);
829 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
830 H = _mm_setzero_pd();
831 GMX_MM_TRANSPOSE2_PD(G,H);
832 Heps = _mm_mul_pd(vfeps,H);
833 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
834 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
835 vvdw12 = _mm_mul_pd(c12_00,VV);
836 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
837 fvdw12 = _mm_mul_pd(c12_00,FF);
838 vvdw = _mm_add_pd(vvdw12,vvdw6);
839 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
841 /* Update potential sum for this i atom from the interaction with this j atom. */
842 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
843 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
847 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
849 /* Calculate temporary vectorial force */
850 tx = _mm_mul_pd(fscal,dx00);
851 ty = _mm_mul_pd(fscal,dy00);
852 tz = _mm_mul_pd(fscal,dz00);
854 /* Update vectorial force */
855 fix0 = _mm_add_pd(fix0,tx);
856 fiy0 = _mm_add_pd(fiy0,ty);
857 fiz0 = _mm_add_pd(fiz0,tz);
859 fjx0 = _mm_add_pd(fjx0,tx);
860 fjy0 = _mm_add_pd(fjy0,ty);
861 fjz0 = _mm_add_pd(fjz0,tz);
863 /**************************
864 * CALCULATE INTERACTIONS *
865 **************************/
867 if (gmx_mm_any_lt(rsq11,rcutoff2))
870 /* REACTION-FIELD ELECTROSTATICS */
871 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_add_pd(rinv11,_mm_mul_pd(krf,rsq11)),crf));
872 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
874 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
876 /* Update potential sum for this i atom from the interaction with this j atom. */
877 velec = _mm_and_pd(velec,cutoff_mask);
878 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
879 velecsum = _mm_add_pd(velecsum,velec);
883 fscal = _mm_and_pd(fscal,cutoff_mask);
885 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
887 /* Calculate temporary vectorial force */
888 tx = _mm_mul_pd(fscal,dx11);
889 ty = _mm_mul_pd(fscal,dy11);
890 tz = _mm_mul_pd(fscal,dz11);
892 /* Update vectorial force */
893 fix1 = _mm_add_pd(fix1,tx);
894 fiy1 = _mm_add_pd(fiy1,ty);
895 fiz1 = _mm_add_pd(fiz1,tz);
897 fjx1 = _mm_add_pd(fjx1,tx);
898 fjy1 = _mm_add_pd(fjy1,ty);
899 fjz1 = _mm_add_pd(fjz1,tz);
903 /**************************
904 * CALCULATE INTERACTIONS *
905 **************************/
907 if (gmx_mm_any_lt(rsq12,rcutoff2))
910 /* REACTION-FIELD ELECTROSTATICS */
911 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_add_pd(rinv12,_mm_mul_pd(krf,rsq12)),crf));
912 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
914 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
916 /* Update potential sum for this i atom from the interaction with this j atom. */
917 velec = _mm_and_pd(velec,cutoff_mask);
918 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
919 velecsum = _mm_add_pd(velecsum,velec);
923 fscal = _mm_and_pd(fscal,cutoff_mask);
925 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
927 /* Calculate temporary vectorial force */
928 tx = _mm_mul_pd(fscal,dx12);
929 ty = _mm_mul_pd(fscal,dy12);
930 tz = _mm_mul_pd(fscal,dz12);
932 /* Update vectorial force */
933 fix1 = _mm_add_pd(fix1,tx);
934 fiy1 = _mm_add_pd(fiy1,ty);
935 fiz1 = _mm_add_pd(fiz1,tz);
937 fjx2 = _mm_add_pd(fjx2,tx);
938 fjy2 = _mm_add_pd(fjy2,ty);
939 fjz2 = _mm_add_pd(fjz2,tz);
943 /**************************
944 * CALCULATE INTERACTIONS *
945 **************************/
947 if (gmx_mm_any_lt(rsq13,rcutoff2))
950 /* REACTION-FIELD ELECTROSTATICS */
951 velec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_add_pd(rinv13,_mm_mul_pd(krf,rsq13)),crf));
952 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
954 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
956 /* Update potential sum for this i atom from the interaction with this j atom. */
957 velec = _mm_and_pd(velec,cutoff_mask);
958 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
959 velecsum = _mm_add_pd(velecsum,velec);
963 fscal = _mm_and_pd(fscal,cutoff_mask);
965 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
967 /* Calculate temporary vectorial force */
968 tx = _mm_mul_pd(fscal,dx13);
969 ty = _mm_mul_pd(fscal,dy13);
970 tz = _mm_mul_pd(fscal,dz13);
972 /* Update vectorial force */
973 fix1 = _mm_add_pd(fix1,tx);
974 fiy1 = _mm_add_pd(fiy1,ty);
975 fiz1 = _mm_add_pd(fiz1,tz);
977 fjx3 = _mm_add_pd(fjx3,tx);
978 fjy3 = _mm_add_pd(fjy3,ty);
979 fjz3 = _mm_add_pd(fjz3,tz);
983 /**************************
984 * CALCULATE INTERACTIONS *
985 **************************/
987 if (gmx_mm_any_lt(rsq21,rcutoff2))
990 /* REACTION-FIELD ELECTROSTATICS */
991 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_add_pd(rinv21,_mm_mul_pd(krf,rsq21)),crf));
992 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
994 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
996 /* Update potential sum for this i atom from the interaction with this j atom. */
997 velec = _mm_and_pd(velec,cutoff_mask);
998 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
999 velecsum = _mm_add_pd(velecsum,velec);
1003 fscal = _mm_and_pd(fscal,cutoff_mask);
1005 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1007 /* Calculate temporary vectorial force */
1008 tx = _mm_mul_pd(fscal,dx21);
1009 ty = _mm_mul_pd(fscal,dy21);
1010 tz = _mm_mul_pd(fscal,dz21);
1012 /* Update vectorial force */
1013 fix2 = _mm_add_pd(fix2,tx);
1014 fiy2 = _mm_add_pd(fiy2,ty);
1015 fiz2 = _mm_add_pd(fiz2,tz);
1017 fjx1 = _mm_add_pd(fjx1,tx);
1018 fjy1 = _mm_add_pd(fjy1,ty);
1019 fjz1 = _mm_add_pd(fjz1,tz);
1023 /**************************
1024 * CALCULATE INTERACTIONS *
1025 **************************/
1027 if (gmx_mm_any_lt(rsq22,rcutoff2))
1030 /* REACTION-FIELD ELECTROSTATICS */
1031 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_add_pd(rinv22,_mm_mul_pd(krf,rsq22)),crf));
1032 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1034 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1036 /* Update potential sum for this i atom from the interaction with this j atom. */
1037 velec = _mm_and_pd(velec,cutoff_mask);
1038 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1039 velecsum = _mm_add_pd(velecsum,velec);
1043 fscal = _mm_and_pd(fscal,cutoff_mask);
1045 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1047 /* Calculate temporary vectorial force */
1048 tx = _mm_mul_pd(fscal,dx22);
1049 ty = _mm_mul_pd(fscal,dy22);
1050 tz = _mm_mul_pd(fscal,dz22);
1052 /* Update vectorial force */
1053 fix2 = _mm_add_pd(fix2,tx);
1054 fiy2 = _mm_add_pd(fiy2,ty);
1055 fiz2 = _mm_add_pd(fiz2,tz);
1057 fjx2 = _mm_add_pd(fjx2,tx);
1058 fjy2 = _mm_add_pd(fjy2,ty);
1059 fjz2 = _mm_add_pd(fjz2,tz);
1063 /**************************
1064 * CALCULATE INTERACTIONS *
1065 **************************/
1067 if (gmx_mm_any_lt(rsq23,rcutoff2))
1070 /* REACTION-FIELD ELECTROSTATICS */
1071 velec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_add_pd(rinv23,_mm_mul_pd(krf,rsq23)),crf));
1072 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
1074 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1076 /* Update potential sum for this i atom from the interaction with this j atom. */
1077 velec = _mm_and_pd(velec,cutoff_mask);
1078 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1079 velecsum = _mm_add_pd(velecsum,velec);
1083 fscal = _mm_and_pd(fscal,cutoff_mask);
1085 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1087 /* Calculate temporary vectorial force */
1088 tx = _mm_mul_pd(fscal,dx23);
1089 ty = _mm_mul_pd(fscal,dy23);
1090 tz = _mm_mul_pd(fscal,dz23);
1092 /* Update vectorial force */
1093 fix2 = _mm_add_pd(fix2,tx);
1094 fiy2 = _mm_add_pd(fiy2,ty);
1095 fiz2 = _mm_add_pd(fiz2,tz);
1097 fjx3 = _mm_add_pd(fjx3,tx);
1098 fjy3 = _mm_add_pd(fjy3,ty);
1099 fjz3 = _mm_add_pd(fjz3,tz);
1103 /**************************
1104 * CALCULATE INTERACTIONS *
1105 **************************/
1107 if (gmx_mm_any_lt(rsq31,rcutoff2))
1110 /* REACTION-FIELD ELECTROSTATICS */
1111 velec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_add_pd(rinv31,_mm_mul_pd(krf,rsq31)),crf));
1112 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
1114 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1116 /* Update potential sum for this i atom from the interaction with this j atom. */
1117 velec = _mm_and_pd(velec,cutoff_mask);
1118 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1119 velecsum = _mm_add_pd(velecsum,velec);
1123 fscal = _mm_and_pd(fscal,cutoff_mask);
1125 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1127 /* Calculate temporary vectorial force */
1128 tx = _mm_mul_pd(fscal,dx31);
1129 ty = _mm_mul_pd(fscal,dy31);
1130 tz = _mm_mul_pd(fscal,dz31);
1132 /* Update vectorial force */
1133 fix3 = _mm_add_pd(fix3,tx);
1134 fiy3 = _mm_add_pd(fiy3,ty);
1135 fiz3 = _mm_add_pd(fiz3,tz);
1137 fjx1 = _mm_add_pd(fjx1,tx);
1138 fjy1 = _mm_add_pd(fjy1,ty);
1139 fjz1 = _mm_add_pd(fjz1,tz);
1143 /**************************
1144 * CALCULATE INTERACTIONS *
1145 **************************/
1147 if (gmx_mm_any_lt(rsq32,rcutoff2))
1150 /* REACTION-FIELD ELECTROSTATICS */
1151 velec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_add_pd(rinv32,_mm_mul_pd(krf,rsq32)),crf));
1152 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
1154 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1156 /* Update potential sum for this i atom from the interaction with this j atom. */
1157 velec = _mm_and_pd(velec,cutoff_mask);
1158 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1159 velecsum = _mm_add_pd(velecsum,velec);
1163 fscal = _mm_and_pd(fscal,cutoff_mask);
1165 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1167 /* Calculate temporary vectorial force */
1168 tx = _mm_mul_pd(fscal,dx32);
1169 ty = _mm_mul_pd(fscal,dy32);
1170 tz = _mm_mul_pd(fscal,dz32);
1172 /* Update vectorial force */
1173 fix3 = _mm_add_pd(fix3,tx);
1174 fiy3 = _mm_add_pd(fiy3,ty);
1175 fiz3 = _mm_add_pd(fiz3,tz);
1177 fjx2 = _mm_add_pd(fjx2,tx);
1178 fjy2 = _mm_add_pd(fjy2,ty);
1179 fjz2 = _mm_add_pd(fjz2,tz);
1183 /**************************
1184 * CALCULATE INTERACTIONS *
1185 **************************/
1187 if (gmx_mm_any_lt(rsq33,rcutoff2))
1190 /* REACTION-FIELD ELECTROSTATICS */
1191 velec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_add_pd(rinv33,_mm_mul_pd(krf,rsq33)),crf));
1192 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
1194 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1196 /* Update potential sum for this i atom from the interaction with this j atom. */
1197 velec = _mm_and_pd(velec,cutoff_mask);
1198 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1199 velecsum = _mm_add_pd(velecsum,velec);
1203 fscal = _mm_and_pd(fscal,cutoff_mask);
1205 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1207 /* Calculate temporary vectorial force */
1208 tx = _mm_mul_pd(fscal,dx33);
1209 ty = _mm_mul_pd(fscal,dy33);
1210 tz = _mm_mul_pd(fscal,dz33);
1212 /* Update vectorial force */
1213 fix3 = _mm_add_pd(fix3,tx);
1214 fiy3 = _mm_add_pd(fiy3,ty);
1215 fiz3 = _mm_add_pd(fiz3,tz);
1217 fjx3 = _mm_add_pd(fjx3,tx);
1218 fjy3 = _mm_add_pd(fjy3,ty);
1219 fjz3 = _mm_add_pd(fjz3,tz);
1223 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1225 /* Inner loop uses 383 flops */
1228 /* End of innermost loop */
1230 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1231 f+i_coord_offset,fshift+i_shift_offset);
1234 /* Update potential energies */
1235 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1236 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1238 /* Increment number of inner iterations */
1239 inneriter += j_index_end - j_index_start;
1241 /* Outer loop uses 26 flops */
1244 /* Increment number of outer iterations */
1247 /* Update outer/inner flops */
1249 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*383);
1252 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_double
1253 * Electrostatics interaction: ReactionField
1254 * VdW interaction: CubicSplineTable
1255 * Geometry: Water4-Water4
1256 * Calculate force/pot: Force
1259 nb_kernel_ElecRFCut_VdwCSTab_GeomW4W4_F_sse2_double
1260 (t_nblist * gmx_restrict nlist,
1261 rvec * gmx_restrict xx,
1262 rvec * gmx_restrict ff,
1263 t_forcerec * gmx_restrict fr,
1264 t_mdatoms * gmx_restrict mdatoms,
1265 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1266 t_nrnb * gmx_restrict nrnb)
1268 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1269 * just 0 for non-waters.
1270 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1271 * jnr indices corresponding to data put in the four positions in the SIMD register.
1273 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1274 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1276 int j_coord_offsetA,j_coord_offsetB;
1277 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1278 real rcutoff_scalar;
1279 real *shiftvec,*fshift,*x,*f;
1280 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1282 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1284 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1286 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1288 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1289 int vdwjidx0A,vdwjidx0B;
1290 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1291 int vdwjidx1A,vdwjidx1B;
1292 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1293 int vdwjidx2A,vdwjidx2B;
1294 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1295 int vdwjidx3A,vdwjidx3B;
1296 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1297 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1298 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1299 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1300 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1301 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1302 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1303 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1304 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1305 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1306 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1307 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1310 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1313 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1314 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1316 __m128i ifour = _mm_set1_epi32(4);
1317 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1319 __m128d dummy_mask,cutoff_mask;
1320 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1321 __m128d one = _mm_set1_pd(1.0);
1322 __m128d two = _mm_set1_pd(2.0);
1328 jindex = nlist->jindex;
1330 shiftidx = nlist->shift;
1332 shiftvec = fr->shift_vec[0];
1333 fshift = fr->fshift[0];
1334 facel = _mm_set1_pd(fr->epsfac);
1335 charge = mdatoms->chargeA;
1336 krf = _mm_set1_pd(fr->ic->k_rf);
1337 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
1338 crf = _mm_set1_pd(fr->ic->c_rf);
1339 nvdwtype = fr->ntype;
1340 vdwparam = fr->nbfp;
1341 vdwtype = mdatoms->typeA;
1343 vftab = kernel_data->table_vdw->data;
1344 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
1346 /* Setup water-specific parameters */
1347 inr = nlist->iinr[0];
1348 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1349 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1350 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1351 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1353 jq1 = _mm_set1_pd(charge[inr+1]);
1354 jq2 = _mm_set1_pd(charge[inr+2]);
1355 jq3 = _mm_set1_pd(charge[inr+3]);
1356 vdwjidx0A = 2*vdwtype[inr+0];
1357 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1358 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1359 qq11 = _mm_mul_pd(iq1,jq1);
1360 qq12 = _mm_mul_pd(iq1,jq2);
1361 qq13 = _mm_mul_pd(iq1,jq3);
1362 qq21 = _mm_mul_pd(iq2,jq1);
1363 qq22 = _mm_mul_pd(iq2,jq2);
1364 qq23 = _mm_mul_pd(iq2,jq3);
1365 qq31 = _mm_mul_pd(iq3,jq1);
1366 qq32 = _mm_mul_pd(iq3,jq2);
1367 qq33 = _mm_mul_pd(iq3,jq3);
1369 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1370 rcutoff_scalar = fr->rcoulomb;
1371 rcutoff = _mm_set1_pd(rcutoff_scalar);
1372 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1374 /* Avoid stupid compiler warnings */
1376 j_coord_offsetA = 0;
1377 j_coord_offsetB = 0;
1382 /* Start outer loop over neighborlists */
1383 for(iidx=0; iidx<nri; iidx++)
1385 /* Load shift vector for this list */
1386 i_shift_offset = DIM*shiftidx[iidx];
1388 /* Load limits for loop over neighbors */
1389 j_index_start = jindex[iidx];
1390 j_index_end = jindex[iidx+1];
1392 /* Get outer coordinate index */
1394 i_coord_offset = DIM*inr;
1396 /* Load i particle coords and add shift vector */
1397 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1398 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1400 fix0 = _mm_setzero_pd();
1401 fiy0 = _mm_setzero_pd();
1402 fiz0 = _mm_setzero_pd();
1403 fix1 = _mm_setzero_pd();
1404 fiy1 = _mm_setzero_pd();
1405 fiz1 = _mm_setzero_pd();
1406 fix2 = _mm_setzero_pd();
1407 fiy2 = _mm_setzero_pd();
1408 fiz2 = _mm_setzero_pd();
1409 fix3 = _mm_setzero_pd();
1410 fiy3 = _mm_setzero_pd();
1411 fiz3 = _mm_setzero_pd();
1413 /* Start inner kernel loop */
1414 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1417 /* Get j neighbor index, and coordinate index */
1419 jnrB = jjnr[jidx+1];
1420 j_coord_offsetA = DIM*jnrA;
1421 j_coord_offsetB = DIM*jnrB;
1423 /* load j atom coordinates */
1424 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1425 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1426 &jy2,&jz2,&jx3,&jy3,&jz3);
1428 /* Calculate displacement vector */
1429 dx00 = _mm_sub_pd(ix0,jx0);
1430 dy00 = _mm_sub_pd(iy0,jy0);
1431 dz00 = _mm_sub_pd(iz0,jz0);
1432 dx11 = _mm_sub_pd(ix1,jx1);
1433 dy11 = _mm_sub_pd(iy1,jy1);
1434 dz11 = _mm_sub_pd(iz1,jz1);
1435 dx12 = _mm_sub_pd(ix1,jx2);
1436 dy12 = _mm_sub_pd(iy1,jy2);
1437 dz12 = _mm_sub_pd(iz1,jz2);
1438 dx13 = _mm_sub_pd(ix1,jx3);
1439 dy13 = _mm_sub_pd(iy1,jy3);
1440 dz13 = _mm_sub_pd(iz1,jz3);
1441 dx21 = _mm_sub_pd(ix2,jx1);
1442 dy21 = _mm_sub_pd(iy2,jy1);
1443 dz21 = _mm_sub_pd(iz2,jz1);
1444 dx22 = _mm_sub_pd(ix2,jx2);
1445 dy22 = _mm_sub_pd(iy2,jy2);
1446 dz22 = _mm_sub_pd(iz2,jz2);
1447 dx23 = _mm_sub_pd(ix2,jx3);
1448 dy23 = _mm_sub_pd(iy2,jy3);
1449 dz23 = _mm_sub_pd(iz2,jz3);
1450 dx31 = _mm_sub_pd(ix3,jx1);
1451 dy31 = _mm_sub_pd(iy3,jy1);
1452 dz31 = _mm_sub_pd(iz3,jz1);
1453 dx32 = _mm_sub_pd(ix3,jx2);
1454 dy32 = _mm_sub_pd(iy3,jy2);
1455 dz32 = _mm_sub_pd(iz3,jz2);
1456 dx33 = _mm_sub_pd(ix3,jx3);
1457 dy33 = _mm_sub_pd(iy3,jy3);
1458 dz33 = _mm_sub_pd(iz3,jz3);
1460 /* Calculate squared distance and things based on it */
1461 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1462 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1463 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1464 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1465 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1466 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1467 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1468 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1469 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1470 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1472 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1473 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1474 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1475 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1476 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1477 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1478 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1479 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1480 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1481 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1483 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1484 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1485 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1486 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1487 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1488 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1489 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1490 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1491 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1493 fjx0 = _mm_setzero_pd();
1494 fjy0 = _mm_setzero_pd();
1495 fjz0 = _mm_setzero_pd();
1496 fjx1 = _mm_setzero_pd();
1497 fjy1 = _mm_setzero_pd();
1498 fjz1 = _mm_setzero_pd();
1499 fjx2 = _mm_setzero_pd();
1500 fjy2 = _mm_setzero_pd();
1501 fjz2 = _mm_setzero_pd();
1502 fjx3 = _mm_setzero_pd();
1503 fjy3 = _mm_setzero_pd();
1504 fjz3 = _mm_setzero_pd();
1506 /**************************
1507 * CALCULATE INTERACTIONS *
1508 **************************/
1510 r00 = _mm_mul_pd(rsq00,rinv00);
1512 /* Calculate table index by multiplying r with table scale and truncate to integer */
1513 rt = _mm_mul_pd(r00,vftabscale);
1514 vfitab = _mm_cvttpd_epi32(rt);
1515 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1516 vfitab = _mm_slli_epi32(vfitab,3);
1518 /* CUBIC SPLINE TABLE DISPERSION */
1519 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1520 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1521 GMX_MM_TRANSPOSE2_PD(Y,F);
1522 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1523 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1524 GMX_MM_TRANSPOSE2_PD(G,H);
1525 Heps = _mm_mul_pd(vfeps,H);
1526 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1527 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1528 fvdw6 = _mm_mul_pd(c6_00,FF);
1530 /* CUBIC SPLINE TABLE REPULSION */
1531 vfitab = _mm_add_epi32(vfitab,ifour);
1532 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1533 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1534 GMX_MM_TRANSPOSE2_PD(Y,F);
1535 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1536 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1537 GMX_MM_TRANSPOSE2_PD(G,H);
1538 Heps = _mm_mul_pd(vfeps,H);
1539 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1540 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1541 fvdw12 = _mm_mul_pd(c12_00,FF);
1542 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1546 /* Calculate temporary vectorial force */
1547 tx = _mm_mul_pd(fscal,dx00);
1548 ty = _mm_mul_pd(fscal,dy00);
1549 tz = _mm_mul_pd(fscal,dz00);
1551 /* Update vectorial force */
1552 fix0 = _mm_add_pd(fix0,tx);
1553 fiy0 = _mm_add_pd(fiy0,ty);
1554 fiz0 = _mm_add_pd(fiz0,tz);
1556 fjx0 = _mm_add_pd(fjx0,tx);
1557 fjy0 = _mm_add_pd(fjy0,ty);
1558 fjz0 = _mm_add_pd(fjz0,tz);
1560 /**************************
1561 * CALCULATE INTERACTIONS *
1562 **************************/
1564 if (gmx_mm_any_lt(rsq11,rcutoff2))
1567 /* REACTION-FIELD ELECTROSTATICS */
1568 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
1570 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1574 fscal = _mm_and_pd(fscal,cutoff_mask);
1576 /* Calculate temporary vectorial force */
1577 tx = _mm_mul_pd(fscal,dx11);
1578 ty = _mm_mul_pd(fscal,dy11);
1579 tz = _mm_mul_pd(fscal,dz11);
1581 /* Update vectorial force */
1582 fix1 = _mm_add_pd(fix1,tx);
1583 fiy1 = _mm_add_pd(fiy1,ty);
1584 fiz1 = _mm_add_pd(fiz1,tz);
1586 fjx1 = _mm_add_pd(fjx1,tx);
1587 fjy1 = _mm_add_pd(fjy1,ty);
1588 fjz1 = _mm_add_pd(fjz1,tz);
1592 /**************************
1593 * CALCULATE INTERACTIONS *
1594 **************************/
1596 if (gmx_mm_any_lt(rsq12,rcutoff2))
1599 /* REACTION-FIELD ELECTROSTATICS */
1600 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
1602 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1606 fscal = _mm_and_pd(fscal,cutoff_mask);
1608 /* Calculate temporary vectorial force */
1609 tx = _mm_mul_pd(fscal,dx12);
1610 ty = _mm_mul_pd(fscal,dy12);
1611 tz = _mm_mul_pd(fscal,dz12);
1613 /* Update vectorial force */
1614 fix1 = _mm_add_pd(fix1,tx);
1615 fiy1 = _mm_add_pd(fiy1,ty);
1616 fiz1 = _mm_add_pd(fiz1,tz);
1618 fjx2 = _mm_add_pd(fjx2,tx);
1619 fjy2 = _mm_add_pd(fjy2,ty);
1620 fjz2 = _mm_add_pd(fjz2,tz);
1624 /**************************
1625 * CALCULATE INTERACTIONS *
1626 **************************/
1628 if (gmx_mm_any_lt(rsq13,rcutoff2))
1631 /* REACTION-FIELD ELECTROSTATICS */
1632 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
1634 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1638 fscal = _mm_and_pd(fscal,cutoff_mask);
1640 /* Calculate temporary vectorial force */
1641 tx = _mm_mul_pd(fscal,dx13);
1642 ty = _mm_mul_pd(fscal,dy13);
1643 tz = _mm_mul_pd(fscal,dz13);
1645 /* Update vectorial force */
1646 fix1 = _mm_add_pd(fix1,tx);
1647 fiy1 = _mm_add_pd(fiy1,ty);
1648 fiz1 = _mm_add_pd(fiz1,tz);
1650 fjx3 = _mm_add_pd(fjx3,tx);
1651 fjy3 = _mm_add_pd(fjy3,ty);
1652 fjz3 = _mm_add_pd(fjz3,tz);
1656 /**************************
1657 * CALCULATE INTERACTIONS *
1658 **************************/
1660 if (gmx_mm_any_lt(rsq21,rcutoff2))
1663 /* REACTION-FIELD ELECTROSTATICS */
1664 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
1666 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1670 fscal = _mm_and_pd(fscal,cutoff_mask);
1672 /* Calculate temporary vectorial force */
1673 tx = _mm_mul_pd(fscal,dx21);
1674 ty = _mm_mul_pd(fscal,dy21);
1675 tz = _mm_mul_pd(fscal,dz21);
1677 /* Update vectorial force */
1678 fix2 = _mm_add_pd(fix2,tx);
1679 fiy2 = _mm_add_pd(fiy2,ty);
1680 fiz2 = _mm_add_pd(fiz2,tz);
1682 fjx1 = _mm_add_pd(fjx1,tx);
1683 fjy1 = _mm_add_pd(fjy1,ty);
1684 fjz1 = _mm_add_pd(fjz1,tz);
1688 /**************************
1689 * CALCULATE INTERACTIONS *
1690 **************************/
1692 if (gmx_mm_any_lt(rsq22,rcutoff2))
1695 /* REACTION-FIELD ELECTROSTATICS */
1696 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
1698 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1702 fscal = _mm_and_pd(fscal,cutoff_mask);
1704 /* Calculate temporary vectorial force */
1705 tx = _mm_mul_pd(fscal,dx22);
1706 ty = _mm_mul_pd(fscal,dy22);
1707 tz = _mm_mul_pd(fscal,dz22);
1709 /* Update vectorial force */
1710 fix2 = _mm_add_pd(fix2,tx);
1711 fiy2 = _mm_add_pd(fiy2,ty);
1712 fiz2 = _mm_add_pd(fiz2,tz);
1714 fjx2 = _mm_add_pd(fjx2,tx);
1715 fjy2 = _mm_add_pd(fjy2,ty);
1716 fjz2 = _mm_add_pd(fjz2,tz);
1720 /**************************
1721 * CALCULATE INTERACTIONS *
1722 **************************/
1724 if (gmx_mm_any_lt(rsq23,rcutoff2))
1727 /* REACTION-FIELD ELECTROSTATICS */
1728 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
1730 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1734 fscal = _mm_and_pd(fscal,cutoff_mask);
1736 /* Calculate temporary vectorial force */
1737 tx = _mm_mul_pd(fscal,dx23);
1738 ty = _mm_mul_pd(fscal,dy23);
1739 tz = _mm_mul_pd(fscal,dz23);
1741 /* Update vectorial force */
1742 fix2 = _mm_add_pd(fix2,tx);
1743 fiy2 = _mm_add_pd(fiy2,ty);
1744 fiz2 = _mm_add_pd(fiz2,tz);
1746 fjx3 = _mm_add_pd(fjx3,tx);
1747 fjy3 = _mm_add_pd(fjy3,ty);
1748 fjz3 = _mm_add_pd(fjz3,tz);
1752 /**************************
1753 * CALCULATE INTERACTIONS *
1754 **************************/
1756 if (gmx_mm_any_lt(rsq31,rcutoff2))
1759 /* REACTION-FIELD ELECTROSTATICS */
1760 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
1762 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1766 fscal = _mm_and_pd(fscal,cutoff_mask);
1768 /* Calculate temporary vectorial force */
1769 tx = _mm_mul_pd(fscal,dx31);
1770 ty = _mm_mul_pd(fscal,dy31);
1771 tz = _mm_mul_pd(fscal,dz31);
1773 /* Update vectorial force */
1774 fix3 = _mm_add_pd(fix3,tx);
1775 fiy3 = _mm_add_pd(fiy3,ty);
1776 fiz3 = _mm_add_pd(fiz3,tz);
1778 fjx1 = _mm_add_pd(fjx1,tx);
1779 fjy1 = _mm_add_pd(fjy1,ty);
1780 fjz1 = _mm_add_pd(fjz1,tz);
1784 /**************************
1785 * CALCULATE INTERACTIONS *
1786 **************************/
1788 if (gmx_mm_any_lt(rsq32,rcutoff2))
1791 /* REACTION-FIELD ELECTROSTATICS */
1792 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
1794 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1798 fscal = _mm_and_pd(fscal,cutoff_mask);
1800 /* Calculate temporary vectorial force */
1801 tx = _mm_mul_pd(fscal,dx32);
1802 ty = _mm_mul_pd(fscal,dy32);
1803 tz = _mm_mul_pd(fscal,dz32);
1805 /* Update vectorial force */
1806 fix3 = _mm_add_pd(fix3,tx);
1807 fiy3 = _mm_add_pd(fiy3,ty);
1808 fiz3 = _mm_add_pd(fiz3,tz);
1810 fjx2 = _mm_add_pd(fjx2,tx);
1811 fjy2 = _mm_add_pd(fjy2,ty);
1812 fjz2 = _mm_add_pd(fjz2,tz);
1816 /**************************
1817 * CALCULATE INTERACTIONS *
1818 **************************/
1820 if (gmx_mm_any_lt(rsq33,rcutoff2))
1823 /* REACTION-FIELD ELECTROSTATICS */
1824 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
1826 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1830 fscal = _mm_and_pd(fscal,cutoff_mask);
1832 /* Calculate temporary vectorial force */
1833 tx = _mm_mul_pd(fscal,dx33);
1834 ty = _mm_mul_pd(fscal,dy33);
1835 tz = _mm_mul_pd(fscal,dz33);
1837 /* Update vectorial force */
1838 fix3 = _mm_add_pd(fix3,tx);
1839 fiy3 = _mm_add_pd(fiy3,ty);
1840 fiz3 = _mm_add_pd(fiz3,tz);
1842 fjx3 = _mm_add_pd(fjx3,tx);
1843 fjy3 = _mm_add_pd(fjy3,ty);
1844 fjz3 = _mm_add_pd(fjz3,tz);
1848 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1850 /* Inner loop uses 321 flops */
1853 if(jidx<j_index_end)
1857 j_coord_offsetA = DIM*jnrA;
1859 /* load j atom coordinates */
1860 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1861 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1862 &jy2,&jz2,&jx3,&jy3,&jz3);
1864 /* Calculate displacement vector */
1865 dx00 = _mm_sub_pd(ix0,jx0);
1866 dy00 = _mm_sub_pd(iy0,jy0);
1867 dz00 = _mm_sub_pd(iz0,jz0);
1868 dx11 = _mm_sub_pd(ix1,jx1);
1869 dy11 = _mm_sub_pd(iy1,jy1);
1870 dz11 = _mm_sub_pd(iz1,jz1);
1871 dx12 = _mm_sub_pd(ix1,jx2);
1872 dy12 = _mm_sub_pd(iy1,jy2);
1873 dz12 = _mm_sub_pd(iz1,jz2);
1874 dx13 = _mm_sub_pd(ix1,jx3);
1875 dy13 = _mm_sub_pd(iy1,jy3);
1876 dz13 = _mm_sub_pd(iz1,jz3);
1877 dx21 = _mm_sub_pd(ix2,jx1);
1878 dy21 = _mm_sub_pd(iy2,jy1);
1879 dz21 = _mm_sub_pd(iz2,jz1);
1880 dx22 = _mm_sub_pd(ix2,jx2);
1881 dy22 = _mm_sub_pd(iy2,jy2);
1882 dz22 = _mm_sub_pd(iz2,jz2);
1883 dx23 = _mm_sub_pd(ix2,jx3);
1884 dy23 = _mm_sub_pd(iy2,jy3);
1885 dz23 = _mm_sub_pd(iz2,jz3);
1886 dx31 = _mm_sub_pd(ix3,jx1);
1887 dy31 = _mm_sub_pd(iy3,jy1);
1888 dz31 = _mm_sub_pd(iz3,jz1);
1889 dx32 = _mm_sub_pd(ix3,jx2);
1890 dy32 = _mm_sub_pd(iy3,jy2);
1891 dz32 = _mm_sub_pd(iz3,jz2);
1892 dx33 = _mm_sub_pd(ix3,jx3);
1893 dy33 = _mm_sub_pd(iy3,jy3);
1894 dz33 = _mm_sub_pd(iz3,jz3);
1896 /* Calculate squared distance and things based on it */
1897 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1898 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1899 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1900 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1901 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1902 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1903 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1904 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1905 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1906 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1908 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1909 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1910 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1911 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1912 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1913 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1914 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1915 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1916 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1917 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1919 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1920 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1921 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1922 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1923 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1924 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1925 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1926 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1927 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1929 fjx0 = _mm_setzero_pd();
1930 fjy0 = _mm_setzero_pd();
1931 fjz0 = _mm_setzero_pd();
1932 fjx1 = _mm_setzero_pd();
1933 fjy1 = _mm_setzero_pd();
1934 fjz1 = _mm_setzero_pd();
1935 fjx2 = _mm_setzero_pd();
1936 fjy2 = _mm_setzero_pd();
1937 fjz2 = _mm_setzero_pd();
1938 fjx3 = _mm_setzero_pd();
1939 fjy3 = _mm_setzero_pd();
1940 fjz3 = _mm_setzero_pd();
1942 /**************************
1943 * CALCULATE INTERACTIONS *
1944 **************************/
1946 r00 = _mm_mul_pd(rsq00,rinv00);
1948 /* Calculate table index by multiplying r with table scale and truncate to integer */
1949 rt = _mm_mul_pd(r00,vftabscale);
1950 vfitab = _mm_cvttpd_epi32(rt);
1951 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1952 vfitab = _mm_slli_epi32(vfitab,3);
1954 /* CUBIC SPLINE TABLE DISPERSION */
1955 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1956 F = _mm_setzero_pd();
1957 GMX_MM_TRANSPOSE2_PD(Y,F);
1958 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1959 H = _mm_setzero_pd();
1960 GMX_MM_TRANSPOSE2_PD(G,H);
1961 Heps = _mm_mul_pd(vfeps,H);
1962 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1963 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1964 fvdw6 = _mm_mul_pd(c6_00,FF);
1966 /* CUBIC SPLINE TABLE REPULSION */
1967 vfitab = _mm_add_epi32(vfitab,ifour);
1968 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1969 F = _mm_setzero_pd();
1970 GMX_MM_TRANSPOSE2_PD(Y,F);
1971 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1972 H = _mm_setzero_pd();
1973 GMX_MM_TRANSPOSE2_PD(G,H);
1974 Heps = _mm_mul_pd(vfeps,H);
1975 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1976 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1977 fvdw12 = _mm_mul_pd(c12_00,FF);
1978 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1982 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1984 /* Calculate temporary vectorial force */
1985 tx = _mm_mul_pd(fscal,dx00);
1986 ty = _mm_mul_pd(fscal,dy00);
1987 tz = _mm_mul_pd(fscal,dz00);
1989 /* Update vectorial force */
1990 fix0 = _mm_add_pd(fix0,tx);
1991 fiy0 = _mm_add_pd(fiy0,ty);
1992 fiz0 = _mm_add_pd(fiz0,tz);
1994 fjx0 = _mm_add_pd(fjx0,tx);
1995 fjy0 = _mm_add_pd(fjy0,ty);
1996 fjz0 = _mm_add_pd(fjz0,tz);
1998 /**************************
1999 * CALCULATE INTERACTIONS *
2000 **************************/
2002 if (gmx_mm_any_lt(rsq11,rcutoff2))
2005 /* REACTION-FIELD ELECTROSTATICS */
2006 felec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_mul_pd(rinv11,rinvsq11),krf2));
2008 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2012 fscal = _mm_and_pd(fscal,cutoff_mask);
2014 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2016 /* Calculate temporary vectorial force */
2017 tx = _mm_mul_pd(fscal,dx11);
2018 ty = _mm_mul_pd(fscal,dy11);
2019 tz = _mm_mul_pd(fscal,dz11);
2021 /* Update vectorial force */
2022 fix1 = _mm_add_pd(fix1,tx);
2023 fiy1 = _mm_add_pd(fiy1,ty);
2024 fiz1 = _mm_add_pd(fiz1,tz);
2026 fjx1 = _mm_add_pd(fjx1,tx);
2027 fjy1 = _mm_add_pd(fjy1,ty);
2028 fjz1 = _mm_add_pd(fjz1,tz);
2032 /**************************
2033 * CALCULATE INTERACTIONS *
2034 **************************/
2036 if (gmx_mm_any_lt(rsq12,rcutoff2))
2039 /* REACTION-FIELD ELECTROSTATICS */
2040 felec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_mul_pd(rinv12,rinvsq12),krf2));
2042 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2046 fscal = _mm_and_pd(fscal,cutoff_mask);
2048 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2050 /* Calculate temporary vectorial force */
2051 tx = _mm_mul_pd(fscal,dx12);
2052 ty = _mm_mul_pd(fscal,dy12);
2053 tz = _mm_mul_pd(fscal,dz12);
2055 /* Update vectorial force */
2056 fix1 = _mm_add_pd(fix1,tx);
2057 fiy1 = _mm_add_pd(fiy1,ty);
2058 fiz1 = _mm_add_pd(fiz1,tz);
2060 fjx2 = _mm_add_pd(fjx2,tx);
2061 fjy2 = _mm_add_pd(fjy2,ty);
2062 fjz2 = _mm_add_pd(fjz2,tz);
2066 /**************************
2067 * CALCULATE INTERACTIONS *
2068 **************************/
2070 if (gmx_mm_any_lt(rsq13,rcutoff2))
2073 /* REACTION-FIELD ELECTROSTATICS */
2074 felec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_mul_pd(rinv13,rinvsq13),krf2));
2076 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
2080 fscal = _mm_and_pd(fscal,cutoff_mask);
2082 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2084 /* Calculate temporary vectorial force */
2085 tx = _mm_mul_pd(fscal,dx13);
2086 ty = _mm_mul_pd(fscal,dy13);
2087 tz = _mm_mul_pd(fscal,dz13);
2089 /* Update vectorial force */
2090 fix1 = _mm_add_pd(fix1,tx);
2091 fiy1 = _mm_add_pd(fiy1,ty);
2092 fiz1 = _mm_add_pd(fiz1,tz);
2094 fjx3 = _mm_add_pd(fjx3,tx);
2095 fjy3 = _mm_add_pd(fjy3,ty);
2096 fjz3 = _mm_add_pd(fjz3,tz);
2100 /**************************
2101 * CALCULATE INTERACTIONS *
2102 **************************/
2104 if (gmx_mm_any_lt(rsq21,rcutoff2))
2107 /* REACTION-FIELD ELECTROSTATICS */
2108 felec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_mul_pd(rinv21,rinvsq21),krf2));
2110 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2114 fscal = _mm_and_pd(fscal,cutoff_mask);
2116 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2118 /* Calculate temporary vectorial force */
2119 tx = _mm_mul_pd(fscal,dx21);
2120 ty = _mm_mul_pd(fscal,dy21);
2121 tz = _mm_mul_pd(fscal,dz21);
2123 /* Update vectorial force */
2124 fix2 = _mm_add_pd(fix2,tx);
2125 fiy2 = _mm_add_pd(fiy2,ty);
2126 fiz2 = _mm_add_pd(fiz2,tz);
2128 fjx1 = _mm_add_pd(fjx1,tx);
2129 fjy1 = _mm_add_pd(fjy1,ty);
2130 fjz1 = _mm_add_pd(fjz1,tz);
2134 /**************************
2135 * CALCULATE INTERACTIONS *
2136 **************************/
2138 if (gmx_mm_any_lt(rsq22,rcutoff2))
2141 /* REACTION-FIELD ELECTROSTATICS */
2142 felec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_mul_pd(rinv22,rinvsq22),krf2));
2144 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2148 fscal = _mm_and_pd(fscal,cutoff_mask);
2150 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2152 /* Calculate temporary vectorial force */
2153 tx = _mm_mul_pd(fscal,dx22);
2154 ty = _mm_mul_pd(fscal,dy22);
2155 tz = _mm_mul_pd(fscal,dz22);
2157 /* Update vectorial force */
2158 fix2 = _mm_add_pd(fix2,tx);
2159 fiy2 = _mm_add_pd(fiy2,ty);
2160 fiz2 = _mm_add_pd(fiz2,tz);
2162 fjx2 = _mm_add_pd(fjx2,tx);
2163 fjy2 = _mm_add_pd(fjy2,ty);
2164 fjz2 = _mm_add_pd(fjz2,tz);
2168 /**************************
2169 * CALCULATE INTERACTIONS *
2170 **************************/
2172 if (gmx_mm_any_lt(rsq23,rcutoff2))
2175 /* REACTION-FIELD ELECTROSTATICS */
2176 felec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_mul_pd(rinv23,rinvsq23),krf2));
2178 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
2182 fscal = _mm_and_pd(fscal,cutoff_mask);
2184 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2186 /* Calculate temporary vectorial force */
2187 tx = _mm_mul_pd(fscal,dx23);
2188 ty = _mm_mul_pd(fscal,dy23);
2189 tz = _mm_mul_pd(fscal,dz23);
2191 /* Update vectorial force */
2192 fix2 = _mm_add_pd(fix2,tx);
2193 fiy2 = _mm_add_pd(fiy2,ty);
2194 fiz2 = _mm_add_pd(fiz2,tz);
2196 fjx3 = _mm_add_pd(fjx3,tx);
2197 fjy3 = _mm_add_pd(fjy3,ty);
2198 fjz3 = _mm_add_pd(fjz3,tz);
2202 /**************************
2203 * CALCULATE INTERACTIONS *
2204 **************************/
2206 if (gmx_mm_any_lt(rsq31,rcutoff2))
2209 /* REACTION-FIELD ELECTROSTATICS */
2210 felec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_mul_pd(rinv31,rinvsq31),krf2));
2212 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
2216 fscal = _mm_and_pd(fscal,cutoff_mask);
2218 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2220 /* Calculate temporary vectorial force */
2221 tx = _mm_mul_pd(fscal,dx31);
2222 ty = _mm_mul_pd(fscal,dy31);
2223 tz = _mm_mul_pd(fscal,dz31);
2225 /* Update vectorial force */
2226 fix3 = _mm_add_pd(fix3,tx);
2227 fiy3 = _mm_add_pd(fiy3,ty);
2228 fiz3 = _mm_add_pd(fiz3,tz);
2230 fjx1 = _mm_add_pd(fjx1,tx);
2231 fjy1 = _mm_add_pd(fjy1,ty);
2232 fjz1 = _mm_add_pd(fjz1,tz);
2236 /**************************
2237 * CALCULATE INTERACTIONS *
2238 **************************/
2240 if (gmx_mm_any_lt(rsq32,rcutoff2))
2243 /* REACTION-FIELD ELECTROSTATICS */
2244 felec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_mul_pd(rinv32,rinvsq32),krf2));
2246 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
2250 fscal = _mm_and_pd(fscal,cutoff_mask);
2252 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2254 /* Calculate temporary vectorial force */
2255 tx = _mm_mul_pd(fscal,dx32);
2256 ty = _mm_mul_pd(fscal,dy32);
2257 tz = _mm_mul_pd(fscal,dz32);
2259 /* Update vectorial force */
2260 fix3 = _mm_add_pd(fix3,tx);
2261 fiy3 = _mm_add_pd(fiy3,ty);
2262 fiz3 = _mm_add_pd(fiz3,tz);
2264 fjx2 = _mm_add_pd(fjx2,tx);
2265 fjy2 = _mm_add_pd(fjy2,ty);
2266 fjz2 = _mm_add_pd(fjz2,tz);
2270 /**************************
2271 * CALCULATE INTERACTIONS *
2272 **************************/
2274 if (gmx_mm_any_lt(rsq33,rcutoff2))
2277 /* REACTION-FIELD ELECTROSTATICS */
2278 felec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_mul_pd(rinv33,rinvsq33),krf2));
2280 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
2284 fscal = _mm_and_pd(fscal,cutoff_mask);
2286 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2288 /* Calculate temporary vectorial force */
2289 tx = _mm_mul_pd(fscal,dx33);
2290 ty = _mm_mul_pd(fscal,dy33);
2291 tz = _mm_mul_pd(fscal,dz33);
2293 /* Update vectorial force */
2294 fix3 = _mm_add_pd(fix3,tx);
2295 fiy3 = _mm_add_pd(fiy3,ty);
2296 fiz3 = _mm_add_pd(fiz3,tz);
2298 fjx3 = _mm_add_pd(fjx3,tx);
2299 fjy3 = _mm_add_pd(fjy3,ty);
2300 fjz3 = _mm_add_pd(fjz3,tz);
2304 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2306 /* Inner loop uses 321 flops */
2309 /* End of innermost loop */
2311 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2312 f+i_coord_offset,fshift+i_shift_offset);
2314 /* Increment number of inner iterations */
2315 inneriter += j_index_end - j_index_start;
2317 /* Outer loop uses 24 flops */
2320 /* Increment number of outer iterations */
2323 /* Update outer/inner flops */
2325 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*321);