2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B;
75 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 int vdwjidx1A,vdwjidx1B;
77 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78 int vdwjidx2A,vdwjidx2B;
79 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80 int vdwjidx3A,vdwjidx3B;
81 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
95 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
98 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
99 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
101 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
103 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
104 real rswitch_scalar,d_scalar;
105 __m128d dummy_mask,cutoff_mask;
106 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
107 __m128d one = _mm_set1_pd(1.0);
108 __m128d two = _mm_set1_pd(2.0);
114 jindex = nlist->jindex;
116 shiftidx = nlist->shift;
118 shiftvec = fr->shift_vec[0];
119 fshift = fr->fshift[0];
120 facel = _mm_set1_pd(fr->epsfac);
121 charge = mdatoms->chargeA;
122 nvdwtype = fr->ntype;
124 vdwtype = mdatoms->typeA;
126 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
127 ewtab = fr->ic->tabq_coul_FDV0;
128 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
129 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
131 /* Setup water-specific parameters */
132 inr = nlist->iinr[0];
133 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
134 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
135 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
136 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
138 jq1 = _mm_set1_pd(charge[inr+1]);
139 jq2 = _mm_set1_pd(charge[inr+2]);
140 jq3 = _mm_set1_pd(charge[inr+3]);
141 vdwjidx0A = 2*vdwtype[inr+0];
142 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
143 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
144 qq11 = _mm_mul_pd(iq1,jq1);
145 qq12 = _mm_mul_pd(iq1,jq2);
146 qq13 = _mm_mul_pd(iq1,jq3);
147 qq21 = _mm_mul_pd(iq2,jq1);
148 qq22 = _mm_mul_pd(iq2,jq2);
149 qq23 = _mm_mul_pd(iq2,jq3);
150 qq31 = _mm_mul_pd(iq3,jq1);
151 qq32 = _mm_mul_pd(iq3,jq2);
152 qq33 = _mm_mul_pd(iq3,jq3);
154 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
155 rcutoff_scalar = fr->rcoulomb;
156 rcutoff = _mm_set1_pd(rcutoff_scalar);
157 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
159 rswitch_scalar = fr->rcoulomb_switch;
160 rswitch = _mm_set1_pd(rswitch_scalar);
161 /* Setup switch parameters */
162 d_scalar = rcutoff_scalar-rswitch_scalar;
163 d = _mm_set1_pd(d_scalar);
164 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
165 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
166 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
167 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
168 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
169 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
171 /* Avoid stupid compiler warnings */
179 /* Start outer loop over neighborlists */
180 for(iidx=0; iidx<nri; iidx++)
182 /* Load shift vector for this list */
183 i_shift_offset = DIM*shiftidx[iidx];
185 /* Load limits for loop over neighbors */
186 j_index_start = jindex[iidx];
187 j_index_end = jindex[iidx+1];
189 /* Get outer coordinate index */
191 i_coord_offset = DIM*inr;
193 /* Load i particle coords and add shift vector */
194 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
195 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
197 fix0 = _mm_setzero_pd();
198 fiy0 = _mm_setzero_pd();
199 fiz0 = _mm_setzero_pd();
200 fix1 = _mm_setzero_pd();
201 fiy1 = _mm_setzero_pd();
202 fiz1 = _mm_setzero_pd();
203 fix2 = _mm_setzero_pd();
204 fiy2 = _mm_setzero_pd();
205 fiz2 = _mm_setzero_pd();
206 fix3 = _mm_setzero_pd();
207 fiy3 = _mm_setzero_pd();
208 fiz3 = _mm_setzero_pd();
210 /* Reset potential sums */
211 velecsum = _mm_setzero_pd();
212 vvdwsum = _mm_setzero_pd();
214 /* Start inner kernel loop */
215 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
218 /* Get j neighbor index, and coordinate index */
221 j_coord_offsetA = DIM*jnrA;
222 j_coord_offsetB = DIM*jnrB;
224 /* load j atom coordinates */
225 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
226 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
227 &jy2,&jz2,&jx3,&jy3,&jz3);
229 /* Calculate displacement vector */
230 dx00 = _mm_sub_pd(ix0,jx0);
231 dy00 = _mm_sub_pd(iy0,jy0);
232 dz00 = _mm_sub_pd(iz0,jz0);
233 dx11 = _mm_sub_pd(ix1,jx1);
234 dy11 = _mm_sub_pd(iy1,jy1);
235 dz11 = _mm_sub_pd(iz1,jz1);
236 dx12 = _mm_sub_pd(ix1,jx2);
237 dy12 = _mm_sub_pd(iy1,jy2);
238 dz12 = _mm_sub_pd(iz1,jz2);
239 dx13 = _mm_sub_pd(ix1,jx3);
240 dy13 = _mm_sub_pd(iy1,jy3);
241 dz13 = _mm_sub_pd(iz1,jz3);
242 dx21 = _mm_sub_pd(ix2,jx1);
243 dy21 = _mm_sub_pd(iy2,jy1);
244 dz21 = _mm_sub_pd(iz2,jz1);
245 dx22 = _mm_sub_pd(ix2,jx2);
246 dy22 = _mm_sub_pd(iy2,jy2);
247 dz22 = _mm_sub_pd(iz2,jz2);
248 dx23 = _mm_sub_pd(ix2,jx3);
249 dy23 = _mm_sub_pd(iy2,jy3);
250 dz23 = _mm_sub_pd(iz2,jz3);
251 dx31 = _mm_sub_pd(ix3,jx1);
252 dy31 = _mm_sub_pd(iy3,jy1);
253 dz31 = _mm_sub_pd(iz3,jz1);
254 dx32 = _mm_sub_pd(ix3,jx2);
255 dy32 = _mm_sub_pd(iy3,jy2);
256 dz32 = _mm_sub_pd(iz3,jz2);
257 dx33 = _mm_sub_pd(ix3,jx3);
258 dy33 = _mm_sub_pd(iy3,jy3);
259 dz33 = _mm_sub_pd(iz3,jz3);
261 /* Calculate squared distance and things based on it */
262 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
263 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
264 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
265 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
266 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
267 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
268 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
269 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
270 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
271 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
273 rinv00 = gmx_mm_invsqrt_pd(rsq00);
274 rinv11 = gmx_mm_invsqrt_pd(rsq11);
275 rinv12 = gmx_mm_invsqrt_pd(rsq12);
276 rinv13 = gmx_mm_invsqrt_pd(rsq13);
277 rinv21 = gmx_mm_invsqrt_pd(rsq21);
278 rinv22 = gmx_mm_invsqrt_pd(rsq22);
279 rinv23 = gmx_mm_invsqrt_pd(rsq23);
280 rinv31 = gmx_mm_invsqrt_pd(rsq31);
281 rinv32 = gmx_mm_invsqrt_pd(rsq32);
282 rinv33 = gmx_mm_invsqrt_pd(rsq33);
284 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
285 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
286 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
287 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
288 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
289 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
290 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
291 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
292 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
293 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
295 fjx0 = _mm_setzero_pd();
296 fjy0 = _mm_setzero_pd();
297 fjz0 = _mm_setzero_pd();
298 fjx1 = _mm_setzero_pd();
299 fjy1 = _mm_setzero_pd();
300 fjz1 = _mm_setzero_pd();
301 fjx2 = _mm_setzero_pd();
302 fjy2 = _mm_setzero_pd();
303 fjz2 = _mm_setzero_pd();
304 fjx3 = _mm_setzero_pd();
305 fjy3 = _mm_setzero_pd();
306 fjz3 = _mm_setzero_pd();
308 /**************************
309 * CALCULATE INTERACTIONS *
310 **************************/
312 if (gmx_mm_any_lt(rsq00,rcutoff2))
315 r00 = _mm_mul_pd(rsq00,rinv00);
317 /* LENNARD-JONES DISPERSION/REPULSION */
319 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
320 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
321 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
322 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
323 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
325 d = _mm_sub_pd(r00,rswitch);
326 d = _mm_max_pd(d,_mm_setzero_pd());
327 d2 = _mm_mul_pd(d,d);
328 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
330 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
332 /* Evaluate switch function */
333 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
334 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
335 vvdw = _mm_mul_pd(vvdw,sw);
336 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
338 /* Update potential sum for this i atom from the interaction with this j atom. */
339 vvdw = _mm_and_pd(vvdw,cutoff_mask);
340 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
344 fscal = _mm_and_pd(fscal,cutoff_mask);
346 /* Calculate temporary vectorial force */
347 tx = _mm_mul_pd(fscal,dx00);
348 ty = _mm_mul_pd(fscal,dy00);
349 tz = _mm_mul_pd(fscal,dz00);
351 /* Update vectorial force */
352 fix0 = _mm_add_pd(fix0,tx);
353 fiy0 = _mm_add_pd(fiy0,ty);
354 fiz0 = _mm_add_pd(fiz0,tz);
356 fjx0 = _mm_add_pd(fjx0,tx);
357 fjy0 = _mm_add_pd(fjy0,ty);
358 fjz0 = _mm_add_pd(fjz0,tz);
362 /**************************
363 * CALCULATE INTERACTIONS *
364 **************************/
366 if (gmx_mm_any_lt(rsq11,rcutoff2))
369 r11 = _mm_mul_pd(rsq11,rinv11);
371 /* EWALD ELECTROSTATICS */
373 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
374 ewrt = _mm_mul_pd(r11,ewtabscale);
375 ewitab = _mm_cvttpd_epi32(ewrt);
376 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
377 ewitab = _mm_slli_epi32(ewitab,2);
378 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
379 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
380 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
381 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
382 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
383 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
384 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
385 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
386 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
387 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
389 d = _mm_sub_pd(r11,rswitch);
390 d = _mm_max_pd(d,_mm_setzero_pd());
391 d2 = _mm_mul_pd(d,d);
392 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
394 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
396 /* Evaluate switch function */
397 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
398 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
399 velec = _mm_mul_pd(velec,sw);
400 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
402 /* Update potential sum for this i atom from the interaction with this j atom. */
403 velec = _mm_and_pd(velec,cutoff_mask);
404 velecsum = _mm_add_pd(velecsum,velec);
408 fscal = _mm_and_pd(fscal,cutoff_mask);
410 /* Calculate temporary vectorial force */
411 tx = _mm_mul_pd(fscal,dx11);
412 ty = _mm_mul_pd(fscal,dy11);
413 tz = _mm_mul_pd(fscal,dz11);
415 /* Update vectorial force */
416 fix1 = _mm_add_pd(fix1,tx);
417 fiy1 = _mm_add_pd(fiy1,ty);
418 fiz1 = _mm_add_pd(fiz1,tz);
420 fjx1 = _mm_add_pd(fjx1,tx);
421 fjy1 = _mm_add_pd(fjy1,ty);
422 fjz1 = _mm_add_pd(fjz1,tz);
426 /**************************
427 * CALCULATE INTERACTIONS *
428 **************************/
430 if (gmx_mm_any_lt(rsq12,rcutoff2))
433 r12 = _mm_mul_pd(rsq12,rinv12);
435 /* EWALD ELECTROSTATICS */
437 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
438 ewrt = _mm_mul_pd(r12,ewtabscale);
439 ewitab = _mm_cvttpd_epi32(ewrt);
440 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
441 ewitab = _mm_slli_epi32(ewitab,2);
442 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
443 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
444 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
445 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
446 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
447 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
448 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
449 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
450 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
451 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
453 d = _mm_sub_pd(r12,rswitch);
454 d = _mm_max_pd(d,_mm_setzero_pd());
455 d2 = _mm_mul_pd(d,d);
456 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
458 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
460 /* Evaluate switch function */
461 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
462 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
463 velec = _mm_mul_pd(velec,sw);
464 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
466 /* Update potential sum for this i atom from the interaction with this j atom. */
467 velec = _mm_and_pd(velec,cutoff_mask);
468 velecsum = _mm_add_pd(velecsum,velec);
472 fscal = _mm_and_pd(fscal,cutoff_mask);
474 /* Calculate temporary vectorial force */
475 tx = _mm_mul_pd(fscal,dx12);
476 ty = _mm_mul_pd(fscal,dy12);
477 tz = _mm_mul_pd(fscal,dz12);
479 /* Update vectorial force */
480 fix1 = _mm_add_pd(fix1,tx);
481 fiy1 = _mm_add_pd(fiy1,ty);
482 fiz1 = _mm_add_pd(fiz1,tz);
484 fjx2 = _mm_add_pd(fjx2,tx);
485 fjy2 = _mm_add_pd(fjy2,ty);
486 fjz2 = _mm_add_pd(fjz2,tz);
490 /**************************
491 * CALCULATE INTERACTIONS *
492 **************************/
494 if (gmx_mm_any_lt(rsq13,rcutoff2))
497 r13 = _mm_mul_pd(rsq13,rinv13);
499 /* EWALD ELECTROSTATICS */
501 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
502 ewrt = _mm_mul_pd(r13,ewtabscale);
503 ewitab = _mm_cvttpd_epi32(ewrt);
504 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
505 ewitab = _mm_slli_epi32(ewitab,2);
506 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
507 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
508 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
509 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
510 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
511 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
512 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
513 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
514 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
515 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
517 d = _mm_sub_pd(r13,rswitch);
518 d = _mm_max_pd(d,_mm_setzero_pd());
519 d2 = _mm_mul_pd(d,d);
520 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
522 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
524 /* Evaluate switch function */
525 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
526 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
527 velec = _mm_mul_pd(velec,sw);
528 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
530 /* Update potential sum for this i atom from the interaction with this j atom. */
531 velec = _mm_and_pd(velec,cutoff_mask);
532 velecsum = _mm_add_pd(velecsum,velec);
536 fscal = _mm_and_pd(fscal,cutoff_mask);
538 /* Calculate temporary vectorial force */
539 tx = _mm_mul_pd(fscal,dx13);
540 ty = _mm_mul_pd(fscal,dy13);
541 tz = _mm_mul_pd(fscal,dz13);
543 /* Update vectorial force */
544 fix1 = _mm_add_pd(fix1,tx);
545 fiy1 = _mm_add_pd(fiy1,ty);
546 fiz1 = _mm_add_pd(fiz1,tz);
548 fjx3 = _mm_add_pd(fjx3,tx);
549 fjy3 = _mm_add_pd(fjy3,ty);
550 fjz3 = _mm_add_pd(fjz3,tz);
554 /**************************
555 * CALCULATE INTERACTIONS *
556 **************************/
558 if (gmx_mm_any_lt(rsq21,rcutoff2))
561 r21 = _mm_mul_pd(rsq21,rinv21);
563 /* EWALD ELECTROSTATICS */
565 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
566 ewrt = _mm_mul_pd(r21,ewtabscale);
567 ewitab = _mm_cvttpd_epi32(ewrt);
568 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
569 ewitab = _mm_slli_epi32(ewitab,2);
570 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
571 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
572 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
573 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
574 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
575 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
576 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
577 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
578 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
579 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
581 d = _mm_sub_pd(r21,rswitch);
582 d = _mm_max_pd(d,_mm_setzero_pd());
583 d2 = _mm_mul_pd(d,d);
584 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
586 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
588 /* Evaluate switch function */
589 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
590 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
591 velec = _mm_mul_pd(velec,sw);
592 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
594 /* Update potential sum for this i atom from the interaction with this j atom. */
595 velec = _mm_and_pd(velec,cutoff_mask);
596 velecsum = _mm_add_pd(velecsum,velec);
600 fscal = _mm_and_pd(fscal,cutoff_mask);
602 /* Calculate temporary vectorial force */
603 tx = _mm_mul_pd(fscal,dx21);
604 ty = _mm_mul_pd(fscal,dy21);
605 tz = _mm_mul_pd(fscal,dz21);
607 /* Update vectorial force */
608 fix2 = _mm_add_pd(fix2,tx);
609 fiy2 = _mm_add_pd(fiy2,ty);
610 fiz2 = _mm_add_pd(fiz2,tz);
612 fjx1 = _mm_add_pd(fjx1,tx);
613 fjy1 = _mm_add_pd(fjy1,ty);
614 fjz1 = _mm_add_pd(fjz1,tz);
618 /**************************
619 * CALCULATE INTERACTIONS *
620 **************************/
622 if (gmx_mm_any_lt(rsq22,rcutoff2))
625 r22 = _mm_mul_pd(rsq22,rinv22);
627 /* EWALD ELECTROSTATICS */
629 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
630 ewrt = _mm_mul_pd(r22,ewtabscale);
631 ewitab = _mm_cvttpd_epi32(ewrt);
632 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
633 ewitab = _mm_slli_epi32(ewitab,2);
634 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
635 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
636 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
637 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
638 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
639 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
640 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
641 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
642 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
643 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
645 d = _mm_sub_pd(r22,rswitch);
646 d = _mm_max_pd(d,_mm_setzero_pd());
647 d2 = _mm_mul_pd(d,d);
648 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
650 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
652 /* Evaluate switch function */
653 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
654 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
655 velec = _mm_mul_pd(velec,sw);
656 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
658 /* Update potential sum for this i atom from the interaction with this j atom. */
659 velec = _mm_and_pd(velec,cutoff_mask);
660 velecsum = _mm_add_pd(velecsum,velec);
664 fscal = _mm_and_pd(fscal,cutoff_mask);
666 /* Calculate temporary vectorial force */
667 tx = _mm_mul_pd(fscal,dx22);
668 ty = _mm_mul_pd(fscal,dy22);
669 tz = _mm_mul_pd(fscal,dz22);
671 /* Update vectorial force */
672 fix2 = _mm_add_pd(fix2,tx);
673 fiy2 = _mm_add_pd(fiy2,ty);
674 fiz2 = _mm_add_pd(fiz2,tz);
676 fjx2 = _mm_add_pd(fjx2,tx);
677 fjy2 = _mm_add_pd(fjy2,ty);
678 fjz2 = _mm_add_pd(fjz2,tz);
682 /**************************
683 * CALCULATE INTERACTIONS *
684 **************************/
686 if (gmx_mm_any_lt(rsq23,rcutoff2))
689 r23 = _mm_mul_pd(rsq23,rinv23);
691 /* EWALD ELECTROSTATICS */
693 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
694 ewrt = _mm_mul_pd(r23,ewtabscale);
695 ewitab = _mm_cvttpd_epi32(ewrt);
696 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
697 ewitab = _mm_slli_epi32(ewitab,2);
698 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
699 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
700 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
701 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
702 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
703 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
704 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
705 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
706 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
707 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
709 d = _mm_sub_pd(r23,rswitch);
710 d = _mm_max_pd(d,_mm_setzero_pd());
711 d2 = _mm_mul_pd(d,d);
712 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
714 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
716 /* Evaluate switch function */
717 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
718 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
719 velec = _mm_mul_pd(velec,sw);
720 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
722 /* Update potential sum for this i atom from the interaction with this j atom. */
723 velec = _mm_and_pd(velec,cutoff_mask);
724 velecsum = _mm_add_pd(velecsum,velec);
728 fscal = _mm_and_pd(fscal,cutoff_mask);
730 /* Calculate temporary vectorial force */
731 tx = _mm_mul_pd(fscal,dx23);
732 ty = _mm_mul_pd(fscal,dy23);
733 tz = _mm_mul_pd(fscal,dz23);
735 /* Update vectorial force */
736 fix2 = _mm_add_pd(fix2,tx);
737 fiy2 = _mm_add_pd(fiy2,ty);
738 fiz2 = _mm_add_pd(fiz2,tz);
740 fjx3 = _mm_add_pd(fjx3,tx);
741 fjy3 = _mm_add_pd(fjy3,ty);
742 fjz3 = _mm_add_pd(fjz3,tz);
746 /**************************
747 * CALCULATE INTERACTIONS *
748 **************************/
750 if (gmx_mm_any_lt(rsq31,rcutoff2))
753 r31 = _mm_mul_pd(rsq31,rinv31);
755 /* EWALD ELECTROSTATICS */
757 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
758 ewrt = _mm_mul_pd(r31,ewtabscale);
759 ewitab = _mm_cvttpd_epi32(ewrt);
760 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
761 ewitab = _mm_slli_epi32(ewitab,2);
762 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
763 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
764 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
765 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
766 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
767 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
768 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
769 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
770 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
771 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
773 d = _mm_sub_pd(r31,rswitch);
774 d = _mm_max_pd(d,_mm_setzero_pd());
775 d2 = _mm_mul_pd(d,d);
776 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
778 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
780 /* Evaluate switch function */
781 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
782 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
783 velec = _mm_mul_pd(velec,sw);
784 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
786 /* Update potential sum for this i atom from the interaction with this j atom. */
787 velec = _mm_and_pd(velec,cutoff_mask);
788 velecsum = _mm_add_pd(velecsum,velec);
792 fscal = _mm_and_pd(fscal,cutoff_mask);
794 /* Calculate temporary vectorial force */
795 tx = _mm_mul_pd(fscal,dx31);
796 ty = _mm_mul_pd(fscal,dy31);
797 tz = _mm_mul_pd(fscal,dz31);
799 /* Update vectorial force */
800 fix3 = _mm_add_pd(fix3,tx);
801 fiy3 = _mm_add_pd(fiy3,ty);
802 fiz3 = _mm_add_pd(fiz3,tz);
804 fjx1 = _mm_add_pd(fjx1,tx);
805 fjy1 = _mm_add_pd(fjy1,ty);
806 fjz1 = _mm_add_pd(fjz1,tz);
810 /**************************
811 * CALCULATE INTERACTIONS *
812 **************************/
814 if (gmx_mm_any_lt(rsq32,rcutoff2))
817 r32 = _mm_mul_pd(rsq32,rinv32);
819 /* EWALD ELECTROSTATICS */
821 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
822 ewrt = _mm_mul_pd(r32,ewtabscale);
823 ewitab = _mm_cvttpd_epi32(ewrt);
824 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
825 ewitab = _mm_slli_epi32(ewitab,2);
826 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
827 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
828 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
829 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
830 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
831 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
832 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
833 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
834 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
835 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
837 d = _mm_sub_pd(r32,rswitch);
838 d = _mm_max_pd(d,_mm_setzero_pd());
839 d2 = _mm_mul_pd(d,d);
840 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
842 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
844 /* Evaluate switch function */
845 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
846 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
847 velec = _mm_mul_pd(velec,sw);
848 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
850 /* Update potential sum for this i atom from the interaction with this j atom. */
851 velec = _mm_and_pd(velec,cutoff_mask);
852 velecsum = _mm_add_pd(velecsum,velec);
856 fscal = _mm_and_pd(fscal,cutoff_mask);
858 /* Calculate temporary vectorial force */
859 tx = _mm_mul_pd(fscal,dx32);
860 ty = _mm_mul_pd(fscal,dy32);
861 tz = _mm_mul_pd(fscal,dz32);
863 /* Update vectorial force */
864 fix3 = _mm_add_pd(fix3,tx);
865 fiy3 = _mm_add_pd(fiy3,ty);
866 fiz3 = _mm_add_pd(fiz3,tz);
868 fjx2 = _mm_add_pd(fjx2,tx);
869 fjy2 = _mm_add_pd(fjy2,ty);
870 fjz2 = _mm_add_pd(fjz2,tz);
874 /**************************
875 * CALCULATE INTERACTIONS *
876 **************************/
878 if (gmx_mm_any_lt(rsq33,rcutoff2))
881 r33 = _mm_mul_pd(rsq33,rinv33);
883 /* EWALD ELECTROSTATICS */
885 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
886 ewrt = _mm_mul_pd(r33,ewtabscale);
887 ewitab = _mm_cvttpd_epi32(ewrt);
888 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
889 ewitab = _mm_slli_epi32(ewitab,2);
890 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
891 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
892 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
893 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
894 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
895 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
896 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
897 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
898 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
899 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
901 d = _mm_sub_pd(r33,rswitch);
902 d = _mm_max_pd(d,_mm_setzero_pd());
903 d2 = _mm_mul_pd(d,d);
904 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
906 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
908 /* Evaluate switch function */
909 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
910 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
911 velec = _mm_mul_pd(velec,sw);
912 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
914 /* Update potential sum for this i atom from the interaction with this j atom. */
915 velec = _mm_and_pd(velec,cutoff_mask);
916 velecsum = _mm_add_pd(velecsum,velec);
920 fscal = _mm_and_pd(fscal,cutoff_mask);
922 /* Calculate temporary vectorial force */
923 tx = _mm_mul_pd(fscal,dx33);
924 ty = _mm_mul_pd(fscal,dy33);
925 tz = _mm_mul_pd(fscal,dz33);
927 /* Update vectorial force */
928 fix3 = _mm_add_pd(fix3,tx);
929 fiy3 = _mm_add_pd(fiy3,ty);
930 fiz3 = _mm_add_pd(fiz3,tz);
932 fjx3 = _mm_add_pd(fjx3,tx);
933 fjy3 = _mm_add_pd(fjy3,ty);
934 fjz3 = _mm_add_pd(fjz3,tz);
938 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
940 /* Inner loop uses 647 flops */
947 j_coord_offsetA = DIM*jnrA;
949 /* load j atom coordinates */
950 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
951 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
952 &jy2,&jz2,&jx3,&jy3,&jz3);
954 /* Calculate displacement vector */
955 dx00 = _mm_sub_pd(ix0,jx0);
956 dy00 = _mm_sub_pd(iy0,jy0);
957 dz00 = _mm_sub_pd(iz0,jz0);
958 dx11 = _mm_sub_pd(ix1,jx1);
959 dy11 = _mm_sub_pd(iy1,jy1);
960 dz11 = _mm_sub_pd(iz1,jz1);
961 dx12 = _mm_sub_pd(ix1,jx2);
962 dy12 = _mm_sub_pd(iy1,jy2);
963 dz12 = _mm_sub_pd(iz1,jz2);
964 dx13 = _mm_sub_pd(ix1,jx3);
965 dy13 = _mm_sub_pd(iy1,jy3);
966 dz13 = _mm_sub_pd(iz1,jz3);
967 dx21 = _mm_sub_pd(ix2,jx1);
968 dy21 = _mm_sub_pd(iy2,jy1);
969 dz21 = _mm_sub_pd(iz2,jz1);
970 dx22 = _mm_sub_pd(ix2,jx2);
971 dy22 = _mm_sub_pd(iy2,jy2);
972 dz22 = _mm_sub_pd(iz2,jz2);
973 dx23 = _mm_sub_pd(ix2,jx3);
974 dy23 = _mm_sub_pd(iy2,jy3);
975 dz23 = _mm_sub_pd(iz2,jz3);
976 dx31 = _mm_sub_pd(ix3,jx1);
977 dy31 = _mm_sub_pd(iy3,jy1);
978 dz31 = _mm_sub_pd(iz3,jz1);
979 dx32 = _mm_sub_pd(ix3,jx2);
980 dy32 = _mm_sub_pd(iy3,jy2);
981 dz32 = _mm_sub_pd(iz3,jz2);
982 dx33 = _mm_sub_pd(ix3,jx3);
983 dy33 = _mm_sub_pd(iy3,jy3);
984 dz33 = _mm_sub_pd(iz3,jz3);
986 /* Calculate squared distance and things based on it */
987 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
988 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
989 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
990 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
991 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
992 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
993 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
994 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
995 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
996 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
998 rinv00 = gmx_mm_invsqrt_pd(rsq00);
999 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1000 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1001 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1002 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1003 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1004 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1005 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1006 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1007 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1009 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1010 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1011 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1012 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1013 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1014 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1015 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1016 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1017 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1018 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1020 fjx0 = _mm_setzero_pd();
1021 fjy0 = _mm_setzero_pd();
1022 fjz0 = _mm_setzero_pd();
1023 fjx1 = _mm_setzero_pd();
1024 fjy1 = _mm_setzero_pd();
1025 fjz1 = _mm_setzero_pd();
1026 fjx2 = _mm_setzero_pd();
1027 fjy2 = _mm_setzero_pd();
1028 fjz2 = _mm_setzero_pd();
1029 fjx3 = _mm_setzero_pd();
1030 fjy3 = _mm_setzero_pd();
1031 fjz3 = _mm_setzero_pd();
1033 /**************************
1034 * CALCULATE INTERACTIONS *
1035 **************************/
1037 if (gmx_mm_any_lt(rsq00,rcutoff2))
1040 r00 = _mm_mul_pd(rsq00,rinv00);
1042 /* LENNARD-JONES DISPERSION/REPULSION */
1044 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1045 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1046 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1047 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
1048 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1050 d = _mm_sub_pd(r00,rswitch);
1051 d = _mm_max_pd(d,_mm_setzero_pd());
1052 d2 = _mm_mul_pd(d,d);
1053 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1055 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1057 /* Evaluate switch function */
1058 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1059 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1060 vvdw = _mm_mul_pd(vvdw,sw);
1061 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1063 /* Update potential sum for this i atom from the interaction with this j atom. */
1064 vvdw = _mm_and_pd(vvdw,cutoff_mask);
1065 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
1066 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
1070 fscal = _mm_and_pd(fscal,cutoff_mask);
1072 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1074 /* Calculate temporary vectorial force */
1075 tx = _mm_mul_pd(fscal,dx00);
1076 ty = _mm_mul_pd(fscal,dy00);
1077 tz = _mm_mul_pd(fscal,dz00);
1079 /* Update vectorial force */
1080 fix0 = _mm_add_pd(fix0,tx);
1081 fiy0 = _mm_add_pd(fiy0,ty);
1082 fiz0 = _mm_add_pd(fiz0,tz);
1084 fjx0 = _mm_add_pd(fjx0,tx);
1085 fjy0 = _mm_add_pd(fjy0,ty);
1086 fjz0 = _mm_add_pd(fjz0,tz);
1090 /**************************
1091 * CALCULATE INTERACTIONS *
1092 **************************/
1094 if (gmx_mm_any_lt(rsq11,rcutoff2))
1097 r11 = _mm_mul_pd(rsq11,rinv11);
1099 /* EWALD ELECTROSTATICS */
1101 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1102 ewrt = _mm_mul_pd(r11,ewtabscale);
1103 ewitab = _mm_cvttpd_epi32(ewrt);
1104 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1105 ewitab = _mm_slli_epi32(ewitab,2);
1106 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1107 ewtabD = _mm_setzero_pd();
1108 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1109 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1110 ewtabFn = _mm_setzero_pd();
1111 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1112 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1113 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1114 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1115 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1117 d = _mm_sub_pd(r11,rswitch);
1118 d = _mm_max_pd(d,_mm_setzero_pd());
1119 d2 = _mm_mul_pd(d,d);
1120 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1122 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1124 /* Evaluate switch function */
1125 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1126 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
1127 velec = _mm_mul_pd(velec,sw);
1128 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1130 /* Update potential sum for this i atom from the interaction with this j atom. */
1131 velec = _mm_and_pd(velec,cutoff_mask);
1132 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1133 velecsum = _mm_add_pd(velecsum,velec);
1137 fscal = _mm_and_pd(fscal,cutoff_mask);
1139 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1141 /* Calculate temporary vectorial force */
1142 tx = _mm_mul_pd(fscal,dx11);
1143 ty = _mm_mul_pd(fscal,dy11);
1144 tz = _mm_mul_pd(fscal,dz11);
1146 /* Update vectorial force */
1147 fix1 = _mm_add_pd(fix1,tx);
1148 fiy1 = _mm_add_pd(fiy1,ty);
1149 fiz1 = _mm_add_pd(fiz1,tz);
1151 fjx1 = _mm_add_pd(fjx1,tx);
1152 fjy1 = _mm_add_pd(fjy1,ty);
1153 fjz1 = _mm_add_pd(fjz1,tz);
1157 /**************************
1158 * CALCULATE INTERACTIONS *
1159 **************************/
1161 if (gmx_mm_any_lt(rsq12,rcutoff2))
1164 r12 = _mm_mul_pd(rsq12,rinv12);
1166 /* EWALD ELECTROSTATICS */
1168 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1169 ewrt = _mm_mul_pd(r12,ewtabscale);
1170 ewitab = _mm_cvttpd_epi32(ewrt);
1171 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1172 ewitab = _mm_slli_epi32(ewitab,2);
1173 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1174 ewtabD = _mm_setzero_pd();
1175 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1176 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1177 ewtabFn = _mm_setzero_pd();
1178 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1179 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1180 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1181 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1182 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1184 d = _mm_sub_pd(r12,rswitch);
1185 d = _mm_max_pd(d,_mm_setzero_pd());
1186 d2 = _mm_mul_pd(d,d);
1187 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1189 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1191 /* Evaluate switch function */
1192 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1193 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1194 velec = _mm_mul_pd(velec,sw);
1195 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1197 /* Update potential sum for this i atom from the interaction with this j atom. */
1198 velec = _mm_and_pd(velec,cutoff_mask);
1199 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1200 velecsum = _mm_add_pd(velecsum,velec);
1204 fscal = _mm_and_pd(fscal,cutoff_mask);
1206 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1208 /* Calculate temporary vectorial force */
1209 tx = _mm_mul_pd(fscal,dx12);
1210 ty = _mm_mul_pd(fscal,dy12);
1211 tz = _mm_mul_pd(fscal,dz12);
1213 /* Update vectorial force */
1214 fix1 = _mm_add_pd(fix1,tx);
1215 fiy1 = _mm_add_pd(fiy1,ty);
1216 fiz1 = _mm_add_pd(fiz1,tz);
1218 fjx2 = _mm_add_pd(fjx2,tx);
1219 fjy2 = _mm_add_pd(fjy2,ty);
1220 fjz2 = _mm_add_pd(fjz2,tz);
1224 /**************************
1225 * CALCULATE INTERACTIONS *
1226 **************************/
1228 if (gmx_mm_any_lt(rsq13,rcutoff2))
1231 r13 = _mm_mul_pd(rsq13,rinv13);
1233 /* EWALD ELECTROSTATICS */
1235 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1236 ewrt = _mm_mul_pd(r13,ewtabscale);
1237 ewitab = _mm_cvttpd_epi32(ewrt);
1238 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1239 ewitab = _mm_slli_epi32(ewitab,2);
1240 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1241 ewtabD = _mm_setzero_pd();
1242 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1243 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1244 ewtabFn = _mm_setzero_pd();
1245 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1246 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1247 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1248 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
1249 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1251 d = _mm_sub_pd(r13,rswitch);
1252 d = _mm_max_pd(d,_mm_setzero_pd());
1253 d2 = _mm_mul_pd(d,d);
1254 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1256 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1258 /* Evaluate switch function */
1259 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1260 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
1261 velec = _mm_mul_pd(velec,sw);
1262 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1264 /* Update potential sum for this i atom from the interaction with this j atom. */
1265 velec = _mm_and_pd(velec,cutoff_mask);
1266 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1267 velecsum = _mm_add_pd(velecsum,velec);
1271 fscal = _mm_and_pd(fscal,cutoff_mask);
1273 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1275 /* Calculate temporary vectorial force */
1276 tx = _mm_mul_pd(fscal,dx13);
1277 ty = _mm_mul_pd(fscal,dy13);
1278 tz = _mm_mul_pd(fscal,dz13);
1280 /* Update vectorial force */
1281 fix1 = _mm_add_pd(fix1,tx);
1282 fiy1 = _mm_add_pd(fiy1,ty);
1283 fiz1 = _mm_add_pd(fiz1,tz);
1285 fjx3 = _mm_add_pd(fjx3,tx);
1286 fjy3 = _mm_add_pd(fjy3,ty);
1287 fjz3 = _mm_add_pd(fjz3,tz);
1291 /**************************
1292 * CALCULATE INTERACTIONS *
1293 **************************/
1295 if (gmx_mm_any_lt(rsq21,rcutoff2))
1298 r21 = _mm_mul_pd(rsq21,rinv21);
1300 /* EWALD ELECTROSTATICS */
1302 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1303 ewrt = _mm_mul_pd(r21,ewtabscale);
1304 ewitab = _mm_cvttpd_epi32(ewrt);
1305 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1306 ewitab = _mm_slli_epi32(ewitab,2);
1307 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1308 ewtabD = _mm_setzero_pd();
1309 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1310 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1311 ewtabFn = _mm_setzero_pd();
1312 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1313 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1314 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1315 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1316 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1318 d = _mm_sub_pd(r21,rswitch);
1319 d = _mm_max_pd(d,_mm_setzero_pd());
1320 d2 = _mm_mul_pd(d,d);
1321 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1323 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1325 /* Evaluate switch function */
1326 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1327 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
1328 velec = _mm_mul_pd(velec,sw);
1329 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1331 /* Update potential sum for this i atom from the interaction with this j atom. */
1332 velec = _mm_and_pd(velec,cutoff_mask);
1333 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1334 velecsum = _mm_add_pd(velecsum,velec);
1338 fscal = _mm_and_pd(fscal,cutoff_mask);
1340 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1342 /* Calculate temporary vectorial force */
1343 tx = _mm_mul_pd(fscal,dx21);
1344 ty = _mm_mul_pd(fscal,dy21);
1345 tz = _mm_mul_pd(fscal,dz21);
1347 /* Update vectorial force */
1348 fix2 = _mm_add_pd(fix2,tx);
1349 fiy2 = _mm_add_pd(fiy2,ty);
1350 fiz2 = _mm_add_pd(fiz2,tz);
1352 fjx1 = _mm_add_pd(fjx1,tx);
1353 fjy1 = _mm_add_pd(fjy1,ty);
1354 fjz1 = _mm_add_pd(fjz1,tz);
1358 /**************************
1359 * CALCULATE INTERACTIONS *
1360 **************************/
1362 if (gmx_mm_any_lt(rsq22,rcutoff2))
1365 r22 = _mm_mul_pd(rsq22,rinv22);
1367 /* EWALD ELECTROSTATICS */
1369 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1370 ewrt = _mm_mul_pd(r22,ewtabscale);
1371 ewitab = _mm_cvttpd_epi32(ewrt);
1372 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1373 ewitab = _mm_slli_epi32(ewitab,2);
1374 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1375 ewtabD = _mm_setzero_pd();
1376 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1377 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1378 ewtabFn = _mm_setzero_pd();
1379 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1380 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1381 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1382 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1383 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1385 d = _mm_sub_pd(r22,rswitch);
1386 d = _mm_max_pd(d,_mm_setzero_pd());
1387 d2 = _mm_mul_pd(d,d);
1388 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1390 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1392 /* Evaluate switch function */
1393 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1394 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
1395 velec = _mm_mul_pd(velec,sw);
1396 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1398 /* Update potential sum for this i atom from the interaction with this j atom. */
1399 velec = _mm_and_pd(velec,cutoff_mask);
1400 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1401 velecsum = _mm_add_pd(velecsum,velec);
1405 fscal = _mm_and_pd(fscal,cutoff_mask);
1407 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1409 /* Calculate temporary vectorial force */
1410 tx = _mm_mul_pd(fscal,dx22);
1411 ty = _mm_mul_pd(fscal,dy22);
1412 tz = _mm_mul_pd(fscal,dz22);
1414 /* Update vectorial force */
1415 fix2 = _mm_add_pd(fix2,tx);
1416 fiy2 = _mm_add_pd(fiy2,ty);
1417 fiz2 = _mm_add_pd(fiz2,tz);
1419 fjx2 = _mm_add_pd(fjx2,tx);
1420 fjy2 = _mm_add_pd(fjy2,ty);
1421 fjz2 = _mm_add_pd(fjz2,tz);
1425 /**************************
1426 * CALCULATE INTERACTIONS *
1427 **************************/
1429 if (gmx_mm_any_lt(rsq23,rcutoff2))
1432 r23 = _mm_mul_pd(rsq23,rinv23);
1434 /* EWALD ELECTROSTATICS */
1436 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1437 ewrt = _mm_mul_pd(r23,ewtabscale);
1438 ewitab = _mm_cvttpd_epi32(ewrt);
1439 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1440 ewitab = _mm_slli_epi32(ewitab,2);
1441 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1442 ewtabD = _mm_setzero_pd();
1443 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1444 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1445 ewtabFn = _mm_setzero_pd();
1446 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1447 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1448 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1449 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
1450 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1452 d = _mm_sub_pd(r23,rswitch);
1453 d = _mm_max_pd(d,_mm_setzero_pd());
1454 d2 = _mm_mul_pd(d,d);
1455 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1457 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1459 /* Evaluate switch function */
1460 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1461 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
1462 velec = _mm_mul_pd(velec,sw);
1463 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1465 /* Update potential sum for this i atom from the interaction with this j atom. */
1466 velec = _mm_and_pd(velec,cutoff_mask);
1467 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1468 velecsum = _mm_add_pd(velecsum,velec);
1472 fscal = _mm_and_pd(fscal,cutoff_mask);
1474 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1476 /* Calculate temporary vectorial force */
1477 tx = _mm_mul_pd(fscal,dx23);
1478 ty = _mm_mul_pd(fscal,dy23);
1479 tz = _mm_mul_pd(fscal,dz23);
1481 /* Update vectorial force */
1482 fix2 = _mm_add_pd(fix2,tx);
1483 fiy2 = _mm_add_pd(fiy2,ty);
1484 fiz2 = _mm_add_pd(fiz2,tz);
1486 fjx3 = _mm_add_pd(fjx3,tx);
1487 fjy3 = _mm_add_pd(fjy3,ty);
1488 fjz3 = _mm_add_pd(fjz3,tz);
1492 /**************************
1493 * CALCULATE INTERACTIONS *
1494 **************************/
1496 if (gmx_mm_any_lt(rsq31,rcutoff2))
1499 r31 = _mm_mul_pd(rsq31,rinv31);
1501 /* EWALD ELECTROSTATICS */
1503 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1504 ewrt = _mm_mul_pd(r31,ewtabscale);
1505 ewitab = _mm_cvttpd_epi32(ewrt);
1506 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1507 ewitab = _mm_slli_epi32(ewitab,2);
1508 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1509 ewtabD = _mm_setzero_pd();
1510 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1511 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1512 ewtabFn = _mm_setzero_pd();
1513 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1514 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1515 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1516 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
1517 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1519 d = _mm_sub_pd(r31,rswitch);
1520 d = _mm_max_pd(d,_mm_setzero_pd());
1521 d2 = _mm_mul_pd(d,d);
1522 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1524 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1526 /* Evaluate switch function */
1527 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1528 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
1529 velec = _mm_mul_pd(velec,sw);
1530 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1532 /* Update potential sum for this i atom from the interaction with this j atom. */
1533 velec = _mm_and_pd(velec,cutoff_mask);
1534 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1535 velecsum = _mm_add_pd(velecsum,velec);
1539 fscal = _mm_and_pd(fscal,cutoff_mask);
1541 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1543 /* Calculate temporary vectorial force */
1544 tx = _mm_mul_pd(fscal,dx31);
1545 ty = _mm_mul_pd(fscal,dy31);
1546 tz = _mm_mul_pd(fscal,dz31);
1548 /* Update vectorial force */
1549 fix3 = _mm_add_pd(fix3,tx);
1550 fiy3 = _mm_add_pd(fiy3,ty);
1551 fiz3 = _mm_add_pd(fiz3,tz);
1553 fjx1 = _mm_add_pd(fjx1,tx);
1554 fjy1 = _mm_add_pd(fjy1,ty);
1555 fjz1 = _mm_add_pd(fjz1,tz);
1559 /**************************
1560 * CALCULATE INTERACTIONS *
1561 **************************/
1563 if (gmx_mm_any_lt(rsq32,rcutoff2))
1566 r32 = _mm_mul_pd(rsq32,rinv32);
1568 /* EWALD ELECTROSTATICS */
1570 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1571 ewrt = _mm_mul_pd(r32,ewtabscale);
1572 ewitab = _mm_cvttpd_epi32(ewrt);
1573 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1574 ewitab = _mm_slli_epi32(ewitab,2);
1575 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1576 ewtabD = _mm_setzero_pd();
1577 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1578 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1579 ewtabFn = _mm_setzero_pd();
1580 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1581 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1582 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1583 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
1584 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1586 d = _mm_sub_pd(r32,rswitch);
1587 d = _mm_max_pd(d,_mm_setzero_pd());
1588 d2 = _mm_mul_pd(d,d);
1589 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1591 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1593 /* Evaluate switch function */
1594 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1595 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
1596 velec = _mm_mul_pd(velec,sw);
1597 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1599 /* Update potential sum for this i atom from the interaction with this j atom. */
1600 velec = _mm_and_pd(velec,cutoff_mask);
1601 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1602 velecsum = _mm_add_pd(velecsum,velec);
1606 fscal = _mm_and_pd(fscal,cutoff_mask);
1608 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1610 /* Calculate temporary vectorial force */
1611 tx = _mm_mul_pd(fscal,dx32);
1612 ty = _mm_mul_pd(fscal,dy32);
1613 tz = _mm_mul_pd(fscal,dz32);
1615 /* Update vectorial force */
1616 fix3 = _mm_add_pd(fix3,tx);
1617 fiy3 = _mm_add_pd(fiy3,ty);
1618 fiz3 = _mm_add_pd(fiz3,tz);
1620 fjx2 = _mm_add_pd(fjx2,tx);
1621 fjy2 = _mm_add_pd(fjy2,ty);
1622 fjz2 = _mm_add_pd(fjz2,tz);
1626 /**************************
1627 * CALCULATE INTERACTIONS *
1628 **************************/
1630 if (gmx_mm_any_lt(rsq33,rcutoff2))
1633 r33 = _mm_mul_pd(rsq33,rinv33);
1635 /* EWALD ELECTROSTATICS */
1637 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1638 ewrt = _mm_mul_pd(r33,ewtabscale);
1639 ewitab = _mm_cvttpd_epi32(ewrt);
1640 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1641 ewitab = _mm_slli_epi32(ewitab,2);
1642 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1643 ewtabD = _mm_setzero_pd();
1644 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1645 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1646 ewtabFn = _mm_setzero_pd();
1647 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1648 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1649 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1650 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
1651 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1653 d = _mm_sub_pd(r33,rswitch);
1654 d = _mm_max_pd(d,_mm_setzero_pd());
1655 d2 = _mm_mul_pd(d,d);
1656 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1658 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1660 /* Evaluate switch function */
1661 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1662 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
1663 velec = _mm_mul_pd(velec,sw);
1664 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1666 /* Update potential sum for this i atom from the interaction with this j atom. */
1667 velec = _mm_and_pd(velec,cutoff_mask);
1668 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1669 velecsum = _mm_add_pd(velecsum,velec);
1673 fscal = _mm_and_pd(fscal,cutoff_mask);
1675 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1677 /* Calculate temporary vectorial force */
1678 tx = _mm_mul_pd(fscal,dx33);
1679 ty = _mm_mul_pd(fscal,dy33);
1680 tz = _mm_mul_pd(fscal,dz33);
1682 /* Update vectorial force */
1683 fix3 = _mm_add_pd(fix3,tx);
1684 fiy3 = _mm_add_pd(fiy3,ty);
1685 fiz3 = _mm_add_pd(fiz3,tz);
1687 fjx3 = _mm_add_pd(fjx3,tx);
1688 fjy3 = _mm_add_pd(fjy3,ty);
1689 fjz3 = _mm_add_pd(fjz3,tz);
1693 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1695 /* Inner loop uses 647 flops */
1698 /* End of innermost loop */
1700 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1701 f+i_coord_offset,fshift+i_shift_offset);
1704 /* Update potential energies */
1705 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1706 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1708 /* Increment number of inner iterations */
1709 inneriter += j_index_end - j_index_start;
1711 /* Outer loop uses 26 flops */
1714 /* Increment number of outer iterations */
1717 /* Update outer/inner flops */
1719 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*647);
1722 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_double
1723 * Electrostatics interaction: Ewald
1724 * VdW interaction: LennardJones
1725 * Geometry: Water4-Water4
1726 * Calculate force/pot: Force
1729 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_double
1730 (t_nblist * gmx_restrict nlist,
1731 rvec * gmx_restrict xx,
1732 rvec * gmx_restrict ff,
1733 t_forcerec * gmx_restrict fr,
1734 t_mdatoms * gmx_restrict mdatoms,
1735 nb_kernel_data_t * gmx_restrict kernel_data,
1736 t_nrnb * gmx_restrict nrnb)
1738 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1739 * just 0 for non-waters.
1740 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1741 * jnr indices corresponding to data put in the four positions in the SIMD register.
1743 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1744 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1746 int j_coord_offsetA,j_coord_offsetB;
1747 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1748 real rcutoff_scalar;
1749 real *shiftvec,*fshift,*x,*f;
1750 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1752 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1754 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1756 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1758 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1759 int vdwjidx0A,vdwjidx0B;
1760 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1761 int vdwjidx1A,vdwjidx1B;
1762 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1763 int vdwjidx2A,vdwjidx2B;
1764 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1765 int vdwjidx3A,vdwjidx3B;
1766 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1767 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1768 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1769 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1770 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1771 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1772 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1773 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1774 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1775 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1776 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1777 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1780 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1783 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1784 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1786 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1788 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1789 real rswitch_scalar,d_scalar;
1790 __m128d dummy_mask,cutoff_mask;
1791 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1792 __m128d one = _mm_set1_pd(1.0);
1793 __m128d two = _mm_set1_pd(2.0);
1799 jindex = nlist->jindex;
1801 shiftidx = nlist->shift;
1803 shiftvec = fr->shift_vec[0];
1804 fshift = fr->fshift[0];
1805 facel = _mm_set1_pd(fr->epsfac);
1806 charge = mdatoms->chargeA;
1807 nvdwtype = fr->ntype;
1808 vdwparam = fr->nbfp;
1809 vdwtype = mdatoms->typeA;
1811 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1812 ewtab = fr->ic->tabq_coul_FDV0;
1813 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1814 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1816 /* Setup water-specific parameters */
1817 inr = nlist->iinr[0];
1818 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1819 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1820 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1821 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1823 jq1 = _mm_set1_pd(charge[inr+1]);
1824 jq2 = _mm_set1_pd(charge[inr+2]);
1825 jq3 = _mm_set1_pd(charge[inr+3]);
1826 vdwjidx0A = 2*vdwtype[inr+0];
1827 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1828 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1829 qq11 = _mm_mul_pd(iq1,jq1);
1830 qq12 = _mm_mul_pd(iq1,jq2);
1831 qq13 = _mm_mul_pd(iq1,jq3);
1832 qq21 = _mm_mul_pd(iq2,jq1);
1833 qq22 = _mm_mul_pd(iq2,jq2);
1834 qq23 = _mm_mul_pd(iq2,jq3);
1835 qq31 = _mm_mul_pd(iq3,jq1);
1836 qq32 = _mm_mul_pd(iq3,jq2);
1837 qq33 = _mm_mul_pd(iq3,jq3);
1839 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1840 rcutoff_scalar = fr->rcoulomb;
1841 rcutoff = _mm_set1_pd(rcutoff_scalar);
1842 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1844 rswitch_scalar = fr->rcoulomb_switch;
1845 rswitch = _mm_set1_pd(rswitch_scalar);
1846 /* Setup switch parameters */
1847 d_scalar = rcutoff_scalar-rswitch_scalar;
1848 d = _mm_set1_pd(d_scalar);
1849 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1850 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1851 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1852 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1853 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1854 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1856 /* Avoid stupid compiler warnings */
1858 j_coord_offsetA = 0;
1859 j_coord_offsetB = 0;
1864 /* Start outer loop over neighborlists */
1865 for(iidx=0; iidx<nri; iidx++)
1867 /* Load shift vector for this list */
1868 i_shift_offset = DIM*shiftidx[iidx];
1870 /* Load limits for loop over neighbors */
1871 j_index_start = jindex[iidx];
1872 j_index_end = jindex[iidx+1];
1874 /* Get outer coordinate index */
1876 i_coord_offset = DIM*inr;
1878 /* Load i particle coords and add shift vector */
1879 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1880 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1882 fix0 = _mm_setzero_pd();
1883 fiy0 = _mm_setzero_pd();
1884 fiz0 = _mm_setzero_pd();
1885 fix1 = _mm_setzero_pd();
1886 fiy1 = _mm_setzero_pd();
1887 fiz1 = _mm_setzero_pd();
1888 fix2 = _mm_setzero_pd();
1889 fiy2 = _mm_setzero_pd();
1890 fiz2 = _mm_setzero_pd();
1891 fix3 = _mm_setzero_pd();
1892 fiy3 = _mm_setzero_pd();
1893 fiz3 = _mm_setzero_pd();
1895 /* Start inner kernel loop */
1896 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1899 /* Get j neighbor index, and coordinate index */
1901 jnrB = jjnr[jidx+1];
1902 j_coord_offsetA = DIM*jnrA;
1903 j_coord_offsetB = DIM*jnrB;
1905 /* load j atom coordinates */
1906 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1907 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1908 &jy2,&jz2,&jx3,&jy3,&jz3);
1910 /* Calculate displacement vector */
1911 dx00 = _mm_sub_pd(ix0,jx0);
1912 dy00 = _mm_sub_pd(iy0,jy0);
1913 dz00 = _mm_sub_pd(iz0,jz0);
1914 dx11 = _mm_sub_pd(ix1,jx1);
1915 dy11 = _mm_sub_pd(iy1,jy1);
1916 dz11 = _mm_sub_pd(iz1,jz1);
1917 dx12 = _mm_sub_pd(ix1,jx2);
1918 dy12 = _mm_sub_pd(iy1,jy2);
1919 dz12 = _mm_sub_pd(iz1,jz2);
1920 dx13 = _mm_sub_pd(ix1,jx3);
1921 dy13 = _mm_sub_pd(iy1,jy3);
1922 dz13 = _mm_sub_pd(iz1,jz3);
1923 dx21 = _mm_sub_pd(ix2,jx1);
1924 dy21 = _mm_sub_pd(iy2,jy1);
1925 dz21 = _mm_sub_pd(iz2,jz1);
1926 dx22 = _mm_sub_pd(ix2,jx2);
1927 dy22 = _mm_sub_pd(iy2,jy2);
1928 dz22 = _mm_sub_pd(iz2,jz2);
1929 dx23 = _mm_sub_pd(ix2,jx3);
1930 dy23 = _mm_sub_pd(iy2,jy3);
1931 dz23 = _mm_sub_pd(iz2,jz3);
1932 dx31 = _mm_sub_pd(ix3,jx1);
1933 dy31 = _mm_sub_pd(iy3,jy1);
1934 dz31 = _mm_sub_pd(iz3,jz1);
1935 dx32 = _mm_sub_pd(ix3,jx2);
1936 dy32 = _mm_sub_pd(iy3,jy2);
1937 dz32 = _mm_sub_pd(iz3,jz2);
1938 dx33 = _mm_sub_pd(ix3,jx3);
1939 dy33 = _mm_sub_pd(iy3,jy3);
1940 dz33 = _mm_sub_pd(iz3,jz3);
1942 /* Calculate squared distance and things based on it */
1943 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1944 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1945 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1946 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1947 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1948 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1949 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1950 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1951 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1952 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1954 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1955 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1956 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1957 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1958 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1959 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1960 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1961 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1962 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1963 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1965 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1966 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1967 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1968 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1969 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1970 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1971 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1972 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1973 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1974 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1976 fjx0 = _mm_setzero_pd();
1977 fjy0 = _mm_setzero_pd();
1978 fjz0 = _mm_setzero_pd();
1979 fjx1 = _mm_setzero_pd();
1980 fjy1 = _mm_setzero_pd();
1981 fjz1 = _mm_setzero_pd();
1982 fjx2 = _mm_setzero_pd();
1983 fjy2 = _mm_setzero_pd();
1984 fjz2 = _mm_setzero_pd();
1985 fjx3 = _mm_setzero_pd();
1986 fjy3 = _mm_setzero_pd();
1987 fjz3 = _mm_setzero_pd();
1989 /**************************
1990 * CALCULATE INTERACTIONS *
1991 **************************/
1993 if (gmx_mm_any_lt(rsq00,rcutoff2))
1996 r00 = _mm_mul_pd(rsq00,rinv00);
1998 /* LENNARD-JONES DISPERSION/REPULSION */
2000 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2001 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
2002 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
2003 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
2004 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
2006 d = _mm_sub_pd(r00,rswitch);
2007 d = _mm_max_pd(d,_mm_setzero_pd());
2008 d2 = _mm_mul_pd(d,d);
2009 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2011 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2013 /* Evaluate switch function */
2014 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2015 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2016 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
2020 fscal = _mm_and_pd(fscal,cutoff_mask);
2022 /* Calculate temporary vectorial force */
2023 tx = _mm_mul_pd(fscal,dx00);
2024 ty = _mm_mul_pd(fscal,dy00);
2025 tz = _mm_mul_pd(fscal,dz00);
2027 /* Update vectorial force */
2028 fix0 = _mm_add_pd(fix0,tx);
2029 fiy0 = _mm_add_pd(fiy0,ty);
2030 fiz0 = _mm_add_pd(fiz0,tz);
2032 fjx0 = _mm_add_pd(fjx0,tx);
2033 fjy0 = _mm_add_pd(fjy0,ty);
2034 fjz0 = _mm_add_pd(fjz0,tz);
2038 /**************************
2039 * CALCULATE INTERACTIONS *
2040 **************************/
2042 if (gmx_mm_any_lt(rsq11,rcutoff2))
2045 r11 = _mm_mul_pd(rsq11,rinv11);
2047 /* EWALD ELECTROSTATICS */
2049 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2050 ewrt = _mm_mul_pd(r11,ewtabscale);
2051 ewitab = _mm_cvttpd_epi32(ewrt);
2052 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2053 ewitab = _mm_slli_epi32(ewitab,2);
2054 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2055 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2056 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2057 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2058 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2059 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2060 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2061 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2062 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2063 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2065 d = _mm_sub_pd(r11,rswitch);
2066 d = _mm_max_pd(d,_mm_setzero_pd());
2067 d2 = _mm_mul_pd(d,d);
2068 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2070 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2072 /* Evaluate switch function */
2073 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2074 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2075 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2079 fscal = _mm_and_pd(fscal,cutoff_mask);
2081 /* Calculate temporary vectorial force */
2082 tx = _mm_mul_pd(fscal,dx11);
2083 ty = _mm_mul_pd(fscal,dy11);
2084 tz = _mm_mul_pd(fscal,dz11);
2086 /* Update vectorial force */
2087 fix1 = _mm_add_pd(fix1,tx);
2088 fiy1 = _mm_add_pd(fiy1,ty);
2089 fiz1 = _mm_add_pd(fiz1,tz);
2091 fjx1 = _mm_add_pd(fjx1,tx);
2092 fjy1 = _mm_add_pd(fjy1,ty);
2093 fjz1 = _mm_add_pd(fjz1,tz);
2097 /**************************
2098 * CALCULATE INTERACTIONS *
2099 **************************/
2101 if (gmx_mm_any_lt(rsq12,rcutoff2))
2104 r12 = _mm_mul_pd(rsq12,rinv12);
2106 /* EWALD ELECTROSTATICS */
2108 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2109 ewrt = _mm_mul_pd(r12,ewtabscale);
2110 ewitab = _mm_cvttpd_epi32(ewrt);
2111 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2112 ewitab = _mm_slli_epi32(ewitab,2);
2113 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2114 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2115 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2116 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2117 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2118 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2119 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2120 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2121 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2122 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2124 d = _mm_sub_pd(r12,rswitch);
2125 d = _mm_max_pd(d,_mm_setzero_pd());
2126 d2 = _mm_mul_pd(d,d);
2127 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2129 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2131 /* Evaluate switch function */
2132 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2133 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2134 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2138 fscal = _mm_and_pd(fscal,cutoff_mask);
2140 /* Calculate temporary vectorial force */
2141 tx = _mm_mul_pd(fscal,dx12);
2142 ty = _mm_mul_pd(fscal,dy12);
2143 tz = _mm_mul_pd(fscal,dz12);
2145 /* Update vectorial force */
2146 fix1 = _mm_add_pd(fix1,tx);
2147 fiy1 = _mm_add_pd(fiy1,ty);
2148 fiz1 = _mm_add_pd(fiz1,tz);
2150 fjx2 = _mm_add_pd(fjx2,tx);
2151 fjy2 = _mm_add_pd(fjy2,ty);
2152 fjz2 = _mm_add_pd(fjz2,tz);
2156 /**************************
2157 * CALCULATE INTERACTIONS *
2158 **************************/
2160 if (gmx_mm_any_lt(rsq13,rcutoff2))
2163 r13 = _mm_mul_pd(rsq13,rinv13);
2165 /* EWALD ELECTROSTATICS */
2167 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2168 ewrt = _mm_mul_pd(r13,ewtabscale);
2169 ewitab = _mm_cvttpd_epi32(ewrt);
2170 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2171 ewitab = _mm_slli_epi32(ewitab,2);
2172 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2173 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2174 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2175 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2176 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2177 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2178 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2179 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2180 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
2181 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2183 d = _mm_sub_pd(r13,rswitch);
2184 d = _mm_max_pd(d,_mm_setzero_pd());
2185 d2 = _mm_mul_pd(d,d);
2186 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2188 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2190 /* Evaluate switch function */
2191 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2192 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
2193 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
2197 fscal = _mm_and_pd(fscal,cutoff_mask);
2199 /* Calculate temporary vectorial force */
2200 tx = _mm_mul_pd(fscal,dx13);
2201 ty = _mm_mul_pd(fscal,dy13);
2202 tz = _mm_mul_pd(fscal,dz13);
2204 /* Update vectorial force */
2205 fix1 = _mm_add_pd(fix1,tx);
2206 fiy1 = _mm_add_pd(fiy1,ty);
2207 fiz1 = _mm_add_pd(fiz1,tz);
2209 fjx3 = _mm_add_pd(fjx3,tx);
2210 fjy3 = _mm_add_pd(fjy3,ty);
2211 fjz3 = _mm_add_pd(fjz3,tz);
2215 /**************************
2216 * CALCULATE INTERACTIONS *
2217 **************************/
2219 if (gmx_mm_any_lt(rsq21,rcutoff2))
2222 r21 = _mm_mul_pd(rsq21,rinv21);
2224 /* EWALD ELECTROSTATICS */
2226 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2227 ewrt = _mm_mul_pd(r21,ewtabscale);
2228 ewitab = _mm_cvttpd_epi32(ewrt);
2229 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2230 ewitab = _mm_slli_epi32(ewitab,2);
2231 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2232 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2233 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2234 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2235 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2236 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2237 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2238 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2239 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2240 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2242 d = _mm_sub_pd(r21,rswitch);
2243 d = _mm_max_pd(d,_mm_setzero_pd());
2244 d2 = _mm_mul_pd(d,d);
2245 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2247 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2249 /* Evaluate switch function */
2250 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2251 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2252 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2256 fscal = _mm_and_pd(fscal,cutoff_mask);
2258 /* Calculate temporary vectorial force */
2259 tx = _mm_mul_pd(fscal,dx21);
2260 ty = _mm_mul_pd(fscal,dy21);
2261 tz = _mm_mul_pd(fscal,dz21);
2263 /* Update vectorial force */
2264 fix2 = _mm_add_pd(fix2,tx);
2265 fiy2 = _mm_add_pd(fiy2,ty);
2266 fiz2 = _mm_add_pd(fiz2,tz);
2268 fjx1 = _mm_add_pd(fjx1,tx);
2269 fjy1 = _mm_add_pd(fjy1,ty);
2270 fjz1 = _mm_add_pd(fjz1,tz);
2274 /**************************
2275 * CALCULATE INTERACTIONS *
2276 **************************/
2278 if (gmx_mm_any_lt(rsq22,rcutoff2))
2281 r22 = _mm_mul_pd(rsq22,rinv22);
2283 /* EWALD ELECTROSTATICS */
2285 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2286 ewrt = _mm_mul_pd(r22,ewtabscale);
2287 ewitab = _mm_cvttpd_epi32(ewrt);
2288 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2289 ewitab = _mm_slli_epi32(ewitab,2);
2290 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2291 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2292 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2293 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2294 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2295 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2296 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2297 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2298 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2299 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2301 d = _mm_sub_pd(r22,rswitch);
2302 d = _mm_max_pd(d,_mm_setzero_pd());
2303 d2 = _mm_mul_pd(d,d);
2304 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2306 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2308 /* Evaluate switch function */
2309 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2310 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2311 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2315 fscal = _mm_and_pd(fscal,cutoff_mask);
2317 /* Calculate temporary vectorial force */
2318 tx = _mm_mul_pd(fscal,dx22);
2319 ty = _mm_mul_pd(fscal,dy22);
2320 tz = _mm_mul_pd(fscal,dz22);
2322 /* Update vectorial force */
2323 fix2 = _mm_add_pd(fix2,tx);
2324 fiy2 = _mm_add_pd(fiy2,ty);
2325 fiz2 = _mm_add_pd(fiz2,tz);
2327 fjx2 = _mm_add_pd(fjx2,tx);
2328 fjy2 = _mm_add_pd(fjy2,ty);
2329 fjz2 = _mm_add_pd(fjz2,tz);
2333 /**************************
2334 * CALCULATE INTERACTIONS *
2335 **************************/
2337 if (gmx_mm_any_lt(rsq23,rcutoff2))
2340 r23 = _mm_mul_pd(rsq23,rinv23);
2342 /* EWALD ELECTROSTATICS */
2344 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2345 ewrt = _mm_mul_pd(r23,ewtabscale);
2346 ewitab = _mm_cvttpd_epi32(ewrt);
2347 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2348 ewitab = _mm_slli_epi32(ewitab,2);
2349 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2350 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2351 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2352 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2353 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2354 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2355 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2356 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2357 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
2358 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2360 d = _mm_sub_pd(r23,rswitch);
2361 d = _mm_max_pd(d,_mm_setzero_pd());
2362 d2 = _mm_mul_pd(d,d);
2363 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2365 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2367 /* Evaluate switch function */
2368 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2369 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
2370 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
2374 fscal = _mm_and_pd(fscal,cutoff_mask);
2376 /* Calculate temporary vectorial force */
2377 tx = _mm_mul_pd(fscal,dx23);
2378 ty = _mm_mul_pd(fscal,dy23);
2379 tz = _mm_mul_pd(fscal,dz23);
2381 /* Update vectorial force */
2382 fix2 = _mm_add_pd(fix2,tx);
2383 fiy2 = _mm_add_pd(fiy2,ty);
2384 fiz2 = _mm_add_pd(fiz2,tz);
2386 fjx3 = _mm_add_pd(fjx3,tx);
2387 fjy3 = _mm_add_pd(fjy3,ty);
2388 fjz3 = _mm_add_pd(fjz3,tz);
2392 /**************************
2393 * CALCULATE INTERACTIONS *
2394 **************************/
2396 if (gmx_mm_any_lt(rsq31,rcutoff2))
2399 r31 = _mm_mul_pd(rsq31,rinv31);
2401 /* EWALD ELECTROSTATICS */
2403 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2404 ewrt = _mm_mul_pd(r31,ewtabscale);
2405 ewitab = _mm_cvttpd_epi32(ewrt);
2406 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2407 ewitab = _mm_slli_epi32(ewitab,2);
2408 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2409 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2410 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2411 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2412 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2413 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2414 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2415 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2416 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
2417 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2419 d = _mm_sub_pd(r31,rswitch);
2420 d = _mm_max_pd(d,_mm_setzero_pd());
2421 d2 = _mm_mul_pd(d,d);
2422 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2424 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2426 /* Evaluate switch function */
2427 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2428 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
2429 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
2433 fscal = _mm_and_pd(fscal,cutoff_mask);
2435 /* Calculate temporary vectorial force */
2436 tx = _mm_mul_pd(fscal,dx31);
2437 ty = _mm_mul_pd(fscal,dy31);
2438 tz = _mm_mul_pd(fscal,dz31);
2440 /* Update vectorial force */
2441 fix3 = _mm_add_pd(fix3,tx);
2442 fiy3 = _mm_add_pd(fiy3,ty);
2443 fiz3 = _mm_add_pd(fiz3,tz);
2445 fjx1 = _mm_add_pd(fjx1,tx);
2446 fjy1 = _mm_add_pd(fjy1,ty);
2447 fjz1 = _mm_add_pd(fjz1,tz);
2451 /**************************
2452 * CALCULATE INTERACTIONS *
2453 **************************/
2455 if (gmx_mm_any_lt(rsq32,rcutoff2))
2458 r32 = _mm_mul_pd(rsq32,rinv32);
2460 /* EWALD ELECTROSTATICS */
2462 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2463 ewrt = _mm_mul_pd(r32,ewtabscale);
2464 ewitab = _mm_cvttpd_epi32(ewrt);
2465 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2466 ewitab = _mm_slli_epi32(ewitab,2);
2467 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2468 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2469 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2470 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2471 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2472 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2473 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2474 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2475 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
2476 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2478 d = _mm_sub_pd(r32,rswitch);
2479 d = _mm_max_pd(d,_mm_setzero_pd());
2480 d2 = _mm_mul_pd(d,d);
2481 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2483 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2485 /* Evaluate switch function */
2486 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2487 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
2488 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
2492 fscal = _mm_and_pd(fscal,cutoff_mask);
2494 /* Calculate temporary vectorial force */
2495 tx = _mm_mul_pd(fscal,dx32);
2496 ty = _mm_mul_pd(fscal,dy32);
2497 tz = _mm_mul_pd(fscal,dz32);
2499 /* Update vectorial force */
2500 fix3 = _mm_add_pd(fix3,tx);
2501 fiy3 = _mm_add_pd(fiy3,ty);
2502 fiz3 = _mm_add_pd(fiz3,tz);
2504 fjx2 = _mm_add_pd(fjx2,tx);
2505 fjy2 = _mm_add_pd(fjy2,ty);
2506 fjz2 = _mm_add_pd(fjz2,tz);
2510 /**************************
2511 * CALCULATE INTERACTIONS *
2512 **************************/
2514 if (gmx_mm_any_lt(rsq33,rcutoff2))
2517 r33 = _mm_mul_pd(rsq33,rinv33);
2519 /* EWALD ELECTROSTATICS */
2521 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2522 ewrt = _mm_mul_pd(r33,ewtabscale);
2523 ewitab = _mm_cvttpd_epi32(ewrt);
2524 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2525 ewitab = _mm_slli_epi32(ewitab,2);
2526 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2527 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2528 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2529 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2530 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2531 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2532 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2533 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2534 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
2535 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2537 d = _mm_sub_pd(r33,rswitch);
2538 d = _mm_max_pd(d,_mm_setzero_pd());
2539 d2 = _mm_mul_pd(d,d);
2540 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2542 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2544 /* Evaluate switch function */
2545 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2546 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
2547 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
2551 fscal = _mm_and_pd(fscal,cutoff_mask);
2553 /* Calculate temporary vectorial force */
2554 tx = _mm_mul_pd(fscal,dx33);
2555 ty = _mm_mul_pd(fscal,dy33);
2556 tz = _mm_mul_pd(fscal,dz33);
2558 /* Update vectorial force */
2559 fix3 = _mm_add_pd(fix3,tx);
2560 fiy3 = _mm_add_pd(fiy3,ty);
2561 fiz3 = _mm_add_pd(fiz3,tz);
2563 fjx3 = _mm_add_pd(fjx3,tx);
2564 fjy3 = _mm_add_pd(fjy3,ty);
2565 fjz3 = _mm_add_pd(fjz3,tz);
2569 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2571 /* Inner loop uses 617 flops */
2574 if(jidx<j_index_end)
2578 j_coord_offsetA = DIM*jnrA;
2580 /* load j atom coordinates */
2581 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2582 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2583 &jy2,&jz2,&jx3,&jy3,&jz3);
2585 /* Calculate displacement vector */
2586 dx00 = _mm_sub_pd(ix0,jx0);
2587 dy00 = _mm_sub_pd(iy0,jy0);
2588 dz00 = _mm_sub_pd(iz0,jz0);
2589 dx11 = _mm_sub_pd(ix1,jx1);
2590 dy11 = _mm_sub_pd(iy1,jy1);
2591 dz11 = _mm_sub_pd(iz1,jz1);
2592 dx12 = _mm_sub_pd(ix1,jx2);
2593 dy12 = _mm_sub_pd(iy1,jy2);
2594 dz12 = _mm_sub_pd(iz1,jz2);
2595 dx13 = _mm_sub_pd(ix1,jx3);
2596 dy13 = _mm_sub_pd(iy1,jy3);
2597 dz13 = _mm_sub_pd(iz1,jz3);
2598 dx21 = _mm_sub_pd(ix2,jx1);
2599 dy21 = _mm_sub_pd(iy2,jy1);
2600 dz21 = _mm_sub_pd(iz2,jz1);
2601 dx22 = _mm_sub_pd(ix2,jx2);
2602 dy22 = _mm_sub_pd(iy2,jy2);
2603 dz22 = _mm_sub_pd(iz2,jz2);
2604 dx23 = _mm_sub_pd(ix2,jx3);
2605 dy23 = _mm_sub_pd(iy2,jy3);
2606 dz23 = _mm_sub_pd(iz2,jz3);
2607 dx31 = _mm_sub_pd(ix3,jx1);
2608 dy31 = _mm_sub_pd(iy3,jy1);
2609 dz31 = _mm_sub_pd(iz3,jz1);
2610 dx32 = _mm_sub_pd(ix3,jx2);
2611 dy32 = _mm_sub_pd(iy3,jy2);
2612 dz32 = _mm_sub_pd(iz3,jz2);
2613 dx33 = _mm_sub_pd(ix3,jx3);
2614 dy33 = _mm_sub_pd(iy3,jy3);
2615 dz33 = _mm_sub_pd(iz3,jz3);
2617 /* Calculate squared distance and things based on it */
2618 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2619 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2620 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2621 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2622 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2623 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2624 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2625 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2626 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2627 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2629 rinv00 = gmx_mm_invsqrt_pd(rsq00);
2630 rinv11 = gmx_mm_invsqrt_pd(rsq11);
2631 rinv12 = gmx_mm_invsqrt_pd(rsq12);
2632 rinv13 = gmx_mm_invsqrt_pd(rsq13);
2633 rinv21 = gmx_mm_invsqrt_pd(rsq21);
2634 rinv22 = gmx_mm_invsqrt_pd(rsq22);
2635 rinv23 = gmx_mm_invsqrt_pd(rsq23);
2636 rinv31 = gmx_mm_invsqrt_pd(rsq31);
2637 rinv32 = gmx_mm_invsqrt_pd(rsq32);
2638 rinv33 = gmx_mm_invsqrt_pd(rsq33);
2640 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
2641 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
2642 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
2643 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
2644 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
2645 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
2646 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
2647 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
2648 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
2649 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
2651 fjx0 = _mm_setzero_pd();
2652 fjy0 = _mm_setzero_pd();
2653 fjz0 = _mm_setzero_pd();
2654 fjx1 = _mm_setzero_pd();
2655 fjy1 = _mm_setzero_pd();
2656 fjz1 = _mm_setzero_pd();
2657 fjx2 = _mm_setzero_pd();
2658 fjy2 = _mm_setzero_pd();
2659 fjz2 = _mm_setzero_pd();
2660 fjx3 = _mm_setzero_pd();
2661 fjy3 = _mm_setzero_pd();
2662 fjz3 = _mm_setzero_pd();
2664 /**************************
2665 * CALCULATE INTERACTIONS *
2666 **************************/
2668 if (gmx_mm_any_lt(rsq00,rcutoff2))
2671 r00 = _mm_mul_pd(rsq00,rinv00);
2673 /* LENNARD-JONES DISPERSION/REPULSION */
2675 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2676 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
2677 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
2678 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
2679 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
2681 d = _mm_sub_pd(r00,rswitch);
2682 d = _mm_max_pd(d,_mm_setzero_pd());
2683 d2 = _mm_mul_pd(d,d);
2684 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2686 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2688 /* Evaluate switch function */
2689 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2690 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2691 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
2695 fscal = _mm_and_pd(fscal,cutoff_mask);
2697 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2699 /* Calculate temporary vectorial force */
2700 tx = _mm_mul_pd(fscal,dx00);
2701 ty = _mm_mul_pd(fscal,dy00);
2702 tz = _mm_mul_pd(fscal,dz00);
2704 /* Update vectorial force */
2705 fix0 = _mm_add_pd(fix0,tx);
2706 fiy0 = _mm_add_pd(fiy0,ty);
2707 fiz0 = _mm_add_pd(fiz0,tz);
2709 fjx0 = _mm_add_pd(fjx0,tx);
2710 fjy0 = _mm_add_pd(fjy0,ty);
2711 fjz0 = _mm_add_pd(fjz0,tz);
2715 /**************************
2716 * CALCULATE INTERACTIONS *
2717 **************************/
2719 if (gmx_mm_any_lt(rsq11,rcutoff2))
2722 r11 = _mm_mul_pd(rsq11,rinv11);
2724 /* EWALD ELECTROSTATICS */
2726 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2727 ewrt = _mm_mul_pd(r11,ewtabscale);
2728 ewitab = _mm_cvttpd_epi32(ewrt);
2729 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2730 ewitab = _mm_slli_epi32(ewitab,2);
2731 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2732 ewtabD = _mm_setzero_pd();
2733 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2734 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2735 ewtabFn = _mm_setzero_pd();
2736 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2737 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2738 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2739 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2740 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2742 d = _mm_sub_pd(r11,rswitch);
2743 d = _mm_max_pd(d,_mm_setzero_pd());
2744 d2 = _mm_mul_pd(d,d);
2745 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2747 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2749 /* Evaluate switch function */
2750 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2751 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2752 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2756 fscal = _mm_and_pd(fscal,cutoff_mask);
2758 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2760 /* Calculate temporary vectorial force */
2761 tx = _mm_mul_pd(fscal,dx11);
2762 ty = _mm_mul_pd(fscal,dy11);
2763 tz = _mm_mul_pd(fscal,dz11);
2765 /* Update vectorial force */
2766 fix1 = _mm_add_pd(fix1,tx);
2767 fiy1 = _mm_add_pd(fiy1,ty);
2768 fiz1 = _mm_add_pd(fiz1,tz);
2770 fjx1 = _mm_add_pd(fjx1,tx);
2771 fjy1 = _mm_add_pd(fjy1,ty);
2772 fjz1 = _mm_add_pd(fjz1,tz);
2776 /**************************
2777 * CALCULATE INTERACTIONS *
2778 **************************/
2780 if (gmx_mm_any_lt(rsq12,rcutoff2))
2783 r12 = _mm_mul_pd(rsq12,rinv12);
2785 /* EWALD ELECTROSTATICS */
2787 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2788 ewrt = _mm_mul_pd(r12,ewtabscale);
2789 ewitab = _mm_cvttpd_epi32(ewrt);
2790 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2791 ewitab = _mm_slli_epi32(ewitab,2);
2792 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2793 ewtabD = _mm_setzero_pd();
2794 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2795 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2796 ewtabFn = _mm_setzero_pd();
2797 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2798 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2799 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2800 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2801 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2803 d = _mm_sub_pd(r12,rswitch);
2804 d = _mm_max_pd(d,_mm_setzero_pd());
2805 d2 = _mm_mul_pd(d,d);
2806 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2808 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2810 /* Evaluate switch function */
2811 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2812 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2813 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2817 fscal = _mm_and_pd(fscal,cutoff_mask);
2819 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2821 /* Calculate temporary vectorial force */
2822 tx = _mm_mul_pd(fscal,dx12);
2823 ty = _mm_mul_pd(fscal,dy12);
2824 tz = _mm_mul_pd(fscal,dz12);
2826 /* Update vectorial force */
2827 fix1 = _mm_add_pd(fix1,tx);
2828 fiy1 = _mm_add_pd(fiy1,ty);
2829 fiz1 = _mm_add_pd(fiz1,tz);
2831 fjx2 = _mm_add_pd(fjx2,tx);
2832 fjy2 = _mm_add_pd(fjy2,ty);
2833 fjz2 = _mm_add_pd(fjz2,tz);
2837 /**************************
2838 * CALCULATE INTERACTIONS *
2839 **************************/
2841 if (gmx_mm_any_lt(rsq13,rcutoff2))
2844 r13 = _mm_mul_pd(rsq13,rinv13);
2846 /* EWALD ELECTROSTATICS */
2848 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2849 ewrt = _mm_mul_pd(r13,ewtabscale);
2850 ewitab = _mm_cvttpd_epi32(ewrt);
2851 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2852 ewitab = _mm_slli_epi32(ewitab,2);
2853 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2854 ewtabD = _mm_setzero_pd();
2855 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2856 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2857 ewtabFn = _mm_setzero_pd();
2858 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2859 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2860 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2861 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
2862 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2864 d = _mm_sub_pd(r13,rswitch);
2865 d = _mm_max_pd(d,_mm_setzero_pd());
2866 d2 = _mm_mul_pd(d,d);
2867 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2869 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2871 /* Evaluate switch function */
2872 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2873 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
2874 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
2878 fscal = _mm_and_pd(fscal,cutoff_mask);
2880 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2882 /* Calculate temporary vectorial force */
2883 tx = _mm_mul_pd(fscal,dx13);
2884 ty = _mm_mul_pd(fscal,dy13);
2885 tz = _mm_mul_pd(fscal,dz13);
2887 /* Update vectorial force */
2888 fix1 = _mm_add_pd(fix1,tx);
2889 fiy1 = _mm_add_pd(fiy1,ty);
2890 fiz1 = _mm_add_pd(fiz1,tz);
2892 fjx3 = _mm_add_pd(fjx3,tx);
2893 fjy3 = _mm_add_pd(fjy3,ty);
2894 fjz3 = _mm_add_pd(fjz3,tz);
2898 /**************************
2899 * CALCULATE INTERACTIONS *
2900 **************************/
2902 if (gmx_mm_any_lt(rsq21,rcutoff2))
2905 r21 = _mm_mul_pd(rsq21,rinv21);
2907 /* EWALD ELECTROSTATICS */
2909 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2910 ewrt = _mm_mul_pd(r21,ewtabscale);
2911 ewitab = _mm_cvttpd_epi32(ewrt);
2912 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2913 ewitab = _mm_slli_epi32(ewitab,2);
2914 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2915 ewtabD = _mm_setzero_pd();
2916 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2917 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2918 ewtabFn = _mm_setzero_pd();
2919 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2920 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2921 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2922 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2923 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2925 d = _mm_sub_pd(r21,rswitch);
2926 d = _mm_max_pd(d,_mm_setzero_pd());
2927 d2 = _mm_mul_pd(d,d);
2928 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2930 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2932 /* Evaluate switch function */
2933 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2934 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2935 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2939 fscal = _mm_and_pd(fscal,cutoff_mask);
2941 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2943 /* Calculate temporary vectorial force */
2944 tx = _mm_mul_pd(fscal,dx21);
2945 ty = _mm_mul_pd(fscal,dy21);
2946 tz = _mm_mul_pd(fscal,dz21);
2948 /* Update vectorial force */
2949 fix2 = _mm_add_pd(fix2,tx);
2950 fiy2 = _mm_add_pd(fiy2,ty);
2951 fiz2 = _mm_add_pd(fiz2,tz);
2953 fjx1 = _mm_add_pd(fjx1,tx);
2954 fjy1 = _mm_add_pd(fjy1,ty);
2955 fjz1 = _mm_add_pd(fjz1,tz);
2959 /**************************
2960 * CALCULATE INTERACTIONS *
2961 **************************/
2963 if (gmx_mm_any_lt(rsq22,rcutoff2))
2966 r22 = _mm_mul_pd(rsq22,rinv22);
2968 /* EWALD ELECTROSTATICS */
2970 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2971 ewrt = _mm_mul_pd(r22,ewtabscale);
2972 ewitab = _mm_cvttpd_epi32(ewrt);
2973 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2974 ewitab = _mm_slli_epi32(ewitab,2);
2975 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2976 ewtabD = _mm_setzero_pd();
2977 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2978 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2979 ewtabFn = _mm_setzero_pd();
2980 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2981 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2982 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2983 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2984 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2986 d = _mm_sub_pd(r22,rswitch);
2987 d = _mm_max_pd(d,_mm_setzero_pd());
2988 d2 = _mm_mul_pd(d,d);
2989 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2991 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2993 /* Evaluate switch function */
2994 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2995 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2996 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
3000 fscal = _mm_and_pd(fscal,cutoff_mask);
3002 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3004 /* Calculate temporary vectorial force */
3005 tx = _mm_mul_pd(fscal,dx22);
3006 ty = _mm_mul_pd(fscal,dy22);
3007 tz = _mm_mul_pd(fscal,dz22);
3009 /* Update vectorial force */
3010 fix2 = _mm_add_pd(fix2,tx);
3011 fiy2 = _mm_add_pd(fiy2,ty);
3012 fiz2 = _mm_add_pd(fiz2,tz);
3014 fjx2 = _mm_add_pd(fjx2,tx);
3015 fjy2 = _mm_add_pd(fjy2,ty);
3016 fjz2 = _mm_add_pd(fjz2,tz);
3020 /**************************
3021 * CALCULATE INTERACTIONS *
3022 **************************/
3024 if (gmx_mm_any_lt(rsq23,rcutoff2))
3027 r23 = _mm_mul_pd(rsq23,rinv23);
3029 /* EWALD ELECTROSTATICS */
3031 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3032 ewrt = _mm_mul_pd(r23,ewtabscale);
3033 ewitab = _mm_cvttpd_epi32(ewrt);
3034 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
3035 ewitab = _mm_slli_epi32(ewitab,2);
3036 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3037 ewtabD = _mm_setzero_pd();
3038 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3039 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3040 ewtabFn = _mm_setzero_pd();
3041 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3042 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3043 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3044 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
3045 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
3047 d = _mm_sub_pd(r23,rswitch);
3048 d = _mm_max_pd(d,_mm_setzero_pd());
3049 d2 = _mm_mul_pd(d,d);
3050 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3052 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3054 /* Evaluate switch function */
3055 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3056 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
3057 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
3061 fscal = _mm_and_pd(fscal,cutoff_mask);
3063 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3065 /* Calculate temporary vectorial force */
3066 tx = _mm_mul_pd(fscal,dx23);
3067 ty = _mm_mul_pd(fscal,dy23);
3068 tz = _mm_mul_pd(fscal,dz23);
3070 /* Update vectorial force */
3071 fix2 = _mm_add_pd(fix2,tx);
3072 fiy2 = _mm_add_pd(fiy2,ty);
3073 fiz2 = _mm_add_pd(fiz2,tz);
3075 fjx3 = _mm_add_pd(fjx3,tx);
3076 fjy3 = _mm_add_pd(fjy3,ty);
3077 fjz3 = _mm_add_pd(fjz3,tz);
3081 /**************************
3082 * CALCULATE INTERACTIONS *
3083 **************************/
3085 if (gmx_mm_any_lt(rsq31,rcutoff2))
3088 r31 = _mm_mul_pd(rsq31,rinv31);
3090 /* EWALD ELECTROSTATICS */
3092 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3093 ewrt = _mm_mul_pd(r31,ewtabscale);
3094 ewitab = _mm_cvttpd_epi32(ewrt);
3095 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
3096 ewitab = _mm_slli_epi32(ewitab,2);
3097 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3098 ewtabD = _mm_setzero_pd();
3099 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3100 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3101 ewtabFn = _mm_setzero_pd();
3102 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3103 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3104 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3105 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
3106 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
3108 d = _mm_sub_pd(r31,rswitch);
3109 d = _mm_max_pd(d,_mm_setzero_pd());
3110 d2 = _mm_mul_pd(d,d);
3111 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3113 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3115 /* Evaluate switch function */
3116 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3117 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
3118 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
3122 fscal = _mm_and_pd(fscal,cutoff_mask);
3124 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3126 /* Calculate temporary vectorial force */
3127 tx = _mm_mul_pd(fscal,dx31);
3128 ty = _mm_mul_pd(fscal,dy31);
3129 tz = _mm_mul_pd(fscal,dz31);
3131 /* Update vectorial force */
3132 fix3 = _mm_add_pd(fix3,tx);
3133 fiy3 = _mm_add_pd(fiy3,ty);
3134 fiz3 = _mm_add_pd(fiz3,tz);
3136 fjx1 = _mm_add_pd(fjx1,tx);
3137 fjy1 = _mm_add_pd(fjy1,ty);
3138 fjz1 = _mm_add_pd(fjz1,tz);
3142 /**************************
3143 * CALCULATE INTERACTIONS *
3144 **************************/
3146 if (gmx_mm_any_lt(rsq32,rcutoff2))
3149 r32 = _mm_mul_pd(rsq32,rinv32);
3151 /* EWALD ELECTROSTATICS */
3153 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3154 ewrt = _mm_mul_pd(r32,ewtabscale);
3155 ewitab = _mm_cvttpd_epi32(ewrt);
3156 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
3157 ewitab = _mm_slli_epi32(ewitab,2);
3158 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3159 ewtabD = _mm_setzero_pd();
3160 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3161 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3162 ewtabFn = _mm_setzero_pd();
3163 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3164 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3165 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3166 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
3167 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
3169 d = _mm_sub_pd(r32,rswitch);
3170 d = _mm_max_pd(d,_mm_setzero_pd());
3171 d2 = _mm_mul_pd(d,d);
3172 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3174 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3176 /* Evaluate switch function */
3177 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3178 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
3179 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
3183 fscal = _mm_and_pd(fscal,cutoff_mask);
3185 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3187 /* Calculate temporary vectorial force */
3188 tx = _mm_mul_pd(fscal,dx32);
3189 ty = _mm_mul_pd(fscal,dy32);
3190 tz = _mm_mul_pd(fscal,dz32);
3192 /* Update vectorial force */
3193 fix3 = _mm_add_pd(fix3,tx);
3194 fiy3 = _mm_add_pd(fiy3,ty);
3195 fiz3 = _mm_add_pd(fiz3,tz);
3197 fjx2 = _mm_add_pd(fjx2,tx);
3198 fjy2 = _mm_add_pd(fjy2,ty);
3199 fjz2 = _mm_add_pd(fjz2,tz);
3203 /**************************
3204 * CALCULATE INTERACTIONS *
3205 **************************/
3207 if (gmx_mm_any_lt(rsq33,rcutoff2))
3210 r33 = _mm_mul_pd(rsq33,rinv33);
3212 /* EWALD ELECTROSTATICS */
3214 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3215 ewrt = _mm_mul_pd(r33,ewtabscale);
3216 ewitab = _mm_cvttpd_epi32(ewrt);
3217 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
3218 ewitab = _mm_slli_epi32(ewitab,2);
3219 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3220 ewtabD = _mm_setzero_pd();
3221 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3222 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3223 ewtabFn = _mm_setzero_pd();
3224 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3225 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3226 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3227 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
3228 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
3230 d = _mm_sub_pd(r33,rswitch);
3231 d = _mm_max_pd(d,_mm_setzero_pd());
3232 d2 = _mm_mul_pd(d,d);
3233 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3235 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3237 /* Evaluate switch function */
3238 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3239 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
3240 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
3244 fscal = _mm_and_pd(fscal,cutoff_mask);
3246 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3248 /* Calculate temporary vectorial force */
3249 tx = _mm_mul_pd(fscal,dx33);
3250 ty = _mm_mul_pd(fscal,dy33);
3251 tz = _mm_mul_pd(fscal,dz33);
3253 /* Update vectorial force */
3254 fix3 = _mm_add_pd(fix3,tx);
3255 fiy3 = _mm_add_pd(fiy3,ty);
3256 fiz3 = _mm_add_pd(fiz3,tz);
3258 fjx3 = _mm_add_pd(fjx3,tx);
3259 fjy3 = _mm_add_pd(fjy3,ty);
3260 fjz3 = _mm_add_pd(fjz3,tz);
3264 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3266 /* Inner loop uses 617 flops */
3269 /* End of innermost loop */
3271 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3272 f+i_coord_offset,fshift+i_shift_offset);
3274 /* Increment number of inner iterations */
3275 inneriter += j_index_end - j_index_start;
3277 /* Outer loop uses 24 flops */
3280 /* Increment number of outer iterations */
3283 /* Update outer/inner flops */
3285 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*617);