2 * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_double.h"
34 #include "kernelutil_x86_avx_128_fma_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_128_fma_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_128_fma_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B;
75 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 int vdwjidx1A,vdwjidx1B;
77 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78 int vdwjidx2A,vdwjidx2B;
79 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80 int vdwjidx3A,vdwjidx3B;
81 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
95 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
98 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
99 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
101 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
103 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
104 real rswitch_scalar,d_scalar;
105 __m128d dummy_mask,cutoff_mask;
106 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
107 __m128d one = _mm_set1_pd(1.0);
108 __m128d two = _mm_set1_pd(2.0);
114 jindex = nlist->jindex;
116 shiftidx = nlist->shift;
118 shiftvec = fr->shift_vec[0];
119 fshift = fr->fshift[0];
120 facel = _mm_set1_pd(fr->epsfac);
121 charge = mdatoms->chargeA;
122 nvdwtype = fr->ntype;
124 vdwtype = mdatoms->typeA;
126 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
127 ewtab = fr->ic->tabq_coul_FDV0;
128 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
129 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
131 /* Setup water-specific parameters */
132 inr = nlist->iinr[0];
133 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
134 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
135 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
136 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
138 jq1 = _mm_set1_pd(charge[inr+1]);
139 jq2 = _mm_set1_pd(charge[inr+2]);
140 jq3 = _mm_set1_pd(charge[inr+3]);
141 vdwjidx0A = 2*vdwtype[inr+0];
142 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
143 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
144 qq11 = _mm_mul_pd(iq1,jq1);
145 qq12 = _mm_mul_pd(iq1,jq2);
146 qq13 = _mm_mul_pd(iq1,jq3);
147 qq21 = _mm_mul_pd(iq2,jq1);
148 qq22 = _mm_mul_pd(iq2,jq2);
149 qq23 = _mm_mul_pd(iq2,jq3);
150 qq31 = _mm_mul_pd(iq3,jq1);
151 qq32 = _mm_mul_pd(iq3,jq2);
152 qq33 = _mm_mul_pd(iq3,jq3);
154 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
155 rcutoff_scalar = fr->rcoulomb;
156 rcutoff = _mm_set1_pd(rcutoff_scalar);
157 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
159 rswitch_scalar = fr->rcoulomb_switch;
160 rswitch = _mm_set1_pd(rswitch_scalar);
161 /* Setup switch parameters */
162 d_scalar = rcutoff_scalar-rswitch_scalar;
163 d = _mm_set1_pd(d_scalar);
164 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
165 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
166 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
167 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
168 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
169 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
171 /* Avoid stupid compiler warnings */
179 /* Start outer loop over neighborlists */
180 for(iidx=0; iidx<nri; iidx++)
182 /* Load shift vector for this list */
183 i_shift_offset = DIM*shiftidx[iidx];
185 /* Load limits for loop over neighbors */
186 j_index_start = jindex[iidx];
187 j_index_end = jindex[iidx+1];
189 /* Get outer coordinate index */
191 i_coord_offset = DIM*inr;
193 /* Load i particle coords and add shift vector */
194 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
195 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
197 fix0 = _mm_setzero_pd();
198 fiy0 = _mm_setzero_pd();
199 fiz0 = _mm_setzero_pd();
200 fix1 = _mm_setzero_pd();
201 fiy1 = _mm_setzero_pd();
202 fiz1 = _mm_setzero_pd();
203 fix2 = _mm_setzero_pd();
204 fiy2 = _mm_setzero_pd();
205 fiz2 = _mm_setzero_pd();
206 fix3 = _mm_setzero_pd();
207 fiy3 = _mm_setzero_pd();
208 fiz3 = _mm_setzero_pd();
210 /* Reset potential sums */
211 velecsum = _mm_setzero_pd();
212 vvdwsum = _mm_setzero_pd();
214 /* Start inner kernel loop */
215 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
218 /* Get j neighbor index, and coordinate index */
221 j_coord_offsetA = DIM*jnrA;
222 j_coord_offsetB = DIM*jnrB;
224 /* load j atom coordinates */
225 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
226 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
227 &jy2,&jz2,&jx3,&jy3,&jz3);
229 /* Calculate displacement vector */
230 dx00 = _mm_sub_pd(ix0,jx0);
231 dy00 = _mm_sub_pd(iy0,jy0);
232 dz00 = _mm_sub_pd(iz0,jz0);
233 dx11 = _mm_sub_pd(ix1,jx1);
234 dy11 = _mm_sub_pd(iy1,jy1);
235 dz11 = _mm_sub_pd(iz1,jz1);
236 dx12 = _mm_sub_pd(ix1,jx2);
237 dy12 = _mm_sub_pd(iy1,jy2);
238 dz12 = _mm_sub_pd(iz1,jz2);
239 dx13 = _mm_sub_pd(ix1,jx3);
240 dy13 = _mm_sub_pd(iy1,jy3);
241 dz13 = _mm_sub_pd(iz1,jz3);
242 dx21 = _mm_sub_pd(ix2,jx1);
243 dy21 = _mm_sub_pd(iy2,jy1);
244 dz21 = _mm_sub_pd(iz2,jz1);
245 dx22 = _mm_sub_pd(ix2,jx2);
246 dy22 = _mm_sub_pd(iy2,jy2);
247 dz22 = _mm_sub_pd(iz2,jz2);
248 dx23 = _mm_sub_pd(ix2,jx3);
249 dy23 = _mm_sub_pd(iy2,jy3);
250 dz23 = _mm_sub_pd(iz2,jz3);
251 dx31 = _mm_sub_pd(ix3,jx1);
252 dy31 = _mm_sub_pd(iy3,jy1);
253 dz31 = _mm_sub_pd(iz3,jz1);
254 dx32 = _mm_sub_pd(ix3,jx2);
255 dy32 = _mm_sub_pd(iy3,jy2);
256 dz32 = _mm_sub_pd(iz3,jz2);
257 dx33 = _mm_sub_pd(ix3,jx3);
258 dy33 = _mm_sub_pd(iy3,jy3);
259 dz33 = _mm_sub_pd(iz3,jz3);
261 /* Calculate squared distance and things based on it */
262 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
263 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
264 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
265 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
266 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
267 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
268 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
269 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
270 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
271 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
273 rinv00 = gmx_mm_invsqrt_pd(rsq00);
274 rinv11 = gmx_mm_invsqrt_pd(rsq11);
275 rinv12 = gmx_mm_invsqrt_pd(rsq12);
276 rinv13 = gmx_mm_invsqrt_pd(rsq13);
277 rinv21 = gmx_mm_invsqrt_pd(rsq21);
278 rinv22 = gmx_mm_invsqrt_pd(rsq22);
279 rinv23 = gmx_mm_invsqrt_pd(rsq23);
280 rinv31 = gmx_mm_invsqrt_pd(rsq31);
281 rinv32 = gmx_mm_invsqrt_pd(rsq32);
282 rinv33 = gmx_mm_invsqrt_pd(rsq33);
284 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
285 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
286 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
287 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
288 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
289 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
290 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
291 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
292 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
293 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
295 fjx0 = _mm_setzero_pd();
296 fjy0 = _mm_setzero_pd();
297 fjz0 = _mm_setzero_pd();
298 fjx1 = _mm_setzero_pd();
299 fjy1 = _mm_setzero_pd();
300 fjz1 = _mm_setzero_pd();
301 fjx2 = _mm_setzero_pd();
302 fjy2 = _mm_setzero_pd();
303 fjz2 = _mm_setzero_pd();
304 fjx3 = _mm_setzero_pd();
305 fjy3 = _mm_setzero_pd();
306 fjz3 = _mm_setzero_pd();
308 /**************************
309 * CALCULATE INTERACTIONS *
310 **************************/
312 if (gmx_mm_any_lt(rsq00,rcutoff2))
315 r00 = _mm_mul_pd(rsq00,rinv00);
317 /* LENNARD-JONES DISPERSION/REPULSION */
319 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
320 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
321 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
322 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
323 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
325 d = _mm_sub_pd(r00,rswitch);
326 d = _mm_max_pd(d,_mm_setzero_pd());
327 d2 = _mm_mul_pd(d,d);
328 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
330 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
332 /* Evaluate switch function */
333 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
334 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
335 vvdw = _mm_mul_pd(vvdw,sw);
336 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
338 /* Update potential sum for this i atom from the interaction with this j atom. */
339 vvdw = _mm_and_pd(vvdw,cutoff_mask);
340 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
344 fscal = _mm_and_pd(fscal,cutoff_mask);
346 /* Update vectorial force */
347 fix0 = _mm_macc_pd(dx00,fscal,fix0);
348 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
349 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
351 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
352 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
353 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
357 /**************************
358 * CALCULATE INTERACTIONS *
359 **************************/
361 if (gmx_mm_any_lt(rsq11,rcutoff2))
364 r11 = _mm_mul_pd(rsq11,rinv11);
366 /* EWALD ELECTROSTATICS */
368 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
369 ewrt = _mm_mul_pd(r11,ewtabscale);
370 ewitab = _mm_cvttpd_epi32(ewrt);
372 eweps = _mm_frcz_pd(ewrt);
374 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
376 twoeweps = _mm_add_pd(eweps,eweps);
377 ewitab = _mm_slli_epi32(ewitab,2);
378 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
379 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
380 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
381 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
382 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
383 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
384 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
385 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
386 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
387 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
389 d = _mm_sub_pd(r11,rswitch);
390 d = _mm_max_pd(d,_mm_setzero_pd());
391 d2 = _mm_mul_pd(d,d);
392 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
394 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
396 /* Evaluate switch function */
397 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
398 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
399 velec = _mm_mul_pd(velec,sw);
400 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
402 /* Update potential sum for this i atom from the interaction with this j atom. */
403 velec = _mm_and_pd(velec,cutoff_mask);
404 velecsum = _mm_add_pd(velecsum,velec);
408 fscal = _mm_and_pd(fscal,cutoff_mask);
410 /* Update vectorial force */
411 fix1 = _mm_macc_pd(dx11,fscal,fix1);
412 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
413 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
415 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
416 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
417 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
421 /**************************
422 * CALCULATE INTERACTIONS *
423 **************************/
425 if (gmx_mm_any_lt(rsq12,rcutoff2))
428 r12 = _mm_mul_pd(rsq12,rinv12);
430 /* EWALD ELECTROSTATICS */
432 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
433 ewrt = _mm_mul_pd(r12,ewtabscale);
434 ewitab = _mm_cvttpd_epi32(ewrt);
436 eweps = _mm_frcz_pd(ewrt);
438 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
440 twoeweps = _mm_add_pd(eweps,eweps);
441 ewitab = _mm_slli_epi32(ewitab,2);
442 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
443 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
444 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
445 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
446 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
447 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
448 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
449 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
450 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
451 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
453 d = _mm_sub_pd(r12,rswitch);
454 d = _mm_max_pd(d,_mm_setzero_pd());
455 d2 = _mm_mul_pd(d,d);
456 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
458 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
460 /* Evaluate switch function */
461 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
462 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
463 velec = _mm_mul_pd(velec,sw);
464 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
466 /* Update potential sum for this i atom from the interaction with this j atom. */
467 velec = _mm_and_pd(velec,cutoff_mask);
468 velecsum = _mm_add_pd(velecsum,velec);
472 fscal = _mm_and_pd(fscal,cutoff_mask);
474 /* Update vectorial force */
475 fix1 = _mm_macc_pd(dx12,fscal,fix1);
476 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
477 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
479 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
480 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
481 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
485 /**************************
486 * CALCULATE INTERACTIONS *
487 **************************/
489 if (gmx_mm_any_lt(rsq13,rcutoff2))
492 r13 = _mm_mul_pd(rsq13,rinv13);
494 /* EWALD ELECTROSTATICS */
496 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
497 ewrt = _mm_mul_pd(r13,ewtabscale);
498 ewitab = _mm_cvttpd_epi32(ewrt);
500 eweps = _mm_frcz_pd(ewrt);
502 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
504 twoeweps = _mm_add_pd(eweps,eweps);
505 ewitab = _mm_slli_epi32(ewitab,2);
506 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
507 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
508 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
509 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
510 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
511 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
512 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
513 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
514 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
515 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
517 d = _mm_sub_pd(r13,rswitch);
518 d = _mm_max_pd(d,_mm_setzero_pd());
519 d2 = _mm_mul_pd(d,d);
520 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
522 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
524 /* Evaluate switch function */
525 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
526 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
527 velec = _mm_mul_pd(velec,sw);
528 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
530 /* Update potential sum for this i atom from the interaction with this j atom. */
531 velec = _mm_and_pd(velec,cutoff_mask);
532 velecsum = _mm_add_pd(velecsum,velec);
536 fscal = _mm_and_pd(fscal,cutoff_mask);
538 /* Update vectorial force */
539 fix1 = _mm_macc_pd(dx13,fscal,fix1);
540 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
541 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
543 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
544 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
545 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
549 /**************************
550 * CALCULATE INTERACTIONS *
551 **************************/
553 if (gmx_mm_any_lt(rsq21,rcutoff2))
556 r21 = _mm_mul_pd(rsq21,rinv21);
558 /* EWALD ELECTROSTATICS */
560 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
561 ewrt = _mm_mul_pd(r21,ewtabscale);
562 ewitab = _mm_cvttpd_epi32(ewrt);
564 eweps = _mm_frcz_pd(ewrt);
566 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
568 twoeweps = _mm_add_pd(eweps,eweps);
569 ewitab = _mm_slli_epi32(ewitab,2);
570 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
571 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
572 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
573 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
574 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
575 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
576 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
577 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
578 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
579 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
581 d = _mm_sub_pd(r21,rswitch);
582 d = _mm_max_pd(d,_mm_setzero_pd());
583 d2 = _mm_mul_pd(d,d);
584 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
586 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
588 /* Evaluate switch function */
589 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
590 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
591 velec = _mm_mul_pd(velec,sw);
592 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
594 /* Update potential sum for this i atom from the interaction with this j atom. */
595 velec = _mm_and_pd(velec,cutoff_mask);
596 velecsum = _mm_add_pd(velecsum,velec);
600 fscal = _mm_and_pd(fscal,cutoff_mask);
602 /* Update vectorial force */
603 fix2 = _mm_macc_pd(dx21,fscal,fix2);
604 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
605 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
607 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
608 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
609 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
613 /**************************
614 * CALCULATE INTERACTIONS *
615 **************************/
617 if (gmx_mm_any_lt(rsq22,rcutoff2))
620 r22 = _mm_mul_pd(rsq22,rinv22);
622 /* EWALD ELECTROSTATICS */
624 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
625 ewrt = _mm_mul_pd(r22,ewtabscale);
626 ewitab = _mm_cvttpd_epi32(ewrt);
628 eweps = _mm_frcz_pd(ewrt);
630 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
632 twoeweps = _mm_add_pd(eweps,eweps);
633 ewitab = _mm_slli_epi32(ewitab,2);
634 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
635 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
636 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
637 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
638 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
639 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
640 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
641 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
642 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
643 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
645 d = _mm_sub_pd(r22,rswitch);
646 d = _mm_max_pd(d,_mm_setzero_pd());
647 d2 = _mm_mul_pd(d,d);
648 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
650 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
652 /* Evaluate switch function */
653 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
654 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
655 velec = _mm_mul_pd(velec,sw);
656 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
658 /* Update potential sum for this i atom from the interaction with this j atom. */
659 velec = _mm_and_pd(velec,cutoff_mask);
660 velecsum = _mm_add_pd(velecsum,velec);
664 fscal = _mm_and_pd(fscal,cutoff_mask);
666 /* Update vectorial force */
667 fix2 = _mm_macc_pd(dx22,fscal,fix2);
668 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
669 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
671 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
672 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
673 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
677 /**************************
678 * CALCULATE INTERACTIONS *
679 **************************/
681 if (gmx_mm_any_lt(rsq23,rcutoff2))
684 r23 = _mm_mul_pd(rsq23,rinv23);
686 /* EWALD ELECTROSTATICS */
688 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
689 ewrt = _mm_mul_pd(r23,ewtabscale);
690 ewitab = _mm_cvttpd_epi32(ewrt);
692 eweps = _mm_frcz_pd(ewrt);
694 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
696 twoeweps = _mm_add_pd(eweps,eweps);
697 ewitab = _mm_slli_epi32(ewitab,2);
698 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
699 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
700 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
701 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
702 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
703 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
704 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
705 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
706 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
707 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
709 d = _mm_sub_pd(r23,rswitch);
710 d = _mm_max_pd(d,_mm_setzero_pd());
711 d2 = _mm_mul_pd(d,d);
712 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
714 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
716 /* Evaluate switch function */
717 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
718 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
719 velec = _mm_mul_pd(velec,sw);
720 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
722 /* Update potential sum for this i atom from the interaction with this j atom. */
723 velec = _mm_and_pd(velec,cutoff_mask);
724 velecsum = _mm_add_pd(velecsum,velec);
728 fscal = _mm_and_pd(fscal,cutoff_mask);
730 /* Update vectorial force */
731 fix2 = _mm_macc_pd(dx23,fscal,fix2);
732 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
733 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
735 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
736 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
737 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
741 /**************************
742 * CALCULATE INTERACTIONS *
743 **************************/
745 if (gmx_mm_any_lt(rsq31,rcutoff2))
748 r31 = _mm_mul_pd(rsq31,rinv31);
750 /* EWALD ELECTROSTATICS */
752 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
753 ewrt = _mm_mul_pd(r31,ewtabscale);
754 ewitab = _mm_cvttpd_epi32(ewrt);
756 eweps = _mm_frcz_pd(ewrt);
758 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
760 twoeweps = _mm_add_pd(eweps,eweps);
761 ewitab = _mm_slli_epi32(ewitab,2);
762 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
763 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
764 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
765 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
766 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
767 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
768 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
769 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
770 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
771 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
773 d = _mm_sub_pd(r31,rswitch);
774 d = _mm_max_pd(d,_mm_setzero_pd());
775 d2 = _mm_mul_pd(d,d);
776 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
778 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
780 /* Evaluate switch function */
781 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
782 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
783 velec = _mm_mul_pd(velec,sw);
784 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
786 /* Update potential sum for this i atom from the interaction with this j atom. */
787 velec = _mm_and_pd(velec,cutoff_mask);
788 velecsum = _mm_add_pd(velecsum,velec);
792 fscal = _mm_and_pd(fscal,cutoff_mask);
794 /* Update vectorial force */
795 fix3 = _mm_macc_pd(dx31,fscal,fix3);
796 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
797 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
799 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
800 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
801 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
805 /**************************
806 * CALCULATE INTERACTIONS *
807 **************************/
809 if (gmx_mm_any_lt(rsq32,rcutoff2))
812 r32 = _mm_mul_pd(rsq32,rinv32);
814 /* EWALD ELECTROSTATICS */
816 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
817 ewrt = _mm_mul_pd(r32,ewtabscale);
818 ewitab = _mm_cvttpd_epi32(ewrt);
820 eweps = _mm_frcz_pd(ewrt);
822 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
824 twoeweps = _mm_add_pd(eweps,eweps);
825 ewitab = _mm_slli_epi32(ewitab,2);
826 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
827 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
828 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
829 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
830 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
831 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
832 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
833 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
834 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
835 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
837 d = _mm_sub_pd(r32,rswitch);
838 d = _mm_max_pd(d,_mm_setzero_pd());
839 d2 = _mm_mul_pd(d,d);
840 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
842 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
844 /* Evaluate switch function */
845 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
846 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
847 velec = _mm_mul_pd(velec,sw);
848 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
850 /* Update potential sum for this i atom from the interaction with this j atom. */
851 velec = _mm_and_pd(velec,cutoff_mask);
852 velecsum = _mm_add_pd(velecsum,velec);
856 fscal = _mm_and_pd(fscal,cutoff_mask);
858 /* Update vectorial force */
859 fix3 = _mm_macc_pd(dx32,fscal,fix3);
860 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
861 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
863 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
864 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
865 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
869 /**************************
870 * CALCULATE INTERACTIONS *
871 **************************/
873 if (gmx_mm_any_lt(rsq33,rcutoff2))
876 r33 = _mm_mul_pd(rsq33,rinv33);
878 /* EWALD ELECTROSTATICS */
880 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
881 ewrt = _mm_mul_pd(r33,ewtabscale);
882 ewitab = _mm_cvttpd_epi32(ewrt);
884 eweps = _mm_frcz_pd(ewrt);
886 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
888 twoeweps = _mm_add_pd(eweps,eweps);
889 ewitab = _mm_slli_epi32(ewitab,2);
890 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
891 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
892 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
893 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
894 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
895 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
896 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
897 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
898 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
899 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
901 d = _mm_sub_pd(r33,rswitch);
902 d = _mm_max_pd(d,_mm_setzero_pd());
903 d2 = _mm_mul_pd(d,d);
904 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
906 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
908 /* Evaluate switch function */
909 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
910 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
911 velec = _mm_mul_pd(velec,sw);
912 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
914 /* Update potential sum for this i atom from the interaction with this j atom. */
915 velec = _mm_and_pd(velec,cutoff_mask);
916 velecsum = _mm_add_pd(velecsum,velec);
920 fscal = _mm_and_pd(fscal,cutoff_mask);
922 /* Update vectorial force */
923 fix3 = _mm_macc_pd(dx33,fscal,fix3);
924 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
925 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
927 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
928 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
929 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
933 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
935 /* Inner loop uses 677 flops */
942 j_coord_offsetA = DIM*jnrA;
944 /* load j atom coordinates */
945 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
946 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
947 &jy2,&jz2,&jx3,&jy3,&jz3);
949 /* Calculate displacement vector */
950 dx00 = _mm_sub_pd(ix0,jx0);
951 dy00 = _mm_sub_pd(iy0,jy0);
952 dz00 = _mm_sub_pd(iz0,jz0);
953 dx11 = _mm_sub_pd(ix1,jx1);
954 dy11 = _mm_sub_pd(iy1,jy1);
955 dz11 = _mm_sub_pd(iz1,jz1);
956 dx12 = _mm_sub_pd(ix1,jx2);
957 dy12 = _mm_sub_pd(iy1,jy2);
958 dz12 = _mm_sub_pd(iz1,jz2);
959 dx13 = _mm_sub_pd(ix1,jx3);
960 dy13 = _mm_sub_pd(iy1,jy3);
961 dz13 = _mm_sub_pd(iz1,jz3);
962 dx21 = _mm_sub_pd(ix2,jx1);
963 dy21 = _mm_sub_pd(iy2,jy1);
964 dz21 = _mm_sub_pd(iz2,jz1);
965 dx22 = _mm_sub_pd(ix2,jx2);
966 dy22 = _mm_sub_pd(iy2,jy2);
967 dz22 = _mm_sub_pd(iz2,jz2);
968 dx23 = _mm_sub_pd(ix2,jx3);
969 dy23 = _mm_sub_pd(iy2,jy3);
970 dz23 = _mm_sub_pd(iz2,jz3);
971 dx31 = _mm_sub_pd(ix3,jx1);
972 dy31 = _mm_sub_pd(iy3,jy1);
973 dz31 = _mm_sub_pd(iz3,jz1);
974 dx32 = _mm_sub_pd(ix3,jx2);
975 dy32 = _mm_sub_pd(iy3,jy2);
976 dz32 = _mm_sub_pd(iz3,jz2);
977 dx33 = _mm_sub_pd(ix3,jx3);
978 dy33 = _mm_sub_pd(iy3,jy3);
979 dz33 = _mm_sub_pd(iz3,jz3);
981 /* Calculate squared distance and things based on it */
982 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
983 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
984 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
985 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
986 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
987 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
988 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
989 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
990 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
991 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
993 rinv00 = gmx_mm_invsqrt_pd(rsq00);
994 rinv11 = gmx_mm_invsqrt_pd(rsq11);
995 rinv12 = gmx_mm_invsqrt_pd(rsq12);
996 rinv13 = gmx_mm_invsqrt_pd(rsq13);
997 rinv21 = gmx_mm_invsqrt_pd(rsq21);
998 rinv22 = gmx_mm_invsqrt_pd(rsq22);
999 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1000 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1001 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1002 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1004 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1005 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1006 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1007 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1008 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1009 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1010 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1011 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1012 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1013 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1015 fjx0 = _mm_setzero_pd();
1016 fjy0 = _mm_setzero_pd();
1017 fjz0 = _mm_setzero_pd();
1018 fjx1 = _mm_setzero_pd();
1019 fjy1 = _mm_setzero_pd();
1020 fjz1 = _mm_setzero_pd();
1021 fjx2 = _mm_setzero_pd();
1022 fjy2 = _mm_setzero_pd();
1023 fjz2 = _mm_setzero_pd();
1024 fjx3 = _mm_setzero_pd();
1025 fjy3 = _mm_setzero_pd();
1026 fjz3 = _mm_setzero_pd();
1028 /**************************
1029 * CALCULATE INTERACTIONS *
1030 **************************/
1032 if (gmx_mm_any_lt(rsq00,rcutoff2))
1035 r00 = _mm_mul_pd(rsq00,rinv00);
1037 /* LENNARD-JONES DISPERSION/REPULSION */
1039 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1040 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1041 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1042 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
1043 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1045 d = _mm_sub_pd(r00,rswitch);
1046 d = _mm_max_pd(d,_mm_setzero_pd());
1047 d2 = _mm_mul_pd(d,d);
1048 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1050 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1052 /* Evaluate switch function */
1053 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1054 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1055 vvdw = _mm_mul_pd(vvdw,sw);
1056 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1058 /* Update potential sum for this i atom from the interaction with this j atom. */
1059 vvdw = _mm_and_pd(vvdw,cutoff_mask);
1060 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
1061 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
1065 fscal = _mm_and_pd(fscal,cutoff_mask);
1067 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1069 /* Update vectorial force */
1070 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1071 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1072 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1074 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1075 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1076 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1080 /**************************
1081 * CALCULATE INTERACTIONS *
1082 **************************/
1084 if (gmx_mm_any_lt(rsq11,rcutoff2))
1087 r11 = _mm_mul_pd(rsq11,rinv11);
1089 /* EWALD ELECTROSTATICS */
1091 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1092 ewrt = _mm_mul_pd(r11,ewtabscale);
1093 ewitab = _mm_cvttpd_epi32(ewrt);
1095 eweps = _mm_frcz_pd(ewrt);
1097 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1099 twoeweps = _mm_add_pd(eweps,eweps);
1100 ewitab = _mm_slli_epi32(ewitab,2);
1101 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1102 ewtabD = _mm_setzero_pd();
1103 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1104 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1105 ewtabFn = _mm_setzero_pd();
1106 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1107 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1108 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1109 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1110 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1112 d = _mm_sub_pd(r11,rswitch);
1113 d = _mm_max_pd(d,_mm_setzero_pd());
1114 d2 = _mm_mul_pd(d,d);
1115 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1117 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1119 /* Evaluate switch function */
1120 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1121 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
1122 velec = _mm_mul_pd(velec,sw);
1123 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1125 /* Update potential sum for this i atom from the interaction with this j atom. */
1126 velec = _mm_and_pd(velec,cutoff_mask);
1127 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1128 velecsum = _mm_add_pd(velecsum,velec);
1132 fscal = _mm_and_pd(fscal,cutoff_mask);
1134 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1136 /* Update vectorial force */
1137 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1138 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1139 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1141 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1142 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1143 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1147 /**************************
1148 * CALCULATE INTERACTIONS *
1149 **************************/
1151 if (gmx_mm_any_lt(rsq12,rcutoff2))
1154 r12 = _mm_mul_pd(rsq12,rinv12);
1156 /* EWALD ELECTROSTATICS */
1158 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1159 ewrt = _mm_mul_pd(r12,ewtabscale);
1160 ewitab = _mm_cvttpd_epi32(ewrt);
1162 eweps = _mm_frcz_pd(ewrt);
1164 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1166 twoeweps = _mm_add_pd(eweps,eweps);
1167 ewitab = _mm_slli_epi32(ewitab,2);
1168 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1169 ewtabD = _mm_setzero_pd();
1170 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1171 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1172 ewtabFn = _mm_setzero_pd();
1173 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1174 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1175 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1176 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1177 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1179 d = _mm_sub_pd(r12,rswitch);
1180 d = _mm_max_pd(d,_mm_setzero_pd());
1181 d2 = _mm_mul_pd(d,d);
1182 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1184 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1186 /* Evaluate switch function */
1187 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1188 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1189 velec = _mm_mul_pd(velec,sw);
1190 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1192 /* Update potential sum for this i atom from the interaction with this j atom. */
1193 velec = _mm_and_pd(velec,cutoff_mask);
1194 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1195 velecsum = _mm_add_pd(velecsum,velec);
1199 fscal = _mm_and_pd(fscal,cutoff_mask);
1201 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1203 /* Update vectorial force */
1204 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1205 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1206 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1208 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1209 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1210 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1214 /**************************
1215 * CALCULATE INTERACTIONS *
1216 **************************/
1218 if (gmx_mm_any_lt(rsq13,rcutoff2))
1221 r13 = _mm_mul_pd(rsq13,rinv13);
1223 /* EWALD ELECTROSTATICS */
1225 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1226 ewrt = _mm_mul_pd(r13,ewtabscale);
1227 ewitab = _mm_cvttpd_epi32(ewrt);
1229 eweps = _mm_frcz_pd(ewrt);
1231 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1233 twoeweps = _mm_add_pd(eweps,eweps);
1234 ewitab = _mm_slli_epi32(ewitab,2);
1235 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1236 ewtabD = _mm_setzero_pd();
1237 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1238 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1239 ewtabFn = _mm_setzero_pd();
1240 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1241 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1242 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1243 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
1244 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1246 d = _mm_sub_pd(r13,rswitch);
1247 d = _mm_max_pd(d,_mm_setzero_pd());
1248 d2 = _mm_mul_pd(d,d);
1249 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1251 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1253 /* Evaluate switch function */
1254 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1255 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
1256 velec = _mm_mul_pd(velec,sw);
1257 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1259 /* Update potential sum for this i atom from the interaction with this j atom. */
1260 velec = _mm_and_pd(velec,cutoff_mask);
1261 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1262 velecsum = _mm_add_pd(velecsum,velec);
1266 fscal = _mm_and_pd(fscal,cutoff_mask);
1268 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1270 /* Update vectorial force */
1271 fix1 = _mm_macc_pd(dx13,fscal,fix1);
1272 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
1273 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
1275 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
1276 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
1277 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
1281 /**************************
1282 * CALCULATE INTERACTIONS *
1283 **************************/
1285 if (gmx_mm_any_lt(rsq21,rcutoff2))
1288 r21 = _mm_mul_pd(rsq21,rinv21);
1290 /* EWALD ELECTROSTATICS */
1292 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1293 ewrt = _mm_mul_pd(r21,ewtabscale);
1294 ewitab = _mm_cvttpd_epi32(ewrt);
1296 eweps = _mm_frcz_pd(ewrt);
1298 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1300 twoeweps = _mm_add_pd(eweps,eweps);
1301 ewitab = _mm_slli_epi32(ewitab,2);
1302 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1303 ewtabD = _mm_setzero_pd();
1304 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1305 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1306 ewtabFn = _mm_setzero_pd();
1307 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1308 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1309 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1310 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1311 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1313 d = _mm_sub_pd(r21,rswitch);
1314 d = _mm_max_pd(d,_mm_setzero_pd());
1315 d2 = _mm_mul_pd(d,d);
1316 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1318 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1320 /* Evaluate switch function */
1321 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1322 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
1323 velec = _mm_mul_pd(velec,sw);
1324 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1326 /* Update potential sum for this i atom from the interaction with this j atom. */
1327 velec = _mm_and_pd(velec,cutoff_mask);
1328 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1329 velecsum = _mm_add_pd(velecsum,velec);
1333 fscal = _mm_and_pd(fscal,cutoff_mask);
1335 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1337 /* Update vectorial force */
1338 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1339 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1340 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1342 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1343 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1344 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1348 /**************************
1349 * CALCULATE INTERACTIONS *
1350 **************************/
1352 if (gmx_mm_any_lt(rsq22,rcutoff2))
1355 r22 = _mm_mul_pd(rsq22,rinv22);
1357 /* EWALD ELECTROSTATICS */
1359 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1360 ewrt = _mm_mul_pd(r22,ewtabscale);
1361 ewitab = _mm_cvttpd_epi32(ewrt);
1363 eweps = _mm_frcz_pd(ewrt);
1365 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1367 twoeweps = _mm_add_pd(eweps,eweps);
1368 ewitab = _mm_slli_epi32(ewitab,2);
1369 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1370 ewtabD = _mm_setzero_pd();
1371 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1372 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1373 ewtabFn = _mm_setzero_pd();
1374 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1375 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1376 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1377 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1378 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1380 d = _mm_sub_pd(r22,rswitch);
1381 d = _mm_max_pd(d,_mm_setzero_pd());
1382 d2 = _mm_mul_pd(d,d);
1383 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1385 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1387 /* Evaluate switch function */
1388 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1389 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
1390 velec = _mm_mul_pd(velec,sw);
1391 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1393 /* Update potential sum for this i atom from the interaction with this j atom. */
1394 velec = _mm_and_pd(velec,cutoff_mask);
1395 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1396 velecsum = _mm_add_pd(velecsum,velec);
1400 fscal = _mm_and_pd(fscal,cutoff_mask);
1402 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1404 /* Update vectorial force */
1405 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1406 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1407 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1409 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1410 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1411 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1415 /**************************
1416 * CALCULATE INTERACTIONS *
1417 **************************/
1419 if (gmx_mm_any_lt(rsq23,rcutoff2))
1422 r23 = _mm_mul_pd(rsq23,rinv23);
1424 /* EWALD ELECTROSTATICS */
1426 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1427 ewrt = _mm_mul_pd(r23,ewtabscale);
1428 ewitab = _mm_cvttpd_epi32(ewrt);
1430 eweps = _mm_frcz_pd(ewrt);
1432 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1434 twoeweps = _mm_add_pd(eweps,eweps);
1435 ewitab = _mm_slli_epi32(ewitab,2);
1436 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1437 ewtabD = _mm_setzero_pd();
1438 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1439 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1440 ewtabFn = _mm_setzero_pd();
1441 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1442 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1443 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1444 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
1445 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1447 d = _mm_sub_pd(r23,rswitch);
1448 d = _mm_max_pd(d,_mm_setzero_pd());
1449 d2 = _mm_mul_pd(d,d);
1450 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1452 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1454 /* Evaluate switch function */
1455 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1456 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
1457 velec = _mm_mul_pd(velec,sw);
1458 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1460 /* Update potential sum for this i atom from the interaction with this j atom. */
1461 velec = _mm_and_pd(velec,cutoff_mask);
1462 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1463 velecsum = _mm_add_pd(velecsum,velec);
1467 fscal = _mm_and_pd(fscal,cutoff_mask);
1469 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1471 /* Update vectorial force */
1472 fix2 = _mm_macc_pd(dx23,fscal,fix2);
1473 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
1474 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
1476 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
1477 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
1478 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
1482 /**************************
1483 * CALCULATE INTERACTIONS *
1484 **************************/
1486 if (gmx_mm_any_lt(rsq31,rcutoff2))
1489 r31 = _mm_mul_pd(rsq31,rinv31);
1491 /* EWALD ELECTROSTATICS */
1493 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1494 ewrt = _mm_mul_pd(r31,ewtabscale);
1495 ewitab = _mm_cvttpd_epi32(ewrt);
1497 eweps = _mm_frcz_pd(ewrt);
1499 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1501 twoeweps = _mm_add_pd(eweps,eweps);
1502 ewitab = _mm_slli_epi32(ewitab,2);
1503 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1504 ewtabD = _mm_setzero_pd();
1505 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1506 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1507 ewtabFn = _mm_setzero_pd();
1508 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1509 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1510 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1511 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
1512 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1514 d = _mm_sub_pd(r31,rswitch);
1515 d = _mm_max_pd(d,_mm_setzero_pd());
1516 d2 = _mm_mul_pd(d,d);
1517 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1519 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1521 /* Evaluate switch function */
1522 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1523 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
1524 velec = _mm_mul_pd(velec,sw);
1525 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1527 /* Update potential sum for this i atom from the interaction with this j atom. */
1528 velec = _mm_and_pd(velec,cutoff_mask);
1529 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1530 velecsum = _mm_add_pd(velecsum,velec);
1534 fscal = _mm_and_pd(fscal,cutoff_mask);
1536 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1538 /* Update vectorial force */
1539 fix3 = _mm_macc_pd(dx31,fscal,fix3);
1540 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
1541 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
1543 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
1544 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
1545 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
1549 /**************************
1550 * CALCULATE INTERACTIONS *
1551 **************************/
1553 if (gmx_mm_any_lt(rsq32,rcutoff2))
1556 r32 = _mm_mul_pd(rsq32,rinv32);
1558 /* EWALD ELECTROSTATICS */
1560 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1561 ewrt = _mm_mul_pd(r32,ewtabscale);
1562 ewitab = _mm_cvttpd_epi32(ewrt);
1564 eweps = _mm_frcz_pd(ewrt);
1566 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1568 twoeweps = _mm_add_pd(eweps,eweps);
1569 ewitab = _mm_slli_epi32(ewitab,2);
1570 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1571 ewtabD = _mm_setzero_pd();
1572 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1573 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1574 ewtabFn = _mm_setzero_pd();
1575 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1576 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1577 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1578 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
1579 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1581 d = _mm_sub_pd(r32,rswitch);
1582 d = _mm_max_pd(d,_mm_setzero_pd());
1583 d2 = _mm_mul_pd(d,d);
1584 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1586 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1588 /* Evaluate switch function */
1589 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1590 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
1591 velec = _mm_mul_pd(velec,sw);
1592 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1594 /* Update potential sum for this i atom from the interaction with this j atom. */
1595 velec = _mm_and_pd(velec,cutoff_mask);
1596 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1597 velecsum = _mm_add_pd(velecsum,velec);
1601 fscal = _mm_and_pd(fscal,cutoff_mask);
1603 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1605 /* Update vectorial force */
1606 fix3 = _mm_macc_pd(dx32,fscal,fix3);
1607 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
1608 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
1610 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
1611 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
1612 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
1616 /**************************
1617 * CALCULATE INTERACTIONS *
1618 **************************/
1620 if (gmx_mm_any_lt(rsq33,rcutoff2))
1623 r33 = _mm_mul_pd(rsq33,rinv33);
1625 /* EWALD ELECTROSTATICS */
1627 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1628 ewrt = _mm_mul_pd(r33,ewtabscale);
1629 ewitab = _mm_cvttpd_epi32(ewrt);
1631 eweps = _mm_frcz_pd(ewrt);
1633 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1635 twoeweps = _mm_add_pd(eweps,eweps);
1636 ewitab = _mm_slli_epi32(ewitab,2);
1637 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1638 ewtabD = _mm_setzero_pd();
1639 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1640 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1641 ewtabFn = _mm_setzero_pd();
1642 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1643 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1644 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1645 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
1646 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1648 d = _mm_sub_pd(r33,rswitch);
1649 d = _mm_max_pd(d,_mm_setzero_pd());
1650 d2 = _mm_mul_pd(d,d);
1651 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1653 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1655 /* Evaluate switch function */
1656 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1657 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
1658 velec = _mm_mul_pd(velec,sw);
1659 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1661 /* Update potential sum for this i atom from the interaction with this j atom. */
1662 velec = _mm_and_pd(velec,cutoff_mask);
1663 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1664 velecsum = _mm_add_pd(velecsum,velec);
1668 fscal = _mm_and_pd(fscal,cutoff_mask);
1670 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1672 /* Update vectorial force */
1673 fix3 = _mm_macc_pd(dx33,fscal,fix3);
1674 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
1675 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
1677 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
1678 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
1679 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
1683 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1685 /* Inner loop uses 677 flops */
1688 /* End of innermost loop */
1690 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1691 f+i_coord_offset,fshift+i_shift_offset);
1694 /* Update potential energies */
1695 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1696 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1698 /* Increment number of inner iterations */
1699 inneriter += j_index_end - j_index_start;
1701 /* Outer loop uses 26 flops */
1704 /* Increment number of outer iterations */
1707 /* Update outer/inner flops */
1709 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*677);
1712 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_128_fma_double
1713 * Electrostatics interaction: Ewald
1714 * VdW interaction: LennardJones
1715 * Geometry: Water4-Water4
1716 * Calculate force/pot: Force
1719 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_128_fma_double
1720 (t_nblist * gmx_restrict nlist,
1721 rvec * gmx_restrict xx,
1722 rvec * gmx_restrict ff,
1723 t_forcerec * gmx_restrict fr,
1724 t_mdatoms * gmx_restrict mdatoms,
1725 nb_kernel_data_t * gmx_restrict kernel_data,
1726 t_nrnb * gmx_restrict nrnb)
1728 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1729 * just 0 for non-waters.
1730 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1731 * jnr indices corresponding to data put in the four positions in the SIMD register.
1733 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1734 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1736 int j_coord_offsetA,j_coord_offsetB;
1737 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1738 real rcutoff_scalar;
1739 real *shiftvec,*fshift,*x,*f;
1740 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1742 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1744 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1746 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1748 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1749 int vdwjidx0A,vdwjidx0B;
1750 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1751 int vdwjidx1A,vdwjidx1B;
1752 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1753 int vdwjidx2A,vdwjidx2B;
1754 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1755 int vdwjidx3A,vdwjidx3B;
1756 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1757 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1758 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1759 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1760 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1761 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1762 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1763 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1764 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1765 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1766 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1767 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1770 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1773 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1774 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1776 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1778 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1779 real rswitch_scalar,d_scalar;
1780 __m128d dummy_mask,cutoff_mask;
1781 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1782 __m128d one = _mm_set1_pd(1.0);
1783 __m128d two = _mm_set1_pd(2.0);
1789 jindex = nlist->jindex;
1791 shiftidx = nlist->shift;
1793 shiftvec = fr->shift_vec[0];
1794 fshift = fr->fshift[0];
1795 facel = _mm_set1_pd(fr->epsfac);
1796 charge = mdatoms->chargeA;
1797 nvdwtype = fr->ntype;
1798 vdwparam = fr->nbfp;
1799 vdwtype = mdatoms->typeA;
1801 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1802 ewtab = fr->ic->tabq_coul_FDV0;
1803 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1804 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1806 /* Setup water-specific parameters */
1807 inr = nlist->iinr[0];
1808 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1809 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1810 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1811 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1813 jq1 = _mm_set1_pd(charge[inr+1]);
1814 jq2 = _mm_set1_pd(charge[inr+2]);
1815 jq3 = _mm_set1_pd(charge[inr+3]);
1816 vdwjidx0A = 2*vdwtype[inr+0];
1817 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1818 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1819 qq11 = _mm_mul_pd(iq1,jq1);
1820 qq12 = _mm_mul_pd(iq1,jq2);
1821 qq13 = _mm_mul_pd(iq1,jq3);
1822 qq21 = _mm_mul_pd(iq2,jq1);
1823 qq22 = _mm_mul_pd(iq2,jq2);
1824 qq23 = _mm_mul_pd(iq2,jq3);
1825 qq31 = _mm_mul_pd(iq3,jq1);
1826 qq32 = _mm_mul_pd(iq3,jq2);
1827 qq33 = _mm_mul_pd(iq3,jq3);
1829 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1830 rcutoff_scalar = fr->rcoulomb;
1831 rcutoff = _mm_set1_pd(rcutoff_scalar);
1832 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1834 rswitch_scalar = fr->rcoulomb_switch;
1835 rswitch = _mm_set1_pd(rswitch_scalar);
1836 /* Setup switch parameters */
1837 d_scalar = rcutoff_scalar-rswitch_scalar;
1838 d = _mm_set1_pd(d_scalar);
1839 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1840 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1841 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1842 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1843 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1844 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1846 /* Avoid stupid compiler warnings */
1848 j_coord_offsetA = 0;
1849 j_coord_offsetB = 0;
1854 /* Start outer loop over neighborlists */
1855 for(iidx=0; iidx<nri; iidx++)
1857 /* Load shift vector for this list */
1858 i_shift_offset = DIM*shiftidx[iidx];
1860 /* Load limits for loop over neighbors */
1861 j_index_start = jindex[iidx];
1862 j_index_end = jindex[iidx+1];
1864 /* Get outer coordinate index */
1866 i_coord_offset = DIM*inr;
1868 /* Load i particle coords and add shift vector */
1869 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1870 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1872 fix0 = _mm_setzero_pd();
1873 fiy0 = _mm_setzero_pd();
1874 fiz0 = _mm_setzero_pd();
1875 fix1 = _mm_setzero_pd();
1876 fiy1 = _mm_setzero_pd();
1877 fiz1 = _mm_setzero_pd();
1878 fix2 = _mm_setzero_pd();
1879 fiy2 = _mm_setzero_pd();
1880 fiz2 = _mm_setzero_pd();
1881 fix3 = _mm_setzero_pd();
1882 fiy3 = _mm_setzero_pd();
1883 fiz3 = _mm_setzero_pd();
1885 /* Start inner kernel loop */
1886 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1889 /* Get j neighbor index, and coordinate index */
1891 jnrB = jjnr[jidx+1];
1892 j_coord_offsetA = DIM*jnrA;
1893 j_coord_offsetB = DIM*jnrB;
1895 /* load j atom coordinates */
1896 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1897 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1898 &jy2,&jz2,&jx3,&jy3,&jz3);
1900 /* Calculate displacement vector */
1901 dx00 = _mm_sub_pd(ix0,jx0);
1902 dy00 = _mm_sub_pd(iy0,jy0);
1903 dz00 = _mm_sub_pd(iz0,jz0);
1904 dx11 = _mm_sub_pd(ix1,jx1);
1905 dy11 = _mm_sub_pd(iy1,jy1);
1906 dz11 = _mm_sub_pd(iz1,jz1);
1907 dx12 = _mm_sub_pd(ix1,jx2);
1908 dy12 = _mm_sub_pd(iy1,jy2);
1909 dz12 = _mm_sub_pd(iz1,jz2);
1910 dx13 = _mm_sub_pd(ix1,jx3);
1911 dy13 = _mm_sub_pd(iy1,jy3);
1912 dz13 = _mm_sub_pd(iz1,jz3);
1913 dx21 = _mm_sub_pd(ix2,jx1);
1914 dy21 = _mm_sub_pd(iy2,jy1);
1915 dz21 = _mm_sub_pd(iz2,jz1);
1916 dx22 = _mm_sub_pd(ix2,jx2);
1917 dy22 = _mm_sub_pd(iy2,jy2);
1918 dz22 = _mm_sub_pd(iz2,jz2);
1919 dx23 = _mm_sub_pd(ix2,jx3);
1920 dy23 = _mm_sub_pd(iy2,jy3);
1921 dz23 = _mm_sub_pd(iz2,jz3);
1922 dx31 = _mm_sub_pd(ix3,jx1);
1923 dy31 = _mm_sub_pd(iy3,jy1);
1924 dz31 = _mm_sub_pd(iz3,jz1);
1925 dx32 = _mm_sub_pd(ix3,jx2);
1926 dy32 = _mm_sub_pd(iy3,jy2);
1927 dz32 = _mm_sub_pd(iz3,jz2);
1928 dx33 = _mm_sub_pd(ix3,jx3);
1929 dy33 = _mm_sub_pd(iy3,jy3);
1930 dz33 = _mm_sub_pd(iz3,jz3);
1932 /* Calculate squared distance and things based on it */
1933 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1934 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1935 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1936 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1937 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1938 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1939 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1940 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1941 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1942 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1944 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1945 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1946 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1947 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1948 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1949 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1950 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1951 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1952 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1953 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1955 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1956 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1957 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1958 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1959 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1960 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1961 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1962 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1963 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1964 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1966 fjx0 = _mm_setzero_pd();
1967 fjy0 = _mm_setzero_pd();
1968 fjz0 = _mm_setzero_pd();
1969 fjx1 = _mm_setzero_pd();
1970 fjy1 = _mm_setzero_pd();
1971 fjz1 = _mm_setzero_pd();
1972 fjx2 = _mm_setzero_pd();
1973 fjy2 = _mm_setzero_pd();
1974 fjz2 = _mm_setzero_pd();
1975 fjx3 = _mm_setzero_pd();
1976 fjy3 = _mm_setzero_pd();
1977 fjz3 = _mm_setzero_pd();
1979 /**************************
1980 * CALCULATE INTERACTIONS *
1981 **************************/
1983 if (gmx_mm_any_lt(rsq00,rcutoff2))
1986 r00 = _mm_mul_pd(rsq00,rinv00);
1988 /* LENNARD-JONES DISPERSION/REPULSION */
1990 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1991 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1992 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1993 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
1994 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1996 d = _mm_sub_pd(r00,rswitch);
1997 d = _mm_max_pd(d,_mm_setzero_pd());
1998 d2 = _mm_mul_pd(d,d);
1999 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2001 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2003 /* Evaluate switch function */
2004 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2005 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2006 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
2010 fscal = _mm_and_pd(fscal,cutoff_mask);
2012 /* Update vectorial force */
2013 fix0 = _mm_macc_pd(dx00,fscal,fix0);
2014 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
2015 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
2017 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
2018 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
2019 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
2023 /**************************
2024 * CALCULATE INTERACTIONS *
2025 **************************/
2027 if (gmx_mm_any_lt(rsq11,rcutoff2))
2030 r11 = _mm_mul_pd(rsq11,rinv11);
2032 /* EWALD ELECTROSTATICS */
2034 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2035 ewrt = _mm_mul_pd(r11,ewtabscale);
2036 ewitab = _mm_cvttpd_epi32(ewrt);
2038 eweps = _mm_frcz_pd(ewrt);
2040 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2042 twoeweps = _mm_add_pd(eweps,eweps);
2043 ewitab = _mm_slli_epi32(ewitab,2);
2044 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2045 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2046 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2047 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2048 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2049 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2050 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2051 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2052 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2053 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2055 d = _mm_sub_pd(r11,rswitch);
2056 d = _mm_max_pd(d,_mm_setzero_pd());
2057 d2 = _mm_mul_pd(d,d);
2058 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2060 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2062 /* Evaluate switch function */
2063 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2064 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2065 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2069 fscal = _mm_and_pd(fscal,cutoff_mask);
2071 /* Update vectorial force */
2072 fix1 = _mm_macc_pd(dx11,fscal,fix1);
2073 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
2074 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
2076 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
2077 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
2078 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
2082 /**************************
2083 * CALCULATE INTERACTIONS *
2084 **************************/
2086 if (gmx_mm_any_lt(rsq12,rcutoff2))
2089 r12 = _mm_mul_pd(rsq12,rinv12);
2091 /* EWALD ELECTROSTATICS */
2093 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2094 ewrt = _mm_mul_pd(r12,ewtabscale);
2095 ewitab = _mm_cvttpd_epi32(ewrt);
2097 eweps = _mm_frcz_pd(ewrt);
2099 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2101 twoeweps = _mm_add_pd(eweps,eweps);
2102 ewitab = _mm_slli_epi32(ewitab,2);
2103 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2104 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2105 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2106 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2107 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2108 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2109 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2110 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2111 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2112 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2114 d = _mm_sub_pd(r12,rswitch);
2115 d = _mm_max_pd(d,_mm_setzero_pd());
2116 d2 = _mm_mul_pd(d,d);
2117 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2119 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2121 /* Evaluate switch function */
2122 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2123 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2124 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2128 fscal = _mm_and_pd(fscal,cutoff_mask);
2130 /* Update vectorial force */
2131 fix1 = _mm_macc_pd(dx12,fscal,fix1);
2132 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
2133 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
2135 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
2136 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
2137 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
2141 /**************************
2142 * CALCULATE INTERACTIONS *
2143 **************************/
2145 if (gmx_mm_any_lt(rsq13,rcutoff2))
2148 r13 = _mm_mul_pd(rsq13,rinv13);
2150 /* EWALD ELECTROSTATICS */
2152 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2153 ewrt = _mm_mul_pd(r13,ewtabscale);
2154 ewitab = _mm_cvttpd_epi32(ewrt);
2156 eweps = _mm_frcz_pd(ewrt);
2158 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2160 twoeweps = _mm_add_pd(eweps,eweps);
2161 ewitab = _mm_slli_epi32(ewitab,2);
2162 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2163 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2164 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2165 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2166 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2167 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2168 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2169 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2170 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
2171 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2173 d = _mm_sub_pd(r13,rswitch);
2174 d = _mm_max_pd(d,_mm_setzero_pd());
2175 d2 = _mm_mul_pd(d,d);
2176 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2178 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2180 /* Evaluate switch function */
2181 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2182 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
2183 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
2187 fscal = _mm_and_pd(fscal,cutoff_mask);
2189 /* Update vectorial force */
2190 fix1 = _mm_macc_pd(dx13,fscal,fix1);
2191 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
2192 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
2194 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
2195 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
2196 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
2200 /**************************
2201 * CALCULATE INTERACTIONS *
2202 **************************/
2204 if (gmx_mm_any_lt(rsq21,rcutoff2))
2207 r21 = _mm_mul_pd(rsq21,rinv21);
2209 /* EWALD ELECTROSTATICS */
2211 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2212 ewrt = _mm_mul_pd(r21,ewtabscale);
2213 ewitab = _mm_cvttpd_epi32(ewrt);
2215 eweps = _mm_frcz_pd(ewrt);
2217 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2219 twoeweps = _mm_add_pd(eweps,eweps);
2220 ewitab = _mm_slli_epi32(ewitab,2);
2221 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2222 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2223 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2224 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2225 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2226 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2227 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2228 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2229 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2230 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2232 d = _mm_sub_pd(r21,rswitch);
2233 d = _mm_max_pd(d,_mm_setzero_pd());
2234 d2 = _mm_mul_pd(d,d);
2235 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2237 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2239 /* Evaluate switch function */
2240 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2241 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2242 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2246 fscal = _mm_and_pd(fscal,cutoff_mask);
2248 /* Update vectorial force */
2249 fix2 = _mm_macc_pd(dx21,fscal,fix2);
2250 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
2251 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
2253 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
2254 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
2255 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
2259 /**************************
2260 * CALCULATE INTERACTIONS *
2261 **************************/
2263 if (gmx_mm_any_lt(rsq22,rcutoff2))
2266 r22 = _mm_mul_pd(rsq22,rinv22);
2268 /* EWALD ELECTROSTATICS */
2270 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2271 ewrt = _mm_mul_pd(r22,ewtabscale);
2272 ewitab = _mm_cvttpd_epi32(ewrt);
2274 eweps = _mm_frcz_pd(ewrt);
2276 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2278 twoeweps = _mm_add_pd(eweps,eweps);
2279 ewitab = _mm_slli_epi32(ewitab,2);
2280 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2281 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2282 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2283 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2284 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2285 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2286 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2287 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2288 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2289 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2291 d = _mm_sub_pd(r22,rswitch);
2292 d = _mm_max_pd(d,_mm_setzero_pd());
2293 d2 = _mm_mul_pd(d,d);
2294 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2296 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2298 /* Evaluate switch function */
2299 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2300 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2301 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2305 fscal = _mm_and_pd(fscal,cutoff_mask);
2307 /* Update vectorial force */
2308 fix2 = _mm_macc_pd(dx22,fscal,fix2);
2309 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
2310 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
2312 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
2313 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
2314 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
2318 /**************************
2319 * CALCULATE INTERACTIONS *
2320 **************************/
2322 if (gmx_mm_any_lt(rsq23,rcutoff2))
2325 r23 = _mm_mul_pd(rsq23,rinv23);
2327 /* EWALD ELECTROSTATICS */
2329 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2330 ewrt = _mm_mul_pd(r23,ewtabscale);
2331 ewitab = _mm_cvttpd_epi32(ewrt);
2333 eweps = _mm_frcz_pd(ewrt);
2335 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2337 twoeweps = _mm_add_pd(eweps,eweps);
2338 ewitab = _mm_slli_epi32(ewitab,2);
2339 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2340 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2341 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2342 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2343 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2344 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2345 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2346 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2347 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
2348 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2350 d = _mm_sub_pd(r23,rswitch);
2351 d = _mm_max_pd(d,_mm_setzero_pd());
2352 d2 = _mm_mul_pd(d,d);
2353 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2355 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2357 /* Evaluate switch function */
2358 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2359 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
2360 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
2364 fscal = _mm_and_pd(fscal,cutoff_mask);
2366 /* Update vectorial force */
2367 fix2 = _mm_macc_pd(dx23,fscal,fix2);
2368 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
2369 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
2371 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
2372 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
2373 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
2377 /**************************
2378 * CALCULATE INTERACTIONS *
2379 **************************/
2381 if (gmx_mm_any_lt(rsq31,rcutoff2))
2384 r31 = _mm_mul_pd(rsq31,rinv31);
2386 /* EWALD ELECTROSTATICS */
2388 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2389 ewrt = _mm_mul_pd(r31,ewtabscale);
2390 ewitab = _mm_cvttpd_epi32(ewrt);
2392 eweps = _mm_frcz_pd(ewrt);
2394 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2396 twoeweps = _mm_add_pd(eweps,eweps);
2397 ewitab = _mm_slli_epi32(ewitab,2);
2398 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2399 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2400 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2401 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2402 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2403 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2404 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2405 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2406 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
2407 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2409 d = _mm_sub_pd(r31,rswitch);
2410 d = _mm_max_pd(d,_mm_setzero_pd());
2411 d2 = _mm_mul_pd(d,d);
2412 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2414 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2416 /* Evaluate switch function */
2417 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2418 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
2419 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
2423 fscal = _mm_and_pd(fscal,cutoff_mask);
2425 /* Update vectorial force */
2426 fix3 = _mm_macc_pd(dx31,fscal,fix3);
2427 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
2428 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
2430 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
2431 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
2432 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
2436 /**************************
2437 * CALCULATE INTERACTIONS *
2438 **************************/
2440 if (gmx_mm_any_lt(rsq32,rcutoff2))
2443 r32 = _mm_mul_pd(rsq32,rinv32);
2445 /* EWALD ELECTROSTATICS */
2447 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2448 ewrt = _mm_mul_pd(r32,ewtabscale);
2449 ewitab = _mm_cvttpd_epi32(ewrt);
2451 eweps = _mm_frcz_pd(ewrt);
2453 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2455 twoeweps = _mm_add_pd(eweps,eweps);
2456 ewitab = _mm_slli_epi32(ewitab,2);
2457 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2458 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2459 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2460 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2461 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2462 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2463 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2464 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2465 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
2466 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2468 d = _mm_sub_pd(r32,rswitch);
2469 d = _mm_max_pd(d,_mm_setzero_pd());
2470 d2 = _mm_mul_pd(d,d);
2471 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2473 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2475 /* Evaluate switch function */
2476 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2477 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
2478 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
2482 fscal = _mm_and_pd(fscal,cutoff_mask);
2484 /* Update vectorial force */
2485 fix3 = _mm_macc_pd(dx32,fscal,fix3);
2486 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
2487 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
2489 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
2490 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
2491 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
2495 /**************************
2496 * CALCULATE INTERACTIONS *
2497 **************************/
2499 if (gmx_mm_any_lt(rsq33,rcutoff2))
2502 r33 = _mm_mul_pd(rsq33,rinv33);
2504 /* EWALD ELECTROSTATICS */
2506 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2507 ewrt = _mm_mul_pd(r33,ewtabscale);
2508 ewitab = _mm_cvttpd_epi32(ewrt);
2510 eweps = _mm_frcz_pd(ewrt);
2512 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2514 twoeweps = _mm_add_pd(eweps,eweps);
2515 ewitab = _mm_slli_epi32(ewitab,2);
2516 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2517 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2518 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2519 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2520 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2521 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2522 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2523 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2524 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
2525 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2527 d = _mm_sub_pd(r33,rswitch);
2528 d = _mm_max_pd(d,_mm_setzero_pd());
2529 d2 = _mm_mul_pd(d,d);
2530 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2532 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2534 /* Evaluate switch function */
2535 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2536 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
2537 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
2541 fscal = _mm_and_pd(fscal,cutoff_mask);
2543 /* Update vectorial force */
2544 fix3 = _mm_macc_pd(dx33,fscal,fix3);
2545 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
2546 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
2548 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
2549 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
2550 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
2554 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2556 /* Inner loop uses 647 flops */
2559 if(jidx<j_index_end)
2563 j_coord_offsetA = DIM*jnrA;
2565 /* load j atom coordinates */
2566 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2567 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2568 &jy2,&jz2,&jx3,&jy3,&jz3);
2570 /* Calculate displacement vector */
2571 dx00 = _mm_sub_pd(ix0,jx0);
2572 dy00 = _mm_sub_pd(iy0,jy0);
2573 dz00 = _mm_sub_pd(iz0,jz0);
2574 dx11 = _mm_sub_pd(ix1,jx1);
2575 dy11 = _mm_sub_pd(iy1,jy1);
2576 dz11 = _mm_sub_pd(iz1,jz1);
2577 dx12 = _mm_sub_pd(ix1,jx2);
2578 dy12 = _mm_sub_pd(iy1,jy2);
2579 dz12 = _mm_sub_pd(iz1,jz2);
2580 dx13 = _mm_sub_pd(ix1,jx3);
2581 dy13 = _mm_sub_pd(iy1,jy3);
2582 dz13 = _mm_sub_pd(iz1,jz3);
2583 dx21 = _mm_sub_pd(ix2,jx1);
2584 dy21 = _mm_sub_pd(iy2,jy1);
2585 dz21 = _mm_sub_pd(iz2,jz1);
2586 dx22 = _mm_sub_pd(ix2,jx2);
2587 dy22 = _mm_sub_pd(iy2,jy2);
2588 dz22 = _mm_sub_pd(iz2,jz2);
2589 dx23 = _mm_sub_pd(ix2,jx3);
2590 dy23 = _mm_sub_pd(iy2,jy3);
2591 dz23 = _mm_sub_pd(iz2,jz3);
2592 dx31 = _mm_sub_pd(ix3,jx1);
2593 dy31 = _mm_sub_pd(iy3,jy1);
2594 dz31 = _mm_sub_pd(iz3,jz1);
2595 dx32 = _mm_sub_pd(ix3,jx2);
2596 dy32 = _mm_sub_pd(iy3,jy2);
2597 dz32 = _mm_sub_pd(iz3,jz2);
2598 dx33 = _mm_sub_pd(ix3,jx3);
2599 dy33 = _mm_sub_pd(iy3,jy3);
2600 dz33 = _mm_sub_pd(iz3,jz3);
2602 /* Calculate squared distance and things based on it */
2603 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2604 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2605 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2606 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2607 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2608 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2609 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2610 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2611 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2612 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2614 rinv00 = gmx_mm_invsqrt_pd(rsq00);
2615 rinv11 = gmx_mm_invsqrt_pd(rsq11);
2616 rinv12 = gmx_mm_invsqrt_pd(rsq12);
2617 rinv13 = gmx_mm_invsqrt_pd(rsq13);
2618 rinv21 = gmx_mm_invsqrt_pd(rsq21);
2619 rinv22 = gmx_mm_invsqrt_pd(rsq22);
2620 rinv23 = gmx_mm_invsqrt_pd(rsq23);
2621 rinv31 = gmx_mm_invsqrt_pd(rsq31);
2622 rinv32 = gmx_mm_invsqrt_pd(rsq32);
2623 rinv33 = gmx_mm_invsqrt_pd(rsq33);
2625 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
2626 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
2627 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
2628 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
2629 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
2630 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
2631 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
2632 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
2633 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
2634 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
2636 fjx0 = _mm_setzero_pd();
2637 fjy0 = _mm_setzero_pd();
2638 fjz0 = _mm_setzero_pd();
2639 fjx1 = _mm_setzero_pd();
2640 fjy1 = _mm_setzero_pd();
2641 fjz1 = _mm_setzero_pd();
2642 fjx2 = _mm_setzero_pd();
2643 fjy2 = _mm_setzero_pd();
2644 fjz2 = _mm_setzero_pd();
2645 fjx3 = _mm_setzero_pd();
2646 fjy3 = _mm_setzero_pd();
2647 fjz3 = _mm_setzero_pd();
2649 /**************************
2650 * CALCULATE INTERACTIONS *
2651 **************************/
2653 if (gmx_mm_any_lt(rsq00,rcutoff2))
2656 r00 = _mm_mul_pd(rsq00,rinv00);
2658 /* LENNARD-JONES DISPERSION/REPULSION */
2660 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2661 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
2662 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
2663 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
2664 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
2666 d = _mm_sub_pd(r00,rswitch);
2667 d = _mm_max_pd(d,_mm_setzero_pd());
2668 d2 = _mm_mul_pd(d,d);
2669 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2671 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2673 /* Evaluate switch function */
2674 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2675 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2676 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
2680 fscal = _mm_and_pd(fscal,cutoff_mask);
2682 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2684 /* Update vectorial force */
2685 fix0 = _mm_macc_pd(dx00,fscal,fix0);
2686 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
2687 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
2689 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
2690 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
2691 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
2695 /**************************
2696 * CALCULATE INTERACTIONS *
2697 **************************/
2699 if (gmx_mm_any_lt(rsq11,rcutoff2))
2702 r11 = _mm_mul_pd(rsq11,rinv11);
2704 /* EWALD ELECTROSTATICS */
2706 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2707 ewrt = _mm_mul_pd(r11,ewtabscale);
2708 ewitab = _mm_cvttpd_epi32(ewrt);
2710 eweps = _mm_frcz_pd(ewrt);
2712 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2714 twoeweps = _mm_add_pd(eweps,eweps);
2715 ewitab = _mm_slli_epi32(ewitab,2);
2716 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2717 ewtabD = _mm_setzero_pd();
2718 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2719 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2720 ewtabFn = _mm_setzero_pd();
2721 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2722 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2723 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2724 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2725 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2727 d = _mm_sub_pd(r11,rswitch);
2728 d = _mm_max_pd(d,_mm_setzero_pd());
2729 d2 = _mm_mul_pd(d,d);
2730 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2732 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2734 /* Evaluate switch function */
2735 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2736 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2737 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2741 fscal = _mm_and_pd(fscal,cutoff_mask);
2743 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2745 /* Update vectorial force */
2746 fix1 = _mm_macc_pd(dx11,fscal,fix1);
2747 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
2748 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
2750 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
2751 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
2752 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
2756 /**************************
2757 * CALCULATE INTERACTIONS *
2758 **************************/
2760 if (gmx_mm_any_lt(rsq12,rcutoff2))
2763 r12 = _mm_mul_pd(rsq12,rinv12);
2765 /* EWALD ELECTROSTATICS */
2767 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2768 ewrt = _mm_mul_pd(r12,ewtabscale);
2769 ewitab = _mm_cvttpd_epi32(ewrt);
2771 eweps = _mm_frcz_pd(ewrt);
2773 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2775 twoeweps = _mm_add_pd(eweps,eweps);
2776 ewitab = _mm_slli_epi32(ewitab,2);
2777 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2778 ewtabD = _mm_setzero_pd();
2779 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2780 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2781 ewtabFn = _mm_setzero_pd();
2782 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2783 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2784 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2785 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2786 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2788 d = _mm_sub_pd(r12,rswitch);
2789 d = _mm_max_pd(d,_mm_setzero_pd());
2790 d2 = _mm_mul_pd(d,d);
2791 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2793 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2795 /* Evaluate switch function */
2796 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2797 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2798 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2802 fscal = _mm_and_pd(fscal,cutoff_mask);
2804 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2806 /* Update vectorial force */
2807 fix1 = _mm_macc_pd(dx12,fscal,fix1);
2808 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
2809 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
2811 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
2812 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
2813 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
2817 /**************************
2818 * CALCULATE INTERACTIONS *
2819 **************************/
2821 if (gmx_mm_any_lt(rsq13,rcutoff2))
2824 r13 = _mm_mul_pd(rsq13,rinv13);
2826 /* EWALD ELECTROSTATICS */
2828 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2829 ewrt = _mm_mul_pd(r13,ewtabscale);
2830 ewitab = _mm_cvttpd_epi32(ewrt);
2832 eweps = _mm_frcz_pd(ewrt);
2834 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2836 twoeweps = _mm_add_pd(eweps,eweps);
2837 ewitab = _mm_slli_epi32(ewitab,2);
2838 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2839 ewtabD = _mm_setzero_pd();
2840 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2841 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2842 ewtabFn = _mm_setzero_pd();
2843 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2844 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2845 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2846 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
2847 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2849 d = _mm_sub_pd(r13,rswitch);
2850 d = _mm_max_pd(d,_mm_setzero_pd());
2851 d2 = _mm_mul_pd(d,d);
2852 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2854 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2856 /* Evaluate switch function */
2857 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2858 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
2859 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
2863 fscal = _mm_and_pd(fscal,cutoff_mask);
2865 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2867 /* Update vectorial force */
2868 fix1 = _mm_macc_pd(dx13,fscal,fix1);
2869 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
2870 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
2872 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
2873 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
2874 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
2878 /**************************
2879 * CALCULATE INTERACTIONS *
2880 **************************/
2882 if (gmx_mm_any_lt(rsq21,rcutoff2))
2885 r21 = _mm_mul_pd(rsq21,rinv21);
2887 /* EWALD ELECTROSTATICS */
2889 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2890 ewrt = _mm_mul_pd(r21,ewtabscale);
2891 ewitab = _mm_cvttpd_epi32(ewrt);
2893 eweps = _mm_frcz_pd(ewrt);
2895 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2897 twoeweps = _mm_add_pd(eweps,eweps);
2898 ewitab = _mm_slli_epi32(ewitab,2);
2899 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2900 ewtabD = _mm_setzero_pd();
2901 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2902 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2903 ewtabFn = _mm_setzero_pd();
2904 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2905 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2906 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2907 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2908 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2910 d = _mm_sub_pd(r21,rswitch);
2911 d = _mm_max_pd(d,_mm_setzero_pd());
2912 d2 = _mm_mul_pd(d,d);
2913 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2915 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2917 /* Evaluate switch function */
2918 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2919 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2920 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2924 fscal = _mm_and_pd(fscal,cutoff_mask);
2926 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2928 /* Update vectorial force */
2929 fix2 = _mm_macc_pd(dx21,fscal,fix2);
2930 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
2931 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
2933 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
2934 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
2935 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
2939 /**************************
2940 * CALCULATE INTERACTIONS *
2941 **************************/
2943 if (gmx_mm_any_lt(rsq22,rcutoff2))
2946 r22 = _mm_mul_pd(rsq22,rinv22);
2948 /* EWALD ELECTROSTATICS */
2950 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2951 ewrt = _mm_mul_pd(r22,ewtabscale);
2952 ewitab = _mm_cvttpd_epi32(ewrt);
2954 eweps = _mm_frcz_pd(ewrt);
2956 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2958 twoeweps = _mm_add_pd(eweps,eweps);
2959 ewitab = _mm_slli_epi32(ewitab,2);
2960 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2961 ewtabD = _mm_setzero_pd();
2962 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2963 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2964 ewtabFn = _mm_setzero_pd();
2965 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2966 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2967 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2968 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2969 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2971 d = _mm_sub_pd(r22,rswitch);
2972 d = _mm_max_pd(d,_mm_setzero_pd());
2973 d2 = _mm_mul_pd(d,d);
2974 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2976 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2978 /* Evaluate switch function */
2979 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2980 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2981 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2985 fscal = _mm_and_pd(fscal,cutoff_mask);
2987 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2989 /* Update vectorial force */
2990 fix2 = _mm_macc_pd(dx22,fscal,fix2);
2991 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
2992 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
2994 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
2995 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
2996 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
3000 /**************************
3001 * CALCULATE INTERACTIONS *
3002 **************************/
3004 if (gmx_mm_any_lt(rsq23,rcutoff2))
3007 r23 = _mm_mul_pd(rsq23,rinv23);
3009 /* EWALD ELECTROSTATICS */
3011 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3012 ewrt = _mm_mul_pd(r23,ewtabscale);
3013 ewitab = _mm_cvttpd_epi32(ewrt);
3015 eweps = _mm_frcz_pd(ewrt);
3017 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3019 twoeweps = _mm_add_pd(eweps,eweps);
3020 ewitab = _mm_slli_epi32(ewitab,2);
3021 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3022 ewtabD = _mm_setzero_pd();
3023 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3024 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
3025 ewtabFn = _mm_setzero_pd();
3026 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3027 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
3028 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
3029 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
3030 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
3032 d = _mm_sub_pd(r23,rswitch);
3033 d = _mm_max_pd(d,_mm_setzero_pd());
3034 d2 = _mm_mul_pd(d,d);
3035 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
3037 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
3039 /* Evaluate switch function */
3040 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3041 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
3042 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
3046 fscal = _mm_and_pd(fscal,cutoff_mask);
3048 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3050 /* Update vectorial force */
3051 fix2 = _mm_macc_pd(dx23,fscal,fix2);
3052 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
3053 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
3055 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
3056 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
3057 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
3061 /**************************
3062 * CALCULATE INTERACTIONS *
3063 **************************/
3065 if (gmx_mm_any_lt(rsq31,rcutoff2))
3068 r31 = _mm_mul_pd(rsq31,rinv31);
3070 /* EWALD ELECTROSTATICS */
3072 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3073 ewrt = _mm_mul_pd(r31,ewtabscale);
3074 ewitab = _mm_cvttpd_epi32(ewrt);
3076 eweps = _mm_frcz_pd(ewrt);
3078 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3080 twoeweps = _mm_add_pd(eweps,eweps);
3081 ewitab = _mm_slli_epi32(ewitab,2);
3082 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3083 ewtabD = _mm_setzero_pd();
3084 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3085 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
3086 ewtabFn = _mm_setzero_pd();
3087 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3088 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
3089 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
3090 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
3091 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
3093 d = _mm_sub_pd(r31,rswitch);
3094 d = _mm_max_pd(d,_mm_setzero_pd());
3095 d2 = _mm_mul_pd(d,d);
3096 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
3098 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
3100 /* Evaluate switch function */
3101 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3102 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
3103 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
3107 fscal = _mm_and_pd(fscal,cutoff_mask);
3109 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3111 /* Update vectorial force */
3112 fix3 = _mm_macc_pd(dx31,fscal,fix3);
3113 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
3114 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
3116 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
3117 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
3118 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
3122 /**************************
3123 * CALCULATE INTERACTIONS *
3124 **************************/
3126 if (gmx_mm_any_lt(rsq32,rcutoff2))
3129 r32 = _mm_mul_pd(rsq32,rinv32);
3131 /* EWALD ELECTROSTATICS */
3133 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3134 ewrt = _mm_mul_pd(r32,ewtabscale);
3135 ewitab = _mm_cvttpd_epi32(ewrt);
3137 eweps = _mm_frcz_pd(ewrt);
3139 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3141 twoeweps = _mm_add_pd(eweps,eweps);
3142 ewitab = _mm_slli_epi32(ewitab,2);
3143 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3144 ewtabD = _mm_setzero_pd();
3145 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3146 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
3147 ewtabFn = _mm_setzero_pd();
3148 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3149 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
3150 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
3151 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
3152 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
3154 d = _mm_sub_pd(r32,rswitch);
3155 d = _mm_max_pd(d,_mm_setzero_pd());
3156 d2 = _mm_mul_pd(d,d);
3157 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
3159 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
3161 /* Evaluate switch function */
3162 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3163 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
3164 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
3168 fscal = _mm_and_pd(fscal,cutoff_mask);
3170 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3172 /* Update vectorial force */
3173 fix3 = _mm_macc_pd(dx32,fscal,fix3);
3174 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
3175 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
3177 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
3178 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
3179 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
3183 /**************************
3184 * CALCULATE INTERACTIONS *
3185 **************************/
3187 if (gmx_mm_any_lt(rsq33,rcutoff2))
3190 r33 = _mm_mul_pd(rsq33,rinv33);
3192 /* EWALD ELECTROSTATICS */
3194 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3195 ewrt = _mm_mul_pd(r33,ewtabscale);
3196 ewitab = _mm_cvttpd_epi32(ewrt);
3198 eweps = _mm_frcz_pd(ewrt);
3200 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3202 twoeweps = _mm_add_pd(eweps,eweps);
3203 ewitab = _mm_slli_epi32(ewitab,2);
3204 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3205 ewtabD = _mm_setzero_pd();
3206 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3207 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
3208 ewtabFn = _mm_setzero_pd();
3209 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3210 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
3211 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
3212 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
3213 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
3215 d = _mm_sub_pd(r33,rswitch);
3216 d = _mm_max_pd(d,_mm_setzero_pd());
3217 d2 = _mm_mul_pd(d,d);
3218 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
3220 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
3222 /* Evaluate switch function */
3223 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3224 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
3225 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
3229 fscal = _mm_and_pd(fscal,cutoff_mask);
3231 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3233 /* Update vectorial force */
3234 fix3 = _mm_macc_pd(dx33,fscal,fix3);
3235 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
3236 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
3238 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
3239 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
3240 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
3244 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3246 /* Inner loop uses 647 flops */
3249 /* End of innermost loop */
3251 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3252 f+i_coord_offset,fshift+i_shift_offset);
3254 /* Increment number of inner iterations */
3255 inneriter += j_index_end - j_index_start;
3257 /* Outer loop uses 24 flops */
3260 /* Increment number of outer iterations */
3263 /* Update outer/inner flops */
3265 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*647);