2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B;
73 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B;
75 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B;
77 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
93 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
94 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
96 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
98 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
99 real rswitch_scalar,d_scalar;
100 __m128d dummy_mask,cutoff_mask;
101 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
102 __m128d one = _mm_set1_pd(1.0);
103 __m128d two = _mm_set1_pd(2.0);
109 jindex = nlist->jindex;
111 shiftidx = nlist->shift;
113 shiftvec = fr->shift_vec[0];
114 fshift = fr->fshift[0];
115 facel = _mm_set1_pd(fr->epsfac);
116 charge = mdatoms->chargeA;
117 nvdwtype = fr->ntype;
119 vdwtype = mdatoms->typeA;
121 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
122 ewtab = fr->ic->tabq_coul_FDV0;
123 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
124 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
126 /* Setup water-specific parameters */
127 inr = nlist->iinr[0];
128 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
129 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
130 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
131 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
133 jq0 = _mm_set1_pd(charge[inr+0]);
134 jq1 = _mm_set1_pd(charge[inr+1]);
135 jq2 = _mm_set1_pd(charge[inr+2]);
136 vdwjidx0A = 2*vdwtype[inr+0];
137 qq00 = _mm_mul_pd(iq0,jq0);
138 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
139 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
140 qq01 = _mm_mul_pd(iq0,jq1);
141 qq02 = _mm_mul_pd(iq0,jq2);
142 qq10 = _mm_mul_pd(iq1,jq0);
143 qq11 = _mm_mul_pd(iq1,jq1);
144 qq12 = _mm_mul_pd(iq1,jq2);
145 qq20 = _mm_mul_pd(iq2,jq0);
146 qq21 = _mm_mul_pd(iq2,jq1);
147 qq22 = _mm_mul_pd(iq2,jq2);
149 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
150 rcutoff_scalar = fr->rcoulomb;
151 rcutoff = _mm_set1_pd(rcutoff_scalar);
152 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
154 rswitch_scalar = fr->rcoulomb_switch;
155 rswitch = _mm_set1_pd(rswitch_scalar);
156 /* Setup switch parameters */
157 d_scalar = rcutoff_scalar-rswitch_scalar;
158 d = _mm_set1_pd(d_scalar);
159 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
160 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
161 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
162 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
163 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
164 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
166 /* Avoid stupid compiler warnings */
174 /* Start outer loop over neighborlists */
175 for(iidx=0; iidx<nri; iidx++)
177 /* Load shift vector for this list */
178 i_shift_offset = DIM*shiftidx[iidx];
180 /* Load limits for loop over neighbors */
181 j_index_start = jindex[iidx];
182 j_index_end = jindex[iidx+1];
184 /* Get outer coordinate index */
186 i_coord_offset = DIM*inr;
188 /* Load i particle coords and add shift vector */
189 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
190 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
192 fix0 = _mm_setzero_pd();
193 fiy0 = _mm_setzero_pd();
194 fiz0 = _mm_setzero_pd();
195 fix1 = _mm_setzero_pd();
196 fiy1 = _mm_setzero_pd();
197 fiz1 = _mm_setzero_pd();
198 fix2 = _mm_setzero_pd();
199 fiy2 = _mm_setzero_pd();
200 fiz2 = _mm_setzero_pd();
202 /* Reset potential sums */
203 velecsum = _mm_setzero_pd();
204 vvdwsum = _mm_setzero_pd();
206 /* Start inner kernel loop */
207 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
210 /* Get j neighbor index, and coordinate index */
213 j_coord_offsetA = DIM*jnrA;
214 j_coord_offsetB = DIM*jnrB;
216 /* load j atom coordinates */
217 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
218 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
220 /* Calculate displacement vector */
221 dx00 = _mm_sub_pd(ix0,jx0);
222 dy00 = _mm_sub_pd(iy0,jy0);
223 dz00 = _mm_sub_pd(iz0,jz0);
224 dx01 = _mm_sub_pd(ix0,jx1);
225 dy01 = _mm_sub_pd(iy0,jy1);
226 dz01 = _mm_sub_pd(iz0,jz1);
227 dx02 = _mm_sub_pd(ix0,jx2);
228 dy02 = _mm_sub_pd(iy0,jy2);
229 dz02 = _mm_sub_pd(iz0,jz2);
230 dx10 = _mm_sub_pd(ix1,jx0);
231 dy10 = _mm_sub_pd(iy1,jy0);
232 dz10 = _mm_sub_pd(iz1,jz0);
233 dx11 = _mm_sub_pd(ix1,jx1);
234 dy11 = _mm_sub_pd(iy1,jy1);
235 dz11 = _mm_sub_pd(iz1,jz1);
236 dx12 = _mm_sub_pd(ix1,jx2);
237 dy12 = _mm_sub_pd(iy1,jy2);
238 dz12 = _mm_sub_pd(iz1,jz2);
239 dx20 = _mm_sub_pd(ix2,jx0);
240 dy20 = _mm_sub_pd(iy2,jy0);
241 dz20 = _mm_sub_pd(iz2,jz0);
242 dx21 = _mm_sub_pd(ix2,jx1);
243 dy21 = _mm_sub_pd(iy2,jy1);
244 dz21 = _mm_sub_pd(iz2,jz1);
245 dx22 = _mm_sub_pd(ix2,jx2);
246 dy22 = _mm_sub_pd(iy2,jy2);
247 dz22 = _mm_sub_pd(iz2,jz2);
249 /* Calculate squared distance and things based on it */
250 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
251 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
252 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
253 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
254 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
255 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
256 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
257 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
258 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
260 rinv00 = gmx_mm_invsqrt_pd(rsq00);
261 rinv01 = gmx_mm_invsqrt_pd(rsq01);
262 rinv02 = gmx_mm_invsqrt_pd(rsq02);
263 rinv10 = gmx_mm_invsqrt_pd(rsq10);
264 rinv11 = gmx_mm_invsqrt_pd(rsq11);
265 rinv12 = gmx_mm_invsqrt_pd(rsq12);
266 rinv20 = gmx_mm_invsqrt_pd(rsq20);
267 rinv21 = gmx_mm_invsqrt_pd(rsq21);
268 rinv22 = gmx_mm_invsqrt_pd(rsq22);
270 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
271 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
272 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
273 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
274 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
275 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
276 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
277 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
278 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
280 fjx0 = _mm_setzero_pd();
281 fjy0 = _mm_setzero_pd();
282 fjz0 = _mm_setzero_pd();
283 fjx1 = _mm_setzero_pd();
284 fjy1 = _mm_setzero_pd();
285 fjz1 = _mm_setzero_pd();
286 fjx2 = _mm_setzero_pd();
287 fjy2 = _mm_setzero_pd();
288 fjz2 = _mm_setzero_pd();
290 /**************************
291 * CALCULATE INTERACTIONS *
292 **************************/
294 if (gmx_mm_any_lt(rsq00,rcutoff2))
297 r00 = _mm_mul_pd(rsq00,rinv00);
299 /* EWALD ELECTROSTATICS */
301 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
302 ewrt = _mm_mul_pd(r00,ewtabscale);
303 ewitab = _mm_cvttpd_epi32(ewrt);
304 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
305 ewitab = _mm_slli_epi32(ewitab,2);
306 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
307 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
308 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
309 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
310 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
311 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
312 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
313 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
314 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
315 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
317 /* LENNARD-JONES DISPERSION/REPULSION */
319 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
320 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
321 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
322 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
323 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
325 d = _mm_sub_pd(r00,rswitch);
326 d = _mm_max_pd(d,_mm_setzero_pd());
327 d2 = _mm_mul_pd(d,d);
328 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
330 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
332 /* Evaluate switch function */
333 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
334 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
335 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
336 velec = _mm_mul_pd(velec,sw);
337 vvdw = _mm_mul_pd(vvdw,sw);
338 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
340 /* Update potential sum for this i atom from the interaction with this j atom. */
341 velec = _mm_and_pd(velec,cutoff_mask);
342 velecsum = _mm_add_pd(velecsum,velec);
343 vvdw = _mm_and_pd(vvdw,cutoff_mask);
344 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
346 fscal = _mm_add_pd(felec,fvdw);
348 fscal = _mm_and_pd(fscal,cutoff_mask);
350 /* Calculate temporary vectorial force */
351 tx = _mm_mul_pd(fscal,dx00);
352 ty = _mm_mul_pd(fscal,dy00);
353 tz = _mm_mul_pd(fscal,dz00);
355 /* Update vectorial force */
356 fix0 = _mm_add_pd(fix0,tx);
357 fiy0 = _mm_add_pd(fiy0,ty);
358 fiz0 = _mm_add_pd(fiz0,tz);
360 fjx0 = _mm_add_pd(fjx0,tx);
361 fjy0 = _mm_add_pd(fjy0,ty);
362 fjz0 = _mm_add_pd(fjz0,tz);
366 /**************************
367 * CALCULATE INTERACTIONS *
368 **************************/
370 if (gmx_mm_any_lt(rsq01,rcutoff2))
373 r01 = _mm_mul_pd(rsq01,rinv01);
375 /* EWALD ELECTROSTATICS */
377 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
378 ewrt = _mm_mul_pd(r01,ewtabscale);
379 ewitab = _mm_cvttpd_epi32(ewrt);
380 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
381 ewitab = _mm_slli_epi32(ewitab,2);
382 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
383 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
384 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
385 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
386 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
387 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
388 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
389 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
390 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
391 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
393 d = _mm_sub_pd(r01,rswitch);
394 d = _mm_max_pd(d,_mm_setzero_pd());
395 d2 = _mm_mul_pd(d,d);
396 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
398 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
400 /* Evaluate switch function */
401 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
402 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
403 velec = _mm_mul_pd(velec,sw);
404 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
406 /* Update potential sum for this i atom from the interaction with this j atom. */
407 velec = _mm_and_pd(velec,cutoff_mask);
408 velecsum = _mm_add_pd(velecsum,velec);
412 fscal = _mm_and_pd(fscal,cutoff_mask);
414 /* Calculate temporary vectorial force */
415 tx = _mm_mul_pd(fscal,dx01);
416 ty = _mm_mul_pd(fscal,dy01);
417 tz = _mm_mul_pd(fscal,dz01);
419 /* Update vectorial force */
420 fix0 = _mm_add_pd(fix0,tx);
421 fiy0 = _mm_add_pd(fiy0,ty);
422 fiz0 = _mm_add_pd(fiz0,tz);
424 fjx1 = _mm_add_pd(fjx1,tx);
425 fjy1 = _mm_add_pd(fjy1,ty);
426 fjz1 = _mm_add_pd(fjz1,tz);
430 /**************************
431 * CALCULATE INTERACTIONS *
432 **************************/
434 if (gmx_mm_any_lt(rsq02,rcutoff2))
437 r02 = _mm_mul_pd(rsq02,rinv02);
439 /* EWALD ELECTROSTATICS */
441 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
442 ewrt = _mm_mul_pd(r02,ewtabscale);
443 ewitab = _mm_cvttpd_epi32(ewrt);
444 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
445 ewitab = _mm_slli_epi32(ewitab,2);
446 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
447 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
448 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
449 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
450 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
451 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
452 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
453 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
454 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
455 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
457 d = _mm_sub_pd(r02,rswitch);
458 d = _mm_max_pd(d,_mm_setzero_pd());
459 d2 = _mm_mul_pd(d,d);
460 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
462 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
464 /* Evaluate switch function */
465 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
466 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
467 velec = _mm_mul_pd(velec,sw);
468 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
470 /* Update potential sum for this i atom from the interaction with this j atom. */
471 velec = _mm_and_pd(velec,cutoff_mask);
472 velecsum = _mm_add_pd(velecsum,velec);
476 fscal = _mm_and_pd(fscal,cutoff_mask);
478 /* Calculate temporary vectorial force */
479 tx = _mm_mul_pd(fscal,dx02);
480 ty = _mm_mul_pd(fscal,dy02);
481 tz = _mm_mul_pd(fscal,dz02);
483 /* Update vectorial force */
484 fix0 = _mm_add_pd(fix0,tx);
485 fiy0 = _mm_add_pd(fiy0,ty);
486 fiz0 = _mm_add_pd(fiz0,tz);
488 fjx2 = _mm_add_pd(fjx2,tx);
489 fjy2 = _mm_add_pd(fjy2,ty);
490 fjz2 = _mm_add_pd(fjz2,tz);
494 /**************************
495 * CALCULATE INTERACTIONS *
496 **************************/
498 if (gmx_mm_any_lt(rsq10,rcutoff2))
501 r10 = _mm_mul_pd(rsq10,rinv10);
503 /* EWALD ELECTROSTATICS */
505 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
506 ewrt = _mm_mul_pd(r10,ewtabscale);
507 ewitab = _mm_cvttpd_epi32(ewrt);
508 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
509 ewitab = _mm_slli_epi32(ewitab,2);
510 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
511 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
512 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
513 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
514 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
515 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
516 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
517 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
518 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
519 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
521 d = _mm_sub_pd(r10,rswitch);
522 d = _mm_max_pd(d,_mm_setzero_pd());
523 d2 = _mm_mul_pd(d,d);
524 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
526 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
528 /* Evaluate switch function */
529 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
530 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
531 velec = _mm_mul_pd(velec,sw);
532 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
534 /* Update potential sum for this i atom from the interaction with this j atom. */
535 velec = _mm_and_pd(velec,cutoff_mask);
536 velecsum = _mm_add_pd(velecsum,velec);
540 fscal = _mm_and_pd(fscal,cutoff_mask);
542 /* Calculate temporary vectorial force */
543 tx = _mm_mul_pd(fscal,dx10);
544 ty = _mm_mul_pd(fscal,dy10);
545 tz = _mm_mul_pd(fscal,dz10);
547 /* Update vectorial force */
548 fix1 = _mm_add_pd(fix1,tx);
549 fiy1 = _mm_add_pd(fiy1,ty);
550 fiz1 = _mm_add_pd(fiz1,tz);
552 fjx0 = _mm_add_pd(fjx0,tx);
553 fjy0 = _mm_add_pd(fjy0,ty);
554 fjz0 = _mm_add_pd(fjz0,tz);
558 /**************************
559 * CALCULATE INTERACTIONS *
560 **************************/
562 if (gmx_mm_any_lt(rsq11,rcutoff2))
565 r11 = _mm_mul_pd(rsq11,rinv11);
567 /* EWALD ELECTROSTATICS */
569 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
570 ewrt = _mm_mul_pd(r11,ewtabscale);
571 ewitab = _mm_cvttpd_epi32(ewrt);
572 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
573 ewitab = _mm_slli_epi32(ewitab,2);
574 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
575 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
576 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
577 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
578 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
579 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
580 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
581 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
582 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
583 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
585 d = _mm_sub_pd(r11,rswitch);
586 d = _mm_max_pd(d,_mm_setzero_pd());
587 d2 = _mm_mul_pd(d,d);
588 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
590 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
592 /* Evaluate switch function */
593 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
594 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
595 velec = _mm_mul_pd(velec,sw);
596 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
598 /* Update potential sum for this i atom from the interaction with this j atom. */
599 velec = _mm_and_pd(velec,cutoff_mask);
600 velecsum = _mm_add_pd(velecsum,velec);
604 fscal = _mm_and_pd(fscal,cutoff_mask);
606 /* Calculate temporary vectorial force */
607 tx = _mm_mul_pd(fscal,dx11);
608 ty = _mm_mul_pd(fscal,dy11);
609 tz = _mm_mul_pd(fscal,dz11);
611 /* Update vectorial force */
612 fix1 = _mm_add_pd(fix1,tx);
613 fiy1 = _mm_add_pd(fiy1,ty);
614 fiz1 = _mm_add_pd(fiz1,tz);
616 fjx1 = _mm_add_pd(fjx1,tx);
617 fjy1 = _mm_add_pd(fjy1,ty);
618 fjz1 = _mm_add_pd(fjz1,tz);
622 /**************************
623 * CALCULATE INTERACTIONS *
624 **************************/
626 if (gmx_mm_any_lt(rsq12,rcutoff2))
629 r12 = _mm_mul_pd(rsq12,rinv12);
631 /* EWALD ELECTROSTATICS */
633 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
634 ewrt = _mm_mul_pd(r12,ewtabscale);
635 ewitab = _mm_cvttpd_epi32(ewrt);
636 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
637 ewitab = _mm_slli_epi32(ewitab,2);
638 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
639 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
640 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
641 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
642 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
643 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
644 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
645 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
646 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
647 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
649 d = _mm_sub_pd(r12,rswitch);
650 d = _mm_max_pd(d,_mm_setzero_pd());
651 d2 = _mm_mul_pd(d,d);
652 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
654 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
656 /* Evaluate switch function */
657 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
658 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
659 velec = _mm_mul_pd(velec,sw);
660 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
662 /* Update potential sum for this i atom from the interaction with this j atom. */
663 velec = _mm_and_pd(velec,cutoff_mask);
664 velecsum = _mm_add_pd(velecsum,velec);
668 fscal = _mm_and_pd(fscal,cutoff_mask);
670 /* Calculate temporary vectorial force */
671 tx = _mm_mul_pd(fscal,dx12);
672 ty = _mm_mul_pd(fscal,dy12);
673 tz = _mm_mul_pd(fscal,dz12);
675 /* Update vectorial force */
676 fix1 = _mm_add_pd(fix1,tx);
677 fiy1 = _mm_add_pd(fiy1,ty);
678 fiz1 = _mm_add_pd(fiz1,tz);
680 fjx2 = _mm_add_pd(fjx2,tx);
681 fjy2 = _mm_add_pd(fjy2,ty);
682 fjz2 = _mm_add_pd(fjz2,tz);
686 /**************************
687 * CALCULATE INTERACTIONS *
688 **************************/
690 if (gmx_mm_any_lt(rsq20,rcutoff2))
693 r20 = _mm_mul_pd(rsq20,rinv20);
695 /* EWALD ELECTROSTATICS */
697 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
698 ewrt = _mm_mul_pd(r20,ewtabscale);
699 ewitab = _mm_cvttpd_epi32(ewrt);
700 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
701 ewitab = _mm_slli_epi32(ewitab,2);
702 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
703 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
704 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
705 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
706 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
707 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
708 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
709 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
710 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
711 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
713 d = _mm_sub_pd(r20,rswitch);
714 d = _mm_max_pd(d,_mm_setzero_pd());
715 d2 = _mm_mul_pd(d,d);
716 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
718 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
720 /* Evaluate switch function */
721 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
722 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
723 velec = _mm_mul_pd(velec,sw);
724 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
726 /* Update potential sum for this i atom from the interaction with this j atom. */
727 velec = _mm_and_pd(velec,cutoff_mask);
728 velecsum = _mm_add_pd(velecsum,velec);
732 fscal = _mm_and_pd(fscal,cutoff_mask);
734 /* Calculate temporary vectorial force */
735 tx = _mm_mul_pd(fscal,dx20);
736 ty = _mm_mul_pd(fscal,dy20);
737 tz = _mm_mul_pd(fscal,dz20);
739 /* Update vectorial force */
740 fix2 = _mm_add_pd(fix2,tx);
741 fiy2 = _mm_add_pd(fiy2,ty);
742 fiz2 = _mm_add_pd(fiz2,tz);
744 fjx0 = _mm_add_pd(fjx0,tx);
745 fjy0 = _mm_add_pd(fjy0,ty);
746 fjz0 = _mm_add_pd(fjz0,tz);
750 /**************************
751 * CALCULATE INTERACTIONS *
752 **************************/
754 if (gmx_mm_any_lt(rsq21,rcutoff2))
757 r21 = _mm_mul_pd(rsq21,rinv21);
759 /* EWALD ELECTROSTATICS */
761 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
762 ewrt = _mm_mul_pd(r21,ewtabscale);
763 ewitab = _mm_cvttpd_epi32(ewrt);
764 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
765 ewitab = _mm_slli_epi32(ewitab,2);
766 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
767 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
768 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
769 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
770 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
771 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
772 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
773 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
774 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
775 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
777 d = _mm_sub_pd(r21,rswitch);
778 d = _mm_max_pd(d,_mm_setzero_pd());
779 d2 = _mm_mul_pd(d,d);
780 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
782 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
784 /* Evaluate switch function */
785 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
786 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
787 velec = _mm_mul_pd(velec,sw);
788 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
790 /* Update potential sum for this i atom from the interaction with this j atom. */
791 velec = _mm_and_pd(velec,cutoff_mask);
792 velecsum = _mm_add_pd(velecsum,velec);
796 fscal = _mm_and_pd(fscal,cutoff_mask);
798 /* Calculate temporary vectorial force */
799 tx = _mm_mul_pd(fscal,dx21);
800 ty = _mm_mul_pd(fscal,dy21);
801 tz = _mm_mul_pd(fscal,dz21);
803 /* Update vectorial force */
804 fix2 = _mm_add_pd(fix2,tx);
805 fiy2 = _mm_add_pd(fiy2,ty);
806 fiz2 = _mm_add_pd(fiz2,tz);
808 fjx1 = _mm_add_pd(fjx1,tx);
809 fjy1 = _mm_add_pd(fjy1,ty);
810 fjz1 = _mm_add_pd(fjz1,tz);
814 /**************************
815 * CALCULATE INTERACTIONS *
816 **************************/
818 if (gmx_mm_any_lt(rsq22,rcutoff2))
821 r22 = _mm_mul_pd(rsq22,rinv22);
823 /* EWALD ELECTROSTATICS */
825 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
826 ewrt = _mm_mul_pd(r22,ewtabscale);
827 ewitab = _mm_cvttpd_epi32(ewrt);
828 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
829 ewitab = _mm_slli_epi32(ewitab,2);
830 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
831 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
832 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
833 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
834 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
835 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
836 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
837 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
838 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
839 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
841 d = _mm_sub_pd(r22,rswitch);
842 d = _mm_max_pd(d,_mm_setzero_pd());
843 d2 = _mm_mul_pd(d,d);
844 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
846 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
848 /* Evaluate switch function */
849 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
850 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
851 velec = _mm_mul_pd(velec,sw);
852 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
854 /* Update potential sum for this i atom from the interaction with this j atom. */
855 velec = _mm_and_pd(velec,cutoff_mask);
856 velecsum = _mm_add_pd(velecsum,velec);
860 fscal = _mm_and_pd(fscal,cutoff_mask);
862 /* Calculate temporary vectorial force */
863 tx = _mm_mul_pd(fscal,dx22);
864 ty = _mm_mul_pd(fscal,dy22);
865 tz = _mm_mul_pd(fscal,dz22);
867 /* Update vectorial force */
868 fix2 = _mm_add_pd(fix2,tx);
869 fiy2 = _mm_add_pd(fiy2,ty);
870 fiz2 = _mm_add_pd(fiz2,tz);
872 fjx2 = _mm_add_pd(fjx2,tx);
873 fjy2 = _mm_add_pd(fjy2,ty);
874 fjz2 = _mm_add_pd(fjz2,tz);
878 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
880 /* Inner loop uses 603 flops */
887 j_coord_offsetA = DIM*jnrA;
889 /* load j atom coordinates */
890 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
891 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
893 /* Calculate displacement vector */
894 dx00 = _mm_sub_pd(ix0,jx0);
895 dy00 = _mm_sub_pd(iy0,jy0);
896 dz00 = _mm_sub_pd(iz0,jz0);
897 dx01 = _mm_sub_pd(ix0,jx1);
898 dy01 = _mm_sub_pd(iy0,jy1);
899 dz01 = _mm_sub_pd(iz0,jz1);
900 dx02 = _mm_sub_pd(ix0,jx2);
901 dy02 = _mm_sub_pd(iy0,jy2);
902 dz02 = _mm_sub_pd(iz0,jz2);
903 dx10 = _mm_sub_pd(ix1,jx0);
904 dy10 = _mm_sub_pd(iy1,jy0);
905 dz10 = _mm_sub_pd(iz1,jz0);
906 dx11 = _mm_sub_pd(ix1,jx1);
907 dy11 = _mm_sub_pd(iy1,jy1);
908 dz11 = _mm_sub_pd(iz1,jz1);
909 dx12 = _mm_sub_pd(ix1,jx2);
910 dy12 = _mm_sub_pd(iy1,jy2);
911 dz12 = _mm_sub_pd(iz1,jz2);
912 dx20 = _mm_sub_pd(ix2,jx0);
913 dy20 = _mm_sub_pd(iy2,jy0);
914 dz20 = _mm_sub_pd(iz2,jz0);
915 dx21 = _mm_sub_pd(ix2,jx1);
916 dy21 = _mm_sub_pd(iy2,jy1);
917 dz21 = _mm_sub_pd(iz2,jz1);
918 dx22 = _mm_sub_pd(ix2,jx2);
919 dy22 = _mm_sub_pd(iy2,jy2);
920 dz22 = _mm_sub_pd(iz2,jz2);
922 /* Calculate squared distance and things based on it */
923 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
924 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
925 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
926 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
927 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
928 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
929 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
930 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
931 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
933 rinv00 = gmx_mm_invsqrt_pd(rsq00);
934 rinv01 = gmx_mm_invsqrt_pd(rsq01);
935 rinv02 = gmx_mm_invsqrt_pd(rsq02);
936 rinv10 = gmx_mm_invsqrt_pd(rsq10);
937 rinv11 = gmx_mm_invsqrt_pd(rsq11);
938 rinv12 = gmx_mm_invsqrt_pd(rsq12);
939 rinv20 = gmx_mm_invsqrt_pd(rsq20);
940 rinv21 = gmx_mm_invsqrt_pd(rsq21);
941 rinv22 = gmx_mm_invsqrt_pd(rsq22);
943 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
944 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
945 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
946 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
947 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
948 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
949 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
950 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
951 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
953 fjx0 = _mm_setzero_pd();
954 fjy0 = _mm_setzero_pd();
955 fjz0 = _mm_setzero_pd();
956 fjx1 = _mm_setzero_pd();
957 fjy1 = _mm_setzero_pd();
958 fjz1 = _mm_setzero_pd();
959 fjx2 = _mm_setzero_pd();
960 fjy2 = _mm_setzero_pd();
961 fjz2 = _mm_setzero_pd();
963 /**************************
964 * CALCULATE INTERACTIONS *
965 **************************/
967 if (gmx_mm_any_lt(rsq00,rcutoff2))
970 r00 = _mm_mul_pd(rsq00,rinv00);
972 /* EWALD ELECTROSTATICS */
974 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
975 ewrt = _mm_mul_pd(r00,ewtabscale);
976 ewitab = _mm_cvttpd_epi32(ewrt);
977 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
978 ewitab = _mm_slli_epi32(ewitab,2);
979 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
980 ewtabD = _mm_setzero_pd();
981 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
982 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
983 ewtabFn = _mm_setzero_pd();
984 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
985 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
986 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
987 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
988 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
990 /* LENNARD-JONES DISPERSION/REPULSION */
992 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
993 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
994 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
995 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
996 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
998 d = _mm_sub_pd(r00,rswitch);
999 d = _mm_max_pd(d,_mm_setzero_pd());
1000 d2 = _mm_mul_pd(d,d);
1001 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1003 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1005 /* Evaluate switch function */
1006 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1007 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
1008 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1009 velec = _mm_mul_pd(velec,sw);
1010 vvdw = _mm_mul_pd(vvdw,sw);
1011 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1013 /* Update potential sum for this i atom from the interaction with this j atom. */
1014 velec = _mm_and_pd(velec,cutoff_mask);
1015 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1016 velecsum = _mm_add_pd(velecsum,velec);
1017 vvdw = _mm_and_pd(vvdw,cutoff_mask);
1018 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
1019 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
1021 fscal = _mm_add_pd(felec,fvdw);
1023 fscal = _mm_and_pd(fscal,cutoff_mask);
1025 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1027 /* Calculate temporary vectorial force */
1028 tx = _mm_mul_pd(fscal,dx00);
1029 ty = _mm_mul_pd(fscal,dy00);
1030 tz = _mm_mul_pd(fscal,dz00);
1032 /* Update vectorial force */
1033 fix0 = _mm_add_pd(fix0,tx);
1034 fiy0 = _mm_add_pd(fiy0,ty);
1035 fiz0 = _mm_add_pd(fiz0,tz);
1037 fjx0 = _mm_add_pd(fjx0,tx);
1038 fjy0 = _mm_add_pd(fjy0,ty);
1039 fjz0 = _mm_add_pd(fjz0,tz);
1043 /**************************
1044 * CALCULATE INTERACTIONS *
1045 **************************/
1047 if (gmx_mm_any_lt(rsq01,rcutoff2))
1050 r01 = _mm_mul_pd(rsq01,rinv01);
1052 /* EWALD ELECTROSTATICS */
1054 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1055 ewrt = _mm_mul_pd(r01,ewtabscale);
1056 ewitab = _mm_cvttpd_epi32(ewrt);
1057 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1058 ewitab = _mm_slli_epi32(ewitab,2);
1059 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1060 ewtabD = _mm_setzero_pd();
1061 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1062 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1063 ewtabFn = _mm_setzero_pd();
1064 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1065 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1066 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1067 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
1068 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1070 d = _mm_sub_pd(r01,rswitch);
1071 d = _mm_max_pd(d,_mm_setzero_pd());
1072 d2 = _mm_mul_pd(d,d);
1073 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1075 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1077 /* Evaluate switch function */
1078 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1079 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
1080 velec = _mm_mul_pd(velec,sw);
1081 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
1083 /* Update potential sum for this i atom from the interaction with this j atom. */
1084 velec = _mm_and_pd(velec,cutoff_mask);
1085 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1086 velecsum = _mm_add_pd(velecsum,velec);
1090 fscal = _mm_and_pd(fscal,cutoff_mask);
1092 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1094 /* Calculate temporary vectorial force */
1095 tx = _mm_mul_pd(fscal,dx01);
1096 ty = _mm_mul_pd(fscal,dy01);
1097 tz = _mm_mul_pd(fscal,dz01);
1099 /* Update vectorial force */
1100 fix0 = _mm_add_pd(fix0,tx);
1101 fiy0 = _mm_add_pd(fiy0,ty);
1102 fiz0 = _mm_add_pd(fiz0,tz);
1104 fjx1 = _mm_add_pd(fjx1,tx);
1105 fjy1 = _mm_add_pd(fjy1,ty);
1106 fjz1 = _mm_add_pd(fjz1,tz);
1110 /**************************
1111 * CALCULATE INTERACTIONS *
1112 **************************/
1114 if (gmx_mm_any_lt(rsq02,rcutoff2))
1117 r02 = _mm_mul_pd(rsq02,rinv02);
1119 /* EWALD ELECTROSTATICS */
1121 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1122 ewrt = _mm_mul_pd(r02,ewtabscale);
1123 ewitab = _mm_cvttpd_epi32(ewrt);
1124 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1125 ewitab = _mm_slli_epi32(ewitab,2);
1126 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1127 ewtabD = _mm_setzero_pd();
1128 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1129 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1130 ewtabFn = _mm_setzero_pd();
1131 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1132 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1133 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1134 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
1135 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1137 d = _mm_sub_pd(r02,rswitch);
1138 d = _mm_max_pd(d,_mm_setzero_pd());
1139 d2 = _mm_mul_pd(d,d);
1140 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1142 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1144 /* Evaluate switch function */
1145 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1146 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
1147 velec = _mm_mul_pd(velec,sw);
1148 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
1150 /* Update potential sum for this i atom from the interaction with this j atom. */
1151 velec = _mm_and_pd(velec,cutoff_mask);
1152 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1153 velecsum = _mm_add_pd(velecsum,velec);
1157 fscal = _mm_and_pd(fscal,cutoff_mask);
1159 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1161 /* Calculate temporary vectorial force */
1162 tx = _mm_mul_pd(fscal,dx02);
1163 ty = _mm_mul_pd(fscal,dy02);
1164 tz = _mm_mul_pd(fscal,dz02);
1166 /* Update vectorial force */
1167 fix0 = _mm_add_pd(fix0,tx);
1168 fiy0 = _mm_add_pd(fiy0,ty);
1169 fiz0 = _mm_add_pd(fiz0,tz);
1171 fjx2 = _mm_add_pd(fjx2,tx);
1172 fjy2 = _mm_add_pd(fjy2,ty);
1173 fjz2 = _mm_add_pd(fjz2,tz);
1177 /**************************
1178 * CALCULATE INTERACTIONS *
1179 **************************/
1181 if (gmx_mm_any_lt(rsq10,rcutoff2))
1184 r10 = _mm_mul_pd(rsq10,rinv10);
1186 /* EWALD ELECTROSTATICS */
1188 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1189 ewrt = _mm_mul_pd(r10,ewtabscale);
1190 ewitab = _mm_cvttpd_epi32(ewrt);
1191 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1192 ewitab = _mm_slli_epi32(ewitab,2);
1193 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1194 ewtabD = _mm_setzero_pd();
1195 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1196 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1197 ewtabFn = _mm_setzero_pd();
1198 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1199 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1200 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1201 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
1202 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1204 d = _mm_sub_pd(r10,rswitch);
1205 d = _mm_max_pd(d,_mm_setzero_pd());
1206 d2 = _mm_mul_pd(d,d);
1207 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1209 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1211 /* Evaluate switch function */
1212 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1213 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
1214 velec = _mm_mul_pd(velec,sw);
1215 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1217 /* Update potential sum for this i atom from the interaction with this j atom. */
1218 velec = _mm_and_pd(velec,cutoff_mask);
1219 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1220 velecsum = _mm_add_pd(velecsum,velec);
1224 fscal = _mm_and_pd(fscal,cutoff_mask);
1226 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1228 /* Calculate temporary vectorial force */
1229 tx = _mm_mul_pd(fscal,dx10);
1230 ty = _mm_mul_pd(fscal,dy10);
1231 tz = _mm_mul_pd(fscal,dz10);
1233 /* Update vectorial force */
1234 fix1 = _mm_add_pd(fix1,tx);
1235 fiy1 = _mm_add_pd(fiy1,ty);
1236 fiz1 = _mm_add_pd(fiz1,tz);
1238 fjx0 = _mm_add_pd(fjx0,tx);
1239 fjy0 = _mm_add_pd(fjy0,ty);
1240 fjz0 = _mm_add_pd(fjz0,tz);
1244 /**************************
1245 * CALCULATE INTERACTIONS *
1246 **************************/
1248 if (gmx_mm_any_lt(rsq11,rcutoff2))
1251 r11 = _mm_mul_pd(rsq11,rinv11);
1253 /* EWALD ELECTROSTATICS */
1255 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1256 ewrt = _mm_mul_pd(r11,ewtabscale);
1257 ewitab = _mm_cvttpd_epi32(ewrt);
1258 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1259 ewitab = _mm_slli_epi32(ewitab,2);
1260 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1261 ewtabD = _mm_setzero_pd();
1262 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1263 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1264 ewtabFn = _mm_setzero_pd();
1265 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1266 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1267 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1268 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1269 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1271 d = _mm_sub_pd(r11,rswitch);
1272 d = _mm_max_pd(d,_mm_setzero_pd());
1273 d2 = _mm_mul_pd(d,d);
1274 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1276 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1278 /* Evaluate switch function */
1279 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1280 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
1281 velec = _mm_mul_pd(velec,sw);
1282 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1284 /* Update potential sum for this i atom from the interaction with this j atom. */
1285 velec = _mm_and_pd(velec,cutoff_mask);
1286 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1287 velecsum = _mm_add_pd(velecsum,velec);
1291 fscal = _mm_and_pd(fscal,cutoff_mask);
1293 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1295 /* Calculate temporary vectorial force */
1296 tx = _mm_mul_pd(fscal,dx11);
1297 ty = _mm_mul_pd(fscal,dy11);
1298 tz = _mm_mul_pd(fscal,dz11);
1300 /* Update vectorial force */
1301 fix1 = _mm_add_pd(fix1,tx);
1302 fiy1 = _mm_add_pd(fiy1,ty);
1303 fiz1 = _mm_add_pd(fiz1,tz);
1305 fjx1 = _mm_add_pd(fjx1,tx);
1306 fjy1 = _mm_add_pd(fjy1,ty);
1307 fjz1 = _mm_add_pd(fjz1,tz);
1311 /**************************
1312 * CALCULATE INTERACTIONS *
1313 **************************/
1315 if (gmx_mm_any_lt(rsq12,rcutoff2))
1318 r12 = _mm_mul_pd(rsq12,rinv12);
1320 /* EWALD ELECTROSTATICS */
1322 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1323 ewrt = _mm_mul_pd(r12,ewtabscale);
1324 ewitab = _mm_cvttpd_epi32(ewrt);
1325 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1326 ewitab = _mm_slli_epi32(ewitab,2);
1327 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1328 ewtabD = _mm_setzero_pd();
1329 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1330 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1331 ewtabFn = _mm_setzero_pd();
1332 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1333 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1334 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1335 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1336 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1338 d = _mm_sub_pd(r12,rswitch);
1339 d = _mm_max_pd(d,_mm_setzero_pd());
1340 d2 = _mm_mul_pd(d,d);
1341 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1343 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1345 /* Evaluate switch function */
1346 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1347 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1348 velec = _mm_mul_pd(velec,sw);
1349 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1351 /* Update potential sum for this i atom from the interaction with this j atom. */
1352 velec = _mm_and_pd(velec,cutoff_mask);
1353 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1354 velecsum = _mm_add_pd(velecsum,velec);
1358 fscal = _mm_and_pd(fscal,cutoff_mask);
1360 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1362 /* Calculate temporary vectorial force */
1363 tx = _mm_mul_pd(fscal,dx12);
1364 ty = _mm_mul_pd(fscal,dy12);
1365 tz = _mm_mul_pd(fscal,dz12);
1367 /* Update vectorial force */
1368 fix1 = _mm_add_pd(fix1,tx);
1369 fiy1 = _mm_add_pd(fiy1,ty);
1370 fiz1 = _mm_add_pd(fiz1,tz);
1372 fjx2 = _mm_add_pd(fjx2,tx);
1373 fjy2 = _mm_add_pd(fjy2,ty);
1374 fjz2 = _mm_add_pd(fjz2,tz);
1378 /**************************
1379 * CALCULATE INTERACTIONS *
1380 **************************/
1382 if (gmx_mm_any_lt(rsq20,rcutoff2))
1385 r20 = _mm_mul_pd(rsq20,rinv20);
1387 /* EWALD ELECTROSTATICS */
1389 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1390 ewrt = _mm_mul_pd(r20,ewtabscale);
1391 ewitab = _mm_cvttpd_epi32(ewrt);
1392 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1393 ewitab = _mm_slli_epi32(ewitab,2);
1394 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1395 ewtabD = _mm_setzero_pd();
1396 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1397 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1398 ewtabFn = _mm_setzero_pd();
1399 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1400 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1401 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1402 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
1403 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1405 d = _mm_sub_pd(r20,rswitch);
1406 d = _mm_max_pd(d,_mm_setzero_pd());
1407 d2 = _mm_mul_pd(d,d);
1408 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1410 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1412 /* Evaluate switch function */
1413 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1414 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
1415 velec = _mm_mul_pd(velec,sw);
1416 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1418 /* Update potential sum for this i atom from the interaction with this j atom. */
1419 velec = _mm_and_pd(velec,cutoff_mask);
1420 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1421 velecsum = _mm_add_pd(velecsum,velec);
1425 fscal = _mm_and_pd(fscal,cutoff_mask);
1427 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1429 /* Calculate temporary vectorial force */
1430 tx = _mm_mul_pd(fscal,dx20);
1431 ty = _mm_mul_pd(fscal,dy20);
1432 tz = _mm_mul_pd(fscal,dz20);
1434 /* Update vectorial force */
1435 fix2 = _mm_add_pd(fix2,tx);
1436 fiy2 = _mm_add_pd(fiy2,ty);
1437 fiz2 = _mm_add_pd(fiz2,tz);
1439 fjx0 = _mm_add_pd(fjx0,tx);
1440 fjy0 = _mm_add_pd(fjy0,ty);
1441 fjz0 = _mm_add_pd(fjz0,tz);
1445 /**************************
1446 * CALCULATE INTERACTIONS *
1447 **************************/
1449 if (gmx_mm_any_lt(rsq21,rcutoff2))
1452 r21 = _mm_mul_pd(rsq21,rinv21);
1454 /* EWALD ELECTROSTATICS */
1456 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1457 ewrt = _mm_mul_pd(r21,ewtabscale);
1458 ewitab = _mm_cvttpd_epi32(ewrt);
1459 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1460 ewitab = _mm_slli_epi32(ewitab,2);
1461 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1462 ewtabD = _mm_setzero_pd();
1463 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1464 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1465 ewtabFn = _mm_setzero_pd();
1466 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1467 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1468 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1469 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1470 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1472 d = _mm_sub_pd(r21,rswitch);
1473 d = _mm_max_pd(d,_mm_setzero_pd());
1474 d2 = _mm_mul_pd(d,d);
1475 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1477 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1479 /* Evaluate switch function */
1480 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1481 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
1482 velec = _mm_mul_pd(velec,sw);
1483 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1485 /* Update potential sum for this i atom from the interaction with this j atom. */
1486 velec = _mm_and_pd(velec,cutoff_mask);
1487 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1488 velecsum = _mm_add_pd(velecsum,velec);
1492 fscal = _mm_and_pd(fscal,cutoff_mask);
1494 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1496 /* Calculate temporary vectorial force */
1497 tx = _mm_mul_pd(fscal,dx21);
1498 ty = _mm_mul_pd(fscal,dy21);
1499 tz = _mm_mul_pd(fscal,dz21);
1501 /* Update vectorial force */
1502 fix2 = _mm_add_pd(fix2,tx);
1503 fiy2 = _mm_add_pd(fiy2,ty);
1504 fiz2 = _mm_add_pd(fiz2,tz);
1506 fjx1 = _mm_add_pd(fjx1,tx);
1507 fjy1 = _mm_add_pd(fjy1,ty);
1508 fjz1 = _mm_add_pd(fjz1,tz);
1512 /**************************
1513 * CALCULATE INTERACTIONS *
1514 **************************/
1516 if (gmx_mm_any_lt(rsq22,rcutoff2))
1519 r22 = _mm_mul_pd(rsq22,rinv22);
1521 /* EWALD ELECTROSTATICS */
1523 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1524 ewrt = _mm_mul_pd(r22,ewtabscale);
1525 ewitab = _mm_cvttpd_epi32(ewrt);
1526 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1527 ewitab = _mm_slli_epi32(ewitab,2);
1528 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1529 ewtabD = _mm_setzero_pd();
1530 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1531 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1532 ewtabFn = _mm_setzero_pd();
1533 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1534 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1535 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1536 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1537 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1539 d = _mm_sub_pd(r22,rswitch);
1540 d = _mm_max_pd(d,_mm_setzero_pd());
1541 d2 = _mm_mul_pd(d,d);
1542 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1544 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1546 /* Evaluate switch function */
1547 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1548 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
1549 velec = _mm_mul_pd(velec,sw);
1550 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1552 /* Update potential sum for this i atom from the interaction with this j atom. */
1553 velec = _mm_and_pd(velec,cutoff_mask);
1554 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1555 velecsum = _mm_add_pd(velecsum,velec);
1559 fscal = _mm_and_pd(fscal,cutoff_mask);
1561 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1563 /* Calculate temporary vectorial force */
1564 tx = _mm_mul_pd(fscal,dx22);
1565 ty = _mm_mul_pd(fscal,dy22);
1566 tz = _mm_mul_pd(fscal,dz22);
1568 /* Update vectorial force */
1569 fix2 = _mm_add_pd(fix2,tx);
1570 fiy2 = _mm_add_pd(fiy2,ty);
1571 fiz2 = _mm_add_pd(fiz2,tz);
1573 fjx2 = _mm_add_pd(fjx2,tx);
1574 fjy2 = _mm_add_pd(fjy2,ty);
1575 fjz2 = _mm_add_pd(fjz2,tz);
1579 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1581 /* Inner loop uses 603 flops */
1584 /* End of innermost loop */
1586 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1587 f+i_coord_offset,fshift+i_shift_offset);
1590 /* Update potential energies */
1591 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1592 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1594 /* Increment number of inner iterations */
1595 inneriter += j_index_end - j_index_start;
1597 /* Outer loop uses 20 flops */
1600 /* Increment number of outer iterations */
1603 /* Update outer/inner flops */
1605 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*603);
1608 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_double
1609 * Electrostatics interaction: Ewald
1610 * VdW interaction: LennardJones
1611 * Geometry: Water3-Water3
1612 * Calculate force/pot: Force
1615 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_double
1616 (t_nblist * gmx_restrict nlist,
1617 rvec * gmx_restrict xx,
1618 rvec * gmx_restrict ff,
1619 t_forcerec * gmx_restrict fr,
1620 t_mdatoms * gmx_restrict mdatoms,
1621 nb_kernel_data_t * gmx_restrict kernel_data,
1622 t_nrnb * gmx_restrict nrnb)
1624 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1625 * just 0 for non-waters.
1626 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1627 * jnr indices corresponding to data put in the four positions in the SIMD register.
1629 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1630 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1632 int j_coord_offsetA,j_coord_offsetB;
1633 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1634 real rcutoff_scalar;
1635 real *shiftvec,*fshift,*x,*f;
1636 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1638 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1640 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1642 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1643 int vdwjidx0A,vdwjidx0B;
1644 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1645 int vdwjidx1A,vdwjidx1B;
1646 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1647 int vdwjidx2A,vdwjidx2B;
1648 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1649 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1650 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1651 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1652 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1653 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1654 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1655 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1656 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1657 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1658 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1661 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1664 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1665 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1667 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1669 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1670 real rswitch_scalar,d_scalar;
1671 __m128d dummy_mask,cutoff_mask;
1672 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1673 __m128d one = _mm_set1_pd(1.0);
1674 __m128d two = _mm_set1_pd(2.0);
1680 jindex = nlist->jindex;
1682 shiftidx = nlist->shift;
1684 shiftvec = fr->shift_vec[0];
1685 fshift = fr->fshift[0];
1686 facel = _mm_set1_pd(fr->epsfac);
1687 charge = mdatoms->chargeA;
1688 nvdwtype = fr->ntype;
1689 vdwparam = fr->nbfp;
1690 vdwtype = mdatoms->typeA;
1692 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1693 ewtab = fr->ic->tabq_coul_FDV0;
1694 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1695 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1697 /* Setup water-specific parameters */
1698 inr = nlist->iinr[0];
1699 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1700 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1701 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1702 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1704 jq0 = _mm_set1_pd(charge[inr+0]);
1705 jq1 = _mm_set1_pd(charge[inr+1]);
1706 jq2 = _mm_set1_pd(charge[inr+2]);
1707 vdwjidx0A = 2*vdwtype[inr+0];
1708 qq00 = _mm_mul_pd(iq0,jq0);
1709 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1710 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1711 qq01 = _mm_mul_pd(iq0,jq1);
1712 qq02 = _mm_mul_pd(iq0,jq2);
1713 qq10 = _mm_mul_pd(iq1,jq0);
1714 qq11 = _mm_mul_pd(iq1,jq1);
1715 qq12 = _mm_mul_pd(iq1,jq2);
1716 qq20 = _mm_mul_pd(iq2,jq0);
1717 qq21 = _mm_mul_pd(iq2,jq1);
1718 qq22 = _mm_mul_pd(iq2,jq2);
1720 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1721 rcutoff_scalar = fr->rcoulomb;
1722 rcutoff = _mm_set1_pd(rcutoff_scalar);
1723 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1725 rswitch_scalar = fr->rcoulomb_switch;
1726 rswitch = _mm_set1_pd(rswitch_scalar);
1727 /* Setup switch parameters */
1728 d_scalar = rcutoff_scalar-rswitch_scalar;
1729 d = _mm_set1_pd(d_scalar);
1730 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1731 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1732 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1733 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1734 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1735 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1737 /* Avoid stupid compiler warnings */
1739 j_coord_offsetA = 0;
1740 j_coord_offsetB = 0;
1745 /* Start outer loop over neighborlists */
1746 for(iidx=0; iidx<nri; iidx++)
1748 /* Load shift vector for this list */
1749 i_shift_offset = DIM*shiftidx[iidx];
1751 /* Load limits for loop over neighbors */
1752 j_index_start = jindex[iidx];
1753 j_index_end = jindex[iidx+1];
1755 /* Get outer coordinate index */
1757 i_coord_offset = DIM*inr;
1759 /* Load i particle coords and add shift vector */
1760 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1761 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1763 fix0 = _mm_setzero_pd();
1764 fiy0 = _mm_setzero_pd();
1765 fiz0 = _mm_setzero_pd();
1766 fix1 = _mm_setzero_pd();
1767 fiy1 = _mm_setzero_pd();
1768 fiz1 = _mm_setzero_pd();
1769 fix2 = _mm_setzero_pd();
1770 fiy2 = _mm_setzero_pd();
1771 fiz2 = _mm_setzero_pd();
1773 /* Start inner kernel loop */
1774 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1777 /* Get j neighbor index, and coordinate index */
1779 jnrB = jjnr[jidx+1];
1780 j_coord_offsetA = DIM*jnrA;
1781 j_coord_offsetB = DIM*jnrB;
1783 /* load j atom coordinates */
1784 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1785 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1787 /* Calculate displacement vector */
1788 dx00 = _mm_sub_pd(ix0,jx0);
1789 dy00 = _mm_sub_pd(iy0,jy0);
1790 dz00 = _mm_sub_pd(iz0,jz0);
1791 dx01 = _mm_sub_pd(ix0,jx1);
1792 dy01 = _mm_sub_pd(iy0,jy1);
1793 dz01 = _mm_sub_pd(iz0,jz1);
1794 dx02 = _mm_sub_pd(ix0,jx2);
1795 dy02 = _mm_sub_pd(iy0,jy2);
1796 dz02 = _mm_sub_pd(iz0,jz2);
1797 dx10 = _mm_sub_pd(ix1,jx0);
1798 dy10 = _mm_sub_pd(iy1,jy0);
1799 dz10 = _mm_sub_pd(iz1,jz0);
1800 dx11 = _mm_sub_pd(ix1,jx1);
1801 dy11 = _mm_sub_pd(iy1,jy1);
1802 dz11 = _mm_sub_pd(iz1,jz1);
1803 dx12 = _mm_sub_pd(ix1,jx2);
1804 dy12 = _mm_sub_pd(iy1,jy2);
1805 dz12 = _mm_sub_pd(iz1,jz2);
1806 dx20 = _mm_sub_pd(ix2,jx0);
1807 dy20 = _mm_sub_pd(iy2,jy0);
1808 dz20 = _mm_sub_pd(iz2,jz0);
1809 dx21 = _mm_sub_pd(ix2,jx1);
1810 dy21 = _mm_sub_pd(iy2,jy1);
1811 dz21 = _mm_sub_pd(iz2,jz1);
1812 dx22 = _mm_sub_pd(ix2,jx2);
1813 dy22 = _mm_sub_pd(iy2,jy2);
1814 dz22 = _mm_sub_pd(iz2,jz2);
1816 /* Calculate squared distance and things based on it */
1817 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1818 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1819 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1820 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1821 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1822 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1823 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1824 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1825 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1827 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1828 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1829 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1830 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1831 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1832 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1833 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1834 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1835 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1837 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1838 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1839 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1840 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1841 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1842 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1843 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1844 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1845 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1847 fjx0 = _mm_setzero_pd();
1848 fjy0 = _mm_setzero_pd();
1849 fjz0 = _mm_setzero_pd();
1850 fjx1 = _mm_setzero_pd();
1851 fjy1 = _mm_setzero_pd();
1852 fjz1 = _mm_setzero_pd();
1853 fjx2 = _mm_setzero_pd();
1854 fjy2 = _mm_setzero_pd();
1855 fjz2 = _mm_setzero_pd();
1857 /**************************
1858 * CALCULATE INTERACTIONS *
1859 **************************/
1861 if (gmx_mm_any_lt(rsq00,rcutoff2))
1864 r00 = _mm_mul_pd(rsq00,rinv00);
1866 /* EWALD ELECTROSTATICS */
1868 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1869 ewrt = _mm_mul_pd(r00,ewtabscale);
1870 ewitab = _mm_cvttpd_epi32(ewrt);
1871 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1872 ewitab = _mm_slli_epi32(ewitab,2);
1873 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1874 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1875 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1876 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1877 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1878 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1879 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1880 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1881 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
1882 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1884 /* LENNARD-JONES DISPERSION/REPULSION */
1886 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1887 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1888 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1889 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
1890 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1892 d = _mm_sub_pd(r00,rswitch);
1893 d = _mm_max_pd(d,_mm_setzero_pd());
1894 d2 = _mm_mul_pd(d,d);
1895 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1897 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1899 /* Evaluate switch function */
1900 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1901 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
1902 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1903 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1905 fscal = _mm_add_pd(felec,fvdw);
1907 fscal = _mm_and_pd(fscal,cutoff_mask);
1909 /* Calculate temporary vectorial force */
1910 tx = _mm_mul_pd(fscal,dx00);
1911 ty = _mm_mul_pd(fscal,dy00);
1912 tz = _mm_mul_pd(fscal,dz00);
1914 /* Update vectorial force */
1915 fix0 = _mm_add_pd(fix0,tx);
1916 fiy0 = _mm_add_pd(fiy0,ty);
1917 fiz0 = _mm_add_pd(fiz0,tz);
1919 fjx0 = _mm_add_pd(fjx0,tx);
1920 fjy0 = _mm_add_pd(fjy0,ty);
1921 fjz0 = _mm_add_pd(fjz0,tz);
1925 /**************************
1926 * CALCULATE INTERACTIONS *
1927 **************************/
1929 if (gmx_mm_any_lt(rsq01,rcutoff2))
1932 r01 = _mm_mul_pd(rsq01,rinv01);
1934 /* EWALD ELECTROSTATICS */
1936 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1937 ewrt = _mm_mul_pd(r01,ewtabscale);
1938 ewitab = _mm_cvttpd_epi32(ewrt);
1939 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1940 ewitab = _mm_slli_epi32(ewitab,2);
1941 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1942 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1943 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1944 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1945 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1946 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1947 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1948 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1949 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
1950 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1952 d = _mm_sub_pd(r01,rswitch);
1953 d = _mm_max_pd(d,_mm_setzero_pd());
1954 d2 = _mm_mul_pd(d,d);
1955 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1957 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1959 /* Evaluate switch function */
1960 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1961 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
1962 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
1966 fscal = _mm_and_pd(fscal,cutoff_mask);
1968 /* Calculate temporary vectorial force */
1969 tx = _mm_mul_pd(fscal,dx01);
1970 ty = _mm_mul_pd(fscal,dy01);
1971 tz = _mm_mul_pd(fscal,dz01);
1973 /* Update vectorial force */
1974 fix0 = _mm_add_pd(fix0,tx);
1975 fiy0 = _mm_add_pd(fiy0,ty);
1976 fiz0 = _mm_add_pd(fiz0,tz);
1978 fjx1 = _mm_add_pd(fjx1,tx);
1979 fjy1 = _mm_add_pd(fjy1,ty);
1980 fjz1 = _mm_add_pd(fjz1,tz);
1984 /**************************
1985 * CALCULATE INTERACTIONS *
1986 **************************/
1988 if (gmx_mm_any_lt(rsq02,rcutoff2))
1991 r02 = _mm_mul_pd(rsq02,rinv02);
1993 /* EWALD ELECTROSTATICS */
1995 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1996 ewrt = _mm_mul_pd(r02,ewtabscale);
1997 ewitab = _mm_cvttpd_epi32(ewrt);
1998 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1999 ewitab = _mm_slli_epi32(ewitab,2);
2000 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2001 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2002 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2003 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2004 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2005 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2006 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2007 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2008 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
2009 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
2011 d = _mm_sub_pd(r02,rswitch);
2012 d = _mm_max_pd(d,_mm_setzero_pd());
2013 d2 = _mm_mul_pd(d,d);
2014 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2016 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2018 /* Evaluate switch function */
2019 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2020 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
2021 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
2025 fscal = _mm_and_pd(fscal,cutoff_mask);
2027 /* Calculate temporary vectorial force */
2028 tx = _mm_mul_pd(fscal,dx02);
2029 ty = _mm_mul_pd(fscal,dy02);
2030 tz = _mm_mul_pd(fscal,dz02);
2032 /* Update vectorial force */
2033 fix0 = _mm_add_pd(fix0,tx);
2034 fiy0 = _mm_add_pd(fiy0,ty);
2035 fiz0 = _mm_add_pd(fiz0,tz);
2037 fjx2 = _mm_add_pd(fjx2,tx);
2038 fjy2 = _mm_add_pd(fjy2,ty);
2039 fjz2 = _mm_add_pd(fjz2,tz);
2043 /**************************
2044 * CALCULATE INTERACTIONS *
2045 **************************/
2047 if (gmx_mm_any_lt(rsq10,rcutoff2))
2050 r10 = _mm_mul_pd(rsq10,rinv10);
2052 /* EWALD ELECTROSTATICS */
2054 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2055 ewrt = _mm_mul_pd(r10,ewtabscale);
2056 ewitab = _mm_cvttpd_epi32(ewrt);
2057 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2058 ewitab = _mm_slli_epi32(ewitab,2);
2059 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2060 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2061 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2062 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2063 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2064 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2065 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2066 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2067 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
2068 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2070 d = _mm_sub_pd(r10,rswitch);
2071 d = _mm_max_pd(d,_mm_setzero_pd());
2072 d2 = _mm_mul_pd(d,d);
2073 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2075 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2077 /* Evaluate switch function */
2078 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2079 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
2080 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
2084 fscal = _mm_and_pd(fscal,cutoff_mask);
2086 /* Calculate temporary vectorial force */
2087 tx = _mm_mul_pd(fscal,dx10);
2088 ty = _mm_mul_pd(fscal,dy10);
2089 tz = _mm_mul_pd(fscal,dz10);
2091 /* Update vectorial force */
2092 fix1 = _mm_add_pd(fix1,tx);
2093 fiy1 = _mm_add_pd(fiy1,ty);
2094 fiz1 = _mm_add_pd(fiz1,tz);
2096 fjx0 = _mm_add_pd(fjx0,tx);
2097 fjy0 = _mm_add_pd(fjy0,ty);
2098 fjz0 = _mm_add_pd(fjz0,tz);
2102 /**************************
2103 * CALCULATE INTERACTIONS *
2104 **************************/
2106 if (gmx_mm_any_lt(rsq11,rcutoff2))
2109 r11 = _mm_mul_pd(rsq11,rinv11);
2111 /* EWALD ELECTROSTATICS */
2113 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2114 ewrt = _mm_mul_pd(r11,ewtabscale);
2115 ewitab = _mm_cvttpd_epi32(ewrt);
2116 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2117 ewitab = _mm_slli_epi32(ewitab,2);
2118 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2119 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2120 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2121 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2122 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2123 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2124 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2125 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2126 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2127 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2129 d = _mm_sub_pd(r11,rswitch);
2130 d = _mm_max_pd(d,_mm_setzero_pd());
2131 d2 = _mm_mul_pd(d,d);
2132 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2134 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2136 /* Evaluate switch function */
2137 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2138 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2139 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2143 fscal = _mm_and_pd(fscal,cutoff_mask);
2145 /* Calculate temporary vectorial force */
2146 tx = _mm_mul_pd(fscal,dx11);
2147 ty = _mm_mul_pd(fscal,dy11);
2148 tz = _mm_mul_pd(fscal,dz11);
2150 /* Update vectorial force */
2151 fix1 = _mm_add_pd(fix1,tx);
2152 fiy1 = _mm_add_pd(fiy1,ty);
2153 fiz1 = _mm_add_pd(fiz1,tz);
2155 fjx1 = _mm_add_pd(fjx1,tx);
2156 fjy1 = _mm_add_pd(fjy1,ty);
2157 fjz1 = _mm_add_pd(fjz1,tz);
2161 /**************************
2162 * CALCULATE INTERACTIONS *
2163 **************************/
2165 if (gmx_mm_any_lt(rsq12,rcutoff2))
2168 r12 = _mm_mul_pd(rsq12,rinv12);
2170 /* EWALD ELECTROSTATICS */
2172 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2173 ewrt = _mm_mul_pd(r12,ewtabscale);
2174 ewitab = _mm_cvttpd_epi32(ewrt);
2175 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2176 ewitab = _mm_slli_epi32(ewitab,2);
2177 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2178 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2179 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2180 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2181 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2182 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2183 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2184 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2185 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2186 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2188 d = _mm_sub_pd(r12,rswitch);
2189 d = _mm_max_pd(d,_mm_setzero_pd());
2190 d2 = _mm_mul_pd(d,d);
2191 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2193 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2195 /* Evaluate switch function */
2196 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2197 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2198 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2202 fscal = _mm_and_pd(fscal,cutoff_mask);
2204 /* Calculate temporary vectorial force */
2205 tx = _mm_mul_pd(fscal,dx12);
2206 ty = _mm_mul_pd(fscal,dy12);
2207 tz = _mm_mul_pd(fscal,dz12);
2209 /* Update vectorial force */
2210 fix1 = _mm_add_pd(fix1,tx);
2211 fiy1 = _mm_add_pd(fiy1,ty);
2212 fiz1 = _mm_add_pd(fiz1,tz);
2214 fjx2 = _mm_add_pd(fjx2,tx);
2215 fjy2 = _mm_add_pd(fjy2,ty);
2216 fjz2 = _mm_add_pd(fjz2,tz);
2220 /**************************
2221 * CALCULATE INTERACTIONS *
2222 **************************/
2224 if (gmx_mm_any_lt(rsq20,rcutoff2))
2227 r20 = _mm_mul_pd(rsq20,rinv20);
2229 /* EWALD ELECTROSTATICS */
2231 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2232 ewrt = _mm_mul_pd(r20,ewtabscale);
2233 ewitab = _mm_cvttpd_epi32(ewrt);
2234 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2235 ewitab = _mm_slli_epi32(ewitab,2);
2236 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2237 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2238 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2239 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2240 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2241 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2242 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2243 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2244 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
2245 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2247 d = _mm_sub_pd(r20,rswitch);
2248 d = _mm_max_pd(d,_mm_setzero_pd());
2249 d2 = _mm_mul_pd(d,d);
2250 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2252 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2254 /* Evaluate switch function */
2255 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2256 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
2257 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
2261 fscal = _mm_and_pd(fscal,cutoff_mask);
2263 /* Calculate temporary vectorial force */
2264 tx = _mm_mul_pd(fscal,dx20);
2265 ty = _mm_mul_pd(fscal,dy20);
2266 tz = _mm_mul_pd(fscal,dz20);
2268 /* Update vectorial force */
2269 fix2 = _mm_add_pd(fix2,tx);
2270 fiy2 = _mm_add_pd(fiy2,ty);
2271 fiz2 = _mm_add_pd(fiz2,tz);
2273 fjx0 = _mm_add_pd(fjx0,tx);
2274 fjy0 = _mm_add_pd(fjy0,ty);
2275 fjz0 = _mm_add_pd(fjz0,tz);
2279 /**************************
2280 * CALCULATE INTERACTIONS *
2281 **************************/
2283 if (gmx_mm_any_lt(rsq21,rcutoff2))
2286 r21 = _mm_mul_pd(rsq21,rinv21);
2288 /* EWALD ELECTROSTATICS */
2290 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2291 ewrt = _mm_mul_pd(r21,ewtabscale);
2292 ewitab = _mm_cvttpd_epi32(ewrt);
2293 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2294 ewitab = _mm_slli_epi32(ewitab,2);
2295 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2296 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2297 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2298 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2299 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2300 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2301 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2302 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2303 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2304 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2306 d = _mm_sub_pd(r21,rswitch);
2307 d = _mm_max_pd(d,_mm_setzero_pd());
2308 d2 = _mm_mul_pd(d,d);
2309 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2311 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2313 /* Evaluate switch function */
2314 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2315 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2316 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2320 fscal = _mm_and_pd(fscal,cutoff_mask);
2322 /* Calculate temporary vectorial force */
2323 tx = _mm_mul_pd(fscal,dx21);
2324 ty = _mm_mul_pd(fscal,dy21);
2325 tz = _mm_mul_pd(fscal,dz21);
2327 /* Update vectorial force */
2328 fix2 = _mm_add_pd(fix2,tx);
2329 fiy2 = _mm_add_pd(fiy2,ty);
2330 fiz2 = _mm_add_pd(fiz2,tz);
2332 fjx1 = _mm_add_pd(fjx1,tx);
2333 fjy1 = _mm_add_pd(fjy1,ty);
2334 fjz1 = _mm_add_pd(fjz1,tz);
2338 /**************************
2339 * CALCULATE INTERACTIONS *
2340 **************************/
2342 if (gmx_mm_any_lt(rsq22,rcutoff2))
2345 r22 = _mm_mul_pd(rsq22,rinv22);
2347 /* EWALD ELECTROSTATICS */
2349 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2350 ewrt = _mm_mul_pd(r22,ewtabscale);
2351 ewitab = _mm_cvttpd_epi32(ewrt);
2352 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2353 ewitab = _mm_slli_epi32(ewitab,2);
2354 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2355 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2356 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2357 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2358 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2359 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2360 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2361 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2362 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2363 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2365 d = _mm_sub_pd(r22,rswitch);
2366 d = _mm_max_pd(d,_mm_setzero_pd());
2367 d2 = _mm_mul_pd(d,d);
2368 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2370 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2372 /* Evaluate switch function */
2373 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2374 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2375 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2379 fscal = _mm_and_pd(fscal,cutoff_mask);
2381 /* Calculate temporary vectorial force */
2382 tx = _mm_mul_pd(fscal,dx22);
2383 ty = _mm_mul_pd(fscal,dy22);
2384 tz = _mm_mul_pd(fscal,dz22);
2386 /* Update vectorial force */
2387 fix2 = _mm_add_pd(fix2,tx);
2388 fiy2 = _mm_add_pd(fiy2,ty);
2389 fiz2 = _mm_add_pd(fiz2,tz);
2391 fjx2 = _mm_add_pd(fjx2,tx);
2392 fjy2 = _mm_add_pd(fjy2,ty);
2393 fjz2 = _mm_add_pd(fjz2,tz);
2397 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2399 /* Inner loop uses 573 flops */
2402 if(jidx<j_index_end)
2406 j_coord_offsetA = DIM*jnrA;
2408 /* load j atom coordinates */
2409 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2410 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2412 /* Calculate displacement vector */
2413 dx00 = _mm_sub_pd(ix0,jx0);
2414 dy00 = _mm_sub_pd(iy0,jy0);
2415 dz00 = _mm_sub_pd(iz0,jz0);
2416 dx01 = _mm_sub_pd(ix0,jx1);
2417 dy01 = _mm_sub_pd(iy0,jy1);
2418 dz01 = _mm_sub_pd(iz0,jz1);
2419 dx02 = _mm_sub_pd(ix0,jx2);
2420 dy02 = _mm_sub_pd(iy0,jy2);
2421 dz02 = _mm_sub_pd(iz0,jz2);
2422 dx10 = _mm_sub_pd(ix1,jx0);
2423 dy10 = _mm_sub_pd(iy1,jy0);
2424 dz10 = _mm_sub_pd(iz1,jz0);
2425 dx11 = _mm_sub_pd(ix1,jx1);
2426 dy11 = _mm_sub_pd(iy1,jy1);
2427 dz11 = _mm_sub_pd(iz1,jz1);
2428 dx12 = _mm_sub_pd(ix1,jx2);
2429 dy12 = _mm_sub_pd(iy1,jy2);
2430 dz12 = _mm_sub_pd(iz1,jz2);
2431 dx20 = _mm_sub_pd(ix2,jx0);
2432 dy20 = _mm_sub_pd(iy2,jy0);
2433 dz20 = _mm_sub_pd(iz2,jz0);
2434 dx21 = _mm_sub_pd(ix2,jx1);
2435 dy21 = _mm_sub_pd(iy2,jy1);
2436 dz21 = _mm_sub_pd(iz2,jz1);
2437 dx22 = _mm_sub_pd(ix2,jx2);
2438 dy22 = _mm_sub_pd(iy2,jy2);
2439 dz22 = _mm_sub_pd(iz2,jz2);
2441 /* Calculate squared distance and things based on it */
2442 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2443 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
2444 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
2445 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
2446 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2447 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2448 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
2449 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2450 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2452 rinv00 = gmx_mm_invsqrt_pd(rsq00);
2453 rinv01 = gmx_mm_invsqrt_pd(rsq01);
2454 rinv02 = gmx_mm_invsqrt_pd(rsq02);
2455 rinv10 = gmx_mm_invsqrt_pd(rsq10);
2456 rinv11 = gmx_mm_invsqrt_pd(rsq11);
2457 rinv12 = gmx_mm_invsqrt_pd(rsq12);
2458 rinv20 = gmx_mm_invsqrt_pd(rsq20);
2459 rinv21 = gmx_mm_invsqrt_pd(rsq21);
2460 rinv22 = gmx_mm_invsqrt_pd(rsq22);
2462 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
2463 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
2464 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
2465 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
2466 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
2467 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
2468 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
2469 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
2470 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
2472 fjx0 = _mm_setzero_pd();
2473 fjy0 = _mm_setzero_pd();
2474 fjz0 = _mm_setzero_pd();
2475 fjx1 = _mm_setzero_pd();
2476 fjy1 = _mm_setzero_pd();
2477 fjz1 = _mm_setzero_pd();
2478 fjx2 = _mm_setzero_pd();
2479 fjy2 = _mm_setzero_pd();
2480 fjz2 = _mm_setzero_pd();
2482 /**************************
2483 * CALCULATE INTERACTIONS *
2484 **************************/
2486 if (gmx_mm_any_lt(rsq00,rcutoff2))
2489 r00 = _mm_mul_pd(rsq00,rinv00);
2491 /* EWALD ELECTROSTATICS */
2493 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2494 ewrt = _mm_mul_pd(r00,ewtabscale);
2495 ewitab = _mm_cvttpd_epi32(ewrt);
2496 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2497 ewitab = _mm_slli_epi32(ewitab,2);
2498 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2499 ewtabD = _mm_setzero_pd();
2500 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2501 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2502 ewtabFn = _mm_setzero_pd();
2503 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2504 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2505 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2506 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
2507 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
2509 /* LENNARD-JONES DISPERSION/REPULSION */
2511 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2512 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
2513 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
2514 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
2515 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
2517 d = _mm_sub_pd(r00,rswitch);
2518 d = _mm_max_pd(d,_mm_setzero_pd());
2519 d2 = _mm_mul_pd(d,d);
2520 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2522 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2524 /* Evaluate switch function */
2525 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2526 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
2527 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2528 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
2530 fscal = _mm_add_pd(felec,fvdw);
2532 fscal = _mm_and_pd(fscal,cutoff_mask);
2534 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2536 /* Calculate temporary vectorial force */
2537 tx = _mm_mul_pd(fscal,dx00);
2538 ty = _mm_mul_pd(fscal,dy00);
2539 tz = _mm_mul_pd(fscal,dz00);
2541 /* Update vectorial force */
2542 fix0 = _mm_add_pd(fix0,tx);
2543 fiy0 = _mm_add_pd(fiy0,ty);
2544 fiz0 = _mm_add_pd(fiz0,tz);
2546 fjx0 = _mm_add_pd(fjx0,tx);
2547 fjy0 = _mm_add_pd(fjy0,ty);
2548 fjz0 = _mm_add_pd(fjz0,tz);
2552 /**************************
2553 * CALCULATE INTERACTIONS *
2554 **************************/
2556 if (gmx_mm_any_lt(rsq01,rcutoff2))
2559 r01 = _mm_mul_pd(rsq01,rinv01);
2561 /* EWALD ELECTROSTATICS */
2563 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2564 ewrt = _mm_mul_pd(r01,ewtabscale);
2565 ewitab = _mm_cvttpd_epi32(ewrt);
2566 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2567 ewitab = _mm_slli_epi32(ewitab,2);
2568 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2569 ewtabD = _mm_setzero_pd();
2570 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2571 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2572 ewtabFn = _mm_setzero_pd();
2573 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2574 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2575 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2576 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
2577 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
2579 d = _mm_sub_pd(r01,rswitch);
2580 d = _mm_max_pd(d,_mm_setzero_pd());
2581 d2 = _mm_mul_pd(d,d);
2582 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2584 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2586 /* Evaluate switch function */
2587 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2588 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
2589 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
2593 fscal = _mm_and_pd(fscal,cutoff_mask);
2595 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2597 /* Calculate temporary vectorial force */
2598 tx = _mm_mul_pd(fscal,dx01);
2599 ty = _mm_mul_pd(fscal,dy01);
2600 tz = _mm_mul_pd(fscal,dz01);
2602 /* Update vectorial force */
2603 fix0 = _mm_add_pd(fix0,tx);
2604 fiy0 = _mm_add_pd(fiy0,ty);
2605 fiz0 = _mm_add_pd(fiz0,tz);
2607 fjx1 = _mm_add_pd(fjx1,tx);
2608 fjy1 = _mm_add_pd(fjy1,ty);
2609 fjz1 = _mm_add_pd(fjz1,tz);
2613 /**************************
2614 * CALCULATE INTERACTIONS *
2615 **************************/
2617 if (gmx_mm_any_lt(rsq02,rcutoff2))
2620 r02 = _mm_mul_pd(rsq02,rinv02);
2622 /* EWALD ELECTROSTATICS */
2624 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2625 ewrt = _mm_mul_pd(r02,ewtabscale);
2626 ewitab = _mm_cvttpd_epi32(ewrt);
2627 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2628 ewitab = _mm_slli_epi32(ewitab,2);
2629 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2630 ewtabD = _mm_setzero_pd();
2631 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2632 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2633 ewtabFn = _mm_setzero_pd();
2634 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2635 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2636 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2637 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
2638 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
2640 d = _mm_sub_pd(r02,rswitch);
2641 d = _mm_max_pd(d,_mm_setzero_pd());
2642 d2 = _mm_mul_pd(d,d);
2643 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2645 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2647 /* Evaluate switch function */
2648 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2649 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
2650 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
2654 fscal = _mm_and_pd(fscal,cutoff_mask);
2656 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2658 /* Calculate temporary vectorial force */
2659 tx = _mm_mul_pd(fscal,dx02);
2660 ty = _mm_mul_pd(fscal,dy02);
2661 tz = _mm_mul_pd(fscal,dz02);
2663 /* Update vectorial force */
2664 fix0 = _mm_add_pd(fix0,tx);
2665 fiy0 = _mm_add_pd(fiy0,ty);
2666 fiz0 = _mm_add_pd(fiz0,tz);
2668 fjx2 = _mm_add_pd(fjx2,tx);
2669 fjy2 = _mm_add_pd(fjy2,ty);
2670 fjz2 = _mm_add_pd(fjz2,tz);
2674 /**************************
2675 * CALCULATE INTERACTIONS *
2676 **************************/
2678 if (gmx_mm_any_lt(rsq10,rcutoff2))
2681 r10 = _mm_mul_pd(rsq10,rinv10);
2683 /* EWALD ELECTROSTATICS */
2685 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2686 ewrt = _mm_mul_pd(r10,ewtabscale);
2687 ewitab = _mm_cvttpd_epi32(ewrt);
2688 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2689 ewitab = _mm_slli_epi32(ewitab,2);
2690 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2691 ewtabD = _mm_setzero_pd();
2692 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2693 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2694 ewtabFn = _mm_setzero_pd();
2695 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2696 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2697 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2698 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
2699 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2701 d = _mm_sub_pd(r10,rswitch);
2702 d = _mm_max_pd(d,_mm_setzero_pd());
2703 d2 = _mm_mul_pd(d,d);
2704 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2706 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2708 /* Evaluate switch function */
2709 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2710 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
2711 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
2715 fscal = _mm_and_pd(fscal,cutoff_mask);
2717 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2719 /* Calculate temporary vectorial force */
2720 tx = _mm_mul_pd(fscal,dx10);
2721 ty = _mm_mul_pd(fscal,dy10);
2722 tz = _mm_mul_pd(fscal,dz10);
2724 /* Update vectorial force */
2725 fix1 = _mm_add_pd(fix1,tx);
2726 fiy1 = _mm_add_pd(fiy1,ty);
2727 fiz1 = _mm_add_pd(fiz1,tz);
2729 fjx0 = _mm_add_pd(fjx0,tx);
2730 fjy0 = _mm_add_pd(fjy0,ty);
2731 fjz0 = _mm_add_pd(fjz0,tz);
2735 /**************************
2736 * CALCULATE INTERACTIONS *
2737 **************************/
2739 if (gmx_mm_any_lt(rsq11,rcutoff2))
2742 r11 = _mm_mul_pd(rsq11,rinv11);
2744 /* EWALD ELECTROSTATICS */
2746 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2747 ewrt = _mm_mul_pd(r11,ewtabscale);
2748 ewitab = _mm_cvttpd_epi32(ewrt);
2749 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2750 ewitab = _mm_slli_epi32(ewitab,2);
2751 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2752 ewtabD = _mm_setzero_pd();
2753 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2754 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2755 ewtabFn = _mm_setzero_pd();
2756 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2757 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2758 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2759 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2760 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2762 d = _mm_sub_pd(r11,rswitch);
2763 d = _mm_max_pd(d,_mm_setzero_pd());
2764 d2 = _mm_mul_pd(d,d);
2765 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2767 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2769 /* Evaluate switch function */
2770 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2771 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2772 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2776 fscal = _mm_and_pd(fscal,cutoff_mask);
2778 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2780 /* Calculate temporary vectorial force */
2781 tx = _mm_mul_pd(fscal,dx11);
2782 ty = _mm_mul_pd(fscal,dy11);
2783 tz = _mm_mul_pd(fscal,dz11);
2785 /* Update vectorial force */
2786 fix1 = _mm_add_pd(fix1,tx);
2787 fiy1 = _mm_add_pd(fiy1,ty);
2788 fiz1 = _mm_add_pd(fiz1,tz);
2790 fjx1 = _mm_add_pd(fjx1,tx);
2791 fjy1 = _mm_add_pd(fjy1,ty);
2792 fjz1 = _mm_add_pd(fjz1,tz);
2796 /**************************
2797 * CALCULATE INTERACTIONS *
2798 **************************/
2800 if (gmx_mm_any_lt(rsq12,rcutoff2))
2803 r12 = _mm_mul_pd(rsq12,rinv12);
2805 /* EWALD ELECTROSTATICS */
2807 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2808 ewrt = _mm_mul_pd(r12,ewtabscale);
2809 ewitab = _mm_cvttpd_epi32(ewrt);
2810 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2811 ewitab = _mm_slli_epi32(ewitab,2);
2812 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2813 ewtabD = _mm_setzero_pd();
2814 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2815 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2816 ewtabFn = _mm_setzero_pd();
2817 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2818 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2819 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2820 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2821 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2823 d = _mm_sub_pd(r12,rswitch);
2824 d = _mm_max_pd(d,_mm_setzero_pd());
2825 d2 = _mm_mul_pd(d,d);
2826 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2828 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2830 /* Evaluate switch function */
2831 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2832 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2833 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2837 fscal = _mm_and_pd(fscal,cutoff_mask);
2839 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2841 /* Calculate temporary vectorial force */
2842 tx = _mm_mul_pd(fscal,dx12);
2843 ty = _mm_mul_pd(fscal,dy12);
2844 tz = _mm_mul_pd(fscal,dz12);
2846 /* Update vectorial force */
2847 fix1 = _mm_add_pd(fix1,tx);
2848 fiy1 = _mm_add_pd(fiy1,ty);
2849 fiz1 = _mm_add_pd(fiz1,tz);
2851 fjx2 = _mm_add_pd(fjx2,tx);
2852 fjy2 = _mm_add_pd(fjy2,ty);
2853 fjz2 = _mm_add_pd(fjz2,tz);
2857 /**************************
2858 * CALCULATE INTERACTIONS *
2859 **************************/
2861 if (gmx_mm_any_lt(rsq20,rcutoff2))
2864 r20 = _mm_mul_pd(rsq20,rinv20);
2866 /* EWALD ELECTROSTATICS */
2868 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2869 ewrt = _mm_mul_pd(r20,ewtabscale);
2870 ewitab = _mm_cvttpd_epi32(ewrt);
2871 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2872 ewitab = _mm_slli_epi32(ewitab,2);
2873 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2874 ewtabD = _mm_setzero_pd();
2875 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2876 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2877 ewtabFn = _mm_setzero_pd();
2878 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2879 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2880 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2881 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
2882 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2884 d = _mm_sub_pd(r20,rswitch);
2885 d = _mm_max_pd(d,_mm_setzero_pd());
2886 d2 = _mm_mul_pd(d,d);
2887 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2889 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2891 /* Evaluate switch function */
2892 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2893 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
2894 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
2898 fscal = _mm_and_pd(fscal,cutoff_mask);
2900 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2902 /* Calculate temporary vectorial force */
2903 tx = _mm_mul_pd(fscal,dx20);
2904 ty = _mm_mul_pd(fscal,dy20);
2905 tz = _mm_mul_pd(fscal,dz20);
2907 /* Update vectorial force */
2908 fix2 = _mm_add_pd(fix2,tx);
2909 fiy2 = _mm_add_pd(fiy2,ty);
2910 fiz2 = _mm_add_pd(fiz2,tz);
2912 fjx0 = _mm_add_pd(fjx0,tx);
2913 fjy0 = _mm_add_pd(fjy0,ty);
2914 fjz0 = _mm_add_pd(fjz0,tz);
2918 /**************************
2919 * CALCULATE INTERACTIONS *
2920 **************************/
2922 if (gmx_mm_any_lt(rsq21,rcutoff2))
2925 r21 = _mm_mul_pd(rsq21,rinv21);
2927 /* EWALD ELECTROSTATICS */
2929 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2930 ewrt = _mm_mul_pd(r21,ewtabscale);
2931 ewitab = _mm_cvttpd_epi32(ewrt);
2932 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2933 ewitab = _mm_slli_epi32(ewitab,2);
2934 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2935 ewtabD = _mm_setzero_pd();
2936 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2937 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2938 ewtabFn = _mm_setzero_pd();
2939 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2940 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2941 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2942 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2943 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2945 d = _mm_sub_pd(r21,rswitch);
2946 d = _mm_max_pd(d,_mm_setzero_pd());
2947 d2 = _mm_mul_pd(d,d);
2948 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2950 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2952 /* Evaluate switch function */
2953 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2954 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2955 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2959 fscal = _mm_and_pd(fscal,cutoff_mask);
2961 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2963 /* Calculate temporary vectorial force */
2964 tx = _mm_mul_pd(fscal,dx21);
2965 ty = _mm_mul_pd(fscal,dy21);
2966 tz = _mm_mul_pd(fscal,dz21);
2968 /* Update vectorial force */
2969 fix2 = _mm_add_pd(fix2,tx);
2970 fiy2 = _mm_add_pd(fiy2,ty);
2971 fiz2 = _mm_add_pd(fiz2,tz);
2973 fjx1 = _mm_add_pd(fjx1,tx);
2974 fjy1 = _mm_add_pd(fjy1,ty);
2975 fjz1 = _mm_add_pd(fjz1,tz);
2979 /**************************
2980 * CALCULATE INTERACTIONS *
2981 **************************/
2983 if (gmx_mm_any_lt(rsq22,rcutoff2))
2986 r22 = _mm_mul_pd(rsq22,rinv22);
2988 /* EWALD ELECTROSTATICS */
2990 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2991 ewrt = _mm_mul_pd(r22,ewtabscale);
2992 ewitab = _mm_cvttpd_epi32(ewrt);
2993 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2994 ewitab = _mm_slli_epi32(ewitab,2);
2995 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2996 ewtabD = _mm_setzero_pd();
2997 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2998 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2999 ewtabFn = _mm_setzero_pd();
3000 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3001 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3002 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3003 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
3004 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
3006 d = _mm_sub_pd(r22,rswitch);
3007 d = _mm_max_pd(d,_mm_setzero_pd());
3008 d2 = _mm_mul_pd(d,d);
3009 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3011 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3013 /* Evaluate switch function */
3014 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3015 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
3016 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
3020 fscal = _mm_and_pd(fscal,cutoff_mask);
3022 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3024 /* Calculate temporary vectorial force */
3025 tx = _mm_mul_pd(fscal,dx22);
3026 ty = _mm_mul_pd(fscal,dy22);
3027 tz = _mm_mul_pd(fscal,dz22);
3029 /* Update vectorial force */
3030 fix2 = _mm_add_pd(fix2,tx);
3031 fiy2 = _mm_add_pd(fiy2,ty);
3032 fiz2 = _mm_add_pd(fiz2,tz);
3034 fjx2 = _mm_add_pd(fjx2,tx);
3035 fjy2 = _mm_add_pd(fjy2,ty);
3036 fjz2 = _mm_add_pd(fjz2,tz);
3040 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3042 /* Inner loop uses 573 flops */
3045 /* End of innermost loop */
3047 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3048 f+i_coord_offset,fshift+i_shift_offset);
3050 /* Increment number of inner iterations */
3051 inneriter += j_index_end - j_index_start;
3053 /* Outer loop uses 18 flops */
3056 /* Increment number of outer iterations */
3059 /* Update outer/inner flops */
3061 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*573);