2 * Note: this file was generated by the Gromacs sse4_1_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse4_1_double.h"
34 #include "kernelutil_x86_sse4_1_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse4_1_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: None
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse4_1_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
69 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
71 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
72 int vdwjidx1A,vdwjidx1B;
73 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
74 int vdwjidx2A,vdwjidx2B;
75 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
76 int vdwjidx3A,vdwjidx3B;
77 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
78 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
79 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
80 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
81 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
82 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
83 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
84 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
85 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
86 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
87 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
92 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
93 real rswitch_scalar,d_scalar;
94 __m128d dummy_mask,cutoff_mask;
95 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
96 __m128d one = _mm_set1_pd(1.0);
97 __m128d two = _mm_set1_pd(2.0);
103 jindex = nlist->jindex;
105 shiftidx = nlist->shift;
107 shiftvec = fr->shift_vec[0];
108 fshift = fr->fshift[0];
109 facel = _mm_set1_pd(fr->epsfac);
110 charge = mdatoms->chargeA;
112 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
113 ewtab = fr->ic->tabq_coul_FDV0;
114 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
115 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
117 /* Setup water-specific parameters */
118 inr = nlist->iinr[0];
119 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
120 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
121 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
123 jq1 = _mm_set1_pd(charge[inr+1]);
124 jq2 = _mm_set1_pd(charge[inr+2]);
125 jq3 = _mm_set1_pd(charge[inr+3]);
126 qq11 = _mm_mul_pd(iq1,jq1);
127 qq12 = _mm_mul_pd(iq1,jq2);
128 qq13 = _mm_mul_pd(iq1,jq3);
129 qq21 = _mm_mul_pd(iq2,jq1);
130 qq22 = _mm_mul_pd(iq2,jq2);
131 qq23 = _mm_mul_pd(iq2,jq3);
132 qq31 = _mm_mul_pd(iq3,jq1);
133 qq32 = _mm_mul_pd(iq3,jq2);
134 qq33 = _mm_mul_pd(iq3,jq3);
136 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
137 rcutoff_scalar = fr->rcoulomb;
138 rcutoff = _mm_set1_pd(rcutoff_scalar);
139 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
141 rswitch_scalar = fr->rcoulomb_switch;
142 rswitch = _mm_set1_pd(rswitch_scalar);
143 /* Setup switch parameters */
144 d_scalar = rcutoff_scalar-rswitch_scalar;
145 d = _mm_set1_pd(d_scalar);
146 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
147 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
148 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
149 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
150 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
151 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
153 /* Avoid stupid compiler warnings */
161 /* Start outer loop over neighborlists */
162 for(iidx=0; iidx<nri; iidx++)
164 /* Load shift vector for this list */
165 i_shift_offset = DIM*shiftidx[iidx];
167 /* Load limits for loop over neighbors */
168 j_index_start = jindex[iidx];
169 j_index_end = jindex[iidx+1];
171 /* Get outer coordinate index */
173 i_coord_offset = DIM*inr;
175 /* Load i particle coords and add shift vector */
176 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
177 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
179 fix1 = _mm_setzero_pd();
180 fiy1 = _mm_setzero_pd();
181 fiz1 = _mm_setzero_pd();
182 fix2 = _mm_setzero_pd();
183 fiy2 = _mm_setzero_pd();
184 fiz2 = _mm_setzero_pd();
185 fix3 = _mm_setzero_pd();
186 fiy3 = _mm_setzero_pd();
187 fiz3 = _mm_setzero_pd();
189 /* Reset potential sums */
190 velecsum = _mm_setzero_pd();
192 /* Start inner kernel loop */
193 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
196 /* Get j neighbor index, and coordinate index */
199 j_coord_offsetA = DIM*jnrA;
200 j_coord_offsetB = DIM*jnrB;
202 /* load j atom coordinates */
203 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
204 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
206 /* Calculate displacement vector */
207 dx11 = _mm_sub_pd(ix1,jx1);
208 dy11 = _mm_sub_pd(iy1,jy1);
209 dz11 = _mm_sub_pd(iz1,jz1);
210 dx12 = _mm_sub_pd(ix1,jx2);
211 dy12 = _mm_sub_pd(iy1,jy2);
212 dz12 = _mm_sub_pd(iz1,jz2);
213 dx13 = _mm_sub_pd(ix1,jx3);
214 dy13 = _mm_sub_pd(iy1,jy3);
215 dz13 = _mm_sub_pd(iz1,jz3);
216 dx21 = _mm_sub_pd(ix2,jx1);
217 dy21 = _mm_sub_pd(iy2,jy1);
218 dz21 = _mm_sub_pd(iz2,jz1);
219 dx22 = _mm_sub_pd(ix2,jx2);
220 dy22 = _mm_sub_pd(iy2,jy2);
221 dz22 = _mm_sub_pd(iz2,jz2);
222 dx23 = _mm_sub_pd(ix2,jx3);
223 dy23 = _mm_sub_pd(iy2,jy3);
224 dz23 = _mm_sub_pd(iz2,jz3);
225 dx31 = _mm_sub_pd(ix3,jx1);
226 dy31 = _mm_sub_pd(iy3,jy1);
227 dz31 = _mm_sub_pd(iz3,jz1);
228 dx32 = _mm_sub_pd(ix3,jx2);
229 dy32 = _mm_sub_pd(iy3,jy2);
230 dz32 = _mm_sub_pd(iz3,jz2);
231 dx33 = _mm_sub_pd(ix3,jx3);
232 dy33 = _mm_sub_pd(iy3,jy3);
233 dz33 = _mm_sub_pd(iz3,jz3);
235 /* Calculate squared distance and things based on it */
236 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
237 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
238 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
239 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
240 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
241 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
242 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
243 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
244 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
246 rinv11 = gmx_mm_invsqrt_pd(rsq11);
247 rinv12 = gmx_mm_invsqrt_pd(rsq12);
248 rinv13 = gmx_mm_invsqrt_pd(rsq13);
249 rinv21 = gmx_mm_invsqrt_pd(rsq21);
250 rinv22 = gmx_mm_invsqrt_pd(rsq22);
251 rinv23 = gmx_mm_invsqrt_pd(rsq23);
252 rinv31 = gmx_mm_invsqrt_pd(rsq31);
253 rinv32 = gmx_mm_invsqrt_pd(rsq32);
254 rinv33 = gmx_mm_invsqrt_pd(rsq33);
256 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
257 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
258 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
259 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
260 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
261 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
262 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
263 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
264 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
266 fjx1 = _mm_setzero_pd();
267 fjy1 = _mm_setzero_pd();
268 fjz1 = _mm_setzero_pd();
269 fjx2 = _mm_setzero_pd();
270 fjy2 = _mm_setzero_pd();
271 fjz2 = _mm_setzero_pd();
272 fjx3 = _mm_setzero_pd();
273 fjy3 = _mm_setzero_pd();
274 fjz3 = _mm_setzero_pd();
276 /**************************
277 * CALCULATE INTERACTIONS *
278 **************************/
280 if (gmx_mm_any_lt(rsq11,rcutoff2))
283 r11 = _mm_mul_pd(rsq11,rinv11);
285 /* EWALD ELECTROSTATICS */
287 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
288 ewrt = _mm_mul_pd(r11,ewtabscale);
289 ewitab = _mm_cvttpd_epi32(ewrt);
290 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
291 ewitab = _mm_slli_epi32(ewitab,2);
292 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
293 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
294 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
295 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
296 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
297 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
298 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
299 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
300 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
301 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
303 d = _mm_sub_pd(r11,rswitch);
304 d = _mm_max_pd(d,_mm_setzero_pd());
305 d2 = _mm_mul_pd(d,d);
306 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
308 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
310 /* Evaluate switch function */
311 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
312 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
313 velec = _mm_mul_pd(velec,sw);
314 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
316 /* Update potential sum for this i atom from the interaction with this j atom. */
317 velec = _mm_and_pd(velec,cutoff_mask);
318 velecsum = _mm_add_pd(velecsum,velec);
322 fscal = _mm_and_pd(fscal,cutoff_mask);
324 /* Calculate temporary vectorial force */
325 tx = _mm_mul_pd(fscal,dx11);
326 ty = _mm_mul_pd(fscal,dy11);
327 tz = _mm_mul_pd(fscal,dz11);
329 /* Update vectorial force */
330 fix1 = _mm_add_pd(fix1,tx);
331 fiy1 = _mm_add_pd(fiy1,ty);
332 fiz1 = _mm_add_pd(fiz1,tz);
334 fjx1 = _mm_add_pd(fjx1,tx);
335 fjy1 = _mm_add_pd(fjy1,ty);
336 fjz1 = _mm_add_pd(fjz1,tz);
340 /**************************
341 * CALCULATE INTERACTIONS *
342 **************************/
344 if (gmx_mm_any_lt(rsq12,rcutoff2))
347 r12 = _mm_mul_pd(rsq12,rinv12);
349 /* EWALD ELECTROSTATICS */
351 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
352 ewrt = _mm_mul_pd(r12,ewtabscale);
353 ewitab = _mm_cvttpd_epi32(ewrt);
354 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
355 ewitab = _mm_slli_epi32(ewitab,2);
356 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
357 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
358 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
359 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
360 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
361 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
362 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
363 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
364 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
365 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
367 d = _mm_sub_pd(r12,rswitch);
368 d = _mm_max_pd(d,_mm_setzero_pd());
369 d2 = _mm_mul_pd(d,d);
370 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
372 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
374 /* Evaluate switch function */
375 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
376 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
377 velec = _mm_mul_pd(velec,sw);
378 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
380 /* Update potential sum for this i atom from the interaction with this j atom. */
381 velec = _mm_and_pd(velec,cutoff_mask);
382 velecsum = _mm_add_pd(velecsum,velec);
386 fscal = _mm_and_pd(fscal,cutoff_mask);
388 /* Calculate temporary vectorial force */
389 tx = _mm_mul_pd(fscal,dx12);
390 ty = _mm_mul_pd(fscal,dy12);
391 tz = _mm_mul_pd(fscal,dz12);
393 /* Update vectorial force */
394 fix1 = _mm_add_pd(fix1,tx);
395 fiy1 = _mm_add_pd(fiy1,ty);
396 fiz1 = _mm_add_pd(fiz1,tz);
398 fjx2 = _mm_add_pd(fjx2,tx);
399 fjy2 = _mm_add_pd(fjy2,ty);
400 fjz2 = _mm_add_pd(fjz2,tz);
404 /**************************
405 * CALCULATE INTERACTIONS *
406 **************************/
408 if (gmx_mm_any_lt(rsq13,rcutoff2))
411 r13 = _mm_mul_pd(rsq13,rinv13);
413 /* EWALD ELECTROSTATICS */
415 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
416 ewrt = _mm_mul_pd(r13,ewtabscale);
417 ewitab = _mm_cvttpd_epi32(ewrt);
418 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
419 ewitab = _mm_slli_epi32(ewitab,2);
420 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
421 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
422 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
423 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
424 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
425 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
426 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
427 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
428 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
429 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
431 d = _mm_sub_pd(r13,rswitch);
432 d = _mm_max_pd(d,_mm_setzero_pd());
433 d2 = _mm_mul_pd(d,d);
434 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
436 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
438 /* Evaluate switch function */
439 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
440 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
441 velec = _mm_mul_pd(velec,sw);
442 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
444 /* Update potential sum for this i atom from the interaction with this j atom. */
445 velec = _mm_and_pd(velec,cutoff_mask);
446 velecsum = _mm_add_pd(velecsum,velec);
450 fscal = _mm_and_pd(fscal,cutoff_mask);
452 /* Calculate temporary vectorial force */
453 tx = _mm_mul_pd(fscal,dx13);
454 ty = _mm_mul_pd(fscal,dy13);
455 tz = _mm_mul_pd(fscal,dz13);
457 /* Update vectorial force */
458 fix1 = _mm_add_pd(fix1,tx);
459 fiy1 = _mm_add_pd(fiy1,ty);
460 fiz1 = _mm_add_pd(fiz1,tz);
462 fjx3 = _mm_add_pd(fjx3,tx);
463 fjy3 = _mm_add_pd(fjy3,ty);
464 fjz3 = _mm_add_pd(fjz3,tz);
468 /**************************
469 * CALCULATE INTERACTIONS *
470 **************************/
472 if (gmx_mm_any_lt(rsq21,rcutoff2))
475 r21 = _mm_mul_pd(rsq21,rinv21);
477 /* EWALD ELECTROSTATICS */
479 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
480 ewrt = _mm_mul_pd(r21,ewtabscale);
481 ewitab = _mm_cvttpd_epi32(ewrt);
482 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
483 ewitab = _mm_slli_epi32(ewitab,2);
484 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
485 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
486 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
487 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
488 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
489 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
490 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
491 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
492 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
493 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
495 d = _mm_sub_pd(r21,rswitch);
496 d = _mm_max_pd(d,_mm_setzero_pd());
497 d2 = _mm_mul_pd(d,d);
498 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
500 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
502 /* Evaluate switch function */
503 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
504 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
505 velec = _mm_mul_pd(velec,sw);
506 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
508 /* Update potential sum for this i atom from the interaction with this j atom. */
509 velec = _mm_and_pd(velec,cutoff_mask);
510 velecsum = _mm_add_pd(velecsum,velec);
514 fscal = _mm_and_pd(fscal,cutoff_mask);
516 /* Calculate temporary vectorial force */
517 tx = _mm_mul_pd(fscal,dx21);
518 ty = _mm_mul_pd(fscal,dy21);
519 tz = _mm_mul_pd(fscal,dz21);
521 /* Update vectorial force */
522 fix2 = _mm_add_pd(fix2,tx);
523 fiy2 = _mm_add_pd(fiy2,ty);
524 fiz2 = _mm_add_pd(fiz2,tz);
526 fjx1 = _mm_add_pd(fjx1,tx);
527 fjy1 = _mm_add_pd(fjy1,ty);
528 fjz1 = _mm_add_pd(fjz1,tz);
532 /**************************
533 * CALCULATE INTERACTIONS *
534 **************************/
536 if (gmx_mm_any_lt(rsq22,rcutoff2))
539 r22 = _mm_mul_pd(rsq22,rinv22);
541 /* EWALD ELECTROSTATICS */
543 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
544 ewrt = _mm_mul_pd(r22,ewtabscale);
545 ewitab = _mm_cvttpd_epi32(ewrt);
546 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
547 ewitab = _mm_slli_epi32(ewitab,2);
548 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
549 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
550 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
551 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
552 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
553 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
554 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
555 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
556 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
557 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
559 d = _mm_sub_pd(r22,rswitch);
560 d = _mm_max_pd(d,_mm_setzero_pd());
561 d2 = _mm_mul_pd(d,d);
562 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
564 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
566 /* Evaluate switch function */
567 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
568 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
569 velec = _mm_mul_pd(velec,sw);
570 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
572 /* Update potential sum for this i atom from the interaction with this j atom. */
573 velec = _mm_and_pd(velec,cutoff_mask);
574 velecsum = _mm_add_pd(velecsum,velec);
578 fscal = _mm_and_pd(fscal,cutoff_mask);
580 /* Calculate temporary vectorial force */
581 tx = _mm_mul_pd(fscal,dx22);
582 ty = _mm_mul_pd(fscal,dy22);
583 tz = _mm_mul_pd(fscal,dz22);
585 /* Update vectorial force */
586 fix2 = _mm_add_pd(fix2,tx);
587 fiy2 = _mm_add_pd(fiy2,ty);
588 fiz2 = _mm_add_pd(fiz2,tz);
590 fjx2 = _mm_add_pd(fjx2,tx);
591 fjy2 = _mm_add_pd(fjy2,ty);
592 fjz2 = _mm_add_pd(fjz2,tz);
596 /**************************
597 * CALCULATE INTERACTIONS *
598 **************************/
600 if (gmx_mm_any_lt(rsq23,rcutoff2))
603 r23 = _mm_mul_pd(rsq23,rinv23);
605 /* EWALD ELECTROSTATICS */
607 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
608 ewrt = _mm_mul_pd(r23,ewtabscale);
609 ewitab = _mm_cvttpd_epi32(ewrt);
610 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
611 ewitab = _mm_slli_epi32(ewitab,2);
612 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
613 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
614 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
615 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
616 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
617 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
618 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
619 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
620 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
621 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
623 d = _mm_sub_pd(r23,rswitch);
624 d = _mm_max_pd(d,_mm_setzero_pd());
625 d2 = _mm_mul_pd(d,d);
626 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
628 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
630 /* Evaluate switch function */
631 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
632 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
633 velec = _mm_mul_pd(velec,sw);
634 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
636 /* Update potential sum for this i atom from the interaction with this j atom. */
637 velec = _mm_and_pd(velec,cutoff_mask);
638 velecsum = _mm_add_pd(velecsum,velec);
642 fscal = _mm_and_pd(fscal,cutoff_mask);
644 /* Calculate temporary vectorial force */
645 tx = _mm_mul_pd(fscal,dx23);
646 ty = _mm_mul_pd(fscal,dy23);
647 tz = _mm_mul_pd(fscal,dz23);
649 /* Update vectorial force */
650 fix2 = _mm_add_pd(fix2,tx);
651 fiy2 = _mm_add_pd(fiy2,ty);
652 fiz2 = _mm_add_pd(fiz2,tz);
654 fjx3 = _mm_add_pd(fjx3,tx);
655 fjy3 = _mm_add_pd(fjy3,ty);
656 fjz3 = _mm_add_pd(fjz3,tz);
660 /**************************
661 * CALCULATE INTERACTIONS *
662 **************************/
664 if (gmx_mm_any_lt(rsq31,rcutoff2))
667 r31 = _mm_mul_pd(rsq31,rinv31);
669 /* EWALD ELECTROSTATICS */
671 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
672 ewrt = _mm_mul_pd(r31,ewtabscale);
673 ewitab = _mm_cvttpd_epi32(ewrt);
674 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
675 ewitab = _mm_slli_epi32(ewitab,2);
676 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
677 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
678 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
679 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
680 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
681 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
682 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
683 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
684 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
685 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
687 d = _mm_sub_pd(r31,rswitch);
688 d = _mm_max_pd(d,_mm_setzero_pd());
689 d2 = _mm_mul_pd(d,d);
690 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
692 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
694 /* Evaluate switch function */
695 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
696 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
697 velec = _mm_mul_pd(velec,sw);
698 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
700 /* Update potential sum for this i atom from the interaction with this j atom. */
701 velec = _mm_and_pd(velec,cutoff_mask);
702 velecsum = _mm_add_pd(velecsum,velec);
706 fscal = _mm_and_pd(fscal,cutoff_mask);
708 /* Calculate temporary vectorial force */
709 tx = _mm_mul_pd(fscal,dx31);
710 ty = _mm_mul_pd(fscal,dy31);
711 tz = _mm_mul_pd(fscal,dz31);
713 /* Update vectorial force */
714 fix3 = _mm_add_pd(fix3,tx);
715 fiy3 = _mm_add_pd(fiy3,ty);
716 fiz3 = _mm_add_pd(fiz3,tz);
718 fjx1 = _mm_add_pd(fjx1,tx);
719 fjy1 = _mm_add_pd(fjy1,ty);
720 fjz1 = _mm_add_pd(fjz1,tz);
724 /**************************
725 * CALCULATE INTERACTIONS *
726 **************************/
728 if (gmx_mm_any_lt(rsq32,rcutoff2))
731 r32 = _mm_mul_pd(rsq32,rinv32);
733 /* EWALD ELECTROSTATICS */
735 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
736 ewrt = _mm_mul_pd(r32,ewtabscale);
737 ewitab = _mm_cvttpd_epi32(ewrt);
738 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
739 ewitab = _mm_slli_epi32(ewitab,2);
740 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
741 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
742 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
743 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
744 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
745 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
746 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
747 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
748 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
749 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
751 d = _mm_sub_pd(r32,rswitch);
752 d = _mm_max_pd(d,_mm_setzero_pd());
753 d2 = _mm_mul_pd(d,d);
754 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
756 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
758 /* Evaluate switch function */
759 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
760 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
761 velec = _mm_mul_pd(velec,sw);
762 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
764 /* Update potential sum for this i atom from the interaction with this j atom. */
765 velec = _mm_and_pd(velec,cutoff_mask);
766 velecsum = _mm_add_pd(velecsum,velec);
770 fscal = _mm_and_pd(fscal,cutoff_mask);
772 /* Calculate temporary vectorial force */
773 tx = _mm_mul_pd(fscal,dx32);
774 ty = _mm_mul_pd(fscal,dy32);
775 tz = _mm_mul_pd(fscal,dz32);
777 /* Update vectorial force */
778 fix3 = _mm_add_pd(fix3,tx);
779 fiy3 = _mm_add_pd(fiy3,ty);
780 fiz3 = _mm_add_pd(fiz3,tz);
782 fjx2 = _mm_add_pd(fjx2,tx);
783 fjy2 = _mm_add_pd(fjy2,ty);
784 fjz2 = _mm_add_pd(fjz2,tz);
788 /**************************
789 * CALCULATE INTERACTIONS *
790 **************************/
792 if (gmx_mm_any_lt(rsq33,rcutoff2))
795 r33 = _mm_mul_pd(rsq33,rinv33);
797 /* EWALD ELECTROSTATICS */
799 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
800 ewrt = _mm_mul_pd(r33,ewtabscale);
801 ewitab = _mm_cvttpd_epi32(ewrt);
802 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
803 ewitab = _mm_slli_epi32(ewitab,2);
804 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
805 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
806 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
807 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
808 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
809 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
810 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
811 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
812 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
813 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
815 d = _mm_sub_pd(r33,rswitch);
816 d = _mm_max_pd(d,_mm_setzero_pd());
817 d2 = _mm_mul_pd(d,d);
818 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
820 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
822 /* Evaluate switch function */
823 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
824 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
825 velec = _mm_mul_pd(velec,sw);
826 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
828 /* Update potential sum for this i atom from the interaction with this j atom. */
829 velec = _mm_and_pd(velec,cutoff_mask);
830 velecsum = _mm_add_pd(velecsum,velec);
834 fscal = _mm_and_pd(fscal,cutoff_mask);
836 /* Calculate temporary vectorial force */
837 tx = _mm_mul_pd(fscal,dx33);
838 ty = _mm_mul_pd(fscal,dy33);
839 tz = _mm_mul_pd(fscal,dz33);
841 /* Update vectorial force */
842 fix3 = _mm_add_pd(fix3,tx);
843 fiy3 = _mm_add_pd(fiy3,ty);
844 fiz3 = _mm_add_pd(fiz3,tz);
846 fjx3 = _mm_add_pd(fjx3,tx);
847 fjy3 = _mm_add_pd(fjy3,ty);
848 fjz3 = _mm_add_pd(fjz3,tz);
852 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
854 /* Inner loop uses 585 flops */
861 j_coord_offsetA = DIM*jnrA;
863 /* load j atom coordinates */
864 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA+DIM,
865 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
867 /* Calculate displacement vector */
868 dx11 = _mm_sub_pd(ix1,jx1);
869 dy11 = _mm_sub_pd(iy1,jy1);
870 dz11 = _mm_sub_pd(iz1,jz1);
871 dx12 = _mm_sub_pd(ix1,jx2);
872 dy12 = _mm_sub_pd(iy1,jy2);
873 dz12 = _mm_sub_pd(iz1,jz2);
874 dx13 = _mm_sub_pd(ix1,jx3);
875 dy13 = _mm_sub_pd(iy1,jy3);
876 dz13 = _mm_sub_pd(iz1,jz3);
877 dx21 = _mm_sub_pd(ix2,jx1);
878 dy21 = _mm_sub_pd(iy2,jy1);
879 dz21 = _mm_sub_pd(iz2,jz1);
880 dx22 = _mm_sub_pd(ix2,jx2);
881 dy22 = _mm_sub_pd(iy2,jy2);
882 dz22 = _mm_sub_pd(iz2,jz2);
883 dx23 = _mm_sub_pd(ix2,jx3);
884 dy23 = _mm_sub_pd(iy2,jy3);
885 dz23 = _mm_sub_pd(iz2,jz3);
886 dx31 = _mm_sub_pd(ix3,jx1);
887 dy31 = _mm_sub_pd(iy3,jy1);
888 dz31 = _mm_sub_pd(iz3,jz1);
889 dx32 = _mm_sub_pd(ix3,jx2);
890 dy32 = _mm_sub_pd(iy3,jy2);
891 dz32 = _mm_sub_pd(iz3,jz2);
892 dx33 = _mm_sub_pd(ix3,jx3);
893 dy33 = _mm_sub_pd(iy3,jy3);
894 dz33 = _mm_sub_pd(iz3,jz3);
896 /* Calculate squared distance and things based on it */
897 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
898 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
899 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
900 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
901 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
902 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
903 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
904 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
905 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
907 rinv11 = gmx_mm_invsqrt_pd(rsq11);
908 rinv12 = gmx_mm_invsqrt_pd(rsq12);
909 rinv13 = gmx_mm_invsqrt_pd(rsq13);
910 rinv21 = gmx_mm_invsqrt_pd(rsq21);
911 rinv22 = gmx_mm_invsqrt_pd(rsq22);
912 rinv23 = gmx_mm_invsqrt_pd(rsq23);
913 rinv31 = gmx_mm_invsqrt_pd(rsq31);
914 rinv32 = gmx_mm_invsqrt_pd(rsq32);
915 rinv33 = gmx_mm_invsqrt_pd(rsq33);
917 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
918 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
919 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
920 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
921 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
922 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
923 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
924 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
925 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
927 fjx1 = _mm_setzero_pd();
928 fjy1 = _mm_setzero_pd();
929 fjz1 = _mm_setzero_pd();
930 fjx2 = _mm_setzero_pd();
931 fjy2 = _mm_setzero_pd();
932 fjz2 = _mm_setzero_pd();
933 fjx3 = _mm_setzero_pd();
934 fjy3 = _mm_setzero_pd();
935 fjz3 = _mm_setzero_pd();
937 /**************************
938 * CALCULATE INTERACTIONS *
939 **************************/
941 if (gmx_mm_any_lt(rsq11,rcutoff2))
944 r11 = _mm_mul_pd(rsq11,rinv11);
946 /* EWALD ELECTROSTATICS */
948 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
949 ewrt = _mm_mul_pd(r11,ewtabscale);
950 ewitab = _mm_cvttpd_epi32(ewrt);
951 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
952 ewitab = _mm_slli_epi32(ewitab,2);
953 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
954 ewtabD = _mm_setzero_pd();
955 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
956 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
957 ewtabFn = _mm_setzero_pd();
958 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
959 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
960 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
961 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
962 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
964 d = _mm_sub_pd(r11,rswitch);
965 d = _mm_max_pd(d,_mm_setzero_pd());
966 d2 = _mm_mul_pd(d,d);
967 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
969 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
971 /* Evaluate switch function */
972 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
973 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
974 velec = _mm_mul_pd(velec,sw);
975 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
977 /* Update potential sum for this i atom from the interaction with this j atom. */
978 velec = _mm_and_pd(velec,cutoff_mask);
979 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
980 velecsum = _mm_add_pd(velecsum,velec);
984 fscal = _mm_and_pd(fscal,cutoff_mask);
986 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
988 /* Calculate temporary vectorial force */
989 tx = _mm_mul_pd(fscal,dx11);
990 ty = _mm_mul_pd(fscal,dy11);
991 tz = _mm_mul_pd(fscal,dz11);
993 /* Update vectorial force */
994 fix1 = _mm_add_pd(fix1,tx);
995 fiy1 = _mm_add_pd(fiy1,ty);
996 fiz1 = _mm_add_pd(fiz1,tz);
998 fjx1 = _mm_add_pd(fjx1,tx);
999 fjy1 = _mm_add_pd(fjy1,ty);
1000 fjz1 = _mm_add_pd(fjz1,tz);
1004 /**************************
1005 * CALCULATE INTERACTIONS *
1006 **************************/
1008 if (gmx_mm_any_lt(rsq12,rcutoff2))
1011 r12 = _mm_mul_pd(rsq12,rinv12);
1013 /* EWALD ELECTROSTATICS */
1015 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1016 ewrt = _mm_mul_pd(r12,ewtabscale);
1017 ewitab = _mm_cvttpd_epi32(ewrt);
1018 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1019 ewitab = _mm_slli_epi32(ewitab,2);
1020 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1021 ewtabD = _mm_setzero_pd();
1022 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1023 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1024 ewtabFn = _mm_setzero_pd();
1025 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1026 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1027 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1028 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1029 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1031 d = _mm_sub_pd(r12,rswitch);
1032 d = _mm_max_pd(d,_mm_setzero_pd());
1033 d2 = _mm_mul_pd(d,d);
1034 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1036 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1038 /* Evaluate switch function */
1039 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1040 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1041 velec = _mm_mul_pd(velec,sw);
1042 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1044 /* Update potential sum for this i atom from the interaction with this j atom. */
1045 velec = _mm_and_pd(velec,cutoff_mask);
1046 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1047 velecsum = _mm_add_pd(velecsum,velec);
1051 fscal = _mm_and_pd(fscal,cutoff_mask);
1053 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1055 /* Calculate temporary vectorial force */
1056 tx = _mm_mul_pd(fscal,dx12);
1057 ty = _mm_mul_pd(fscal,dy12);
1058 tz = _mm_mul_pd(fscal,dz12);
1060 /* Update vectorial force */
1061 fix1 = _mm_add_pd(fix1,tx);
1062 fiy1 = _mm_add_pd(fiy1,ty);
1063 fiz1 = _mm_add_pd(fiz1,tz);
1065 fjx2 = _mm_add_pd(fjx2,tx);
1066 fjy2 = _mm_add_pd(fjy2,ty);
1067 fjz2 = _mm_add_pd(fjz2,tz);
1071 /**************************
1072 * CALCULATE INTERACTIONS *
1073 **************************/
1075 if (gmx_mm_any_lt(rsq13,rcutoff2))
1078 r13 = _mm_mul_pd(rsq13,rinv13);
1080 /* EWALD ELECTROSTATICS */
1082 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1083 ewrt = _mm_mul_pd(r13,ewtabscale);
1084 ewitab = _mm_cvttpd_epi32(ewrt);
1085 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1086 ewitab = _mm_slli_epi32(ewitab,2);
1087 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1088 ewtabD = _mm_setzero_pd();
1089 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1090 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1091 ewtabFn = _mm_setzero_pd();
1092 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1093 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1094 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1095 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
1096 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1098 d = _mm_sub_pd(r13,rswitch);
1099 d = _mm_max_pd(d,_mm_setzero_pd());
1100 d2 = _mm_mul_pd(d,d);
1101 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1103 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1105 /* Evaluate switch function */
1106 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1107 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
1108 velec = _mm_mul_pd(velec,sw);
1109 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1111 /* Update potential sum for this i atom from the interaction with this j atom. */
1112 velec = _mm_and_pd(velec,cutoff_mask);
1113 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1114 velecsum = _mm_add_pd(velecsum,velec);
1118 fscal = _mm_and_pd(fscal,cutoff_mask);
1120 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1122 /* Calculate temporary vectorial force */
1123 tx = _mm_mul_pd(fscal,dx13);
1124 ty = _mm_mul_pd(fscal,dy13);
1125 tz = _mm_mul_pd(fscal,dz13);
1127 /* Update vectorial force */
1128 fix1 = _mm_add_pd(fix1,tx);
1129 fiy1 = _mm_add_pd(fiy1,ty);
1130 fiz1 = _mm_add_pd(fiz1,tz);
1132 fjx3 = _mm_add_pd(fjx3,tx);
1133 fjy3 = _mm_add_pd(fjy3,ty);
1134 fjz3 = _mm_add_pd(fjz3,tz);
1138 /**************************
1139 * CALCULATE INTERACTIONS *
1140 **************************/
1142 if (gmx_mm_any_lt(rsq21,rcutoff2))
1145 r21 = _mm_mul_pd(rsq21,rinv21);
1147 /* EWALD ELECTROSTATICS */
1149 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1150 ewrt = _mm_mul_pd(r21,ewtabscale);
1151 ewitab = _mm_cvttpd_epi32(ewrt);
1152 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1153 ewitab = _mm_slli_epi32(ewitab,2);
1154 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1155 ewtabD = _mm_setzero_pd();
1156 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1157 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1158 ewtabFn = _mm_setzero_pd();
1159 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1160 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1161 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1162 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1163 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1165 d = _mm_sub_pd(r21,rswitch);
1166 d = _mm_max_pd(d,_mm_setzero_pd());
1167 d2 = _mm_mul_pd(d,d);
1168 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1170 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1172 /* Evaluate switch function */
1173 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1174 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
1175 velec = _mm_mul_pd(velec,sw);
1176 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1178 /* Update potential sum for this i atom from the interaction with this j atom. */
1179 velec = _mm_and_pd(velec,cutoff_mask);
1180 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1181 velecsum = _mm_add_pd(velecsum,velec);
1185 fscal = _mm_and_pd(fscal,cutoff_mask);
1187 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1189 /* Calculate temporary vectorial force */
1190 tx = _mm_mul_pd(fscal,dx21);
1191 ty = _mm_mul_pd(fscal,dy21);
1192 tz = _mm_mul_pd(fscal,dz21);
1194 /* Update vectorial force */
1195 fix2 = _mm_add_pd(fix2,tx);
1196 fiy2 = _mm_add_pd(fiy2,ty);
1197 fiz2 = _mm_add_pd(fiz2,tz);
1199 fjx1 = _mm_add_pd(fjx1,tx);
1200 fjy1 = _mm_add_pd(fjy1,ty);
1201 fjz1 = _mm_add_pd(fjz1,tz);
1205 /**************************
1206 * CALCULATE INTERACTIONS *
1207 **************************/
1209 if (gmx_mm_any_lt(rsq22,rcutoff2))
1212 r22 = _mm_mul_pd(rsq22,rinv22);
1214 /* EWALD ELECTROSTATICS */
1216 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1217 ewrt = _mm_mul_pd(r22,ewtabscale);
1218 ewitab = _mm_cvttpd_epi32(ewrt);
1219 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1220 ewitab = _mm_slli_epi32(ewitab,2);
1221 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1222 ewtabD = _mm_setzero_pd();
1223 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1224 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1225 ewtabFn = _mm_setzero_pd();
1226 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1227 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1228 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1229 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1230 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1232 d = _mm_sub_pd(r22,rswitch);
1233 d = _mm_max_pd(d,_mm_setzero_pd());
1234 d2 = _mm_mul_pd(d,d);
1235 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1237 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1239 /* Evaluate switch function */
1240 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1241 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
1242 velec = _mm_mul_pd(velec,sw);
1243 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1245 /* Update potential sum for this i atom from the interaction with this j atom. */
1246 velec = _mm_and_pd(velec,cutoff_mask);
1247 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1248 velecsum = _mm_add_pd(velecsum,velec);
1252 fscal = _mm_and_pd(fscal,cutoff_mask);
1254 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1256 /* Calculate temporary vectorial force */
1257 tx = _mm_mul_pd(fscal,dx22);
1258 ty = _mm_mul_pd(fscal,dy22);
1259 tz = _mm_mul_pd(fscal,dz22);
1261 /* Update vectorial force */
1262 fix2 = _mm_add_pd(fix2,tx);
1263 fiy2 = _mm_add_pd(fiy2,ty);
1264 fiz2 = _mm_add_pd(fiz2,tz);
1266 fjx2 = _mm_add_pd(fjx2,tx);
1267 fjy2 = _mm_add_pd(fjy2,ty);
1268 fjz2 = _mm_add_pd(fjz2,tz);
1272 /**************************
1273 * CALCULATE INTERACTIONS *
1274 **************************/
1276 if (gmx_mm_any_lt(rsq23,rcutoff2))
1279 r23 = _mm_mul_pd(rsq23,rinv23);
1281 /* EWALD ELECTROSTATICS */
1283 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1284 ewrt = _mm_mul_pd(r23,ewtabscale);
1285 ewitab = _mm_cvttpd_epi32(ewrt);
1286 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1287 ewitab = _mm_slli_epi32(ewitab,2);
1288 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1289 ewtabD = _mm_setzero_pd();
1290 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1291 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1292 ewtabFn = _mm_setzero_pd();
1293 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1294 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1295 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1296 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
1297 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1299 d = _mm_sub_pd(r23,rswitch);
1300 d = _mm_max_pd(d,_mm_setzero_pd());
1301 d2 = _mm_mul_pd(d,d);
1302 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1304 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1306 /* Evaluate switch function */
1307 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1308 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
1309 velec = _mm_mul_pd(velec,sw);
1310 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1312 /* Update potential sum for this i atom from the interaction with this j atom. */
1313 velec = _mm_and_pd(velec,cutoff_mask);
1314 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1315 velecsum = _mm_add_pd(velecsum,velec);
1319 fscal = _mm_and_pd(fscal,cutoff_mask);
1321 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1323 /* Calculate temporary vectorial force */
1324 tx = _mm_mul_pd(fscal,dx23);
1325 ty = _mm_mul_pd(fscal,dy23);
1326 tz = _mm_mul_pd(fscal,dz23);
1328 /* Update vectorial force */
1329 fix2 = _mm_add_pd(fix2,tx);
1330 fiy2 = _mm_add_pd(fiy2,ty);
1331 fiz2 = _mm_add_pd(fiz2,tz);
1333 fjx3 = _mm_add_pd(fjx3,tx);
1334 fjy3 = _mm_add_pd(fjy3,ty);
1335 fjz3 = _mm_add_pd(fjz3,tz);
1339 /**************************
1340 * CALCULATE INTERACTIONS *
1341 **************************/
1343 if (gmx_mm_any_lt(rsq31,rcutoff2))
1346 r31 = _mm_mul_pd(rsq31,rinv31);
1348 /* EWALD ELECTROSTATICS */
1350 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1351 ewrt = _mm_mul_pd(r31,ewtabscale);
1352 ewitab = _mm_cvttpd_epi32(ewrt);
1353 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1354 ewitab = _mm_slli_epi32(ewitab,2);
1355 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1356 ewtabD = _mm_setzero_pd();
1357 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1358 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1359 ewtabFn = _mm_setzero_pd();
1360 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1361 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1362 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1363 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
1364 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1366 d = _mm_sub_pd(r31,rswitch);
1367 d = _mm_max_pd(d,_mm_setzero_pd());
1368 d2 = _mm_mul_pd(d,d);
1369 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1371 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1373 /* Evaluate switch function */
1374 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1375 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
1376 velec = _mm_mul_pd(velec,sw);
1377 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1379 /* Update potential sum for this i atom from the interaction with this j atom. */
1380 velec = _mm_and_pd(velec,cutoff_mask);
1381 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1382 velecsum = _mm_add_pd(velecsum,velec);
1386 fscal = _mm_and_pd(fscal,cutoff_mask);
1388 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1390 /* Calculate temporary vectorial force */
1391 tx = _mm_mul_pd(fscal,dx31);
1392 ty = _mm_mul_pd(fscal,dy31);
1393 tz = _mm_mul_pd(fscal,dz31);
1395 /* Update vectorial force */
1396 fix3 = _mm_add_pd(fix3,tx);
1397 fiy3 = _mm_add_pd(fiy3,ty);
1398 fiz3 = _mm_add_pd(fiz3,tz);
1400 fjx1 = _mm_add_pd(fjx1,tx);
1401 fjy1 = _mm_add_pd(fjy1,ty);
1402 fjz1 = _mm_add_pd(fjz1,tz);
1406 /**************************
1407 * CALCULATE INTERACTIONS *
1408 **************************/
1410 if (gmx_mm_any_lt(rsq32,rcutoff2))
1413 r32 = _mm_mul_pd(rsq32,rinv32);
1415 /* EWALD ELECTROSTATICS */
1417 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1418 ewrt = _mm_mul_pd(r32,ewtabscale);
1419 ewitab = _mm_cvttpd_epi32(ewrt);
1420 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1421 ewitab = _mm_slli_epi32(ewitab,2);
1422 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1423 ewtabD = _mm_setzero_pd();
1424 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1425 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1426 ewtabFn = _mm_setzero_pd();
1427 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1428 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1429 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1430 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
1431 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1433 d = _mm_sub_pd(r32,rswitch);
1434 d = _mm_max_pd(d,_mm_setzero_pd());
1435 d2 = _mm_mul_pd(d,d);
1436 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1438 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1440 /* Evaluate switch function */
1441 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1442 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
1443 velec = _mm_mul_pd(velec,sw);
1444 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1446 /* Update potential sum for this i atom from the interaction with this j atom. */
1447 velec = _mm_and_pd(velec,cutoff_mask);
1448 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1449 velecsum = _mm_add_pd(velecsum,velec);
1453 fscal = _mm_and_pd(fscal,cutoff_mask);
1455 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1457 /* Calculate temporary vectorial force */
1458 tx = _mm_mul_pd(fscal,dx32);
1459 ty = _mm_mul_pd(fscal,dy32);
1460 tz = _mm_mul_pd(fscal,dz32);
1462 /* Update vectorial force */
1463 fix3 = _mm_add_pd(fix3,tx);
1464 fiy3 = _mm_add_pd(fiy3,ty);
1465 fiz3 = _mm_add_pd(fiz3,tz);
1467 fjx2 = _mm_add_pd(fjx2,tx);
1468 fjy2 = _mm_add_pd(fjy2,ty);
1469 fjz2 = _mm_add_pd(fjz2,tz);
1473 /**************************
1474 * CALCULATE INTERACTIONS *
1475 **************************/
1477 if (gmx_mm_any_lt(rsq33,rcutoff2))
1480 r33 = _mm_mul_pd(rsq33,rinv33);
1482 /* EWALD ELECTROSTATICS */
1484 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1485 ewrt = _mm_mul_pd(r33,ewtabscale);
1486 ewitab = _mm_cvttpd_epi32(ewrt);
1487 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1488 ewitab = _mm_slli_epi32(ewitab,2);
1489 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1490 ewtabD = _mm_setzero_pd();
1491 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1492 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1493 ewtabFn = _mm_setzero_pd();
1494 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1495 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1496 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1497 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
1498 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1500 d = _mm_sub_pd(r33,rswitch);
1501 d = _mm_max_pd(d,_mm_setzero_pd());
1502 d2 = _mm_mul_pd(d,d);
1503 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1505 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1507 /* Evaluate switch function */
1508 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1509 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
1510 velec = _mm_mul_pd(velec,sw);
1511 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1513 /* Update potential sum for this i atom from the interaction with this j atom. */
1514 velec = _mm_and_pd(velec,cutoff_mask);
1515 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1516 velecsum = _mm_add_pd(velecsum,velec);
1520 fscal = _mm_and_pd(fscal,cutoff_mask);
1522 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1524 /* Calculate temporary vectorial force */
1525 tx = _mm_mul_pd(fscal,dx33);
1526 ty = _mm_mul_pd(fscal,dy33);
1527 tz = _mm_mul_pd(fscal,dz33);
1529 /* Update vectorial force */
1530 fix3 = _mm_add_pd(fix3,tx);
1531 fiy3 = _mm_add_pd(fiy3,ty);
1532 fiz3 = _mm_add_pd(fiz3,tz);
1534 fjx3 = _mm_add_pd(fjx3,tx);
1535 fjy3 = _mm_add_pd(fjy3,ty);
1536 fjz3 = _mm_add_pd(fjz3,tz);
1540 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1542 /* Inner loop uses 585 flops */
1545 /* End of innermost loop */
1547 gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1548 f+i_coord_offset+DIM,fshift+i_shift_offset);
1551 /* Update potential energies */
1552 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1554 /* Increment number of inner iterations */
1555 inneriter += j_index_end - j_index_start;
1557 /* Outer loop uses 19 flops */
1560 /* Increment number of outer iterations */
1563 /* Update outer/inner flops */
1565 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*585);
1568 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse4_1_double
1569 * Electrostatics interaction: Ewald
1570 * VdW interaction: None
1571 * Geometry: Water4-Water4
1572 * Calculate force/pot: Force
1575 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse4_1_double
1576 (t_nblist * gmx_restrict nlist,
1577 rvec * gmx_restrict xx,
1578 rvec * gmx_restrict ff,
1579 t_forcerec * gmx_restrict fr,
1580 t_mdatoms * gmx_restrict mdatoms,
1581 nb_kernel_data_t * gmx_restrict kernel_data,
1582 t_nrnb * gmx_restrict nrnb)
1584 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1585 * just 0 for non-waters.
1586 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1587 * jnr indices corresponding to data put in the four positions in the SIMD register.
1589 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1590 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1592 int j_coord_offsetA,j_coord_offsetB;
1593 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1594 real rcutoff_scalar;
1595 real *shiftvec,*fshift,*x,*f;
1596 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1598 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1600 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1602 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1603 int vdwjidx1A,vdwjidx1B;
1604 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1605 int vdwjidx2A,vdwjidx2B;
1606 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1607 int vdwjidx3A,vdwjidx3B;
1608 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1609 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1610 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1611 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1612 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1613 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1614 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1615 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1616 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1617 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1618 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1621 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1623 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1624 real rswitch_scalar,d_scalar;
1625 __m128d dummy_mask,cutoff_mask;
1626 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1627 __m128d one = _mm_set1_pd(1.0);
1628 __m128d two = _mm_set1_pd(2.0);
1634 jindex = nlist->jindex;
1636 shiftidx = nlist->shift;
1638 shiftvec = fr->shift_vec[0];
1639 fshift = fr->fshift[0];
1640 facel = _mm_set1_pd(fr->epsfac);
1641 charge = mdatoms->chargeA;
1643 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1644 ewtab = fr->ic->tabq_coul_FDV0;
1645 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1646 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1648 /* Setup water-specific parameters */
1649 inr = nlist->iinr[0];
1650 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1651 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1652 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1654 jq1 = _mm_set1_pd(charge[inr+1]);
1655 jq2 = _mm_set1_pd(charge[inr+2]);
1656 jq3 = _mm_set1_pd(charge[inr+3]);
1657 qq11 = _mm_mul_pd(iq1,jq1);
1658 qq12 = _mm_mul_pd(iq1,jq2);
1659 qq13 = _mm_mul_pd(iq1,jq3);
1660 qq21 = _mm_mul_pd(iq2,jq1);
1661 qq22 = _mm_mul_pd(iq2,jq2);
1662 qq23 = _mm_mul_pd(iq2,jq3);
1663 qq31 = _mm_mul_pd(iq3,jq1);
1664 qq32 = _mm_mul_pd(iq3,jq2);
1665 qq33 = _mm_mul_pd(iq3,jq3);
1667 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1668 rcutoff_scalar = fr->rcoulomb;
1669 rcutoff = _mm_set1_pd(rcutoff_scalar);
1670 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1672 rswitch_scalar = fr->rcoulomb_switch;
1673 rswitch = _mm_set1_pd(rswitch_scalar);
1674 /* Setup switch parameters */
1675 d_scalar = rcutoff_scalar-rswitch_scalar;
1676 d = _mm_set1_pd(d_scalar);
1677 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1678 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1679 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1680 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1681 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1682 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1684 /* Avoid stupid compiler warnings */
1686 j_coord_offsetA = 0;
1687 j_coord_offsetB = 0;
1692 /* Start outer loop over neighborlists */
1693 for(iidx=0; iidx<nri; iidx++)
1695 /* Load shift vector for this list */
1696 i_shift_offset = DIM*shiftidx[iidx];
1698 /* Load limits for loop over neighbors */
1699 j_index_start = jindex[iidx];
1700 j_index_end = jindex[iidx+1];
1702 /* Get outer coordinate index */
1704 i_coord_offset = DIM*inr;
1706 /* Load i particle coords and add shift vector */
1707 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1708 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1710 fix1 = _mm_setzero_pd();
1711 fiy1 = _mm_setzero_pd();
1712 fiz1 = _mm_setzero_pd();
1713 fix2 = _mm_setzero_pd();
1714 fiy2 = _mm_setzero_pd();
1715 fiz2 = _mm_setzero_pd();
1716 fix3 = _mm_setzero_pd();
1717 fiy3 = _mm_setzero_pd();
1718 fiz3 = _mm_setzero_pd();
1720 /* Start inner kernel loop */
1721 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1724 /* Get j neighbor index, and coordinate index */
1726 jnrB = jjnr[jidx+1];
1727 j_coord_offsetA = DIM*jnrA;
1728 j_coord_offsetB = DIM*jnrB;
1730 /* load j atom coordinates */
1731 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1732 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1734 /* Calculate displacement vector */
1735 dx11 = _mm_sub_pd(ix1,jx1);
1736 dy11 = _mm_sub_pd(iy1,jy1);
1737 dz11 = _mm_sub_pd(iz1,jz1);
1738 dx12 = _mm_sub_pd(ix1,jx2);
1739 dy12 = _mm_sub_pd(iy1,jy2);
1740 dz12 = _mm_sub_pd(iz1,jz2);
1741 dx13 = _mm_sub_pd(ix1,jx3);
1742 dy13 = _mm_sub_pd(iy1,jy3);
1743 dz13 = _mm_sub_pd(iz1,jz3);
1744 dx21 = _mm_sub_pd(ix2,jx1);
1745 dy21 = _mm_sub_pd(iy2,jy1);
1746 dz21 = _mm_sub_pd(iz2,jz1);
1747 dx22 = _mm_sub_pd(ix2,jx2);
1748 dy22 = _mm_sub_pd(iy2,jy2);
1749 dz22 = _mm_sub_pd(iz2,jz2);
1750 dx23 = _mm_sub_pd(ix2,jx3);
1751 dy23 = _mm_sub_pd(iy2,jy3);
1752 dz23 = _mm_sub_pd(iz2,jz3);
1753 dx31 = _mm_sub_pd(ix3,jx1);
1754 dy31 = _mm_sub_pd(iy3,jy1);
1755 dz31 = _mm_sub_pd(iz3,jz1);
1756 dx32 = _mm_sub_pd(ix3,jx2);
1757 dy32 = _mm_sub_pd(iy3,jy2);
1758 dz32 = _mm_sub_pd(iz3,jz2);
1759 dx33 = _mm_sub_pd(ix3,jx3);
1760 dy33 = _mm_sub_pd(iy3,jy3);
1761 dz33 = _mm_sub_pd(iz3,jz3);
1763 /* Calculate squared distance and things based on it */
1764 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1765 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1766 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1767 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1768 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1769 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1770 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1771 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1772 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1774 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1775 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1776 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1777 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1778 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1779 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1780 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1781 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1782 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1784 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1785 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1786 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1787 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1788 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1789 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1790 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1791 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1792 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1794 fjx1 = _mm_setzero_pd();
1795 fjy1 = _mm_setzero_pd();
1796 fjz1 = _mm_setzero_pd();
1797 fjx2 = _mm_setzero_pd();
1798 fjy2 = _mm_setzero_pd();
1799 fjz2 = _mm_setzero_pd();
1800 fjx3 = _mm_setzero_pd();
1801 fjy3 = _mm_setzero_pd();
1802 fjz3 = _mm_setzero_pd();
1804 /**************************
1805 * CALCULATE INTERACTIONS *
1806 **************************/
1808 if (gmx_mm_any_lt(rsq11,rcutoff2))
1811 r11 = _mm_mul_pd(rsq11,rinv11);
1813 /* EWALD ELECTROSTATICS */
1815 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1816 ewrt = _mm_mul_pd(r11,ewtabscale);
1817 ewitab = _mm_cvttpd_epi32(ewrt);
1818 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1819 ewitab = _mm_slli_epi32(ewitab,2);
1820 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1821 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1822 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1823 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1824 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1825 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1826 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1827 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1828 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1829 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1831 d = _mm_sub_pd(r11,rswitch);
1832 d = _mm_max_pd(d,_mm_setzero_pd());
1833 d2 = _mm_mul_pd(d,d);
1834 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1836 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1838 /* Evaluate switch function */
1839 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1840 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
1841 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1845 fscal = _mm_and_pd(fscal,cutoff_mask);
1847 /* Calculate temporary vectorial force */
1848 tx = _mm_mul_pd(fscal,dx11);
1849 ty = _mm_mul_pd(fscal,dy11);
1850 tz = _mm_mul_pd(fscal,dz11);
1852 /* Update vectorial force */
1853 fix1 = _mm_add_pd(fix1,tx);
1854 fiy1 = _mm_add_pd(fiy1,ty);
1855 fiz1 = _mm_add_pd(fiz1,tz);
1857 fjx1 = _mm_add_pd(fjx1,tx);
1858 fjy1 = _mm_add_pd(fjy1,ty);
1859 fjz1 = _mm_add_pd(fjz1,tz);
1863 /**************************
1864 * CALCULATE INTERACTIONS *
1865 **************************/
1867 if (gmx_mm_any_lt(rsq12,rcutoff2))
1870 r12 = _mm_mul_pd(rsq12,rinv12);
1872 /* EWALD ELECTROSTATICS */
1874 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1875 ewrt = _mm_mul_pd(r12,ewtabscale);
1876 ewitab = _mm_cvttpd_epi32(ewrt);
1877 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1878 ewitab = _mm_slli_epi32(ewitab,2);
1879 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1880 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1881 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1882 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1883 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1884 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1885 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1886 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1887 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1888 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1890 d = _mm_sub_pd(r12,rswitch);
1891 d = _mm_max_pd(d,_mm_setzero_pd());
1892 d2 = _mm_mul_pd(d,d);
1893 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1895 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1897 /* Evaluate switch function */
1898 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1899 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1900 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1904 fscal = _mm_and_pd(fscal,cutoff_mask);
1906 /* Calculate temporary vectorial force */
1907 tx = _mm_mul_pd(fscal,dx12);
1908 ty = _mm_mul_pd(fscal,dy12);
1909 tz = _mm_mul_pd(fscal,dz12);
1911 /* Update vectorial force */
1912 fix1 = _mm_add_pd(fix1,tx);
1913 fiy1 = _mm_add_pd(fiy1,ty);
1914 fiz1 = _mm_add_pd(fiz1,tz);
1916 fjx2 = _mm_add_pd(fjx2,tx);
1917 fjy2 = _mm_add_pd(fjy2,ty);
1918 fjz2 = _mm_add_pd(fjz2,tz);
1922 /**************************
1923 * CALCULATE INTERACTIONS *
1924 **************************/
1926 if (gmx_mm_any_lt(rsq13,rcutoff2))
1929 r13 = _mm_mul_pd(rsq13,rinv13);
1931 /* EWALD ELECTROSTATICS */
1933 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1934 ewrt = _mm_mul_pd(r13,ewtabscale);
1935 ewitab = _mm_cvttpd_epi32(ewrt);
1936 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1937 ewitab = _mm_slli_epi32(ewitab,2);
1938 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1939 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1940 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1941 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1942 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1943 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1944 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1945 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1946 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
1947 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1949 d = _mm_sub_pd(r13,rswitch);
1950 d = _mm_max_pd(d,_mm_setzero_pd());
1951 d2 = _mm_mul_pd(d,d);
1952 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1954 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1956 /* Evaluate switch function */
1957 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1958 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
1959 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1963 fscal = _mm_and_pd(fscal,cutoff_mask);
1965 /* Calculate temporary vectorial force */
1966 tx = _mm_mul_pd(fscal,dx13);
1967 ty = _mm_mul_pd(fscal,dy13);
1968 tz = _mm_mul_pd(fscal,dz13);
1970 /* Update vectorial force */
1971 fix1 = _mm_add_pd(fix1,tx);
1972 fiy1 = _mm_add_pd(fiy1,ty);
1973 fiz1 = _mm_add_pd(fiz1,tz);
1975 fjx3 = _mm_add_pd(fjx3,tx);
1976 fjy3 = _mm_add_pd(fjy3,ty);
1977 fjz3 = _mm_add_pd(fjz3,tz);
1981 /**************************
1982 * CALCULATE INTERACTIONS *
1983 **************************/
1985 if (gmx_mm_any_lt(rsq21,rcutoff2))
1988 r21 = _mm_mul_pd(rsq21,rinv21);
1990 /* EWALD ELECTROSTATICS */
1992 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1993 ewrt = _mm_mul_pd(r21,ewtabscale);
1994 ewitab = _mm_cvttpd_epi32(ewrt);
1995 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1996 ewitab = _mm_slli_epi32(ewitab,2);
1997 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1998 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1999 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2000 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2001 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2002 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2003 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2004 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2005 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2006 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2008 d = _mm_sub_pd(r21,rswitch);
2009 d = _mm_max_pd(d,_mm_setzero_pd());
2010 d2 = _mm_mul_pd(d,d);
2011 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2013 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2015 /* Evaluate switch function */
2016 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2017 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2018 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2022 fscal = _mm_and_pd(fscal,cutoff_mask);
2024 /* Calculate temporary vectorial force */
2025 tx = _mm_mul_pd(fscal,dx21);
2026 ty = _mm_mul_pd(fscal,dy21);
2027 tz = _mm_mul_pd(fscal,dz21);
2029 /* Update vectorial force */
2030 fix2 = _mm_add_pd(fix2,tx);
2031 fiy2 = _mm_add_pd(fiy2,ty);
2032 fiz2 = _mm_add_pd(fiz2,tz);
2034 fjx1 = _mm_add_pd(fjx1,tx);
2035 fjy1 = _mm_add_pd(fjy1,ty);
2036 fjz1 = _mm_add_pd(fjz1,tz);
2040 /**************************
2041 * CALCULATE INTERACTIONS *
2042 **************************/
2044 if (gmx_mm_any_lt(rsq22,rcutoff2))
2047 r22 = _mm_mul_pd(rsq22,rinv22);
2049 /* EWALD ELECTROSTATICS */
2051 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2052 ewrt = _mm_mul_pd(r22,ewtabscale);
2053 ewitab = _mm_cvttpd_epi32(ewrt);
2054 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2055 ewitab = _mm_slli_epi32(ewitab,2);
2056 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2057 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2058 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2059 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2060 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2061 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2062 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2063 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2064 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2065 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2067 d = _mm_sub_pd(r22,rswitch);
2068 d = _mm_max_pd(d,_mm_setzero_pd());
2069 d2 = _mm_mul_pd(d,d);
2070 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2072 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2074 /* Evaluate switch function */
2075 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2076 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2077 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2081 fscal = _mm_and_pd(fscal,cutoff_mask);
2083 /* Calculate temporary vectorial force */
2084 tx = _mm_mul_pd(fscal,dx22);
2085 ty = _mm_mul_pd(fscal,dy22);
2086 tz = _mm_mul_pd(fscal,dz22);
2088 /* Update vectorial force */
2089 fix2 = _mm_add_pd(fix2,tx);
2090 fiy2 = _mm_add_pd(fiy2,ty);
2091 fiz2 = _mm_add_pd(fiz2,tz);
2093 fjx2 = _mm_add_pd(fjx2,tx);
2094 fjy2 = _mm_add_pd(fjy2,ty);
2095 fjz2 = _mm_add_pd(fjz2,tz);
2099 /**************************
2100 * CALCULATE INTERACTIONS *
2101 **************************/
2103 if (gmx_mm_any_lt(rsq23,rcutoff2))
2106 r23 = _mm_mul_pd(rsq23,rinv23);
2108 /* EWALD ELECTROSTATICS */
2110 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2111 ewrt = _mm_mul_pd(r23,ewtabscale);
2112 ewitab = _mm_cvttpd_epi32(ewrt);
2113 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2114 ewitab = _mm_slli_epi32(ewitab,2);
2115 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2116 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2117 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2118 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2119 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2120 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2121 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2122 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2123 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
2124 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2126 d = _mm_sub_pd(r23,rswitch);
2127 d = _mm_max_pd(d,_mm_setzero_pd());
2128 d2 = _mm_mul_pd(d,d);
2129 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2131 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2133 /* Evaluate switch function */
2134 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2135 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
2136 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
2140 fscal = _mm_and_pd(fscal,cutoff_mask);
2142 /* Calculate temporary vectorial force */
2143 tx = _mm_mul_pd(fscal,dx23);
2144 ty = _mm_mul_pd(fscal,dy23);
2145 tz = _mm_mul_pd(fscal,dz23);
2147 /* Update vectorial force */
2148 fix2 = _mm_add_pd(fix2,tx);
2149 fiy2 = _mm_add_pd(fiy2,ty);
2150 fiz2 = _mm_add_pd(fiz2,tz);
2152 fjx3 = _mm_add_pd(fjx3,tx);
2153 fjy3 = _mm_add_pd(fjy3,ty);
2154 fjz3 = _mm_add_pd(fjz3,tz);
2158 /**************************
2159 * CALCULATE INTERACTIONS *
2160 **************************/
2162 if (gmx_mm_any_lt(rsq31,rcutoff2))
2165 r31 = _mm_mul_pd(rsq31,rinv31);
2167 /* EWALD ELECTROSTATICS */
2169 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2170 ewrt = _mm_mul_pd(r31,ewtabscale);
2171 ewitab = _mm_cvttpd_epi32(ewrt);
2172 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2173 ewitab = _mm_slli_epi32(ewitab,2);
2174 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2175 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2176 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2177 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2178 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2179 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2180 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2181 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2182 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
2183 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2185 d = _mm_sub_pd(r31,rswitch);
2186 d = _mm_max_pd(d,_mm_setzero_pd());
2187 d2 = _mm_mul_pd(d,d);
2188 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2190 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2192 /* Evaluate switch function */
2193 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2194 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
2195 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
2199 fscal = _mm_and_pd(fscal,cutoff_mask);
2201 /* Calculate temporary vectorial force */
2202 tx = _mm_mul_pd(fscal,dx31);
2203 ty = _mm_mul_pd(fscal,dy31);
2204 tz = _mm_mul_pd(fscal,dz31);
2206 /* Update vectorial force */
2207 fix3 = _mm_add_pd(fix3,tx);
2208 fiy3 = _mm_add_pd(fiy3,ty);
2209 fiz3 = _mm_add_pd(fiz3,tz);
2211 fjx1 = _mm_add_pd(fjx1,tx);
2212 fjy1 = _mm_add_pd(fjy1,ty);
2213 fjz1 = _mm_add_pd(fjz1,tz);
2217 /**************************
2218 * CALCULATE INTERACTIONS *
2219 **************************/
2221 if (gmx_mm_any_lt(rsq32,rcutoff2))
2224 r32 = _mm_mul_pd(rsq32,rinv32);
2226 /* EWALD ELECTROSTATICS */
2228 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2229 ewrt = _mm_mul_pd(r32,ewtabscale);
2230 ewitab = _mm_cvttpd_epi32(ewrt);
2231 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2232 ewitab = _mm_slli_epi32(ewitab,2);
2233 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2234 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2235 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2236 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2237 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2238 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2239 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2240 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2241 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
2242 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2244 d = _mm_sub_pd(r32,rswitch);
2245 d = _mm_max_pd(d,_mm_setzero_pd());
2246 d2 = _mm_mul_pd(d,d);
2247 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2249 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2251 /* Evaluate switch function */
2252 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2253 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
2254 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
2258 fscal = _mm_and_pd(fscal,cutoff_mask);
2260 /* Calculate temporary vectorial force */
2261 tx = _mm_mul_pd(fscal,dx32);
2262 ty = _mm_mul_pd(fscal,dy32);
2263 tz = _mm_mul_pd(fscal,dz32);
2265 /* Update vectorial force */
2266 fix3 = _mm_add_pd(fix3,tx);
2267 fiy3 = _mm_add_pd(fiy3,ty);
2268 fiz3 = _mm_add_pd(fiz3,tz);
2270 fjx2 = _mm_add_pd(fjx2,tx);
2271 fjy2 = _mm_add_pd(fjy2,ty);
2272 fjz2 = _mm_add_pd(fjz2,tz);
2276 /**************************
2277 * CALCULATE INTERACTIONS *
2278 **************************/
2280 if (gmx_mm_any_lt(rsq33,rcutoff2))
2283 r33 = _mm_mul_pd(rsq33,rinv33);
2285 /* EWALD ELECTROSTATICS */
2287 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2288 ewrt = _mm_mul_pd(r33,ewtabscale);
2289 ewitab = _mm_cvttpd_epi32(ewrt);
2290 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2291 ewitab = _mm_slli_epi32(ewitab,2);
2292 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2293 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2294 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2295 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2296 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2297 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2298 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2299 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2300 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
2301 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2303 d = _mm_sub_pd(r33,rswitch);
2304 d = _mm_max_pd(d,_mm_setzero_pd());
2305 d2 = _mm_mul_pd(d,d);
2306 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2308 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2310 /* Evaluate switch function */
2311 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2312 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
2313 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
2317 fscal = _mm_and_pd(fscal,cutoff_mask);
2319 /* Calculate temporary vectorial force */
2320 tx = _mm_mul_pd(fscal,dx33);
2321 ty = _mm_mul_pd(fscal,dy33);
2322 tz = _mm_mul_pd(fscal,dz33);
2324 /* Update vectorial force */
2325 fix3 = _mm_add_pd(fix3,tx);
2326 fiy3 = _mm_add_pd(fiy3,ty);
2327 fiz3 = _mm_add_pd(fiz3,tz);
2329 fjx3 = _mm_add_pd(fjx3,tx);
2330 fjy3 = _mm_add_pd(fjy3,ty);
2331 fjz3 = _mm_add_pd(fjz3,tz);
2335 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2337 /* Inner loop uses 558 flops */
2340 if(jidx<j_index_end)
2344 j_coord_offsetA = DIM*jnrA;
2346 /* load j atom coordinates */
2347 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA+DIM,
2348 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2350 /* Calculate displacement vector */
2351 dx11 = _mm_sub_pd(ix1,jx1);
2352 dy11 = _mm_sub_pd(iy1,jy1);
2353 dz11 = _mm_sub_pd(iz1,jz1);
2354 dx12 = _mm_sub_pd(ix1,jx2);
2355 dy12 = _mm_sub_pd(iy1,jy2);
2356 dz12 = _mm_sub_pd(iz1,jz2);
2357 dx13 = _mm_sub_pd(ix1,jx3);
2358 dy13 = _mm_sub_pd(iy1,jy3);
2359 dz13 = _mm_sub_pd(iz1,jz3);
2360 dx21 = _mm_sub_pd(ix2,jx1);
2361 dy21 = _mm_sub_pd(iy2,jy1);
2362 dz21 = _mm_sub_pd(iz2,jz1);
2363 dx22 = _mm_sub_pd(ix2,jx2);
2364 dy22 = _mm_sub_pd(iy2,jy2);
2365 dz22 = _mm_sub_pd(iz2,jz2);
2366 dx23 = _mm_sub_pd(ix2,jx3);
2367 dy23 = _mm_sub_pd(iy2,jy3);
2368 dz23 = _mm_sub_pd(iz2,jz3);
2369 dx31 = _mm_sub_pd(ix3,jx1);
2370 dy31 = _mm_sub_pd(iy3,jy1);
2371 dz31 = _mm_sub_pd(iz3,jz1);
2372 dx32 = _mm_sub_pd(ix3,jx2);
2373 dy32 = _mm_sub_pd(iy3,jy2);
2374 dz32 = _mm_sub_pd(iz3,jz2);
2375 dx33 = _mm_sub_pd(ix3,jx3);
2376 dy33 = _mm_sub_pd(iy3,jy3);
2377 dz33 = _mm_sub_pd(iz3,jz3);
2379 /* Calculate squared distance and things based on it */
2380 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2381 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2382 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2383 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2384 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2385 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2386 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2387 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2388 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2390 rinv11 = gmx_mm_invsqrt_pd(rsq11);
2391 rinv12 = gmx_mm_invsqrt_pd(rsq12);
2392 rinv13 = gmx_mm_invsqrt_pd(rsq13);
2393 rinv21 = gmx_mm_invsqrt_pd(rsq21);
2394 rinv22 = gmx_mm_invsqrt_pd(rsq22);
2395 rinv23 = gmx_mm_invsqrt_pd(rsq23);
2396 rinv31 = gmx_mm_invsqrt_pd(rsq31);
2397 rinv32 = gmx_mm_invsqrt_pd(rsq32);
2398 rinv33 = gmx_mm_invsqrt_pd(rsq33);
2400 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
2401 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
2402 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
2403 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
2404 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
2405 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
2406 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
2407 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
2408 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
2410 fjx1 = _mm_setzero_pd();
2411 fjy1 = _mm_setzero_pd();
2412 fjz1 = _mm_setzero_pd();
2413 fjx2 = _mm_setzero_pd();
2414 fjy2 = _mm_setzero_pd();
2415 fjz2 = _mm_setzero_pd();
2416 fjx3 = _mm_setzero_pd();
2417 fjy3 = _mm_setzero_pd();
2418 fjz3 = _mm_setzero_pd();
2420 /**************************
2421 * CALCULATE INTERACTIONS *
2422 **************************/
2424 if (gmx_mm_any_lt(rsq11,rcutoff2))
2427 r11 = _mm_mul_pd(rsq11,rinv11);
2429 /* EWALD ELECTROSTATICS */
2431 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2432 ewrt = _mm_mul_pd(r11,ewtabscale);
2433 ewitab = _mm_cvttpd_epi32(ewrt);
2434 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2435 ewitab = _mm_slli_epi32(ewitab,2);
2436 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2437 ewtabD = _mm_setzero_pd();
2438 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2439 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2440 ewtabFn = _mm_setzero_pd();
2441 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2442 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2443 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2444 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2445 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2447 d = _mm_sub_pd(r11,rswitch);
2448 d = _mm_max_pd(d,_mm_setzero_pd());
2449 d2 = _mm_mul_pd(d,d);
2450 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2452 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2454 /* Evaluate switch function */
2455 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2456 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2457 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2461 fscal = _mm_and_pd(fscal,cutoff_mask);
2463 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2465 /* Calculate temporary vectorial force */
2466 tx = _mm_mul_pd(fscal,dx11);
2467 ty = _mm_mul_pd(fscal,dy11);
2468 tz = _mm_mul_pd(fscal,dz11);
2470 /* Update vectorial force */
2471 fix1 = _mm_add_pd(fix1,tx);
2472 fiy1 = _mm_add_pd(fiy1,ty);
2473 fiz1 = _mm_add_pd(fiz1,tz);
2475 fjx1 = _mm_add_pd(fjx1,tx);
2476 fjy1 = _mm_add_pd(fjy1,ty);
2477 fjz1 = _mm_add_pd(fjz1,tz);
2481 /**************************
2482 * CALCULATE INTERACTIONS *
2483 **************************/
2485 if (gmx_mm_any_lt(rsq12,rcutoff2))
2488 r12 = _mm_mul_pd(rsq12,rinv12);
2490 /* EWALD ELECTROSTATICS */
2492 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2493 ewrt = _mm_mul_pd(r12,ewtabscale);
2494 ewitab = _mm_cvttpd_epi32(ewrt);
2495 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2496 ewitab = _mm_slli_epi32(ewitab,2);
2497 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2498 ewtabD = _mm_setzero_pd();
2499 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2500 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2501 ewtabFn = _mm_setzero_pd();
2502 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2503 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2504 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2505 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2506 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2508 d = _mm_sub_pd(r12,rswitch);
2509 d = _mm_max_pd(d,_mm_setzero_pd());
2510 d2 = _mm_mul_pd(d,d);
2511 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2513 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2515 /* Evaluate switch function */
2516 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2517 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2518 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2522 fscal = _mm_and_pd(fscal,cutoff_mask);
2524 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2526 /* Calculate temporary vectorial force */
2527 tx = _mm_mul_pd(fscal,dx12);
2528 ty = _mm_mul_pd(fscal,dy12);
2529 tz = _mm_mul_pd(fscal,dz12);
2531 /* Update vectorial force */
2532 fix1 = _mm_add_pd(fix1,tx);
2533 fiy1 = _mm_add_pd(fiy1,ty);
2534 fiz1 = _mm_add_pd(fiz1,tz);
2536 fjx2 = _mm_add_pd(fjx2,tx);
2537 fjy2 = _mm_add_pd(fjy2,ty);
2538 fjz2 = _mm_add_pd(fjz2,tz);
2542 /**************************
2543 * CALCULATE INTERACTIONS *
2544 **************************/
2546 if (gmx_mm_any_lt(rsq13,rcutoff2))
2549 r13 = _mm_mul_pd(rsq13,rinv13);
2551 /* EWALD ELECTROSTATICS */
2553 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2554 ewrt = _mm_mul_pd(r13,ewtabscale);
2555 ewitab = _mm_cvttpd_epi32(ewrt);
2556 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2557 ewitab = _mm_slli_epi32(ewitab,2);
2558 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2559 ewtabD = _mm_setzero_pd();
2560 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2561 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2562 ewtabFn = _mm_setzero_pd();
2563 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2564 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2565 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2566 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
2567 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2569 d = _mm_sub_pd(r13,rswitch);
2570 d = _mm_max_pd(d,_mm_setzero_pd());
2571 d2 = _mm_mul_pd(d,d);
2572 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2574 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2576 /* Evaluate switch function */
2577 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2578 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
2579 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
2583 fscal = _mm_and_pd(fscal,cutoff_mask);
2585 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2587 /* Calculate temporary vectorial force */
2588 tx = _mm_mul_pd(fscal,dx13);
2589 ty = _mm_mul_pd(fscal,dy13);
2590 tz = _mm_mul_pd(fscal,dz13);
2592 /* Update vectorial force */
2593 fix1 = _mm_add_pd(fix1,tx);
2594 fiy1 = _mm_add_pd(fiy1,ty);
2595 fiz1 = _mm_add_pd(fiz1,tz);
2597 fjx3 = _mm_add_pd(fjx3,tx);
2598 fjy3 = _mm_add_pd(fjy3,ty);
2599 fjz3 = _mm_add_pd(fjz3,tz);
2603 /**************************
2604 * CALCULATE INTERACTIONS *
2605 **************************/
2607 if (gmx_mm_any_lt(rsq21,rcutoff2))
2610 r21 = _mm_mul_pd(rsq21,rinv21);
2612 /* EWALD ELECTROSTATICS */
2614 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2615 ewrt = _mm_mul_pd(r21,ewtabscale);
2616 ewitab = _mm_cvttpd_epi32(ewrt);
2617 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2618 ewitab = _mm_slli_epi32(ewitab,2);
2619 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2620 ewtabD = _mm_setzero_pd();
2621 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2622 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2623 ewtabFn = _mm_setzero_pd();
2624 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2625 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2626 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2627 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2628 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2630 d = _mm_sub_pd(r21,rswitch);
2631 d = _mm_max_pd(d,_mm_setzero_pd());
2632 d2 = _mm_mul_pd(d,d);
2633 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2635 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2637 /* Evaluate switch function */
2638 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2639 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2640 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2644 fscal = _mm_and_pd(fscal,cutoff_mask);
2646 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2648 /* Calculate temporary vectorial force */
2649 tx = _mm_mul_pd(fscal,dx21);
2650 ty = _mm_mul_pd(fscal,dy21);
2651 tz = _mm_mul_pd(fscal,dz21);
2653 /* Update vectorial force */
2654 fix2 = _mm_add_pd(fix2,tx);
2655 fiy2 = _mm_add_pd(fiy2,ty);
2656 fiz2 = _mm_add_pd(fiz2,tz);
2658 fjx1 = _mm_add_pd(fjx1,tx);
2659 fjy1 = _mm_add_pd(fjy1,ty);
2660 fjz1 = _mm_add_pd(fjz1,tz);
2664 /**************************
2665 * CALCULATE INTERACTIONS *
2666 **************************/
2668 if (gmx_mm_any_lt(rsq22,rcutoff2))
2671 r22 = _mm_mul_pd(rsq22,rinv22);
2673 /* EWALD ELECTROSTATICS */
2675 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2676 ewrt = _mm_mul_pd(r22,ewtabscale);
2677 ewitab = _mm_cvttpd_epi32(ewrt);
2678 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2679 ewitab = _mm_slli_epi32(ewitab,2);
2680 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2681 ewtabD = _mm_setzero_pd();
2682 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2683 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2684 ewtabFn = _mm_setzero_pd();
2685 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2686 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2687 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2688 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2689 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2691 d = _mm_sub_pd(r22,rswitch);
2692 d = _mm_max_pd(d,_mm_setzero_pd());
2693 d2 = _mm_mul_pd(d,d);
2694 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2696 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2698 /* Evaluate switch function */
2699 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2700 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2701 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2705 fscal = _mm_and_pd(fscal,cutoff_mask);
2707 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2709 /* Calculate temporary vectorial force */
2710 tx = _mm_mul_pd(fscal,dx22);
2711 ty = _mm_mul_pd(fscal,dy22);
2712 tz = _mm_mul_pd(fscal,dz22);
2714 /* Update vectorial force */
2715 fix2 = _mm_add_pd(fix2,tx);
2716 fiy2 = _mm_add_pd(fiy2,ty);
2717 fiz2 = _mm_add_pd(fiz2,tz);
2719 fjx2 = _mm_add_pd(fjx2,tx);
2720 fjy2 = _mm_add_pd(fjy2,ty);
2721 fjz2 = _mm_add_pd(fjz2,tz);
2725 /**************************
2726 * CALCULATE INTERACTIONS *
2727 **************************/
2729 if (gmx_mm_any_lt(rsq23,rcutoff2))
2732 r23 = _mm_mul_pd(rsq23,rinv23);
2734 /* EWALD ELECTROSTATICS */
2736 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2737 ewrt = _mm_mul_pd(r23,ewtabscale);
2738 ewitab = _mm_cvttpd_epi32(ewrt);
2739 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2740 ewitab = _mm_slli_epi32(ewitab,2);
2741 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2742 ewtabD = _mm_setzero_pd();
2743 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2744 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2745 ewtabFn = _mm_setzero_pd();
2746 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2747 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2748 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2749 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
2750 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2752 d = _mm_sub_pd(r23,rswitch);
2753 d = _mm_max_pd(d,_mm_setzero_pd());
2754 d2 = _mm_mul_pd(d,d);
2755 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2757 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2759 /* Evaluate switch function */
2760 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2761 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
2762 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
2766 fscal = _mm_and_pd(fscal,cutoff_mask);
2768 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2770 /* Calculate temporary vectorial force */
2771 tx = _mm_mul_pd(fscal,dx23);
2772 ty = _mm_mul_pd(fscal,dy23);
2773 tz = _mm_mul_pd(fscal,dz23);
2775 /* Update vectorial force */
2776 fix2 = _mm_add_pd(fix2,tx);
2777 fiy2 = _mm_add_pd(fiy2,ty);
2778 fiz2 = _mm_add_pd(fiz2,tz);
2780 fjx3 = _mm_add_pd(fjx3,tx);
2781 fjy3 = _mm_add_pd(fjy3,ty);
2782 fjz3 = _mm_add_pd(fjz3,tz);
2786 /**************************
2787 * CALCULATE INTERACTIONS *
2788 **************************/
2790 if (gmx_mm_any_lt(rsq31,rcutoff2))
2793 r31 = _mm_mul_pd(rsq31,rinv31);
2795 /* EWALD ELECTROSTATICS */
2797 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2798 ewrt = _mm_mul_pd(r31,ewtabscale);
2799 ewitab = _mm_cvttpd_epi32(ewrt);
2800 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2801 ewitab = _mm_slli_epi32(ewitab,2);
2802 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2803 ewtabD = _mm_setzero_pd();
2804 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2805 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2806 ewtabFn = _mm_setzero_pd();
2807 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2808 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2809 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2810 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
2811 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2813 d = _mm_sub_pd(r31,rswitch);
2814 d = _mm_max_pd(d,_mm_setzero_pd());
2815 d2 = _mm_mul_pd(d,d);
2816 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2818 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2820 /* Evaluate switch function */
2821 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2822 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
2823 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
2827 fscal = _mm_and_pd(fscal,cutoff_mask);
2829 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2831 /* Calculate temporary vectorial force */
2832 tx = _mm_mul_pd(fscal,dx31);
2833 ty = _mm_mul_pd(fscal,dy31);
2834 tz = _mm_mul_pd(fscal,dz31);
2836 /* Update vectorial force */
2837 fix3 = _mm_add_pd(fix3,tx);
2838 fiy3 = _mm_add_pd(fiy3,ty);
2839 fiz3 = _mm_add_pd(fiz3,tz);
2841 fjx1 = _mm_add_pd(fjx1,tx);
2842 fjy1 = _mm_add_pd(fjy1,ty);
2843 fjz1 = _mm_add_pd(fjz1,tz);
2847 /**************************
2848 * CALCULATE INTERACTIONS *
2849 **************************/
2851 if (gmx_mm_any_lt(rsq32,rcutoff2))
2854 r32 = _mm_mul_pd(rsq32,rinv32);
2856 /* EWALD ELECTROSTATICS */
2858 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2859 ewrt = _mm_mul_pd(r32,ewtabscale);
2860 ewitab = _mm_cvttpd_epi32(ewrt);
2861 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2862 ewitab = _mm_slli_epi32(ewitab,2);
2863 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2864 ewtabD = _mm_setzero_pd();
2865 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2866 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2867 ewtabFn = _mm_setzero_pd();
2868 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2869 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2870 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2871 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
2872 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2874 d = _mm_sub_pd(r32,rswitch);
2875 d = _mm_max_pd(d,_mm_setzero_pd());
2876 d2 = _mm_mul_pd(d,d);
2877 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2879 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2881 /* Evaluate switch function */
2882 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2883 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
2884 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
2888 fscal = _mm_and_pd(fscal,cutoff_mask);
2890 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2892 /* Calculate temporary vectorial force */
2893 tx = _mm_mul_pd(fscal,dx32);
2894 ty = _mm_mul_pd(fscal,dy32);
2895 tz = _mm_mul_pd(fscal,dz32);
2897 /* Update vectorial force */
2898 fix3 = _mm_add_pd(fix3,tx);
2899 fiy3 = _mm_add_pd(fiy3,ty);
2900 fiz3 = _mm_add_pd(fiz3,tz);
2902 fjx2 = _mm_add_pd(fjx2,tx);
2903 fjy2 = _mm_add_pd(fjy2,ty);
2904 fjz2 = _mm_add_pd(fjz2,tz);
2908 /**************************
2909 * CALCULATE INTERACTIONS *
2910 **************************/
2912 if (gmx_mm_any_lt(rsq33,rcutoff2))
2915 r33 = _mm_mul_pd(rsq33,rinv33);
2917 /* EWALD ELECTROSTATICS */
2919 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2920 ewrt = _mm_mul_pd(r33,ewtabscale);
2921 ewitab = _mm_cvttpd_epi32(ewrt);
2922 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2923 ewitab = _mm_slli_epi32(ewitab,2);
2924 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2925 ewtabD = _mm_setzero_pd();
2926 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2927 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2928 ewtabFn = _mm_setzero_pd();
2929 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2930 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2931 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2932 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
2933 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2935 d = _mm_sub_pd(r33,rswitch);
2936 d = _mm_max_pd(d,_mm_setzero_pd());
2937 d2 = _mm_mul_pd(d,d);
2938 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2940 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2942 /* Evaluate switch function */
2943 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2944 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
2945 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
2949 fscal = _mm_and_pd(fscal,cutoff_mask);
2951 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2953 /* Calculate temporary vectorial force */
2954 tx = _mm_mul_pd(fscal,dx33);
2955 ty = _mm_mul_pd(fscal,dy33);
2956 tz = _mm_mul_pd(fscal,dz33);
2958 /* Update vectorial force */
2959 fix3 = _mm_add_pd(fix3,tx);
2960 fiy3 = _mm_add_pd(fiy3,ty);
2961 fiz3 = _mm_add_pd(fiz3,tz);
2963 fjx3 = _mm_add_pd(fjx3,tx);
2964 fjy3 = _mm_add_pd(fjy3,ty);
2965 fjz3 = _mm_add_pd(fjz3,tz);
2969 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2971 /* Inner loop uses 558 flops */
2974 /* End of innermost loop */
2976 gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2977 f+i_coord_offset+DIM,fshift+i_shift_offset);
2979 /* Increment number of inner iterations */
2980 inneriter += j_index_end - j_index_start;
2982 /* Outer loop uses 18 flops */
2985 /* Increment number of outer iterations */
2988 /* Update outer/inner flops */
2990 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*558);