2 * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_double.h"
34 #include "kernelutil_x86_avx_128_fma_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_128_fma_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: None
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_128_fma_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B;
73 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B;
75 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B;
77 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
92 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
93 real rswitch_scalar,d_scalar;
94 __m128d dummy_mask,cutoff_mask;
95 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
96 __m128d one = _mm_set1_pd(1.0);
97 __m128d two = _mm_set1_pd(2.0);
103 jindex = nlist->jindex;
105 shiftidx = nlist->shift;
107 shiftvec = fr->shift_vec[0];
108 fshift = fr->fshift[0];
109 facel = _mm_set1_pd(fr->epsfac);
110 charge = mdatoms->chargeA;
112 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
113 ewtab = fr->ic->tabq_coul_FDV0;
114 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
115 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
117 /* Setup water-specific parameters */
118 inr = nlist->iinr[0];
119 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
120 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
121 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
123 jq0 = _mm_set1_pd(charge[inr+0]);
124 jq1 = _mm_set1_pd(charge[inr+1]);
125 jq2 = _mm_set1_pd(charge[inr+2]);
126 qq00 = _mm_mul_pd(iq0,jq0);
127 qq01 = _mm_mul_pd(iq0,jq1);
128 qq02 = _mm_mul_pd(iq0,jq2);
129 qq10 = _mm_mul_pd(iq1,jq0);
130 qq11 = _mm_mul_pd(iq1,jq1);
131 qq12 = _mm_mul_pd(iq1,jq2);
132 qq20 = _mm_mul_pd(iq2,jq0);
133 qq21 = _mm_mul_pd(iq2,jq1);
134 qq22 = _mm_mul_pd(iq2,jq2);
136 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
137 rcutoff_scalar = fr->rcoulomb;
138 rcutoff = _mm_set1_pd(rcutoff_scalar);
139 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
141 rswitch_scalar = fr->rcoulomb_switch;
142 rswitch = _mm_set1_pd(rswitch_scalar);
143 /* Setup switch parameters */
144 d_scalar = rcutoff_scalar-rswitch_scalar;
145 d = _mm_set1_pd(d_scalar);
146 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
147 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
148 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
149 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
150 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
151 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
153 /* Avoid stupid compiler warnings */
161 /* Start outer loop over neighborlists */
162 for(iidx=0; iidx<nri; iidx++)
164 /* Load shift vector for this list */
165 i_shift_offset = DIM*shiftidx[iidx];
167 /* Load limits for loop over neighbors */
168 j_index_start = jindex[iidx];
169 j_index_end = jindex[iidx+1];
171 /* Get outer coordinate index */
173 i_coord_offset = DIM*inr;
175 /* Load i particle coords and add shift vector */
176 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
177 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
179 fix0 = _mm_setzero_pd();
180 fiy0 = _mm_setzero_pd();
181 fiz0 = _mm_setzero_pd();
182 fix1 = _mm_setzero_pd();
183 fiy1 = _mm_setzero_pd();
184 fiz1 = _mm_setzero_pd();
185 fix2 = _mm_setzero_pd();
186 fiy2 = _mm_setzero_pd();
187 fiz2 = _mm_setzero_pd();
189 /* Reset potential sums */
190 velecsum = _mm_setzero_pd();
192 /* Start inner kernel loop */
193 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
196 /* Get j neighbor index, and coordinate index */
199 j_coord_offsetA = DIM*jnrA;
200 j_coord_offsetB = DIM*jnrB;
202 /* load j atom coordinates */
203 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
204 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
206 /* Calculate displacement vector */
207 dx00 = _mm_sub_pd(ix0,jx0);
208 dy00 = _mm_sub_pd(iy0,jy0);
209 dz00 = _mm_sub_pd(iz0,jz0);
210 dx01 = _mm_sub_pd(ix0,jx1);
211 dy01 = _mm_sub_pd(iy0,jy1);
212 dz01 = _mm_sub_pd(iz0,jz1);
213 dx02 = _mm_sub_pd(ix0,jx2);
214 dy02 = _mm_sub_pd(iy0,jy2);
215 dz02 = _mm_sub_pd(iz0,jz2);
216 dx10 = _mm_sub_pd(ix1,jx0);
217 dy10 = _mm_sub_pd(iy1,jy0);
218 dz10 = _mm_sub_pd(iz1,jz0);
219 dx11 = _mm_sub_pd(ix1,jx1);
220 dy11 = _mm_sub_pd(iy1,jy1);
221 dz11 = _mm_sub_pd(iz1,jz1);
222 dx12 = _mm_sub_pd(ix1,jx2);
223 dy12 = _mm_sub_pd(iy1,jy2);
224 dz12 = _mm_sub_pd(iz1,jz2);
225 dx20 = _mm_sub_pd(ix2,jx0);
226 dy20 = _mm_sub_pd(iy2,jy0);
227 dz20 = _mm_sub_pd(iz2,jz0);
228 dx21 = _mm_sub_pd(ix2,jx1);
229 dy21 = _mm_sub_pd(iy2,jy1);
230 dz21 = _mm_sub_pd(iz2,jz1);
231 dx22 = _mm_sub_pd(ix2,jx2);
232 dy22 = _mm_sub_pd(iy2,jy2);
233 dz22 = _mm_sub_pd(iz2,jz2);
235 /* Calculate squared distance and things based on it */
236 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
237 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
238 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
239 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
240 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
241 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
242 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
243 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
244 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
246 rinv00 = gmx_mm_invsqrt_pd(rsq00);
247 rinv01 = gmx_mm_invsqrt_pd(rsq01);
248 rinv02 = gmx_mm_invsqrt_pd(rsq02);
249 rinv10 = gmx_mm_invsqrt_pd(rsq10);
250 rinv11 = gmx_mm_invsqrt_pd(rsq11);
251 rinv12 = gmx_mm_invsqrt_pd(rsq12);
252 rinv20 = gmx_mm_invsqrt_pd(rsq20);
253 rinv21 = gmx_mm_invsqrt_pd(rsq21);
254 rinv22 = gmx_mm_invsqrt_pd(rsq22);
256 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
257 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
258 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
259 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
260 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
261 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
262 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
263 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
264 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
266 fjx0 = _mm_setzero_pd();
267 fjy0 = _mm_setzero_pd();
268 fjz0 = _mm_setzero_pd();
269 fjx1 = _mm_setzero_pd();
270 fjy1 = _mm_setzero_pd();
271 fjz1 = _mm_setzero_pd();
272 fjx2 = _mm_setzero_pd();
273 fjy2 = _mm_setzero_pd();
274 fjz2 = _mm_setzero_pd();
276 /**************************
277 * CALCULATE INTERACTIONS *
278 **************************/
280 if (gmx_mm_any_lt(rsq00,rcutoff2))
283 r00 = _mm_mul_pd(rsq00,rinv00);
285 /* EWALD ELECTROSTATICS */
287 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
288 ewrt = _mm_mul_pd(r00,ewtabscale);
289 ewitab = _mm_cvttpd_epi32(ewrt);
291 eweps = _mm_frcz_pd(ewrt);
293 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
295 twoeweps = _mm_add_pd(eweps,eweps);
296 ewitab = _mm_slli_epi32(ewitab,2);
297 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
298 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
299 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
300 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
301 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
302 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
303 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
304 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
305 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
306 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
308 d = _mm_sub_pd(r00,rswitch);
309 d = _mm_max_pd(d,_mm_setzero_pd());
310 d2 = _mm_mul_pd(d,d);
311 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
313 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
315 /* Evaluate switch function */
316 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
317 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
318 velec = _mm_mul_pd(velec,sw);
319 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
321 /* Update potential sum for this i atom from the interaction with this j atom. */
322 velec = _mm_and_pd(velec,cutoff_mask);
323 velecsum = _mm_add_pd(velecsum,velec);
327 fscal = _mm_and_pd(fscal,cutoff_mask);
329 /* Update vectorial force */
330 fix0 = _mm_macc_pd(dx00,fscal,fix0);
331 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
332 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
334 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
335 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
336 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
340 /**************************
341 * CALCULATE INTERACTIONS *
342 **************************/
344 if (gmx_mm_any_lt(rsq01,rcutoff2))
347 r01 = _mm_mul_pd(rsq01,rinv01);
349 /* EWALD ELECTROSTATICS */
351 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
352 ewrt = _mm_mul_pd(r01,ewtabscale);
353 ewitab = _mm_cvttpd_epi32(ewrt);
355 eweps = _mm_frcz_pd(ewrt);
357 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
359 twoeweps = _mm_add_pd(eweps,eweps);
360 ewitab = _mm_slli_epi32(ewitab,2);
361 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
362 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
363 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
364 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
365 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
366 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
367 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
368 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
369 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
370 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
372 d = _mm_sub_pd(r01,rswitch);
373 d = _mm_max_pd(d,_mm_setzero_pd());
374 d2 = _mm_mul_pd(d,d);
375 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
377 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
379 /* Evaluate switch function */
380 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
381 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
382 velec = _mm_mul_pd(velec,sw);
383 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
385 /* Update potential sum for this i atom from the interaction with this j atom. */
386 velec = _mm_and_pd(velec,cutoff_mask);
387 velecsum = _mm_add_pd(velecsum,velec);
391 fscal = _mm_and_pd(fscal,cutoff_mask);
393 /* Update vectorial force */
394 fix0 = _mm_macc_pd(dx01,fscal,fix0);
395 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
396 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
398 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
399 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
400 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
404 /**************************
405 * CALCULATE INTERACTIONS *
406 **************************/
408 if (gmx_mm_any_lt(rsq02,rcutoff2))
411 r02 = _mm_mul_pd(rsq02,rinv02);
413 /* EWALD ELECTROSTATICS */
415 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
416 ewrt = _mm_mul_pd(r02,ewtabscale);
417 ewitab = _mm_cvttpd_epi32(ewrt);
419 eweps = _mm_frcz_pd(ewrt);
421 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
423 twoeweps = _mm_add_pd(eweps,eweps);
424 ewitab = _mm_slli_epi32(ewitab,2);
425 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
426 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
427 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
428 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
429 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
430 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
431 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
432 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
433 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
434 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
436 d = _mm_sub_pd(r02,rswitch);
437 d = _mm_max_pd(d,_mm_setzero_pd());
438 d2 = _mm_mul_pd(d,d);
439 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
441 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
443 /* Evaluate switch function */
444 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
445 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
446 velec = _mm_mul_pd(velec,sw);
447 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
449 /* Update potential sum for this i atom from the interaction with this j atom. */
450 velec = _mm_and_pd(velec,cutoff_mask);
451 velecsum = _mm_add_pd(velecsum,velec);
455 fscal = _mm_and_pd(fscal,cutoff_mask);
457 /* Update vectorial force */
458 fix0 = _mm_macc_pd(dx02,fscal,fix0);
459 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
460 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
462 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
463 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
464 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
468 /**************************
469 * CALCULATE INTERACTIONS *
470 **************************/
472 if (gmx_mm_any_lt(rsq10,rcutoff2))
475 r10 = _mm_mul_pd(rsq10,rinv10);
477 /* EWALD ELECTROSTATICS */
479 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
480 ewrt = _mm_mul_pd(r10,ewtabscale);
481 ewitab = _mm_cvttpd_epi32(ewrt);
483 eweps = _mm_frcz_pd(ewrt);
485 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
487 twoeweps = _mm_add_pd(eweps,eweps);
488 ewitab = _mm_slli_epi32(ewitab,2);
489 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
490 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
491 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
492 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
493 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
494 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
495 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
496 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
497 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
498 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
500 d = _mm_sub_pd(r10,rswitch);
501 d = _mm_max_pd(d,_mm_setzero_pd());
502 d2 = _mm_mul_pd(d,d);
503 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
505 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
507 /* Evaluate switch function */
508 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
509 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
510 velec = _mm_mul_pd(velec,sw);
511 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
513 /* Update potential sum for this i atom from the interaction with this j atom. */
514 velec = _mm_and_pd(velec,cutoff_mask);
515 velecsum = _mm_add_pd(velecsum,velec);
519 fscal = _mm_and_pd(fscal,cutoff_mask);
521 /* Update vectorial force */
522 fix1 = _mm_macc_pd(dx10,fscal,fix1);
523 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
524 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
526 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
527 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
528 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
532 /**************************
533 * CALCULATE INTERACTIONS *
534 **************************/
536 if (gmx_mm_any_lt(rsq11,rcutoff2))
539 r11 = _mm_mul_pd(rsq11,rinv11);
541 /* EWALD ELECTROSTATICS */
543 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
544 ewrt = _mm_mul_pd(r11,ewtabscale);
545 ewitab = _mm_cvttpd_epi32(ewrt);
547 eweps = _mm_frcz_pd(ewrt);
549 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
551 twoeweps = _mm_add_pd(eweps,eweps);
552 ewitab = _mm_slli_epi32(ewitab,2);
553 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
554 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
555 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
556 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
557 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
558 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
559 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
560 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
561 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
562 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
564 d = _mm_sub_pd(r11,rswitch);
565 d = _mm_max_pd(d,_mm_setzero_pd());
566 d2 = _mm_mul_pd(d,d);
567 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
569 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
571 /* Evaluate switch function */
572 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
573 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
574 velec = _mm_mul_pd(velec,sw);
575 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
577 /* Update potential sum for this i atom from the interaction with this j atom. */
578 velec = _mm_and_pd(velec,cutoff_mask);
579 velecsum = _mm_add_pd(velecsum,velec);
583 fscal = _mm_and_pd(fscal,cutoff_mask);
585 /* Update vectorial force */
586 fix1 = _mm_macc_pd(dx11,fscal,fix1);
587 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
588 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
590 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
591 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
592 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
596 /**************************
597 * CALCULATE INTERACTIONS *
598 **************************/
600 if (gmx_mm_any_lt(rsq12,rcutoff2))
603 r12 = _mm_mul_pd(rsq12,rinv12);
605 /* EWALD ELECTROSTATICS */
607 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
608 ewrt = _mm_mul_pd(r12,ewtabscale);
609 ewitab = _mm_cvttpd_epi32(ewrt);
611 eweps = _mm_frcz_pd(ewrt);
613 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
615 twoeweps = _mm_add_pd(eweps,eweps);
616 ewitab = _mm_slli_epi32(ewitab,2);
617 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
618 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
619 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
620 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
621 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
622 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
623 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
624 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
625 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
626 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
628 d = _mm_sub_pd(r12,rswitch);
629 d = _mm_max_pd(d,_mm_setzero_pd());
630 d2 = _mm_mul_pd(d,d);
631 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
633 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
635 /* Evaluate switch function */
636 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
637 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
638 velec = _mm_mul_pd(velec,sw);
639 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
641 /* Update potential sum for this i atom from the interaction with this j atom. */
642 velec = _mm_and_pd(velec,cutoff_mask);
643 velecsum = _mm_add_pd(velecsum,velec);
647 fscal = _mm_and_pd(fscal,cutoff_mask);
649 /* Update vectorial force */
650 fix1 = _mm_macc_pd(dx12,fscal,fix1);
651 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
652 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
654 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
655 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
656 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
660 /**************************
661 * CALCULATE INTERACTIONS *
662 **************************/
664 if (gmx_mm_any_lt(rsq20,rcutoff2))
667 r20 = _mm_mul_pd(rsq20,rinv20);
669 /* EWALD ELECTROSTATICS */
671 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
672 ewrt = _mm_mul_pd(r20,ewtabscale);
673 ewitab = _mm_cvttpd_epi32(ewrt);
675 eweps = _mm_frcz_pd(ewrt);
677 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
679 twoeweps = _mm_add_pd(eweps,eweps);
680 ewitab = _mm_slli_epi32(ewitab,2);
681 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
682 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
683 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
684 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
685 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
686 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
687 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
688 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
689 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
690 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
692 d = _mm_sub_pd(r20,rswitch);
693 d = _mm_max_pd(d,_mm_setzero_pd());
694 d2 = _mm_mul_pd(d,d);
695 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
697 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
699 /* Evaluate switch function */
700 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
701 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
702 velec = _mm_mul_pd(velec,sw);
703 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
705 /* Update potential sum for this i atom from the interaction with this j atom. */
706 velec = _mm_and_pd(velec,cutoff_mask);
707 velecsum = _mm_add_pd(velecsum,velec);
711 fscal = _mm_and_pd(fscal,cutoff_mask);
713 /* Update vectorial force */
714 fix2 = _mm_macc_pd(dx20,fscal,fix2);
715 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
716 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
718 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
719 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
720 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
724 /**************************
725 * CALCULATE INTERACTIONS *
726 **************************/
728 if (gmx_mm_any_lt(rsq21,rcutoff2))
731 r21 = _mm_mul_pd(rsq21,rinv21);
733 /* EWALD ELECTROSTATICS */
735 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
736 ewrt = _mm_mul_pd(r21,ewtabscale);
737 ewitab = _mm_cvttpd_epi32(ewrt);
739 eweps = _mm_frcz_pd(ewrt);
741 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
743 twoeweps = _mm_add_pd(eweps,eweps);
744 ewitab = _mm_slli_epi32(ewitab,2);
745 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
746 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
747 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
748 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
749 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
750 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
751 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
752 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
753 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
754 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
756 d = _mm_sub_pd(r21,rswitch);
757 d = _mm_max_pd(d,_mm_setzero_pd());
758 d2 = _mm_mul_pd(d,d);
759 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
761 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
763 /* Evaluate switch function */
764 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
765 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
766 velec = _mm_mul_pd(velec,sw);
767 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
769 /* Update potential sum for this i atom from the interaction with this j atom. */
770 velec = _mm_and_pd(velec,cutoff_mask);
771 velecsum = _mm_add_pd(velecsum,velec);
775 fscal = _mm_and_pd(fscal,cutoff_mask);
777 /* Update vectorial force */
778 fix2 = _mm_macc_pd(dx21,fscal,fix2);
779 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
780 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
782 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
783 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
784 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
788 /**************************
789 * CALCULATE INTERACTIONS *
790 **************************/
792 if (gmx_mm_any_lt(rsq22,rcutoff2))
795 r22 = _mm_mul_pd(rsq22,rinv22);
797 /* EWALD ELECTROSTATICS */
799 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
800 ewrt = _mm_mul_pd(r22,ewtabscale);
801 ewitab = _mm_cvttpd_epi32(ewrt);
803 eweps = _mm_frcz_pd(ewrt);
805 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
807 twoeweps = _mm_add_pd(eweps,eweps);
808 ewitab = _mm_slli_epi32(ewitab,2);
809 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
810 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
811 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
812 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
813 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
814 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
815 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
816 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
817 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
818 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
820 d = _mm_sub_pd(r22,rswitch);
821 d = _mm_max_pd(d,_mm_setzero_pd());
822 d2 = _mm_mul_pd(d,d);
823 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
825 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
827 /* Evaluate switch function */
828 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
829 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
830 velec = _mm_mul_pd(velec,sw);
831 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
833 /* Update potential sum for this i atom from the interaction with this j atom. */
834 velec = _mm_and_pd(velec,cutoff_mask);
835 velecsum = _mm_add_pd(velecsum,velec);
839 fscal = _mm_and_pd(fscal,cutoff_mask);
841 /* Update vectorial force */
842 fix2 = _mm_macc_pd(dx22,fscal,fix2);
843 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
844 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
846 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
847 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
848 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
852 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
854 /* Inner loop uses 612 flops */
861 j_coord_offsetA = DIM*jnrA;
863 /* load j atom coordinates */
864 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
865 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
867 /* Calculate displacement vector */
868 dx00 = _mm_sub_pd(ix0,jx0);
869 dy00 = _mm_sub_pd(iy0,jy0);
870 dz00 = _mm_sub_pd(iz0,jz0);
871 dx01 = _mm_sub_pd(ix0,jx1);
872 dy01 = _mm_sub_pd(iy0,jy1);
873 dz01 = _mm_sub_pd(iz0,jz1);
874 dx02 = _mm_sub_pd(ix0,jx2);
875 dy02 = _mm_sub_pd(iy0,jy2);
876 dz02 = _mm_sub_pd(iz0,jz2);
877 dx10 = _mm_sub_pd(ix1,jx0);
878 dy10 = _mm_sub_pd(iy1,jy0);
879 dz10 = _mm_sub_pd(iz1,jz0);
880 dx11 = _mm_sub_pd(ix1,jx1);
881 dy11 = _mm_sub_pd(iy1,jy1);
882 dz11 = _mm_sub_pd(iz1,jz1);
883 dx12 = _mm_sub_pd(ix1,jx2);
884 dy12 = _mm_sub_pd(iy1,jy2);
885 dz12 = _mm_sub_pd(iz1,jz2);
886 dx20 = _mm_sub_pd(ix2,jx0);
887 dy20 = _mm_sub_pd(iy2,jy0);
888 dz20 = _mm_sub_pd(iz2,jz0);
889 dx21 = _mm_sub_pd(ix2,jx1);
890 dy21 = _mm_sub_pd(iy2,jy1);
891 dz21 = _mm_sub_pd(iz2,jz1);
892 dx22 = _mm_sub_pd(ix2,jx2);
893 dy22 = _mm_sub_pd(iy2,jy2);
894 dz22 = _mm_sub_pd(iz2,jz2);
896 /* Calculate squared distance and things based on it */
897 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
898 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
899 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
900 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
901 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
902 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
903 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
904 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
905 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
907 rinv00 = gmx_mm_invsqrt_pd(rsq00);
908 rinv01 = gmx_mm_invsqrt_pd(rsq01);
909 rinv02 = gmx_mm_invsqrt_pd(rsq02);
910 rinv10 = gmx_mm_invsqrt_pd(rsq10);
911 rinv11 = gmx_mm_invsqrt_pd(rsq11);
912 rinv12 = gmx_mm_invsqrt_pd(rsq12);
913 rinv20 = gmx_mm_invsqrt_pd(rsq20);
914 rinv21 = gmx_mm_invsqrt_pd(rsq21);
915 rinv22 = gmx_mm_invsqrt_pd(rsq22);
917 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
918 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
919 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
920 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
921 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
922 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
923 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
924 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
925 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
927 fjx0 = _mm_setzero_pd();
928 fjy0 = _mm_setzero_pd();
929 fjz0 = _mm_setzero_pd();
930 fjx1 = _mm_setzero_pd();
931 fjy1 = _mm_setzero_pd();
932 fjz1 = _mm_setzero_pd();
933 fjx2 = _mm_setzero_pd();
934 fjy2 = _mm_setzero_pd();
935 fjz2 = _mm_setzero_pd();
937 /**************************
938 * CALCULATE INTERACTIONS *
939 **************************/
941 if (gmx_mm_any_lt(rsq00,rcutoff2))
944 r00 = _mm_mul_pd(rsq00,rinv00);
946 /* EWALD ELECTROSTATICS */
948 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
949 ewrt = _mm_mul_pd(r00,ewtabscale);
950 ewitab = _mm_cvttpd_epi32(ewrt);
952 eweps = _mm_frcz_pd(ewrt);
954 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
956 twoeweps = _mm_add_pd(eweps,eweps);
957 ewitab = _mm_slli_epi32(ewitab,2);
958 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
959 ewtabD = _mm_setzero_pd();
960 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
961 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
962 ewtabFn = _mm_setzero_pd();
963 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
964 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
965 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
966 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
967 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
969 d = _mm_sub_pd(r00,rswitch);
970 d = _mm_max_pd(d,_mm_setzero_pd());
971 d2 = _mm_mul_pd(d,d);
972 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
974 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
976 /* Evaluate switch function */
977 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
978 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
979 velec = _mm_mul_pd(velec,sw);
980 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
982 /* Update potential sum for this i atom from the interaction with this j atom. */
983 velec = _mm_and_pd(velec,cutoff_mask);
984 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
985 velecsum = _mm_add_pd(velecsum,velec);
989 fscal = _mm_and_pd(fscal,cutoff_mask);
991 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
993 /* Update vectorial force */
994 fix0 = _mm_macc_pd(dx00,fscal,fix0);
995 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
996 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
998 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
999 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1000 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1004 /**************************
1005 * CALCULATE INTERACTIONS *
1006 **************************/
1008 if (gmx_mm_any_lt(rsq01,rcutoff2))
1011 r01 = _mm_mul_pd(rsq01,rinv01);
1013 /* EWALD ELECTROSTATICS */
1015 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1016 ewrt = _mm_mul_pd(r01,ewtabscale);
1017 ewitab = _mm_cvttpd_epi32(ewrt);
1019 eweps = _mm_frcz_pd(ewrt);
1021 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1023 twoeweps = _mm_add_pd(eweps,eweps);
1024 ewitab = _mm_slli_epi32(ewitab,2);
1025 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1026 ewtabD = _mm_setzero_pd();
1027 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1028 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1029 ewtabFn = _mm_setzero_pd();
1030 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1031 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1032 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1033 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
1034 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1036 d = _mm_sub_pd(r01,rswitch);
1037 d = _mm_max_pd(d,_mm_setzero_pd());
1038 d2 = _mm_mul_pd(d,d);
1039 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1041 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1043 /* Evaluate switch function */
1044 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1045 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
1046 velec = _mm_mul_pd(velec,sw);
1047 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
1049 /* Update potential sum for this i atom from the interaction with this j atom. */
1050 velec = _mm_and_pd(velec,cutoff_mask);
1051 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1052 velecsum = _mm_add_pd(velecsum,velec);
1056 fscal = _mm_and_pd(fscal,cutoff_mask);
1058 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1060 /* Update vectorial force */
1061 fix0 = _mm_macc_pd(dx01,fscal,fix0);
1062 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
1063 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
1065 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
1066 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
1067 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
1071 /**************************
1072 * CALCULATE INTERACTIONS *
1073 **************************/
1075 if (gmx_mm_any_lt(rsq02,rcutoff2))
1078 r02 = _mm_mul_pd(rsq02,rinv02);
1080 /* EWALD ELECTROSTATICS */
1082 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1083 ewrt = _mm_mul_pd(r02,ewtabscale);
1084 ewitab = _mm_cvttpd_epi32(ewrt);
1086 eweps = _mm_frcz_pd(ewrt);
1088 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1090 twoeweps = _mm_add_pd(eweps,eweps);
1091 ewitab = _mm_slli_epi32(ewitab,2);
1092 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1093 ewtabD = _mm_setzero_pd();
1094 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1095 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1096 ewtabFn = _mm_setzero_pd();
1097 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1098 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1099 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1100 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
1101 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1103 d = _mm_sub_pd(r02,rswitch);
1104 d = _mm_max_pd(d,_mm_setzero_pd());
1105 d2 = _mm_mul_pd(d,d);
1106 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1108 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1110 /* Evaluate switch function */
1111 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1112 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
1113 velec = _mm_mul_pd(velec,sw);
1114 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
1116 /* Update potential sum for this i atom from the interaction with this j atom. */
1117 velec = _mm_and_pd(velec,cutoff_mask);
1118 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1119 velecsum = _mm_add_pd(velecsum,velec);
1123 fscal = _mm_and_pd(fscal,cutoff_mask);
1125 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1127 /* Update vectorial force */
1128 fix0 = _mm_macc_pd(dx02,fscal,fix0);
1129 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
1130 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
1132 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
1133 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
1134 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
1138 /**************************
1139 * CALCULATE INTERACTIONS *
1140 **************************/
1142 if (gmx_mm_any_lt(rsq10,rcutoff2))
1145 r10 = _mm_mul_pd(rsq10,rinv10);
1147 /* EWALD ELECTROSTATICS */
1149 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1150 ewrt = _mm_mul_pd(r10,ewtabscale);
1151 ewitab = _mm_cvttpd_epi32(ewrt);
1153 eweps = _mm_frcz_pd(ewrt);
1155 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1157 twoeweps = _mm_add_pd(eweps,eweps);
1158 ewitab = _mm_slli_epi32(ewitab,2);
1159 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1160 ewtabD = _mm_setzero_pd();
1161 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1162 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1163 ewtabFn = _mm_setzero_pd();
1164 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1165 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1166 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1167 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
1168 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1170 d = _mm_sub_pd(r10,rswitch);
1171 d = _mm_max_pd(d,_mm_setzero_pd());
1172 d2 = _mm_mul_pd(d,d);
1173 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1175 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1177 /* Evaluate switch function */
1178 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1179 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
1180 velec = _mm_mul_pd(velec,sw);
1181 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1183 /* Update potential sum for this i atom from the interaction with this j atom. */
1184 velec = _mm_and_pd(velec,cutoff_mask);
1185 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1186 velecsum = _mm_add_pd(velecsum,velec);
1190 fscal = _mm_and_pd(fscal,cutoff_mask);
1192 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1194 /* Update vectorial force */
1195 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1196 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1197 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1199 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1200 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1201 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1205 /**************************
1206 * CALCULATE INTERACTIONS *
1207 **************************/
1209 if (gmx_mm_any_lt(rsq11,rcutoff2))
1212 r11 = _mm_mul_pd(rsq11,rinv11);
1214 /* EWALD ELECTROSTATICS */
1216 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1217 ewrt = _mm_mul_pd(r11,ewtabscale);
1218 ewitab = _mm_cvttpd_epi32(ewrt);
1220 eweps = _mm_frcz_pd(ewrt);
1222 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1224 twoeweps = _mm_add_pd(eweps,eweps);
1225 ewitab = _mm_slli_epi32(ewitab,2);
1226 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1227 ewtabD = _mm_setzero_pd();
1228 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1229 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1230 ewtabFn = _mm_setzero_pd();
1231 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1232 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1233 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1234 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1235 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1237 d = _mm_sub_pd(r11,rswitch);
1238 d = _mm_max_pd(d,_mm_setzero_pd());
1239 d2 = _mm_mul_pd(d,d);
1240 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1242 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1244 /* Evaluate switch function */
1245 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1246 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
1247 velec = _mm_mul_pd(velec,sw);
1248 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1250 /* Update potential sum for this i atom from the interaction with this j atom. */
1251 velec = _mm_and_pd(velec,cutoff_mask);
1252 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1253 velecsum = _mm_add_pd(velecsum,velec);
1257 fscal = _mm_and_pd(fscal,cutoff_mask);
1259 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1261 /* Update vectorial force */
1262 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1263 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1264 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1266 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1267 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1268 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1272 /**************************
1273 * CALCULATE INTERACTIONS *
1274 **************************/
1276 if (gmx_mm_any_lt(rsq12,rcutoff2))
1279 r12 = _mm_mul_pd(rsq12,rinv12);
1281 /* EWALD ELECTROSTATICS */
1283 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1284 ewrt = _mm_mul_pd(r12,ewtabscale);
1285 ewitab = _mm_cvttpd_epi32(ewrt);
1287 eweps = _mm_frcz_pd(ewrt);
1289 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1291 twoeweps = _mm_add_pd(eweps,eweps);
1292 ewitab = _mm_slli_epi32(ewitab,2);
1293 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1294 ewtabD = _mm_setzero_pd();
1295 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1296 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1297 ewtabFn = _mm_setzero_pd();
1298 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1299 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1300 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1301 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1302 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1304 d = _mm_sub_pd(r12,rswitch);
1305 d = _mm_max_pd(d,_mm_setzero_pd());
1306 d2 = _mm_mul_pd(d,d);
1307 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1309 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1311 /* Evaluate switch function */
1312 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1313 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1314 velec = _mm_mul_pd(velec,sw);
1315 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1317 /* Update potential sum for this i atom from the interaction with this j atom. */
1318 velec = _mm_and_pd(velec,cutoff_mask);
1319 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1320 velecsum = _mm_add_pd(velecsum,velec);
1324 fscal = _mm_and_pd(fscal,cutoff_mask);
1326 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1328 /* Update vectorial force */
1329 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1330 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1331 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1333 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1334 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1335 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1339 /**************************
1340 * CALCULATE INTERACTIONS *
1341 **************************/
1343 if (gmx_mm_any_lt(rsq20,rcutoff2))
1346 r20 = _mm_mul_pd(rsq20,rinv20);
1348 /* EWALD ELECTROSTATICS */
1350 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1351 ewrt = _mm_mul_pd(r20,ewtabscale);
1352 ewitab = _mm_cvttpd_epi32(ewrt);
1354 eweps = _mm_frcz_pd(ewrt);
1356 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1358 twoeweps = _mm_add_pd(eweps,eweps);
1359 ewitab = _mm_slli_epi32(ewitab,2);
1360 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1361 ewtabD = _mm_setzero_pd();
1362 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1363 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1364 ewtabFn = _mm_setzero_pd();
1365 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1366 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1367 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1368 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
1369 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1371 d = _mm_sub_pd(r20,rswitch);
1372 d = _mm_max_pd(d,_mm_setzero_pd());
1373 d2 = _mm_mul_pd(d,d);
1374 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1376 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1378 /* Evaluate switch function */
1379 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1380 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
1381 velec = _mm_mul_pd(velec,sw);
1382 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1384 /* Update potential sum for this i atom from the interaction with this j atom. */
1385 velec = _mm_and_pd(velec,cutoff_mask);
1386 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1387 velecsum = _mm_add_pd(velecsum,velec);
1391 fscal = _mm_and_pd(fscal,cutoff_mask);
1393 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1395 /* Update vectorial force */
1396 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1397 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1398 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1400 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1401 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1402 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1406 /**************************
1407 * CALCULATE INTERACTIONS *
1408 **************************/
1410 if (gmx_mm_any_lt(rsq21,rcutoff2))
1413 r21 = _mm_mul_pd(rsq21,rinv21);
1415 /* EWALD ELECTROSTATICS */
1417 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1418 ewrt = _mm_mul_pd(r21,ewtabscale);
1419 ewitab = _mm_cvttpd_epi32(ewrt);
1421 eweps = _mm_frcz_pd(ewrt);
1423 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1425 twoeweps = _mm_add_pd(eweps,eweps);
1426 ewitab = _mm_slli_epi32(ewitab,2);
1427 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1428 ewtabD = _mm_setzero_pd();
1429 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1430 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1431 ewtabFn = _mm_setzero_pd();
1432 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1433 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1434 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1435 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1436 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1438 d = _mm_sub_pd(r21,rswitch);
1439 d = _mm_max_pd(d,_mm_setzero_pd());
1440 d2 = _mm_mul_pd(d,d);
1441 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1443 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1445 /* Evaluate switch function */
1446 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1447 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
1448 velec = _mm_mul_pd(velec,sw);
1449 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1451 /* Update potential sum for this i atom from the interaction with this j atom. */
1452 velec = _mm_and_pd(velec,cutoff_mask);
1453 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1454 velecsum = _mm_add_pd(velecsum,velec);
1458 fscal = _mm_and_pd(fscal,cutoff_mask);
1460 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1462 /* Update vectorial force */
1463 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1464 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1465 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1467 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1468 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1469 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1473 /**************************
1474 * CALCULATE INTERACTIONS *
1475 **************************/
1477 if (gmx_mm_any_lt(rsq22,rcutoff2))
1480 r22 = _mm_mul_pd(rsq22,rinv22);
1482 /* EWALD ELECTROSTATICS */
1484 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1485 ewrt = _mm_mul_pd(r22,ewtabscale);
1486 ewitab = _mm_cvttpd_epi32(ewrt);
1488 eweps = _mm_frcz_pd(ewrt);
1490 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1492 twoeweps = _mm_add_pd(eweps,eweps);
1493 ewitab = _mm_slli_epi32(ewitab,2);
1494 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1495 ewtabD = _mm_setzero_pd();
1496 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1497 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1498 ewtabFn = _mm_setzero_pd();
1499 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1500 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1501 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1502 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1503 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1505 d = _mm_sub_pd(r22,rswitch);
1506 d = _mm_max_pd(d,_mm_setzero_pd());
1507 d2 = _mm_mul_pd(d,d);
1508 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1510 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1512 /* Evaluate switch function */
1513 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1514 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
1515 velec = _mm_mul_pd(velec,sw);
1516 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1518 /* Update potential sum for this i atom from the interaction with this j atom. */
1519 velec = _mm_and_pd(velec,cutoff_mask);
1520 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1521 velecsum = _mm_add_pd(velecsum,velec);
1525 fscal = _mm_and_pd(fscal,cutoff_mask);
1527 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1529 /* Update vectorial force */
1530 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1531 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1532 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1534 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1535 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1536 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1540 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1542 /* Inner loop uses 612 flops */
1545 /* End of innermost loop */
1547 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1548 f+i_coord_offset,fshift+i_shift_offset);
1551 /* Update potential energies */
1552 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1554 /* Increment number of inner iterations */
1555 inneriter += j_index_end - j_index_start;
1557 /* Outer loop uses 19 flops */
1560 /* Increment number of outer iterations */
1563 /* Update outer/inner flops */
1565 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*612);
1568 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_128_fma_double
1569 * Electrostatics interaction: Ewald
1570 * VdW interaction: None
1571 * Geometry: Water3-Water3
1572 * Calculate force/pot: Force
1575 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_128_fma_double
1576 (t_nblist * gmx_restrict nlist,
1577 rvec * gmx_restrict xx,
1578 rvec * gmx_restrict ff,
1579 t_forcerec * gmx_restrict fr,
1580 t_mdatoms * gmx_restrict mdatoms,
1581 nb_kernel_data_t * gmx_restrict kernel_data,
1582 t_nrnb * gmx_restrict nrnb)
1584 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1585 * just 0 for non-waters.
1586 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1587 * jnr indices corresponding to data put in the four positions in the SIMD register.
1589 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1590 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1592 int j_coord_offsetA,j_coord_offsetB;
1593 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1594 real rcutoff_scalar;
1595 real *shiftvec,*fshift,*x,*f;
1596 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1598 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1600 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1602 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1603 int vdwjidx0A,vdwjidx0B;
1604 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1605 int vdwjidx1A,vdwjidx1B;
1606 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1607 int vdwjidx2A,vdwjidx2B;
1608 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1609 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1610 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1611 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1612 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1613 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1614 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1615 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1616 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1617 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1618 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1621 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1623 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1624 real rswitch_scalar,d_scalar;
1625 __m128d dummy_mask,cutoff_mask;
1626 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1627 __m128d one = _mm_set1_pd(1.0);
1628 __m128d two = _mm_set1_pd(2.0);
1634 jindex = nlist->jindex;
1636 shiftidx = nlist->shift;
1638 shiftvec = fr->shift_vec[0];
1639 fshift = fr->fshift[0];
1640 facel = _mm_set1_pd(fr->epsfac);
1641 charge = mdatoms->chargeA;
1643 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1644 ewtab = fr->ic->tabq_coul_FDV0;
1645 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1646 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1648 /* Setup water-specific parameters */
1649 inr = nlist->iinr[0];
1650 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1651 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1652 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1654 jq0 = _mm_set1_pd(charge[inr+0]);
1655 jq1 = _mm_set1_pd(charge[inr+1]);
1656 jq2 = _mm_set1_pd(charge[inr+2]);
1657 qq00 = _mm_mul_pd(iq0,jq0);
1658 qq01 = _mm_mul_pd(iq0,jq1);
1659 qq02 = _mm_mul_pd(iq0,jq2);
1660 qq10 = _mm_mul_pd(iq1,jq0);
1661 qq11 = _mm_mul_pd(iq1,jq1);
1662 qq12 = _mm_mul_pd(iq1,jq2);
1663 qq20 = _mm_mul_pd(iq2,jq0);
1664 qq21 = _mm_mul_pd(iq2,jq1);
1665 qq22 = _mm_mul_pd(iq2,jq2);
1667 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1668 rcutoff_scalar = fr->rcoulomb;
1669 rcutoff = _mm_set1_pd(rcutoff_scalar);
1670 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1672 rswitch_scalar = fr->rcoulomb_switch;
1673 rswitch = _mm_set1_pd(rswitch_scalar);
1674 /* Setup switch parameters */
1675 d_scalar = rcutoff_scalar-rswitch_scalar;
1676 d = _mm_set1_pd(d_scalar);
1677 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1678 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1679 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1680 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1681 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1682 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1684 /* Avoid stupid compiler warnings */
1686 j_coord_offsetA = 0;
1687 j_coord_offsetB = 0;
1692 /* Start outer loop over neighborlists */
1693 for(iidx=0; iidx<nri; iidx++)
1695 /* Load shift vector for this list */
1696 i_shift_offset = DIM*shiftidx[iidx];
1698 /* Load limits for loop over neighbors */
1699 j_index_start = jindex[iidx];
1700 j_index_end = jindex[iidx+1];
1702 /* Get outer coordinate index */
1704 i_coord_offset = DIM*inr;
1706 /* Load i particle coords and add shift vector */
1707 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1708 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1710 fix0 = _mm_setzero_pd();
1711 fiy0 = _mm_setzero_pd();
1712 fiz0 = _mm_setzero_pd();
1713 fix1 = _mm_setzero_pd();
1714 fiy1 = _mm_setzero_pd();
1715 fiz1 = _mm_setzero_pd();
1716 fix2 = _mm_setzero_pd();
1717 fiy2 = _mm_setzero_pd();
1718 fiz2 = _mm_setzero_pd();
1720 /* Start inner kernel loop */
1721 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1724 /* Get j neighbor index, and coordinate index */
1726 jnrB = jjnr[jidx+1];
1727 j_coord_offsetA = DIM*jnrA;
1728 j_coord_offsetB = DIM*jnrB;
1730 /* load j atom coordinates */
1731 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1732 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1734 /* Calculate displacement vector */
1735 dx00 = _mm_sub_pd(ix0,jx0);
1736 dy00 = _mm_sub_pd(iy0,jy0);
1737 dz00 = _mm_sub_pd(iz0,jz0);
1738 dx01 = _mm_sub_pd(ix0,jx1);
1739 dy01 = _mm_sub_pd(iy0,jy1);
1740 dz01 = _mm_sub_pd(iz0,jz1);
1741 dx02 = _mm_sub_pd(ix0,jx2);
1742 dy02 = _mm_sub_pd(iy0,jy2);
1743 dz02 = _mm_sub_pd(iz0,jz2);
1744 dx10 = _mm_sub_pd(ix1,jx0);
1745 dy10 = _mm_sub_pd(iy1,jy0);
1746 dz10 = _mm_sub_pd(iz1,jz0);
1747 dx11 = _mm_sub_pd(ix1,jx1);
1748 dy11 = _mm_sub_pd(iy1,jy1);
1749 dz11 = _mm_sub_pd(iz1,jz1);
1750 dx12 = _mm_sub_pd(ix1,jx2);
1751 dy12 = _mm_sub_pd(iy1,jy2);
1752 dz12 = _mm_sub_pd(iz1,jz2);
1753 dx20 = _mm_sub_pd(ix2,jx0);
1754 dy20 = _mm_sub_pd(iy2,jy0);
1755 dz20 = _mm_sub_pd(iz2,jz0);
1756 dx21 = _mm_sub_pd(ix2,jx1);
1757 dy21 = _mm_sub_pd(iy2,jy1);
1758 dz21 = _mm_sub_pd(iz2,jz1);
1759 dx22 = _mm_sub_pd(ix2,jx2);
1760 dy22 = _mm_sub_pd(iy2,jy2);
1761 dz22 = _mm_sub_pd(iz2,jz2);
1763 /* Calculate squared distance and things based on it */
1764 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1765 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1766 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1767 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1768 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1769 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1770 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1771 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1772 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1774 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1775 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1776 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1777 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1778 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1779 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1780 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1781 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1782 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1784 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1785 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1786 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1787 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1788 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1789 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1790 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1791 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1792 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1794 fjx0 = _mm_setzero_pd();
1795 fjy0 = _mm_setzero_pd();
1796 fjz0 = _mm_setzero_pd();
1797 fjx1 = _mm_setzero_pd();
1798 fjy1 = _mm_setzero_pd();
1799 fjz1 = _mm_setzero_pd();
1800 fjx2 = _mm_setzero_pd();
1801 fjy2 = _mm_setzero_pd();
1802 fjz2 = _mm_setzero_pd();
1804 /**************************
1805 * CALCULATE INTERACTIONS *
1806 **************************/
1808 if (gmx_mm_any_lt(rsq00,rcutoff2))
1811 r00 = _mm_mul_pd(rsq00,rinv00);
1813 /* EWALD ELECTROSTATICS */
1815 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1816 ewrt = _mm_mul_pd(r00,ewtabscale);
1817 ewitab = _mm_cvttpd_epi32(ewrt);
1819 eweps = _mm_frcz_pd(ewrt);
1821 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1823 twoeweps = _mm_add_pd(eweps,eweps);
1824 ewitab = _mm_slli_epi32(ewitab,2);
1825 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1826 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1827 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1828 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1829 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
1830 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1831 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1832 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1833 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
1834 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1836 d = _mm_sub_pd(r00,rswitch);
1837 d = _mm_max_pd(d,_mm_setzero_pd());
1838 d2 = _mm_mul_pd(d,d);
1839 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1841 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1843 /* Evaluate switch function */
1844 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1845 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
1846 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1850 fscal = _mm_and_pd(fscal,cutoff_mask);
1852 /* Update vectorial force */
1853 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1854 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1855 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1857 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1858 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1859 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1863 /**************************
1864 * CALCULATE INTERACTIONS *
1865 **************************/
1867 if (gmx_mm_any_lt(rsq01,rcutoff2))
1870 r01 = _mm_mul_pd(rsq01,rinv01);
1872 /* EWALD ELECTROSTATICS */
1874 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1875 ewrt = _mm_mul_pd(r01,ewtabscale);
1876 ewitab = _mm_cvttpd_epi32(ewrt);
1878 eweps = _mm_frcz_pd(ewrt);
1880 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1882 twoeweps = _mm_add_pd(eweps,eweps);
1883 ewitab = _mm_slli_epi32(ewitab,2);
1884 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1885 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1886 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1887 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1888 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
1889 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1890 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1891 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1892 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
1893 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1895 d = _mm_sub_pd(r01,rswitch);
1896 d = _mm_max_pd(d,_mm_setzero_pd());
1897 d2 = _mm_mul_pd(d,d);
1898 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1900 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1902 /* Evaluate switch function */
1903 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1904 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
1905 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
1909 fscal = _mm_and_pd(fscal,cutoff_mask);
1911 /* Update vectorial force */
1912 fix0 = _mm_macc_pd(dx01,fscal,fix0);
1913 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
1914 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
1916 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
1917 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
1918 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
1922 /**************************
1923 * CALCULATE INTERACTIONS *
1924 **************************/
1926 if (gmx_mm_any_lt(rsq02,rcutoff2))
1929 r02 = _mm_mul_pd(rsq02,rinv02);
1931 /* EWALD ELECTROSTATICS */
1933 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1934 ewrt = _mm_mul_pd(r02,ewtabscale);
1935 ewitab = _mm_cvttpd_epi32(ewrt);
1937 eweps = _mm_frcz_pd(ewrt);
1939 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1941 twoeweps = _mm_add_pd(eweps,eweps);
1942 ewitab = _mm_slli_epi32(ewitab,2);
1943 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1944 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1945 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1946 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1947 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
1948 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1949 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1950 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1951 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
1952 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1954 d = _mm_sub_pd(r02,rswitch);
1955 d = _mm_max_pd(d,_mm_setzero_pd());
1956 d2 = _mm_mul_pd(d,d);
1957 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1959 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1961 /* Evaluate switch function */
1962 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1963 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
1964 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
1968 fscal = _mm_and_pd(fscal,cutoff_mask);
1970 /* Update vectorial force */
1971 fix0 = _mm_macc_pd(dx02,fscal,fix0);
1972 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
1973 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
1975 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
1976 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
1977 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
1981 /**************************
1982 * CALCULATE INTERACTIONS *
1983 **************************/
1985 if (gmx_mm_any_lt(rsq10,rcutoff2))
1988 r10 = _mm_mul_pd(rsq10,rinv10);
1990 /* EWALD ELECTROSTATICS */
1992 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1993 ewrt = _mm_mul_pd(r10,ewtabscale);
1994 ewitab = _mm_cvttpd_epi32(ewrt);
1996 eweps = _mm_frcz_pd(ewrt);
1998 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2000 twoeweps = _mm_add_pd(eweps,eweps);
2001 ewitab = _mm_slli_epi32(ewitab,2);
2002 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2003 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2004 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2005 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2006 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2007 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2008 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2009 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2010 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
2011 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2013 d = _mm_sub_pd(r10,rswitch);
2014 d = _mm_max_pd(d,_mm_setzero_pd());
2015 d2 = _mm_mul_pd(d,d);
2016 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2018 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2020 /* Evaluate switch function */
2021 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2022 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
2023 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
2027 fscal = _mm_and_pd(fscal,cutoff_mask);
2029 /* Update vectorial force */
2030 fix1 = _mm_macc_pd(dx10,fscal,fix1);
2031 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
2032 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
2034 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
2035 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
2036 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
2040 /**************************
2041 * CALCULATE INTERACTIONS *
2042 **************************/
2044 if (gmx_mm_any_lt(rsq11,rcutoff2))
2047 r11 = _mm_mul_pd(rsq11,rinv11);
2049 /* EWALD ELECTROSTATICS */
2051 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2052 ewrt = _mm_mul_pd(r11,ewtabscale);
2053 ewitab = _mm_cvttpd_epi32(ewrt);
2055 eweps = _mm_frcz_pd(ewrt);
2057 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2059 twoeweps = _mm_add_pd(eweps,eweps);
2060 ewitab = _mm_slli_epi32(ewitab,2);
2061 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2062 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2063 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2064 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2065 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2066 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2067 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2068 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2069 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2070 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2072 d = _mm_sub_pd(r11,rswitch);
2073 d = _mm_max_pd(d,_mm_setzero_pd());
2074 d2 = _mm_mul_pd(d,d);
2075 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2077 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2079 /* Evaluate switch function */
2080 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2081 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2082 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2086 fscal = _mm_and_pd(fscal,cutoff_mask);
2088 /* Update vectorial force */
2089 fix1 = _mm_macc_pd(dx11,fscal,fix1);
2090 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
2091 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
2093 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
2094 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
2095 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
2099 /**************************
2100 * CALCULATE INTERACTIONS *
2101 **************************/
2103 if (gmx_mm_any_lt(rsq12,rcutoff2))
2106 r12 = _mm_mul_pd(rsq12,rinv12);
2108 /* EWALD ELECTROSTATICS */
2110 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2111 ewrt = _mm_mul_pd(r12,ewtabscale);
2112 ewitab = _mm_cvttpd_epi32(ewrt);
2114 eweps = _mm_frcz_pd(ewrt);
2116 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2118 twoeweps = _mm_add_pd(eweps,eweps);
2119 ewitab = _mm_slli_epi32(ewitab,2);
2120 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2121 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2122 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2123 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2124 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2125 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2126 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2127 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2128 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2129 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2131 d = _mm_sub_pd(r12,rswitch);
2132 d = _mm_max_pd(d,_mm_setzero_pd());
2133 d2 = _mm_mul_pd(d,d);
2134 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2136 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2138 /* Evaluate switch function */
2139 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2140 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2141 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2145 fscal = _mm_and_pd(fscal,cutoff_mask);
2147 /* Update vectorial force */
2148 fix1 = _mm_macc_pd(dx12,fscal,fix1);
2149 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
2150 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
2152 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
2153 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
2154 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
2158 /**************************
2159 * CALCULATE INTERACTIONS *
2160 **************************/
2162 if (gmx_mm_any_lt(rsq20,rcutoff2))
2165 r20 = _mm_mul_pd(rsq20,rinv20);
2167 /* EWALD ELECTROSTATICS */
2169 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2170 ewrt = _mm_mul_pd(r20,ewtabscale);
2171 ewitab = _mm_cvttpd_epi32(ewrt);
2173 eweps = _mm_frcz_pd(ewrt);
2175 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2177 twoeweps = _mm_add_pd(eweps,eweps);
2178 ewitab = _mm_slli_epi32(ewitab,2);
2179 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2180 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2181 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2182 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2183 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2184 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2185 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2186 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2187 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
2188 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2190 d = _mm_sub_pd(r20,rswitch);
2191 d = _mm_max_pd(d,_mm_setzero_pd());
2192 d2 = _mm_mul_pd(d,d);
2193 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2195 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2197 /* Evaluate switch function */
2198 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2199 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
2200 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
2204 fscal = _mm_and_pd(fscal,cutoff_mask);
2206 /* Update vectorial force */
2207 fix2 = _mm_macc_pd(dx20,fscal,fix2);
2208 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
2209 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
2211 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
2212 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
2213 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
2217 /**************************
2218 * CALCULATE INTERACTIONS *
2219 **************************/
2221 if (gmx_mm_any_lt(rsq21,rcutoff2))
2224 r21 = _mm_mul_pd(rsq21,rinv21);
2226 /* EWALD ELECTROSTATICS */
2228 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2229 ewrt = _mm_mul_pd(r21,ewtabscale);
2230 ewitab = _mm_cvttpd_epi32(ewrt);
2232 eweps = _mm_frcz_pd(ewrt);
2234 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2236 twoeweps = _mm_add_pd(eweps,eweps);
2237 ewitab = _mm_slli_epi32(ewitab,2);
2238 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2239 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2240 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2241 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2242 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2243 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2244 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2245 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2246 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2247 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2249 d = _mm_sub_pd(r21,rswitch);
2250 d = _mm_max_pd(d,_mm_setzero_pd());
2251 d2 = _mm_mul_pd(d,d);
2252 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2254 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2256 /* Evaluate switch function */
2257 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2258 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2259 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2263 fscal = _mm_and_pd(fscal,cutoff_mask);
2265 /* Update vectorial force */
2266 fix2 = _mm_macc_pd(dx21,fscal,fix2);
2267 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
2268 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
2270 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
2271 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
2272 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
2276 /**************************
2277 * CALCULATE INTERACTIONS *
2278 **************************/
2280 if (gmx_mm_any_lt(rsq22,rcutoff2))
2283 r22 = _mm_mul_pd(rsq22,rinv22);
2285 /* EWALD ELECTROSTATICS */
2287 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2288 ewrt = _mm_mul_pd(r22,ewtabscale);
2289 ewitab = _mm_cvttpd_epi32(ewrt);
2291 eweps = _mm_frcz_pd(ewrt);
2293 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2295 twoeweps = _mm_add_pd(eweps,eweps);
2296 ewitab = _mm_slli_epi32(ewitab,2);
2297 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2298 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2299 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2300 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2301 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2302 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2303 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2304 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2305 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2306 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2308 d = _mm_sub_pd(r22,rswitch);
2309 d = _mm_max_pd(d,_mm_setzero_pd());
2310 d2 = _mm_mul_pd(d,d);
2311 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2313 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2315 /* Evaluate switch function */
2316 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2317 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2318 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2322 fscal = _mm_and_pd(fscal,cutoff_mask);
2324 /* Update vectorial force */
2325 fix2 = _mm_macc_pd(dx22,fscal,fix2);
2326 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
2327 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
2329 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
2330 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
2331 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
2335 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2337 /* Inner loop uses 585 flops */
2340 if(jidx<j_index_end)
2344 j_coord_offsetA = DIM*jnrA;
2346 /* load j atom coordinates */
2347 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2348 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2350 /* Calculate displacement vector */
2351 dx00 = _mm_sub_pd(ix0,jx0);
2352 dy00 = _mm_sub_pd(iy0,jy0);
2353 dz00 = _mm_sub_pd(iz0,jz0);
2354 dx01 = _mm_sub_pd(ix0,jx1);
2355 dy01 = _mm_sub_pd(iy0,jy1);
2356 dz01 = _mm_sub_pd(iz0,jz1);
2357 dx02 = _mm_sub_pd(ix0,jx2);
2358 dy02 = _mm_sub_pd(iy0,jy2);
2359 dz02 = _mm_sub_pd(iz0,jz2);
2360 dx10 = _mm_sub_pd(ix1,jx0);
2361 dy10 = _mm_sub_pd(iy1,jy0);
2362 dz10 = _mm_sub_pd(iz1,jz0);
2363 dx11 = _mm_sub_pd(ix1,jx1);
2364 dy11 = _mm_sub_pd(iy1,jy1);
2365 dz11 = _mm_sub_pd(iz1,jz1);
2366 dx12 = _mm_sub_pd(ix1,jx2);
2367 dy12 = _mm_sub_pd(iy1,jy2);
2368 dz12 = _mm_sub_pd(iz1,jz2);
2369 dx20 = _mm_sub_pd(ix2,jx0);
2370 dy20 = _mm_sub_pd(iy2,jy0);
2371 dz20 = _mm_sub_pd(iz2,jz0);
2372 dx21 = _mm_sub_pd(ix2,jx1);
2373 dy21 = _mm_sub_pd(iy2,jy1);
2374 dz21 = _mm_sub_pd(iz2,jz1);
2375 dx22 = _mm_sub_pd(ix2,jx2);
2376 dy22 = _mm_sub_pd(iy2,jy2);
2377 dz22 = _mm_sub_pd(iz2,jz2);
2379 /* Calculate squared distance and things based on it */
2380 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2381 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
2382 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
2383 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
2384 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2385 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2386 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
2387 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2388 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2390 rinv00 = gmx_mm_invsqrt_pd(rsq00);
2391 rinv01 = gmx_mm_invsqrt_pd(rsq01);
2392 rinv02 = gmx_mm_invsqrt_pd(rsq02);
2393 rinv10 = gmx_mm_invsqrt_pd(rsq10);
2394 rinv11 = gmx_mm_invsqrt_pd(rsq11);
2395 rinv12 = gmx_mm_invsqrt_pd(rsq12);
2396 rinv20 = gmx_mm_invsqrt_pd(rsq20);
2397 rinv21 = gmx_mm_invsqrt_pd(rsq21);
2398 rinv22 = gmx_mm_invsqrt_pd(rsq22);
2400 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
2401 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
2402 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
2403 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
2404 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
2405 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
2406 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
2407 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
2408 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
2410 fjx0 = _mm_setzero_pd();
2411 fjy0 = _mm_setzero_pd();
2412 fjz0 = _mm_setzero_pd();
2413 fjx1 = _mm_setzero_pd();
2414 fjy1 = _mm_setzero_pd();
2415 fjz1 = _mm_setzero_pd();
2416 fjx2 = _mm_setzero_pd();
2417 fjy2 = _mm_setzero_pd();
2418 fjz2 = _mm_setzero_pd();
2420 /**************************
2421 * CALCULATE INTERACTIONS *
2422 **************************/
2424 if (gmx_mm_any_lt(rsq00,rcutoff2))
2427 r00 = _mm_mul_pd(rsq00,rinv00);
2429 /* EWALD ELECTROSTATICS */
2431 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2432 ewrt = _mm_mul_pd(r00,ewtabscale);
2433 ewitab = _mm_cvttpd_epi32(ewrt);
2435 eweps = _mm_frcz_pd(ewrt);
2437 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2439 twoeweps = _mm_add_pd(eweps,eweps);
2440 ewitab = _mm_slli_epi32(ewitab,2);
2441 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2442 ewtabD = _mm_setzero_pd();
2443 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2444 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2445 ewtabFn = _mm_setzero_pd();
2446 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2447 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2448 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2449 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
2450 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
2452 d = _mm_sub_pd(r00,rswitch);
2453 d = _mm_max_pd(d,_mm_setzero_pd());
2454 d2 = _mm_mul_pd(d,d);
2455 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2457 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2459 /* Evaluate switch function */
2460 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2461 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
2462 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
2466 fscal = _mm_and_pd(fscal,cutoff_mask);
2468 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2470 /* Update vectorial force */
2471 fix0 = _mm_macc_pd(dx00,fscal,fix0);
2472 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
2473 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
2475 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
2476 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
2477 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
2481 /**************************
2482 * CALCULATE INTERACTIONS *
2483 **************************/
2485 if (gmx_mm_any_lt(rsq01,rcutoff2))
2488 r01 = _mm_mul_pd(rsq01,rinv01);
2490 /* EWALD ELECTROSTATICS */
2492 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2493 ewrt = _mm_mul_pd(r01,ewtabscale);
2494 ewitab = _mm_cvttpd_epi32(ewrt);
2496 eweps = _mm_frcz_pd(ewrt);
2498 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2500 twoeweps = _mm_add_pd(eweps,eweps);
2501 ewitab = _mm_slli_epi32(ewitab,2);
2502 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2503 ewtabD = _mm_setzero_pd();
2504 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2505 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2506 ewtabFn = _mm_setzero_pd();
2507 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2508 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2509 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2510 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
2511 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
2513 d = _mm_sub_pd(r01,rswitch);
2514 d = _mm_max_pd(d,_mm_setzero_pd());
2515 d2 = _mm_mul_pd(d,d);
2516 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2518 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2520 /* Evaluate switch function */
2521 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2522 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
2523 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
2527 fscal = _mm_and_pd(fscal,cutoff_mask);
2529 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2531 /* Update vectorial force */
2532 fix0 = _mm_macc_pd(dx01,fscal,fix0);
2533 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
2534 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
2536 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
2537 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
2538 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
2542 /**************************
2543 * CALCULATE INTERACTIONS *
2544 **************************/
2546 if (gmx_mm_any_lt(rsq02,rcutoff2))
2549 r02 = _mm_mul_pd(rsq02,rinv02);
2551 /* EWALD ELECTROSTATICS */
2553 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2554 ewrt = _mm_mul_pd(r02,ewtabscale);
2555 ewitab = _mm_cvttpd_epi32(ewrt);
2557 eweps = _mm_frcz_pd(ewrt);
2559 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2561 twoeweps = _mm_add_pd(eweps,eweps);
2562 ewitab = _mm_slli_epi32(ewitab,2);
2563 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2564 ewtabD = _mm_setzero_pd();
2565 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2566 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2567 ewtabFn = _mm_setzero_pd();
2568 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2569 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2570 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2571 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
2572 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
2574 d = _mm_sub_pd(r02,rswitch);
2575 d = _mm_max_pd(d,_mm_setzero_pd());
2576 d2 = _mm_mul_pd(d,d);
2577 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2579 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2581 /* Evaluate switch function */
2582 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2583 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
2584 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
2588 fscal = _mm_and_pd(fscal,cutoff_mask);
2590 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2592 /* Update vectorial force */
2593 fix0 = _mm_macc_pd(dx02,fscal,fix0);
2594 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
2595 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
2597 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
2598 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
2599 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
2603 /**************************
2604 * CALCULATE INTERACTIONS *
2605 **************************/
2607 if (gmx_mm_any_lt(rsq10,rcutoff2))
2610 r10 = _mm_mul_pd(rsq10,rinv10);
2612 /* EWALD ELECTROSTATICS */
2614 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2615 ewrt = _mm_mul_pd(r10,ewtabscale);
2616 ewitab = _mm_cvttpd_epi32(ewrt);
2618 eweps = _mm_frcz_pd(ewrt);
2620 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2622 twoeweps = _mm_add_pd(eweps,eweps);
2623 ewitab = _mm_slli_epi32(ewitab,2);
2624 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2625 ewtabD = _mm_setzero_pd();
2626 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2627 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2628 ewtabFn = _mm_setzero_pd();
2629 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2630 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2631 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2632 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
2633 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2635 d = _mm_sub_pd(r10,rswitch);
2636 d = _mm_max_pd(d,_mm_setzero_pd());
2637 d2 = _mm_mul_pd(d,d);
2638 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2640 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2642 /* Evaluate switch function */
2643 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2644 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
2645 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
2649 fscal = _mm_and_pd(fscal,cutoff_mask);
2651 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2653 /* Update vectorial force */
2654 fix1 = _mm_macc_pd(dx10,fscal,fix1);
2655 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
2656 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
2658 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
2659 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
2660 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
2664 /**************************
2665 * CALCULATE INTERACTIONS *
2666 **************************/
2668 if (gmx_mm_any_lt(rsq11,rcutoff2))
2671 r11 = _mm_mul_pd(rsq11,rinv11);
2673 /* EWALD ELECTROSTATICS */
2675 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2676 ewrt = _mm_mul_pd(r11,ewtabscale);
2677 ewitab = _mm_cvttpd_epi32(ewrt);
2679 eweps = _mm_frcz_pd(ewrt);
2681 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2683 twoeweps = _mm_add_pd(eweps,eweps);
2684 ewitab = _mm_slli_epi32(ewitab,2);
2685 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2686 ewtabD = _mm_setzero_pd();
2687 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2688 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2689 ewtabFn = _mm_setzero_pd();
2690 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2691 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2692 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2693 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2694 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2696 d = _mm_sub_pd(r11,rswitch);
2697 d = _mm_max_pd(d,_mm_setzero_pd());
2698 d2 = _mm_mul_pd(d,d);
2699 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2701 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2703 /* Evaluate switch function */
2704 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2705 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2706 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2710 fscal = _mm_and_pd(fscal,cutoff_mask);
2712 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2714 /* Update vectorial force */
2715 fix1 = _mm_macc_pd(dx11,fscal,fix1);
2716 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
2717 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
2719 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
2720 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
2721 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
2725 /**************************
2726 * CALCULATE INTERACTIONS *
2727 **************************/
2729 if (gmx_mm_any_lt(rsq12,rcutoff2))
2732 r12 = _mm_mul_pd(rsq12,rinv12);
2734 /* EWALD ELECTROSTATICS */
2736 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2737 ewrt = _mm_mul_pd(r12,ewtabscale);
2738 ewitab = _mm_cvttpd_epi32(ewrt);
2740 eweps = _mm_frcz_pd(ewrt);
2742 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2744 twoeweps = _mm_add_pd(eweps,eweps);
2745 ewitab = _mm_slli_epi32(ewitab,2);
2746 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2747 ewtabD = _mm_setzero_pd();
2748 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2749 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2750 ewtabFn = _mm_setzero_pd();
2751 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2752 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2753 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2754 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2755 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2757 d = _mm_sub_pd(r12,rswitch);
2758 d = _mm_max_pd(d,_mm_setzero_pd());
2759 d2 = _mm_mul_pd(d,d);
2760 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2762 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2764 /* Evaluate switch function */
2765 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2766 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2767 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2771 fscal = _mm_and_pd(fscal,cutoff_mask);
2773 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2775 /* Update vectorial force */
2776 fix1 = _mm_macc_pd(dx12,fscal,fix1);
2777 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
2778 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
2780 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
2781 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
2782 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
2786 /**************************
2787 * CALCULATE INTERACTIONS *
2788 **************************/
2790 if (gmx_mm_any_lt(rsq20,rcutoff2))
2793 r20 = _mm_mul_pd(rsq20,rinv20);
2795 /* EWALD ELECTROSTATICS */
2797 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2798 ewrt = _mm_mul_pd(r20,ewtabscale);
2799 ewitab = _mm_cvttpd_epi32(ewrt);
2801 eweps = _mm_frcz_pd(ewrt);
2803 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2805 twoeweps = _mm_add_pd(eweps,eweps);
2806 ewitab = _mm_slli_epi32(ewitab,2);
2807 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2808 ewtabD = _mm_setzero_pd();
2809 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2810 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2811 ewtabFn = _mm_setzero_pd();
2812 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2813 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2814 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2815 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
2816 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2818 d = _mm_sub_pd(r20,rswitch);
2819 d = _mm_max_pd(d,_mm_setzero_pd());
2820 d2 = _mm_mul_pd(d,d);
2821 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2823 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2825 /* Evaluate switch function */
2826 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2827 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
2828 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
2832 fscal = _mm_and_pd(fscal,cutoff_mask);
2834 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2836 /* Update vectorial force */
2837 fix2 = _mm_macc_pd(dx20,fscal,fix2);
2838 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
2839 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
2841 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
2842 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
2843 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
2847 /**************************
2848 * CALCULATE INTERACTIONS *
2849 **************************/
2851 if (gmx_mm_any_lt(rsq21,rcutoff2))
2854 r21 = _mm_mul_pd(rsq21,rinv21);
2856 /* EWALD ELECTROSTATICS */
2858 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2859 ewrt = _mm_mul_pd(r21,ewtabscale);
2860 ewitab = _mm_cvttpd_epi32(ewrt);
2862 eweps = _mm_frcz_pd(ewrt);
2864 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2866 twoeweps = _mm_add_pd(eweps,eweps);
2867 ewitab = _mm_slli_epi32(ewitab,2);
2868 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2869 ewtabD = _mm_setzero_pd();
2870 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2871 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2872 ewtabFn = _mm_setzero_pd();
2873 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2874 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2875 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2876 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2877 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2879 d = _mm_sub_pd(r21,rswitch);
2880 d = _mm_max_pd(d,_mm_setzero_pd());
2881 d2 = _mm_mul_pd(d,d);
2882 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2884 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2886 /* Evaluate switch function */
2887 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2888 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2889 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2893 fscal = _mm_and_pd(fscal,cutoff_mask);
2895 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2897 /* Update vectorial force */
2898 fix2 = _mm_macc_pd(dx21,fscal,fix2);
2899 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
2900 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
2902 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
2903 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
2904 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
2908 /**************************
2909 * CALCULATE INTERACTIONS *
2910 **************************/
2912 if (gmx_mm_any_lt(rsq22,rcutoff2))
2915 r22 = _mm_mul_pd(rsq22,rinv22);
2917 /* EWALD ELECTROSTATICS */
2919 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2920 ewrt = _mm_mul_pd(r22,ewtabscale);
2921 ewitab = _mm_cvttpd_epi32(ewrt);
2923 eweps = _mm_frcz_pd(ewrt);
2925 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2927 twoeweps = _mm_add_pd(eweps,eweps);
2928 ewitab = _mm_slli_epi32(ewitab,2);
2929 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2930 ewtabD = _mm_setzero_pd();
2931 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2932 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2933 ewtabFn = _mm_setzero_pd();
2934 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2935 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2936 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2937 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2938 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2940 d = _mm_sub_pd(r22,rswitch);
2941 d = _mm_max_pd(d,_mm_setzero_pd());
2942 d2 = _mm_mul_pd(d,d);
2943 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2945 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2947 /* Evaluate switch function */
2948 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2949 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2950 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2954 fscal = _mm_and_pd(fscal,cutoff_mask);
2956 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2958 /* Update vectorial force */
2959 fix2 = _mm_macc_pd(dx22,fscal,fix2);
2960 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
2961 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
2963 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
2964 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
2965 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
2969 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2971 /* Inner loop uses 585 flops */
2974 /* End of innermost loop */
2976 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2977 f+i_coord_offset,fshift+i_shift_offset);
2979 /* Increment number of inner iterations */
2980 inneriter += j_index_end - j_index_start;
2982 /* Outer loop uses 18 flops */
2985 /* Increment number of outer iterations */
2988 /* Update outer/inner flops */
2990 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*585);