2 * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_double.h"
34 #include "kernelutil_x86_avx_128_fma_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_128_fma_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_128_fma_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B;
73 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B;
75 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B;
77 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
93 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
94 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
96 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
98 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
99 real rswitch_scalar,d_scalar;
100 __m128d dummy_mask,cutoff_mask;
101 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
102 __m128d one = _mm_set1_pd(1.0);
103 __m128d two = _mm_set1_pd(2.0);
109 jindex = nlist->jindex;
111 shiftidx = nlist->shift;
113 shiftvec = fr->shift_vec[0];
114 fshift = fr->fshift[0];
115 facel = _mm_set1_pd(fr->epsfac);
116 charge = mdatoms->chargeA;
117 nvdwtype = fr->ntype;
119 vdwtype = mdatoms->typeA;
121 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
122 ewtab = fr->ic->tabq_coul_FDV0;
123 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
124 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
126 /* Setup water-specific parameters */
127 inr = nlist->iinr[0];
128 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
129 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
130 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
131 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
133 jq0 = _mm_set1_pd(charge[inr+0]);
134 jq1 = _mm_set1_pd(charge[inr+1]);
135 jq2 = _mm_set1_pd(charge[inr+2]);
136 vdwjidx0A = 2*vdwtype[inr+0];
137 qq00 = _mm_mul_pd(iq0,jq0);
138 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
139 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
140 qq01 = _mm_mul_pd(iq0,jq1);
141 qq02 = _mm_mul_pd(iq0,jq2);
142 qq10 = _mm_mul_pd(iq1,jq0);
143 qq11 = _mm_mul_pd(iq1,jq1);
144 qq12 = _mm_mul_pd(iq1,jq2);
145 qq20 = _mm_mul_pd(iq2,jq0);
146 qq21 = _mm_mul_pd(iq2,jq1);
147 qq22 = _mm_mul_pd(iq2,jq2);
149 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
150 rcutoff_scalar = fr->rcoulomb;
151 rcutoff = _mm_set1_pd(rcutoff_scalar);
152 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
154 rswitch_scalar = fr->rcoulomb_switch;
155 rswitch = _mm_set1_pd(rswitch_scalar);
156 /* Setup switch parameters */
157 d_scalar = rcutoff_scalar-rswitch_scalar;
158 d = _mm_set1_pd(d_scalar);
159 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
160 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
161 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
162 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
163 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
164 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
166 /* Avoid stupid compiler warnings */
174 /* Start outer loop over neighborlists */
175 for(iidx=0; iidx<nri; iidx++)
177 /* Load shift vector for this list */
178 i_shift_offset = DIM*shiftidx[iidx];
180 /* Load limits for loop over neighbors */
181 j_index_start = jindex[iidx];
182 j_index_end = jindex[iidx+1];
184 /* Get outer coordinate index */
186 i_coord_offset = DIM*inr;
188 /* Load i particle coords and add shift vector */
189 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
190 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
192 fix0 = _mm_setzero_pd();
193 fiy0 = _mm_setzero_pd();
194 fiz0 = _mm_setzero_pd();
195 fix1 = _mm_setzero_pd();
196 fiy1 = _mm_setzero_pd();
197 fiz1 = _mm_setzero_pd();
198 fix2 = _mm_setzero_pd();
199 fiy2 = _mm_setzero_pd();
200 fiz2 = _mm_setzero_pd();
202 /* Reset potential sums */
203 velecsum = _mm_setzero_pd();
204 vvdwsum = _mm_setzero_pd();
206 /* Start inner kernel loop */
207 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
210 /* Get j neighbor index, and coordinate index */
213 j_coord_offsetA = DIM*jnrA;
214 j_coord_offsetB = DIM*jnrB;
216 /* load j atom coordinates */
217 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
218 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
220 /* Calculate displacement vector */
221 dx00 = _mm_sub_pd(ix0,jx0);
222 dy00 = _mm_sub_pd(iy0,jy0);
223 dz00 = _mm_sub_pd(iz0,jz0);
224 dx01 = _mm_sub_pd(ix0,jx1);
225 dy01 = _mm_sub_pd(iy0,jy1);
226 dz01 = _mm_sub_pd(iz0,jz1);
227 dx02 = _mm_sub_pd(ix0,jx2);
228 dy02 = _mm_sub_pd(iy0,jy2);
229 dz02 = _mm_sub_pd(iz0,jz2);
230 dx10 = _mm_sub_pd(ix1,jx0);
231 dy10 = _mm_sub_pd(iy1,jy0);
232 dz10 = _mm_sub_pd(iz1,jz0);
233 dx11 = _mm_sub_pd(ix1,jx1);
234 dy11 = _mm_sub_pd(iy1,jy1);
235 dz11 = _mm_sub_pd(iz1,jz1);
236 dx12 = _mm_sub_pd(ix1,jx2);
237 dy12 = _mm_sub_pd(iy1,jy2);
238 dz12 = _mm_sub_pd(iz1,jz2);
239 dx20 = _mm_sub_pd(ix2,jx0);
240 dy20 = _mm_sub_pd(iy2,jy0);
241 dz20 = _mm_sub_pd(iz2,jz0);
242 dx21 = _mm_sub_pd(ix2,jx1);
243 dy21 = _mm_sub_pd(iy2,jy1);
244 dz21 = _mm_sub_pd(iz2,jz1);
245 dx22 = _mm_sub_pd(ix2,jx2);
246 dy22 = _mm_sub_pd(iy2,jy2);
247 dz22 = _mm_sub_pd(iz2,jz2);
249 /* Calculate squared distance and things based on it */
250 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
251 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
252 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
253 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
254 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
255 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
256 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
257 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
258 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
260 rinv00 = gmx_mm_invsqrt_pd(rsq00);
261 rinv01 = gmx_mm_invsqrt_pd(rsq01);
262 rinv02 = gmx_mm_invsqrt_pd(rsq02);
263 rinv10 = gmx_mm_invsqrt_pd(rsq10);
264 rinv11 = gmx_mm_invsqrt_pd(rsq11);
265 rinv12 = gmx_mm_invsqrt_pd(rsq12);
266 rinv20 = gmx_mm_invsqrt_pd(rsq20);
267 rinv21 = gmx_mm_invsqrt_pd(rsq21);
268 rinv22 = gmx_mm_invsqrt_pd(rsq22);
270 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
271 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
272 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
273 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
274 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
275 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
276 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
277 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
278 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
280 fjx0 = _mm_setzero_pd();
281 fjy0 = _mm_setzero_pd();
282 fjz0 = _mm_setzero_pd();
283 fjx1 = _mm_setzero_pd();
284 fjy1 = _mm_setzero_pd();
285 fjz1 = _mm_setzero_pd();
286 fjx2 = _mm_setzero_pd();
287 fjy2 = _mm_setzero_pd();
288 fjz2 = _mm_setzero_pd();
290 /**************************
291 * CALCULATE INTERACTIONS *
292 **************************/
294 if (gmx_mm_any_lt(rsq00,rcutoff2))
297 r00 = _mm_mul_pd(rsq00,rinv00);
299 /* EWALD ELECTROSTATICS */
301 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
302 ewrt = _mm_mul_pd(r00,ewtabscale);
303 ewitab = _mm_cvttpd_epi32(ewrt);
305 eweps = _mm_frcz_pd(ewrt);
307 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
309 twoeweps = _mm_add_pd(eweps,eweps);
310 ewitab = _mm_slli_epi32(ewitab,2);
311 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
312 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
313 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
314 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
315 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
316 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
317 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
318 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
319 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
320 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
322 /* LENNARD-JONES DISPERSION/REPULSION */
324 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
325 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
326 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
327 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
328 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
330 d = _mm_sub_pd(r00,rswitch);
331 d = _mm_max_pd(d,_mm_setzero_pd());
332 d2 = _mm_mul_pd(d,d);
333 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
335 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
337 /* Evaluate switch function */
338 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
339 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
340 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
341 velec = _mm_mul_pd(velec,sw);
342 vvdw = _mm_mul_pd(vvdw,sw);
343 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
345 /* Update potential sum for this i atom from the interaction with this j atom. */
346 velec = _mm_and_pd(velec,cutoff_mask);
347 velecsum = _mm_add_pd(velecsum,velec);
348 vvdw = _mm_and_pd(vvdw,cutoff_mask);
349 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
351 fscal = _mm_add_pd(felec,fvdw);
353 fscal = _mm_and_pd(fscal,cutoff_mask);
355 /* Update vectorial force */
356 fix0 = _mm_macc_pd(dx00,fscal,fix0);
357 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
358 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
360 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
361 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
362 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
366 /**************************
367 * CALCULATE INTERACTIONS *
368 **************************/
370 if (gmx_mm_any_lt(rsq01,rcutoff2))
373 r01 = _mm_mul_pd(rsq01,rinv01);
375 /* EWALD ELECTROSTATICS */
377 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
378 ewrt = _mm_mul_pd(r01,ewtabscale);
379 ewitab = _mm_cvttpd_epi32(ewrt);
381 eweps = _mm_frcz_pd(ewrt);
383 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
385 twoeweps = _mm_add_pd(eweps,eweps);
386 ewitab = _mm_slli_epi32(ewitab,2);
387 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
388 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
389 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
390 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
391 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
392 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
393 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
394 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
395 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
396 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
398 d = _mm_sub_pd(r01,rswitch);
399 d = _mm_max_pd(d,_mm_setzero_pd());
400 d2 = _mm_mul_pd(d,d);
401 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
403 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
405 /* Evaluate switch function */
406 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
407 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
408 velec = _mm_mul_pd(velec,sw);
409 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
411 /* Update potential sum for this i atom from the interaction with this j atom. */
412 velec = _mm_and_pd(velec,cutoff_mask);
413 velecsum = _mm_add_pd(velecsum,velec);
417 fscal = _mm_and_pd(fscal,cutoff_mask);
419 /* Update vectorial force */
420 fix0 = _mm_macc_pd(dx01,fscal,fix0);
421 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
422 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
424 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
425 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
426 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
430 /**************************
431 * CALCULATE INTERACTIONS *
432 **************************/
434 if (gmx_mm_any_lt(rsq02,rcutoff2))
437 r02 = _mm_mul_pd(rsq02,rinv02);
439 /* EWALD ELECTROSTATICS */
441 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
442 ewrt = _mm_mul_pd(r02,ewtabscale);
443 ewitab = _mm_cvttpd_epi32(ewrt);
445 eweps = _mm_frcz_pd(ewrt);
447 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
449 twoeweps = _mm_add_pd(eweps,eweps);
450 ewitab = _mm_slli_epi32(ewitab,2);
451 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
452 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
453 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
454 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
455 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
456 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
457 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
458 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
459 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
460 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
462 d = _mm_sub_pd(r02,rswitch);
463 d = _mm_max_pd(d,_mm_setzero_pd());
464 d2 = _mm_mul_pd(d,d);
465 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
467 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
469 /* Evaluate switch function */
470 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
471 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
472 velec = _mm_mul_pd(velec,sw);
473 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
475 /* Update potential sum for this i atom from the interaction with this j atom. */
476 velec = _mm_and_pd(velec,cutoff_mask);
477 velecsum = _mm_add_pd(velecsum,velec);
481 fscal = _mm_and_pd(fscal,cutoff_mask);
483 /* Update vectorial force */
484 fix0 = _mm_macc_pd(dx02,fscal,fix0);
485 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
486 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
488 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
489 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
490 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
494 /**************************
495 * CALCULATE INTERACTIONS *
496 **************************/
498 if (gmx_mm_any_lt(rsq10,rcutoff2))
501 r10 = _mm_mul_pd(rsq10,rinv10);
503 /* EWALD ELECTROSTATICS */
505 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
506 ewrt = _mm_mul_pd(r10,ewtabscale);
507 ewitab = _mm_cvttpd_epi32(ewrt);
509 eweps = _mm_frcz_pd(ewrt);
511 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
513 twoeweps = _mm_add_pd(eweps,eweps);
514 ewitab = _mm_slli_epi32(ewitab,2);
515 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
516 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
517 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
518 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
519 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
520 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
521 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
522 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
523 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
524 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
526 d = _mm_sub_pd(r10,rswitch);
527 d = _mm_max_pd(d,_mm_setzero_pd());
528 d2 = _mm_mul_pd(d,d);
529 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
531 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
533 /* Evaluate switch function */
534 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
535 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
536 velec = _mm_mul_pd(velec,sw);
537 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
539 /* Update potential sum for this i atom from the interaction with this j atom. */
540 velec = _mm_and_pd(velec,cutoff_mask);
541 velecsum = _mm_add_pd(velecsum,velec);
545 fscal = _mm_and_pd(fscal,cutoff_mask);
547 /* Update vectorial force */
548 fix1 = _mm_macc_pd(dx10,fscal,fix1);
549 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
550 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
552 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
553 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
554 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
558 /**************************
559 * CALCULATE INTERACTIONS *
560 **************************/
562 if (gmx_mm_any_lt(rsq11,rcutoff2))
565 r11 = _mm_mul_pd(rsq11,rinv11);
567 /* EWALD ELECTROSTATICS */
569 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
570 ewrt = _mm_mul_pd(r11,ewtabscale);
571 ewitab = _mm_cvttpd_epi32(ewrt);
573 eweps = _mm_frcz_pd(ewrt);
575 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
577 twoeweps = _mm_add_pd(eweps,eweps);
578 ewitab = _mm_slli_epi32(ewitab,2);
579 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
580 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
581 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
582 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
583 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
584 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
585 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
586 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
587 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
588 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
590 d = _mm_sub_pd(r11,rswitch);
591 d = _mm_max_pd(d,_mm_setzero_pd());
592 d2 = _mm_mul_pd(d,d);
593 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
595 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
597 /* Evaluate switch function */
598 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
599 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
600 velec = _mm_mul_pd(velec,sw);
601 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
603 /* Update potential sum for this i atom from the interaction with this j atom. */
604 velec = _mm_and_pd(velec,cutoff_mask);
605 velecsum = _mm_add_pd(velecsum,velec);
609 fscal = _mm_and_pd(fscal,cutoff_mask);
611 /* Update vectorial force */
612 fix1 = _mm_macc_pd(dx11,fscal,fix1);
613 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
614 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
616 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
617 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
618 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
622 /**************************
623 * CALCULATE INTERACTIONS *
624 **************************/
626 if (gmx_mm_any_lt(rsq12,rcutoff2))
629 r12 = _mm_mul_pd(rsq12,rinv12);
631 /* EWALD ELECTROSTATICS */
633 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
634 ewrt = _mm_mul_pd(r12,ewtabscale);
635 ewitab = _mm_cvttpd_epi32(ewrt);
637 eweps = _mm_frcz_pd(ewrt);
639 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
641 twoeweps = _mm_add_pd(eweps,eweps);
642 ewitab = _mm_slli_epi32(ewitab,2);
643 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
644 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
645 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
646 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
647 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
648 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
649 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
650 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
651 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
652 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
654 d = _mm_sub_pd(r12,rswitch);
655 d = _mm_max_pd(d,_mm_setzero_pd());
656 d2 = _mm_mul_pd(d,d);
657 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
659 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
661 /* Evaluate switch function */
662 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
663 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
664 velec = _mm_mul_pd(velec,sw);
665 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
667 /* Update potential sum for this i atom from the interaction with this j atom. */
668 velec = _mm_and_pd(velec,cutoff_mask);
669 velecsum = _mm_add_pd(velecsum,velec);
673 fscal = _mm_and_pd(fscal,cutoff_mask);
675 /* Update vectorial force */
676 fix1 = _mm_macc_pd(dx12,fscal,fix1);
677 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
678 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
680 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
681 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
682 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
686 /**************************
687 * CALCULATE INTERACTIONS *
688 **************************/
690 if (gmx_mm_any_lt(rsq20,rcutoff2))
693 r20 = _mm_mul_pd(rsq20,rinv20);
695 /* EWALD ELECTROSTATICS */
697 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
698 ewrt = _mm_mul_pd(r20,ewtabscale);
699 ewitab = _mm_cvttpd_epi32(ewrt);
701 eweps = _mm_frcz_pd(ewrt);
703 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
705 twoeweps = _mm_add_pd(eweps,eweps);
706 ewitab = _mm_slli_epi32(ewitab,2);
707 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
708 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
709 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
710 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
711 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
712 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
713 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
714 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
715 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
716 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
718 d = _mm_sub_pd(r20,rswitch);
719 d = _mm_max_pd(d,_mm_setzero_pd());
720 d2 = _mm_mul_pd(d,d);
721 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
723 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
725 /* Evaluate switch function */
726 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
727 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
728 velec = _mm_mul_pd(velec,sw);
729 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
731 /* Update potential sum for this i atom from the interaction with this j atom. */
732 velec = _mm_and_pd(velec,cutoff_mask);
733 velecsum = _mm_add_pd(velecsum,velec);
737 fscal = _mm_and_pd(fscal,cutoff_mask);
739 /* Update vectorial force */
740 fix2 = _mm_macc_pd(dx20,fscal,fix2);
741 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
742 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
744 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
745 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
746 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
750 /**************************
751 * CALCULATE INTERACTIONS *
752 **************************/
754 if (gmx_mm_any_lt(rsq21,rcutoff2))
757 r21 = _mm_mul_pd(rsq21,rinv21);
759 /* EWALD ELECTROSTATICS */
761 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
762 ewrt = _mm_mul_pd(r21,ewtabscale);
763 ewitab = _mm_cvttpd_epi32(ewrt);
765 eweps = _mm_frcz_pd(ewrt);
767 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
769 twoeweps = _mm_add_pd(eweps,eweps);
770 ewitab = _mm_slli_epi32(ewitab,2);
771 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
772 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
773 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
774 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
775 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
776 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
777 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
778 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
779 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
780 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
782 d = _mm_sub_pd(r21,rswitch);
783 d = _mm_max_pd(d,_mm_setzero_pd());
784 d2 = _mm_mul_pd(d,d);
785 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
787 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
789 /* Evaluate switch function */
790 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
791 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
792 velec = _mm_mul_pd(velec,sw);
793 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
795 /* Update potential sum for this i atom from the interaction with this j atom. */
796 velec = _mm_and_pd(velec,cutoff_mask);
797 velecsum = _mm_add_pd(velecsum,velec);
801 fscal = _mm_and_pd(fscal,cutoff_mask);
803 /* Update vectorial force */
804 fix2 = _mm_macc_pd(dx21,fscal,fix2);
805 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
806 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
808 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
809 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
810 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
814 /**************************
815 * CALCULATE INTERACTIONS *
816 **************************/
818 if (gmx_mm_any_lt(rsq22,rcutoff2))
821 r22 = _mm_mul_pd(rsq22,rinv22);
823 /* EWALD ELECTROSTATICS */
825 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
826 ewrt = _mm_mul_pd(r22,ewtabscale);
827 ewitab = _mm_cvttpd_epi32(ewrt);
829 eweps = _mm_frcz_pd(ewrt);
831 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
833 twoeweps = _mm_add_pd(eweps,eweps);
834 ewitab = _mm_slli_epi32(ewitab,2);
835 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
836 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
837 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
838 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
839 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
840 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
841 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
842 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
843 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
844 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
846 d = _mm_sub_pd(r22,rswitch);
847 d = _mm_max_pd(d,_mm_setzero_pd());
848 d2 = _mm_mul_pd(d,d);
849 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
851 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
853 /* Evaluate switch function */
854 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
855 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
856 velec = _mm_mul_pd(velec,sw);
857 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
859 /* Update potential sum for this i atom from the interaction with this j atom. */
860 velec = _mm_and_pd(velec,cutoff_mask);
861 velecsum = _mm_add_pd(velecsum,velec);
865 fscal = _mm_and_pd(fscal,cutoff_mask);
867 /* Update vectorial force */
868 fix2 = _mm_macc_pd(dx22,fscal,fix2);
869 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
870 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
872 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
873 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
874 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
878 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
880 /* Inner loop uses 630 flops */
887 j_coord_offsetA = DIM*jnrA;
889 /* load j atom coordinates */
890 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
891 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
893 /* Calculate displacement vector */
894 dx00 = _mm_sub_pd(ix0,jx0);
895 dy00 = _mm_sub_pd(iy0,jy0);
896 dz00 = _mm_sub_pd(iz0,jz0);
897 dx01 = _mm_sub_pd(ix0,jx1);
898 dy01 = _mm_sub_pd(iy0,jy1);
899 dz01 = _mm_sub_pd(iz0,jz1);
900 dx02 = _mm_sub_pd(ix0,jx2);
901 dy02 = _mm_sub_pd(iy0,jy2);
902 dz02 = _mm_sub_pd(iz0,jz2);
903 dx10 = _mm_sub_pd(ix1,jx0);
904 dy10 = _mm_sub_pd(iy1,jy0);
905 dz10 = _mm_sub_pd(iz1,jz0);
906 dx11 = _mm_sub_pd(ix1,jx1);
907 dy11 = _mm_sub_pd(iy1,jy1);
908 dz11 = _mm_sub_pd(iz1,jz1);
909 dx12 = _mm_sub_pd(ix1,jx2);
910 dy12 = _mm_sub_pd(iy1,jy2);
911 dz12 = _mm_sub_pd(iz1,jz2);
912 dx20 = _mm_sub_pd(ix2,jx0);
913 dy20 = _mm_sub_pd(iy2,jy0);
914 dz20 = _mm_sub_pd(iz2,jz0);
915 dx21 = _mm_sub_pd(ix2,jx1);
916 dy21 = _mm_sub_pd(iy2,jy1);
917 dz21 = _mm_sub_pd(iz2,jz1);
918 dx22 = _mm_sub_pd(ix2,jx2);
919 dy22 = _mm_sub_pd(iy2,jy2);
920 dz22 = _mm_sub_pd(iz2,jz2);
922 /* Calculate squared distance and things based on it */
923 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
924 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
925 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
926 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
927 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
928 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
929 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
930 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
931 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
933 rinv00 = gmx_mm_invsqrt_pd(rsq00);
934 rinv01 = gmx_mm_invsqrt_pd(rsq01);
935 rinv02 = gmx_mm_invsqrt_pd(rsq02);
936 rinv10 = gmx_mm_invsqrt_pd(rsq10);
937 rinv11 = gmx_mm_invsqrt_pd(rsq11);
938 rinv12 = gmx_mm_invsqrt_pd(rsq12);
939 rinv20 = gmx_mm_invsqrt_pd(rsq20);
940 rinv21 = gmx_mm_invsqrt_pd(rsq21);
941 rinv22 = gmx_mm_invsqrt_pd(rsq22);
943 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
944 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
945 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
946 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
947 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
948 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
949 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
950 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
951 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
953 fjx0 = _mm_setzero_pd();
954 fjy0 = _mm_setzero_pd();
955 fjz0 = _mm_setzero_pd();
956 fjx1 = _mm_setzero_pd();
957 fjy1 = _mm_setzero_pd();
958 fjz1 = _mm_setzero_pd();
959 fjx2 = _mm_setzero_pd();
960 fjy2 = _mm_setzero_pd();
961 fjz2 = _mm_setzero_pd();
963 /**************************
964 * CALCULATE INTERACTIONS *
965 **************************/
967 if (gmx_mm_any_lt(rsq00,rcutoff2))
970 r00 = _mm_mul_pd(rsq00,rinv00);
972 /* EWALD ELECTROSTATICS */
974 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
975 ewrt = _mm_mul_pd(r00,ewtabscale);
976 ewitab = _mm_cvttpd_epi32(ewrt);
978 eweps = _mm_frcz_pd(ewrt);
980 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
982 twoeweps = _mm_add_pd(eweps,eweps);
983 ewitab = _mm_slli_epi32(ewitab,2);
984 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
985 ewtabD = _mm_setzero_pd();
986 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
987 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
988 ewtabFn = _mm_setzero_pd();
989 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
990 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
991 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
992 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
993 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
995 /* LENNARD-JONES DISPERSION/REPULSION */
997 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
998 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
999 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1000 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
1001 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1003 d = _mm_sub_pd(r00,rswitch);
1004 d = _mm_max_pd(d,_mm_setzero_pd());
1005 d2 = _mm_mul_pd(d,d);
1006 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1008 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1010 /* Evaluate switch function */
1011 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1012 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
1013 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1014 velec = _mm_mul_pd(velec,sw);
1015 vvdw = _mm_mul_pd(vvdw,sw);
1016 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1018 /* Update potential sum for this i atom from the interaction with this j atom. */
1019 velec = _mm_and_pd(velec,cutoff_mask);
1020 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1021 velecsum = _mm_add_pd(velecsum,velec);
1022 vvdw = _mm_and_pd(vvdw,cutoff_mask);
1023 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
1024 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
1026 fscal = _mm_add_pd(felec,fvdw);
1028 fscal = _mm_and_pd(fscal,cutoff_mask);
1030 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1032 /* Update vectorial force */
1033 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1034 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1035 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1037 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1038 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1039 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1043 /**************************
1044 * CALCULATE INTERACTIONS *
1045 **************************/
1047 if (gmx_mm_any_lt(rsq01,rcutoff2))
1050 r01 = _mm_mul_pd(rsq01,rinv01);
1052 /* EWALD ELECTROSTATICS */
1054 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1055 ewrt = _mm_mul_pd(r01,ewtabscale);
1056 ewitab = _mm_cvttpd_epi32(ewrt);
1058 eweps = _mm_frcz_pd(ewrt);
1060 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1062 twoeweps = _mm_add_pd(eweps,eweps);
1063 ewitab = _mm_slli_epi32(ewitab,2);
1064 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1065 ewtabD = _mm_setzero_pd();
1066 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1067 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1068 ewtabFn = _mm_setzero_pd();
1069 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1070 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1071 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1072 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
1073 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1075 d = _mm_sub_pd(r01,rswitch);
1076 d = _mm_max_pd(d,_mm_setzero_pd());
1077 d2 = _mm_mul_pd(d,d);
1078 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1080 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1082 /* Evaluate switch function */
1083 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1084 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
1085 velec = _mm_mul_pd(velec,sw);
1086 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
1088 /* Update potential sum for this i atom from the interaction with this j atom. */
1089 velec = _mm_and_pd(velec,cutoff_mask);
1090 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1091 velecsum = _mm_add_pd(velecsum,velec);
1095 fscal = _mm_and_pd(fscal,cutoff_mask);
1097 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1099 /* Update vectorial force */
1100 fix0 = _mm_macc_pd(dx01,fscal,fix0);
1101 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
1102 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
1104 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
1105 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
1106 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
1110 /**************************
1111 * CALCULATE INTERACTIONS *
1112 **************************/
1114 if (gmx_mm_any_lt(rsq02,rcutoff2))
1117 r02 = _mm_mul_pd(rsq02,rinv02);
1119 /* EWALD ELECTROSTATICS */
1121 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1122 ewrt = _mm_mul_pd(r02,ewtabscale);
1123 ewitab = _mm_cvttpd_epi32(ewrt);
1125 eweps = _mm_frcz_pd(ewrt);
1127 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1129 twoeweps = _mm_add_pd(eweps,eweps);
1130 ewitab = _mm_slli_epi32(ewitab,2);
1131 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1132 ewtabD = _mm_setzero_pd();
1133 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1134 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1135 ewtabFn = _mm_setzero_pd();
1136 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1137 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1138 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1139 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
1140 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1142 d = _mm_sub_pd(r02,rswitch);
1143 d = _mm_max_pd(d,_mm_setzero_pd());
1144 d2 = _mm_mul_pd(d,d);
1145 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1147 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1149 /* Evaluate switch function */
1150 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1151 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
1152 velec = _mm_mul_pd(velec,sw);
1153 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
1155 /* Update potential sum for this i atom from the interaction with this j atom. */
1156 velec = _mm_and_pd(velec,cutoff_mask);
1157 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1158 velecsum = _mm_add_pd(velecsum,velec);
1162 fscal = _mm_and_pd(fscal,cutoff_mask);
1164 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1166 /* Update vectorial force */
1167 fix0 = _mm_macc_pd(dx02,fscal,fix0);
1168 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
1169 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
1171 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
1172 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
1173 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
1177 /**************************
1178 * CALCULATE INTERACTIONS *
1179 **************************/
1181 if (gmx_mm_any_lt(rsq10,rcutoff2))
1184 r10 = _mm_mul_pd(rsq10,rinv10);
1186 /* EWALD ELECTROSTATICS */
1188 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1189 ewrt = _mm_mul_pd(r10,ewtabscale);
1190 ewitab = _mm_cvttpd_epi32(ewrt);
1192 eweps = _mm_frcz_pd(ewrt);
1194 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1196 twoeweps = _mm_add_pd(eweps,eweps);
1197 ewitab = _mm_slli_epi32(ewitab,2);
1198 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1199 ewtabD = _mm_setzero_pd();
1200 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1201 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1202 ewtabFn = _mm_setzero_pd();
1203 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1204 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1205 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1206 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
1207 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1209 d = _mm_sub_pd(r10,rswitch);
1210 d = _mm_max_pd(d,_mm_setzero_pd());
1211 d2 = _mm_mul_pd(d,d);
1212 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1214 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1216 /* Evaluate switch function */
1217 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1218 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
1219 velec = _mm_mul_pd(velec,sw);
1220 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1222 /* Update potential sum for this i atom from the interaction with this j atom. */
1223 velec = _mm_and_pd(velec,cutoff_mask);
1224 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1225 velecsum = _mm_add_pd(velecsum,velec);
1229 fscal = _mm_and_pd(fscal,cutoff_mask);
1231 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1233 /* Update vectorial force */
1234 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1235 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1236 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1238 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1239 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1240 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1244 /**************************
1245 * CALCULATE INTERACTIONS *
1246 **************************/
1248 if (gmx_mm_any_lt(rsq11,rcutoff2))
1251 r11 = _mm_mul_pd(rsq11,rinv11);
1253 /* EWALD ELECTROSTATICS */
1255 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1256 ewrt = _mm_mul_pd(r11,ewtabscale);
1257 ewitab = _mm_cvttpd_epi32(ewrt);
1259 eweps = _mm_frcz_pd(ewrt);
1261 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1263 twoeweps = _mm_add_pd(eweps,eweps);
1264 ewitab = _mm_slli_epi32(ewitab,2);
1265 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1266 ewtabD = _mm_setzero_pd();
1267 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1268 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1269 ewtabFn = _mm_setzero_pd();
1270 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1271 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1272 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1273 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1274 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1276 d = _mm_sub_pd(r11,rswitch);
1277 d = _mm_max_pd(d,_mm_setzero_pd());
1278 d2 = _mm_mul_pd(d,d);
1279 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1281 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1283 /* Evaluate switch function */
1284 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1285 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
1286 velec = _mm_mul_pd(velec,sw);
1287 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1289 /* Update potential sum for this i atom from the interaction with this j atom. */
1290 velec = _mm_and_pd(velec,cutoff_mask);
1291 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1292 velecsum = _mm_add_pd(velecsum,velec);
1296 fscal = _mm_and_pd(fscal,cutoff_mask);
1298 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1300 /* Update vectorial force */
1301 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1302 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1303 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1305 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1306 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1307 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1311 /**************************
1312 * CALCULATE INTERACTIONS *
1313 **************************/
1315 if (gmx_mm_any_lt(rsq12,rcutoff2))
1318 r12 = _mm_mul_pd(rsq12,rinv12);
1320 /* EWALD ELECTROSTATICS */
1322 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1323 ewrt = _mm_mul_pd(r12,ewtabscale);
1324 ewitab = _mm_cvttpd_epi32(ewrt);
1326 eweps = _mm_frcz_pd(ewrt);
1328 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1330 twoeweps = _mm_add_pd(eweps,eweps);
1331 ewitab = _mm_slli_epi32(ewitab,2);
1332 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1333 ewtabD = _mm_setzero_pd();
1334 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1335 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1336 ewtabFn = _mm_setzero_pd();
1337 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1338 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1339 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1340 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1341 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1343 d = _mm_sub_pd(r12,rswitch);
1344 d = _mm_max_pd(d,_mm_setzero_pd());
1345 d2 = _mm_mul_pd(d,d);
1346 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1348 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1350 /* Evaluate switch function */
1351 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1352 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1353 velec = _mm_mul_pd(velec,sw);
1354 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1356 /* Update potential sum for this i atom from the interaction with this j atom. */
1357 velec = _mm_and_pd(velec,cutoff_mask);
1358 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1359 velecsum = _mm_add_pd(velecsum,velec);
1363 fscal = _mm_and_pd(fscal,cutoff_mask);
1365 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1367 /* Update vectorial force */
1368 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1369 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1370 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1372 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1373 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1374 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1378 /**************************
1379 * CALCULATE INTERACTIONS *
1380 **************************/
1382 if (gmx_mm_any_lt(rsq20,rcutoff2))
1385 r20 = _mm_mul_pd(rsq20,rinv20);
1387 /* EWALD ELECTROSTATICS */
1389 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1390 ewrt = _mm_mul_pd(r20,ewtabscale);
1391 ewitab = _mm_cvttpd_epi32(ewrt);
1393 eweps = _mm_frcz_pd(ewrt);
1395 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1397 twoeweps = _mm_add_pd(eweps,eweps);
1398 ewitab = _mm_slli_epi32(ewitab,2);
1399 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1400 ewtabD = _mm_setzero_pd();
1401 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1402 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1403 ewtabFn = _mm_setzero_pd();
1404 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1405 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1406 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1407 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
1408 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1410 d = _mm_sub_pd(r20,rswitch);
1411 d = _mm_max_pd(d,_mm_setzero_pd());
1412 d2 = _mm_mul_pd(d,d);
1413 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1415 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1417 /* Evaluate switch function */
1418 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1419 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
1420 velec = _mm_mul_pd(velec,sw);
1421 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1423 /* Update potential sum for this i atom from the interaction with this j atom. */
1424 velec = _mm_and_pd(velec,cutoff_mask);
1425 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1426 velecsum = _mm_add_pd(velecsum,velec);
1430 fscal = _mm_and_pd(fscal,cutoff_mask);
1432 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1434 /* Update vectorial force */
1435 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1436 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1437 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1439 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1440 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1441 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1445 /**************************
1446 * CALCULATE INTERACTIONS *
1447 **************************/
1449 if (gmx_mm_any_lt(rsq21,rcutoff2))
1452 r21 = _mm_mul_pd(rsq21,rinv21);
1454 /* EWALD ELECTROSTATICS */
1456 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1457 ewrt = _mm_mul_pd(r21,ewtabscale);
1458 ewitab = _mm_cvttpd_epi32(ewrt);
1460 eweps = _mm_frcz_pd(ewrt);
1462 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1464 twoeweps = _mm_add_pd(eweps,eweps);
1465 ewitab = _mm_slli_epi32(ewitab,2);
1466 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1467 ewtabD = _mm_setzero_pd();
1468 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1469 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1470 ewtabFn = _mm_setzero_pd();
1471 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1472 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1473 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1474 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1475 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1477 d = _mm_sub_pd(r21,rswitch);
1478 d = _mm_max_pd(d,_mm_setzero_pd());
1479 d2 = _mm_mul_pd(d,d);
1480 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1482 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1484 /* Evaluate switch function */
1485 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1486 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
1487 velec = _mm_mul_pd(velec,sw);
1488 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1490 /* Update potential sum for this i atom from the interaction with this j atom. */
1491 velec = _mm_and_pd(velec,cutoff_mask);
1492 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1493 velecsum = _mm_add_pd(velecsum,velec);
1497 fscal = _mm_and_pd(fscal,cutoff_mask);
1499 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1501 /* Update vectorial force */
1502 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1503 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1504 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1506 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1507 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1508 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1512 /**************************
1513 * CALCULATE INTERACTIONS *
1514 **************************/
1516 if (gmx_mm_any_lt(rsq22,rcutoff2))
1519 r22 = _mm_mul_pd(rsq22,rinv22);
1521 /* EWALD ELECTROSTATICS */
1523 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1524 ewrt = _mm_mul_pd(r22,ewtabscale);
1525 ewitab = _mm_cvttpd_epi32(ewrt);
1527 eweps = _mm_frcz_pd(ewrt);
1529 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1531 twoeweps = _mm_add_pd(eweps,eweps);
1532 ewitab = _mm_slli_epi32(ewitab,2);
1533 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1534 ewtabD = _mm_setzero_pd();
1535 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1536 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1537 ewtabFn = _mm_setzero_pd();
1538 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1539 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1540 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1541 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1542 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1544 d = _mm_sub_pd(r22,rswitch);
1545 d = _mm_max_pd(d,_mm_setzero_pd());
1546 d2 = _mm_mul_pd(d,d);
1547 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1549 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1551 /* Evaluate switch function */
1552 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1553 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
1554 velec = _mm_mul_pd(velec,sw);
1555 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1557 /* Update potential sum for this i atom from the interaction with this j atom. */
1558 velec = _mm_and_pd(velec,cutoff_mask);
1559 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1560 velecsum = _mm_add_pd(velecsum,velec);
1564 fscal = _mm_and_pd(fscal,cutoff_mask);
1566 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1568 /* Update vectorial force */
1569 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1570 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1571 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1573 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1574 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1575 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1579 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1581 /* Inner loop uses 630 flops */
1584 /* End of innermost loop */
1586 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1587 f+i_coord_offset,fshift+i_shift_offset);
1590 /* Update potential energies */
1591 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1592 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1594 /* Increment number of inner iterations */
1595 inneriter += j_index_end - j_index_start;
1597 /* Outer loop uses 20 flops */
1600 /* Increment number of outer iterations */
1603 /* Update outer/inner flops */
1605 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*630);
1608 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_128_fma_double
1609 * Electrostatics interaction: Ewald
1610 * VdW interaction: LennardJones
1611 * Geometry: Water3-Water3
1612 * Calculate force/pot: Force
1615 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_128_fma_double
1616 (t_nblist * gmx_restrict nlist,
1617 rvec * gmx_restrict xx,
1618 rvec * gmx_restrict ff,
1619 t_forcerec * gmx_restrict fr,
1620 t_mdatoms * gmx_restrict mdatoms,
1621 nb_kernel_data_t * gmx_restrict kernel_data,
1622 t_nrnb * gmx_restrict nrnb)
1624 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1625 * just 0 for non-waters.
1626 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1627 * jnr indices corresponding to data put in the four positions in the SIMD register.
1629 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1630 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1632 int j_coord_offsetA,j_coord_offsetB;
1633 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1634 real rcutoff_scalar;
1635 real *shiftvec,*fshift,*x,*f;
1636 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1638 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1640 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1642 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1643 int vdwjidx0A,vdwjidx0B;
1644 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1645 int vdwjidx1A,vdwjidx1B;
1646 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1647 int vdwjidx2A,vdwjidx2B;
1648 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1649 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1650 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1651 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1652 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1653 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1654 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1655 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1656 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1657 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1658 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1661 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1664 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1665 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1667 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1669 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1670 real rswitch_scalar,d_scalar;
1671 __m128d dummy_mask,cutoff_mask;
1672 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1673 __m128d one = _mm_set1_pd(1.0);
1674 __m128d two = _mm_set1_pd(2.0);
1680 jindex = nlist->jindex;
1682 shiftidx = nlist->shift;
1684 shiftvec = fr->shift_vec[0];
1685 fshift = fr->fshift[0];
1686 facel = _mm_set1_pd(fr->epsfac);
1687 charge = mdatoms->chargeA;
1688 nvdwtype = fr->ntype;
1689 vdwparam = fr->nbfp;
1690 vdwtype = mdatoms->typeA;
1692 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1693 ewtab = fr->ic->tabq_coul_FDV0;
1694 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1695 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1697 /* Setup water-specific parameters */
1698 inr = nlist->iinr[0];
1699 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1700 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1701 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1702 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1704 jq0 = _mm_set1_pd(charge[inr+0]);
1705 jq1 = _mm_set1_pd(charge[inr+1]);
1706 jq2 = _mm_set1_pd(charge[inr+2]);
1707 vdwjidx0A = 2*vdwtype[inr+0];
1708 qq00 = _mm_mul_pd(iq0,jq0);
1709 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1710 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1711 qq01 = _mm_mul_pd(iq0,jq1);
1712 qq02 = _mm_mul_pd(iq0,jq2);
1713 qq10 = _mm_mul_pd(iq1,jq0);
1714 qq11 = _mm_mul_pd(iq1,jq1);
1715 qq12 = _mm_mul_pd(iq1,jq2);
1716 qq20 = _mm_mul_pd(iq2,jq0);
1717 qq21 = _mm_mul_pd(iq2,jq1);
1718 qq22 = _mm_mul_pd(iq2,jq2);
1720 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1721 rcutoff_scalar = fr->rcoulomb;
1722 rcutoff = _mm_set1_pd(rcutoff_scalar);
1723 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1725 rswitch_scalar = fr->rcoulomb_switch;
1726 rswitch = _mm_set1_pd(rswitch_scalar);
1727 /* Setup switch parameters */
1728 d_scalar = rcutoff_scalar-rswitch_scalar;
1729 d = _mm_set1_pd(d_scalar);
1730 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1731 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1732 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1733 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1734 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1735 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1737 /* Avoid stupid compiler warnings */
1739 j_coord_offsetA = 0;
1740 j_coord_offsetB = 0;
1745 /* Start outer loop over neighborlists */
1746 for(iidx=0; iidx<nri; iidx++)
1748 /* Load shift vector for this list */
1749 i_shift_offset = DIM*shiftidx[iidx];
1751 /* Load limits for loop over neighbors */
1752 j_index_start = jindex[iidx];
1753 j_index_end = jindex[iidx+1];
1755 /* Get outer coordinate index */
1757 i_coord_offset = DIM*inr;
1759 /* Load i particle coords and add shift vector */
1760 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1761 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1763 fix0 = _mm_setzero_pd();
1764 fiy0 = _mm_setzero_pd();
1765 fiz0 = _mm_setzero_pd();
1766 fix1 = _mm_setzero_pd();
1767 fiy1 = _mm_setzero_pd();
1768 fiz1 = _mm_setzero_pd();
1769 fix2 = _mm_setzero_pd();
1770 fiy2 = _mm_setzero_pd();
1771 fiz2 = _mm_setzero_pd();
1773 /* Start inner kernel loop */
1774 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1777 /* Get j neighbor index, and coordinate index */
1779 jnrB = jjnr[jidx+1];
1780 j_coord_offsetA = DIM*jnrA;
1781 j_coord_offsetB = DIM*jnrB;
1783 /* load j atom coordinates */
1784 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1785 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1787 /* Calculate displacement vector */
1788 dx00 = _mm_sub_pd(ix0,jx0);
1789 dy00 = _mm_sub_pd(iy0,jy0);
1790 dz00 = _mm_sub_pd(iz0,jz0);
1791 dx01 = _mm_sub_pd(ix0,jx1);
1792 dy01 = _mm_sub_pd(iy0,jy1);
1793 dz01 = _mm_sub_pd(iz0,jz1);
1794 dx02 = _mm_sub_pd(ix0,jx2);
1795 dy02 = _mm_sub_pd(iy0,jy2);
1796 dz02 = _mm_sub_pd(iz0,jz2);
1797 dx10 = _mm_sub_pd(ix1,jx0);
1798 dy10 = _mm_sub_pd(iy1,jy0);
1799 dz10 = _mm_sub_pd(iz1,jz0);
1800 dx11 = _mm_sub_pd(ix1,jx1);
1801 dy11 = _mm_sub_pd(iy1,jy1);
1802 dz11 = _mm_sub_pd(iz1,jz1);
1803 dx12 = _mm_sub_pd(ix1,jx2);
1804 dy12 = _mm_sub_pd(iy1,jy2);
1805 dz12 = _mm_sub_pd(iz1,jz2);
1806 dx20 = _mm_sub_pd(ix2,jx0);
1807 dy20 = _mm_sub_pd(iy2,jy0);
1808 dz20 = _mm_sub_pd(iz2,jz0);
1809 dx21 = _mm_sub_pd(ix2,jx1);
1810 dy21 = _mm_sub_pd(iy2,jy1);
1811 dz21 = _mm_sub_pd(iz2,jz1);
1812 dx22 = _mm_sub_pd(ix2,jx2);
1813 dy22 = _mm_sub_pd(iy2,jy2);
1814 dz22 = _mm_sub_pd(iz2,jz2);
1816 /* Calculate squared distance and things based on it */
1817 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1818 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1819 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1820 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1821 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1822 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1823 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1824 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1825 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1827 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1828 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1829 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1830 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1831 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1832 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1833 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1834 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1835 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1837 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1838 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1839 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1840 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1841 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1842 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1843 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1844 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1845 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1847 fjx0 = _mm_setzero_pd();
1848 fjy0 = _mm_setzero_pd();
1849 fjz0 = _mm_setzero_pd();
1850 fjx1 = _mm_setzero_pd();
1851 fjy1 = _mm_setzero_pd();
1852 fjz1 = _mm_setzero_pd();
1853 fjx2 = _mm_setzero_pd();
1854 fjy2 = _mm_setzero_pd();
1855 fjz2 = _mm_setzero_pd();
1857 /**************************
1858 * CALCULATE INTERACTIONS *
1859 **************************/
1861 if (gmx_mm_any_lt(rsq00,rcutoff2))
1864 r00 = _mm_mul_pd(rsq00,rinv00);
1866 /* EWALD ELECTROSTATICS */
1868 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1869 ewrt = _mm_mul_pd(r00,ewtabscale);
1870 ewitab = _mm_cvttpd_epi32(ewrt);
1872 eweps = _mm_frcz_pd(ewrt);
1874 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1876 twoeweps = _mm_add_pd(eweps,eweps);
1877 ewitab = _mm_slli_epi32(ewitab,2);
1878 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1879 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1880 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1881 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1882 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
1883 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1884 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1885 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1886 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
1887 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1889 /* LENNARD-JONES DISPERSION/REPULSION */
1891 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1892 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1893 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1894 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
1895 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1897 d = _mm_sub_pd(r00,rswitch);
1898 d = _mm_max_pd(d,_mm_setzero_pd());
1899 d2 = _mm_mul_pd(d,d);
1900 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1902 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1904 /* Evaluate switch function */
1905 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1906 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
1907 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1908 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1910 fscal = _mm_add_pd(felec,fvdw);
1912 fscal = _mm_and_pd(fscal,cutoff_mask);
1914 /* Update vectorial force */
1915 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1916 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1917 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1919 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1920 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1921 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1925 /**************************
1926 * CALCULATE INTERACTIONS *
1927 **************************/
1929 if (gmx_mm_any_lt(rsq01,rcutoff2))
1932 r01 = _mm_mul_pd(rsq01,rinv01);
1934 /* EWALD ELECTROSTATICS */
1936 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1937 ewrt = _mm_mul_pd(r01,ewtabscale);
1938 ewitab = _mm_cvttpd_epi32(ewrt);
1940 eweps = _mm_frcz_pd(ewrt);
1942 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1944 twoeweps = _mm_add_pd(eweps,eweps);
1945 ewitab = _mm_slli_epi32(ewitab,2);
1946 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1947 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1948 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1949 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1950 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
1951 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1952 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1953 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1954 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
1955 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1957 d = _mm_sub_pd(r01,rswitch);
1958 d = _mm_max_pd(d,_mm_setzero_pd());
1959 d2 = _mm_mul_pd(d,d);
1960 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1962 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1964 /* Evaluate switch function */
1965 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1966 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
1967 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
1971 fscal = _mm_and_pd(fscal,cutoff_mask);
1973 /* Update vectorial force */
1974 fix0 = _mm_macc_pd(dx01,fscal,fix0);
1975 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
1976 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
1978 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
1979 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
1980 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
1984 /**************************
1985 * CALCULATE INTERACTIONS *
1986 **************************/
1988 if (gmx_mm_any_lt(rsq02,rcutoff2))
1991 r02 = _mm_mul_pd(rsq02,rinv02);
1993 /* EWALD ELECTROSTATICS */
1995 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1996 ewrt = _mm_mul_pd(r02,ewtabscale);
1997 ewitab = _mm_cvttpd_epi32(ewrt);
1999 eweps = _mm_frcz_pd(ewrt);
2001 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2003 twoeweps = _mm_add_pd(eweps,eweps);
2004 ewitab = _mm_slli_epi32(ewitab,2);
2005 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2006 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2007 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2008 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2009 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2010 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2011 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2012 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2013 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
2014 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
2016 d = _mm_sub_pd(r02,rswitch);
2017 d = _mm_max_pd(d,_mm_setzero_pd());
2018 d2 = _mm_mul_pd(d,d);
2019 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2021 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2023 /* Evaluate switch function */
2024 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2025 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
2026 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
2030 fscal = _mm_and_pd(fscal,cutoff_mask);
2032 /* Update vectorial force */
2033 fix0 = _mm_macc_pd(dx02,fscal,fix0);
2034 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
2035 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
2037 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
2038 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
2039 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
2043 /**************************
2044 * CALCULATE INTERACTIONS *
2045 **************************/
2047 if (gmx_mm_any_lt(rsq10,rcutoff2))
2050 r10 = _mm_mul_pd(rsq10,rinv10);
2052 /* EWALD ELECTROSTATICS */
2054 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2055 ewrt = _mm_mul_pd(r10,ewtabscale);
2056 ewitab = _mm_cvttpd_epi32(ewrt);
2058 eweps = _mm_frcz_pd(ewrt);
2060 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2062 twoeweps = _mm_add_pd(eweps,eweps);
2063 ewitab = _mm_slli_epi32(ewitab,2);
2064 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2065 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2066 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2067 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2068 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2069 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2070 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2071 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2072 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
2073 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2075 d = _mm_sub_pd(r10,rswitch);
2076 d = _mm_max_pd(d,_mm_setzero_pd());
2077 d2 = _mm_mul_pd(d,d);
2078 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2080 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2082 /* Evaluate switch function */
2083 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2084 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
2085 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
2089 fscal = _mm_and_pd(fscal,cutoff_mask);
2091 /* Update vectorial force */
2092 fix1 = _mm_macc_pd(dx10,fscal,fix1);
2093 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
2094 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
2096 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
2097 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
2098 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
2102 /**************************
2103 * CALCULATE INTERACTIONS *
2104 **************************/
2106 if (gmx_mm_any_lt(rsq11,rcutoff2))
2109 r11 = _mm_mul_pd(rsq11,rinv11);
2111 /* EWALD ELECTROSTATICS */
2113 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2114 ewrt = _mm_mul_pd(r11,ewtabscale);
2115 ewitab = _mm_cvttpd_epi32(ewrt);
2117 eweps = _mm_frcz_pd(ewrt);
2119 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2121 twoeweps = _mm_add_pd(eweps,eweps);
2122 ewitab = _mm_slli_epi32(ewitab,2);
2123 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2124 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2125 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2126 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2127 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2128 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2129 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2130 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2131 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2132 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2134 d = _mm_sub_pd(r11,rswitch);
2135 d = _mm_max_pd(d,_mm_setzero_pd());
2136 d2 = _mm_mul_pd(d,d);
2137 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2139 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2141 /* Evaluate switch function */
2142 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2143 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2144 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2148 fscal = _mm_and_pd(fscal,cutoff_mask);
2150 /* Update vectorial force */
2151 fix1 = _mm_macc_pd(dx11,fscal,fix1);
2152 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
2153 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
2155 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
2156 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
2157 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
2161 /**************************
2162 * CALCULATE INTERACTIONS *
2163 **************************/
2165 if (gmx_mm_any_lt(rsq12,rcutoff2))
2168 r12 = _mm_mul_pd(rsq12,rinv12);
2170 /* EWALD ELECTROSTATICS */
2172 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2173 ewrt = _mm_mul_pd(r12,ewtabscale);
2174 ewitab = _mm_cvttpd_epi32(ewrt);
2176 eweps = _mm_frcz_pd(ewrt);
2178 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2180 twoeweps = _mm_add_pd(eweps,eweps);
2181 ewitab = _mm_slli_epi32(ewitab,2);
2182 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2183 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2184 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2185 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2186 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2187 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2188 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2189 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2190 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2191 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2193 d = _mm_sub_pd(r12,rswitch);
2194 d = _mm_max_pd(d,_mm_setzero_pd());
2195 d2 = _mm_mul_pd(d,d);
2196 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2198 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2200 /* Evaluate switch function */
2201 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2202 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2203 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2207 fscal = _mm_and_pd(fscal,cutoff_mask);
2209 /* Update vectorial force */
2210 fix1 = _mm_macc_pd(dx12,fscal,fix1);
2211 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
2212 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
2214 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
2215 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
2216 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
2220 /**************************
2221 * CALCULATE INTERACTIONS *
2222 **************************/
2224 if (gmx_mm_any_lt(rsq20,rcutoff2))
2227 r20 = _mm_mul_pd(rsq20,rinv20);
2229 /* EWALD ELECTROSTATICS */
2231 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2232 ewrt = _mm_mul_pd(r20,ewtabscale);
2233 ewitab = _mm_cvttpd_epi32(ewrt);
2235 eweps = _mm_frcz_pd(ewrt);
2237 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2239 twoeweps = _mm_add_pd(eweps,eweps);
2240 ewitab = _mm_slli_epi32(ewitab,2);
2241 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2242 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2243 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2244 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2245 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2246 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2247 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2248 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2249 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
2250 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2252 d = _mm_sub_pd(r20,rswitch);
2253 d = _mm_max_pd(d,_mm_setzero_pd());
2254 d2 = _mm_mul_pd(d,d);
2255 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2257 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2259 /* Evaluate switch function */
2260 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2261 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
2262 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
2266 fscal = _mm_and_pd(fscal,cutoff_mask);
2268 /* Update vectorial force */
2269 fix2 = _mm_macc_pd(dx20,fscal,fix2);
2270 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
2271 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
2273 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
2274 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
2275 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
2279 /**************************
2280 * CALCULATE INTERACTIONS *
2281 **************************/
2283 if (gmx_mm_any_lt(rsq21,rcutoff2))
2286 r21 = _mm_mul_pd(rsq21,rinv21);
2288 /* EWALD ELECTROSTATICS */
2290 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2291 ewrt = _mm_mul_pd(r21,ewtabscale);
2292 ewitab = _mm_cvttpd_epi32(ewrt);
2294 eweps = _mm_frcz_pd(ewrt);
2296 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2298 twoeweps = _mm_add_pd(eweps,eweps);
2299 ewitab = _mm_slli_epi32(ewitab,2);
2300 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2301 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2302 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2303 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2304 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2305 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2306 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2307 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2308 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2309 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2311 d = _mm_sub_pd(r21,rswitch);
2312 d = _mm_max_pd(d,_mm_setzero_pd());
2313 d2 = _mm_mul_pd(d,d);
2314 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2316 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2318 /* Evaluate switch function */
2319 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2320 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2321 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2325 fscal = _mm_and_pd(fscal,cutoff_mask);
2327 /* Update vectorial force */
2328 fix2 = _mm_macc_pd(dx21,fscal,fix2);
2329 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
2330 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
2332 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
2333 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
2334 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
2338 /**************************
2339 * CALCULATE INTERACTIONS *
2340 **************************/
2342 if (gmx_mm_any_lt(rsq22,rcutoff2))
2345 r22 = _mm_mul_pd(rsq22,rinv22);
2347 /* EWALD ELECTROSTATICS */
2349 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2350 ewrt = _mm_mul_pd(r22,ewtabscale);
2351 ewitab = _mm_cvttpd_epi32(ewrt);
2353 eweps = _mm_frcz_pd(ewrt);
2355 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2357 twoeweps = _mm_add_pd(eweps,eweps);
2358 ewitab = _mm_slli_epi32(ewitab,2);
2359 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2360 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2361 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2362 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2363 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2364 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2365 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2366 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2367 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2368 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2370 d = _mm_sub_pd(r22,rswitch);
2371 d = _mm_max_pd(d,_mm_setzero_pd());
2372 d2 = _mm_mul_pd(d,d);
2373 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2375 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2377 /* Evaluate switch function */
2378 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2379 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2380 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2384 fscal = _mm_and_pd(fscal,cutoff_mask);
2386 /* Update vectorial force */
2387 fix2 = _mm_macc_pd(dx22,fscal,fix2);
2388 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
2389 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
2391 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
2392 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
2393 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
2397 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2399 /* Inner loop uses 600 flops */
2402 if(jidx<j_index_end)
2406 j_coord_offsetA = DIM*jnrA;
2408 /* load j atom coordinates */
2409 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2410 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2412 /* Calculate displacement vector */
2413 dx00 = _mm_sub_pd(ix0,jx0);
2414 dy00 = _mm_sub_pd(iy0,jy0);
2415 dz00 = _mm_sub_pd(iz0,jz0);
2416 dx01 = _mm_sub_pd(ix0,jx1);
2417 dy01 = _mm_sub_pd(iy0,jy1);
2418 dz01 = _mm_sub_pd(iz0,jz1);
2419 dx02 = _mm_sub_pd(ix0,jx2);
2420 dy02 = _mm_sub_pd(iy0,jy2);
2421 dz02 = _mm_sub_pd(iz0,jz2);
2422 dx10 = _mm_sub_pd(ix1,jx0);
2423 dy10 = _mm_sub_pd(iy1,jy0);
2424 dz10 = _mm_sub_pd(iz1,jz0);
2425 dx11 = _mm_sub_pd(ix1,jx1);
2426 dy11 = _mm_sub_pd(iy1,jy1);
2427 dz11 = _mm_sub_pd(iz1,jz1);
2428 dx12 = _mm_sub_pd(ix1,jx2);
2429 dy12 = _mm_sub_pd(iy1,jy2);
2430 dz12 = _mm_sub_pd(iz1,jz2);
2431 dx20 = _mm_sub_pd(ix2,jx0);
2432 dy20 = _mm_sub_pd(iy2,jy0);
2433 dz20 = _mm_sub_pd(iz2,jz0);
2434 dx21 = _mm_sub_pd(ix2,jx1);
2435 dy21 = _mm_sub_pd(iy2,jy1);
2436 dz21 = _mm_sub_pd(iz2,jz1);
2437 dx22 = _mm_sub_pd(ix2,jx2);
2438 dy22 = _mm_sub_pd(iy2,jy2);
2439 dz22 = _mm_sub_pd(iz2,jz2);
2441 /* Calculate squared distance and things based on it */
2442 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2443 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
2444 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
2445 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
2446 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2447 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2448 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
2449 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2450 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2452 rinv00 = gmx_mm_invsqrt_pd(rsq00);
2453 rinv01 = gmx_mm_invsqrt_pd(rsq01);
2454 rinv02 = gmx_mm_invsqrt_pd(rsq02);
2455 rinv10 = gmx_mm_invsqrt_pd(rsq10);
2456 rinv11 = gmx_mm_invsqrt_pd(rsq11);
2457 rinv12 = gmx_mm_invsqrt_pd(rsq12);
2458 rinv20 = gmx_mm_invsqrt_pd(rsq20);
2459 rinv21 = gmx_mm_invsqrt_pd(rsq21);
2460 rinv22 = gmx_mm_invsqrt_pd(rsq22);
2462 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
2463 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
2464 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
2465 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
2466 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
2467 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
2468 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
2469 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
2470 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
2472 fjx0 = _mm_setzero_pd();
2473 fjy0 = _mm_setzero_pd();
2474 fjz0 = _mm_setzero_pd();
2475 fjx1 = _mm_setzero_pd();
2476 fjy1 = _mm_setzero_pd();
2477 fjz1 = _mm_setzero_pd();
2478 fjx2 = _mm_setzero_pd();
2479 fjy2 = _mm_setzero_pd();
2480 fjz2 = _mm_setzero_pd();
2482 /**************************
2483 * CALCULATE INTERACTIONS *
2484 **************************/
2486 if (gmx_mm_any_lt(rsq00,rcutoff2))
2489 r00 = _mm_mul_pd(rsq00,rinv00);
2491 /* EWALD ELECTROSTATICS */
2493 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2494 ewrt = _mm_mul_pd(r00,ewtabscale);
2495 ewitab = _mm_cvttpd_epi32(ewrt);
2497 eweps = _mm_frcz_pd(ewrt);
2499 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2501 twoeweps = _mm_add_pd(eweps,eweps);
2502 ewitab = _mm_slli_epi32(ewitab,2);
2503 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2504 ewtabD = _mm_setzero_pd();
2505 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2506 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2507 ewtabFn = _mm_setzero_pd();
2508 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2509 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2510 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2511 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
2512 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
2514 /* LENNARD-JONES DISPERSION/REPULSION */
2516 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2517 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
2518 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
2519 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
2520 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
2522 d = _mm_sub_pd(r00,rswitch);
2523 d = _mm_max_pd(d,_mm_setzero_pd());
2524 d2 = _mm_mul_pd(d,d);
2525 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2527 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2529 /* Evaluate switch function */
2530 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2531 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
2532 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2533 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
2535 fscal = _mm_add_pd(felec,fvdw);
2537 fscal = _mm_and_pd(fscal,cutoff_mask);
2539 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2541 /* Update vectorial force */
2542 fix0 = _mm_macc_pd(dx00,fscal,fix0);
2543 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
2544 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
2546 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
2547 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
2548 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
2552 /**************************
2553 * CALCULATE INTERACTIONS *
2554 **************************/
2556 if (gmx_mm_any_lt(rsq01,rcutoff2))
2559 r01 = _mm_mul_pd(rsq01,rinv01);
2561 /* EWALD ELECTROSTATICS */
2563 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2564 ewrt = _mm_mul_pd(r01,ewtabscale);
2565 ewitab = _mm_cvttpd_epi32(ewrt);
2567 eweps = _mm_frcz_pd(ewrt);
2569 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2571 twoeweps = _mm_add_pd(eweps,eweps);
2572 ewitab = _mm_slli_epi32(ewitab,2);
2573 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2574 ewtabD = _mm_setzero_pd();
2575 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2576 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2577 ewtabFn = _mm_setzero_pd();
2578 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2579 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2580 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2581 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
2582 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
2584 d = _mm_sub_pd(r01,rswitch);
2585 d = _mm_max_pd(d,_mm_setzero_pd());
2586 d2 = _mm_mul_pd(d,d);
2587 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2589 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2591 /* Evaluate switch function */
2592 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2593 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
2594 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
2598 fscal = _mm_and_pd(fscal,cutoff_mask);
2600 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2602 /* Update vectorial force */
2603 fix0 = _mm_macc_pd(dx01,fscal,fix0);
2604 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
2605 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
2607 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
2608 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
2609 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
2613 /**************************
2614 * CALCULATE INTERACTIONS *
2615 **************************/
2617 if (gmx_mm_any_lt(rsq02,rcutoff2))
2620 r02 = _mm_mul_pd(rsq02,rinv02);
2622 /* EWALD ELECTROSTATICS */
2624 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2625 ewrt = _mm_mul_pd(r02,ewtabscale);
2626 ewitab = _mm_cvttpd_epi32(ewrt);
2628 eweps = _mm_frcz_pd(ewrt);
2630 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2632 twoeweps = _mm_add_pd(eweps,eweps);
2633 ewitab = _mm_slli_epi32(ewitab,2);
2634 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2635 ewtabD = _mm_setzero_pd();
2636 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2637 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2638 ewtabFn = _mm_setzero_pd();
2639 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2640 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2641 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2642 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
2643 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
2645 d = _mm_sub_pd(r02,rswitch);
2646 d = _mm_max_pd(d,_mm_setzero_pd());
2647 d2 = _mm_mul_pd(d,d);
2648 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2650 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2652 /* Evaluate switch function */
2653 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2654 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
2655 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
2659 fscal = _mm_and_pd(fscal,cutoff_mask);
2661 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2663 /* Update vectorial force */
2664 fix0 = _mm_macc_pd(dx02,fscal,fix0);
2665 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
2666 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
2668 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
2669 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
2670 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
2674 /**************************
2675 * CALCULATE INTERACTIONS *
2676 **************************/
2678 if (gmx_mm_any_lt(rsq10,rcutoff2))
2681 r10 = _mm_mul_pd(rsq10,rinv10);
2683 /* EWALD ELECTROSTATICS */
2685 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2686 ewrt = _mm_mul_pd(r10,ewtabscale);
2687 ewitab = _mm_cvttpd_epi32(ewrt);
2689 eweps = _mm_frcz_pd(ewrt);
2691 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2693 twoeweps = _mm_add_pd(eweps,eweps);
2694 ewitab = _mm_slli_epi32(ewitab,2);
2695 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2696 ewtabD = _mm_setzero_pd();
2697 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2698 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2699 ewtabFn = _mm_setzero_pd();
2700 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2701 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2702 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2703 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
2704 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2706 d = _mm_sub_pd(r10,rswitch);
2707 d = _mm_max_pd(d,_mm_setzero_pd());
2708 d2 = _mm_mul_pd(d,d);
2709 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2711 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2713 /* Evaluate switch function */
2714 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2715 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
2716 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
2720 fscal = _mm_and_pd(fscal,cutoff_mask);
2722 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2724 /* Update vectorial force */
2725 fix1 = _mm_macc_pd(dx10,fscal,fix1);
2726 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
2727 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
2729 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
2730 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
2731 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
2735 /**************************
2736 * CALCULATE INTERACTIONS *
2737 **************************/
2739 if (gmx_mm_any_lt(rsq11,rcutoff2))
2742 r11 = _mm_mul_pd(rsq11,rinv11);
2744 /* EWALD ELECTROSTATICS */
2746 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2747 ewrt = _mm_mul_pd(r11,ewtabscale);
2748 ewitab = _mm_cvttpd_epi32(ewrt);
2750 eweps = _mm_frcz_pd(ewrt);
2752 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2754 twoeweps = _mm_add_pd(eweps,eweps);
2755 ewitab = _mm_slli_epi32(ewitab,2);
2756 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2757 ewtabD = _mm_setzero_pd();
2758 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2759 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2760 ewtabFn = _mm_setzero_pd();
2761 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2762 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2763 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2764 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2765 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2767 d = _mm_sub_pd(r11,rswitch);
2768 d = _mm_max_pd(d,_mm_setzero_pd());
2769 d2 = _mm_mul_pd(d,d);
2770 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2772 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2774 /* Evaluate switch function */
2775 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2776 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2777 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2781 fscal = _mm_and_pd(fscal,cutoff_mask);
2783 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2785 /* Update vectorial force */
2786 fix1 = _mm_macc_pd(dx11,fscal,fix1);
2787 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
2788 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
2790 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
2791 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
2792 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
2796 /**************************
2797 * CALCULATE INTERACTIONS *
2798 **************************/
2800 if (gmx_mm_any_lt(rsq12,rcutoff2))
2803 r12 = _mm_mul_pd(rsq12,rinv12);
2805 /* EWALD ELECTROSTATICS */
2807 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2808 ewrt = _mm_mul_pd(r12,ewtabscale);
2809 ewitab = _mm_cvttpd_epi32(ewrt);
2811 eweps = _mm_frcz_pd(ewrt);
2813 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2815 twoeweps = _mm_add_pd(eweps,eweps);
2816 ewitab = _mm_slli_epi32(ewitab,2);
2817 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2818 ewtabD = _mm_setzero_pd();
2819 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2820 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2821 ewtabFn = _mm_setzero_pd();
2822 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2823 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2824 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2825 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2826 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2828 d = _mm_sub_pd(r12,rswitch);
2829 d = _mm_max_pd(d,_mm_setzero_pd());
2830 d2 = _mm_mul_pd(d,d);
2831 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2833 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2835 /* Evaluate switch function */
2836 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2837 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2838 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2842 fscal = _mm_and_pd(fscal,cutoff_mask);
2844 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2846 /* Update vectorial force */
2847 fix1 = _mm_macc_pd(dx12,fscal,fix1);
2848 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
2849 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
2851 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
2852 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
2853 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
2857 /**************************
2858 * CALCULATE INTERACTIONS *
2859 **************************/
2861 if (gmx_mm_any_lt(rsq20,rcutoff2))
2864 r20 = _mm_mul_pd(rsq20,rinv20);
2866 /* EWALD ELECTROSTATICS */
2868 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2869 ewrt = _mm_mul_pd(r20,ewtabscale);
2870 ewitab = _mm_cvttpd_epi32(ewrt);
2872 eweps = _mm_frcz_pd(ewrt);
2874 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2876 twoeweps = _mm_add_pd(eweps,eweps);
2877 ewitab = _mm_slli_epi32(ewitab,2);
2878 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2879 ewtabD = _mm_setzero_pd();
2880 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2881 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2882 ewtabFn = _mm_setzero_pd();
2883 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2884 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2885 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2886 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
2887 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2889 d = _mm_sub_pd(r20,rswitch);
2890 d = _mm_max_pd(d,_mm_setzero_pd());
2891 d2 = _mm_mul_pd(d,d);
2892 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2894 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2896 /* Evaluate switch function */
2897 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2898 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
2899 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
2903 fscal = _mm_and_pd(fscal,cutoff_mask);
2905 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2907 /* Update vectorial force */
2908 fix2 = _mm_macc_pd(dx20,fscal,fix2);
2909 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
2910 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
2912 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
2913 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
2914 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
2918 /**************************
2919 * CALCULATE INTERACTIONS *
2920 **************************/
2922 if (gmx_mm_any_lt(rsq21,rcutoff2))
2925 r21 = _mm_mul_pd(rsq21,rinv21);
2927 /* EWALD ELECTROSTATICS */
2929 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2930 ewrt = _mm_mul_pd(r21,ewtabscale);
2931 ewitab = _mm_cvttpd_epi32(ewrt);
2933 eweps = _mm_frcz_pd(ewrt);
2935 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2937 twoeweps = _mm_add_pd(eweps,eweps);
2938 ewitab = _mm_slli_epi32(ewitab,2);
2939 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2940 ewtabD = _mm_setzero_pd();
2941 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2942 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2943 ewtabFn = _mm_setzero_pd();
2944 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2945 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2946 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2947 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2948 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2950 d = _mm_sub_pd(r21,rswitch);
2951 d = _mm_max_pd(d,_mm_setzero_pd());
2952 d2 = _mm_mul_pd(d,d);
2953 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2955 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2957 /* Evaluate switch function */
2958 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2959 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2960 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2964 fscal = _mm_and_pd(fscal,cutoff_mask);
2966 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2968 /* Update vectorial force */
2969 fix2 = _mm_macc_pd(dx21,fscal,fix2);
2970 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
2971 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
2973 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
2974 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
2975 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
2979 /**************************
2980 * CALCULATE INTERACTIONS *
2981 **************************/
2983 if (gmx_mm_any_lt(rsq22,rcutoff2))
2986 r22 = _mm_mul_pd(rsq22,rinv22);
2988 /* EWALD ELECTROSTATICS */
2990 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2991 ewrt = _mm_mul_pd(r22,ewtabscale);
2992 ewitab = _mm_cvttpd_epi32(ewrt);
2994 eweps = _mm_frcz_pd(ewrt);
2996 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2998 twoeweps = _mm_add_pd(eweps,eweps);
2999 ewitab = _mm_slli_epi32(ewitab,2);
3000 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3001 ewtabD = _mm_setzero_pd();
3002 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3003 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
3004 ewtabFn = _mm_setzero_pd();
3005 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3006 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
3007 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
3008 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
3009 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
3011 d = _mm_sub_pd(r22,rswitch);
3012 d = _mm_max_pd(d,_mm_setzero_pd());
3013 d2 = _mm_mul_pd(d,d);
3014 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
3016 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
3018 /* Evaluate switch function */
3019 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3020 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
3021 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
3025 fscal = _mm_and_pd(fscal,cutoff_mask);
3027 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3029 /* Update vectorial force */
3030 fix2 = _mm_macc_pd(dx22,fscal,fix2);
3031 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
3032 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
3034 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
3035 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
3036 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
3040 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3042 /* Inner loop uses 600 flops */
3045 /* End of innermost loop */
3047 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3048 f+i_coord_offset,fshift+i_shift_offset);
3050 /* Increment number of inner iterations */
3051 inneriter += j_index_end - j_index_start;
3053 /* Outer loop uses 18 flops */
3056 /* Increment number of outer iterations */
3059 /* Update outer/inner flops */
3061 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*600);