2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_128_fma_double.h"
48 #include "kernelutil_x86_avx_128_fma_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_128_fma_double
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LennardJones
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_128_fma_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int j_coord_offsetA,j_coord_offsetB;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
88 int vdwjidx0A,vdwjidx0B;
89 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 int vdwjidx1A,vdwjidx1B;
91 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B;
93 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 int vdwjidx3A,vdwjidx3B;
95 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
96 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
100 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
103 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
104 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
105 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
106 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
109 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
113 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
115 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
117 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
118 real rswitch_scalar,d_scalar;
119 __m128d dummy_mask,cutoff_mask;
120 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
121 __m128d one = _mm_set1_pd(1.0);
122 __m128d two = _mm_set1_pd(2.0);
128 jindex = nlist->jindex;
130 shiftidx = nlist->shift;
132 shiftvec = fr->shift_vec[0];
133 fshift = fr->fshift[0];
134 facel = _mm_set1_pd(fr->epsfac);
135 charge = mdatoms->chargeA;
136 nvdwtype = fr->ntype;
138 vdwtype = mdatoms->typeA;
140 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
141 ewtab = fr->ic->tabq_coul_FDV0;
142 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
143 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
145 /* Setup water-specific parameters */
146 inr = nlist->iinr[0];
147 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
148 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
149 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
150 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
152 jq1 = _mm_set1_pd(charge[inr+1]);
153 jq2 = _mm_set1_pd(charge[inr+2]);
154 jq3 = _mm_set1_pd(charge[inr+3]);
155 vdwjidx0A = 2*vdwtype[inr+0];
156 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
157 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
158 qq11 = _mm_mul_pd(iq1,jq1);
159 qq12 = _mm_mul_pd(iq1,jq2);
160 qq13 = _mm_mul_pd(iq1,jq3);
161 qq21 = _mm_mul_pd(iq2,jq1);
162 qq22 = _mm_mul_pd(iq2,jq2);
163 qq23 = _mm_mul_pd(iq2,jq3);
164 qq31 = _mm_mul_pd(iq3,jq1);
165 qq32 = _mm_mul_pd(iq3,jq2);
166 qq33 = _mm_mul_pd(iq3,jq3);
168 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
169 rcutoff_scalar = fr->rcoulomb;
170 rcutoff = _mm_set1_pd(rcutoff_scalar);
171 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
173 rswitch_scalar = fr->rcoulomb_switch;
174 rswitch = _mm_set1_pd(rswitch_scalar);
175 /* Setup switch parameters */
176 d_scalar = rcutoff_scalar-rswitch_scalar;
177 d = _mm_set1_pd(d_scalar);
178 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
179 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
180 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
181 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
182 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
183 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
185 /* Avoid stupid compiler warnings */
193 /* Start outer loop over neighborlists */
194 for(iidx=0; iidx<nri; iidx++)
196 /* Load shift vector for this list */
197 i_shift_offset = DIM*shiftidx[iidx];
199 /* Load limits for loop over neighbors */
200 j_index_start = jindex[iidx];
201 j_index_end = jindex[iidx+1];
203 /* Get outer coordinate index */
205 i_coord_offset = DIM*inr;
207 /* Load i particle coords and add shift vector */
208 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
209 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
211 fix0 = _mm_setzero_pd();
212 fiy0 = _mm_setzero_pd();
213 fiz0 = _mm_setzero_pd();
214 fix1 = _mm_setzero_pd();
215 fiy1 = _mm_setzero_pd();
216 fiz1 = _mm_setzero_pd();
217 fix2 = _mm_setzero_pd();
218 fiy2 = _mm_setzero_pd();
219 fiz2 = _mm_setzero_pd();
220 fix3 = _mm_setzero_pd();
221 fiy3 = _mm_setzero_pd();
222 fiz3 = _mm_setzero_pd();
224 /* Reset potential sums */
225 velecsum = _mm_setzero_pd();
226 vvdwsum = _mm_setzero_pd();
228 /* Start inner kernel loop */
229 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
232 /* Get j neighbor index, and coordinate index */
235 j_coord_offsetA = DIM*jnrA;
236 j_coord_offsetB = DIM*jnrB;
238 /* load j atom coordinates */
239 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
240 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
241 &jy2,&jz2,&jx3,&jy3,&jz3);
243 /* Calculate displacement vector */
244 dx00 = _mm_sub_pd(ix0,jx0);
245 dy00 = _mm_sub_pd(iy0,jy0);
246 dz00 = _mm_sub_pd(iz0,jz0);
247 dx11 = _mm_sub_pd(ix1,jx1);
248 dy11 = _mm_sub_pd(iy1,jy1);
249 dz11 = _mm_sub_pd(iz1,jz1);
250 dx12 = _mm_sub_pd(ix1,jx2);
251 dy12 = _mm_sub_pd(iy1,jy2);
252 dz12 = _mm_sub_pd(iz1,jz2);
253 dx13 = _mm_sub_pd(ix1,jx3);
254 dy13 = _mm_sub_pd(iy1,jy3);
255 dz13 = _mm_sub_pd(iz1,jz3);
256 dx21 = _mm_sub_pd(ix2,jx1);
257 dy21 = _mm_sub_pd(iy2,jy1);
258 dz21 = _mm_sub_pd(iz2,jz1);
259 dx22 = _mm_sub_pd(ix2,jx2);
260 dy22 = _mm_sub_pd(iy2,jy2);
261 dz22 = _mm_sub_pd(iz2,jz2);
262 dx23 = _mm_sub_pd(ix2,jx3);
263 dy23 = _mm_sub_pd(iy2,jy3);
264 dz23 = _mm_sub_pd(iz2,jz3);
265 dx31 = _mm_sub_pd(ix3,jx1);
266 dy31 = _mm_sub_pd(iy3,jy1);
267 dz31 = _mm_sub_pd(iz3,jz1);
268 dx32 = _mm_sub_pd(ix3,jx2);
269 dy32 = _mm_sub_pd(iy3,jy2);
270 dz32 = _mm_sub_pd(iz3,jz2);
271 dx33 = _mm_sub_pd(ix3,jx3);
272 dy33 = _mm_sub_pd(iy3,jy3);
273 dz33 = _mm_sub_pd(iz3,jz3);
275 /* Calculate squared distance and things based on it */
276 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
277 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
278 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
279 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
280 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
281 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
282 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
283 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
284 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
285 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
287 rinv00 = gmx_mm_invsqrt_pd(rsq00);
288 rinv11 = gmx_mm_invsqrt_pd(rsq11);
289 rinv12 = gmx_mm_invsqrt_pd(rsq12);
290 rinv13 = gmx_mm_invsqrt_pd(rsq13);
291 rinv21 = gmx_mm_invsqrt_pd(rsq21);
292 rinv22 = gmx_mm_invsqrt_pd(rsq22);
293 rinv23 = gmx_mm_invsqrt_pd(rsq23);
294 rinv31 = gmx_mm_invsqrt_pd(rsq31);
295 rinv32 = gmx_mm_invsqrt_pd(rsq32);
296 rinv33 = gmx_mm_invsqrt_pd(rsq33);
298 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
299 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
300 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
301 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
302 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
303 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
304 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
305 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
306 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
307 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
309 fjx0 = _mm_setzero_pd();
310 fjy0 = _mm_setzero_pd();
311 fjz0 = _mm_setzero_pd();
312 fjx1 = _mm_setzero_pd();
313 fjy1 = _mm_setzero_pd();
314 fjz1 = _mm_setzero_pd();
315 fjx2 = _mm_setzero_pd();
316 fjy2 = _mm_setzero_pd();
317 fjz2 = _mm_setzero_pd();
318 fjx3 = _mm_setzero_pd();
319 fjy3 = _mm_setzero_pd();
320 fjz3 = _mm_setzero_pd();
322 /**************************
323 * CALCULATE INTERACTIONS *
324 **************************/
326 if (gmx_mm_any_lt(rsq00,rcutoff2))
329 r00 = _mm_mul_pd(rsq00,rinv00);
331 /* LENNARD-JONES DISPERSION/REPULSION */
333 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
334 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
335 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
336 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
337 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
339 d = _mm_sub_pd(r00,rswitch);
340 d = _mm_max_pd(d,_mm_setzero_pd());
341 d2 = _mm_mul_pd(d,d);
342 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
344 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
346 /* Evaluate switch function */
347 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
348 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
349 vvdw = _mm_mul_pd(vvdw,sw);
350 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
352 /* Update potential sum for this i atom from the interaction with this j atom. */
353 vvdw = _mm_and_pd(vvdw,cutoff_mask);
354 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
358 fscal = _mm_and_pd(fscal,cutoff_mask);
360 /* Update vectorial force */
361 fix0 = _mm_macc_pd(dx00,fscal,fix0);
362 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
363 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
365 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
366 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
367 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
371 /**************************
372 * CALCULATE INTERACTIONS *
373 **************************/
375 if (gmx_mm_any_lt(rsq11,rcutoff2))
378 r11 = _mm_mul_pd(rsq11,rinv11);
380 /* EWALD ELECTROSTATICS */
382 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
383 ewrt = _mm_mul_pd(r11,ewtabscale);
384 ewitab = _mm_cvttpd_epi32(ewrt);
386 eweps = _mm_frcz_pd(ewrt);
388 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
390 twoeweps = _mm_add_pd(eweps,eweps);
391 ewitab = _mm_slli_epi32(ewitab,2);
392 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
393 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
394 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
395 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
396 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
397 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
398 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
399 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
400 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
401 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
403 d = _mm_sub_pd(r11,rswitch);
404 d = _mm_max_pd(d,_mm_setzero_pd());
405 d2 = _mm_mul_pd(d,d);
406 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
408 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
410 /* Evaluate switch function */
411 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
412 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
413 velec = _mm_mul_pd(velec,sw);
414 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
416 /* Update potential sum for this i atom from the interaction with this j atom. */
417 velec = _mm_and_pd(velec,cutoff_mask);
418 velecsum = _mm_add_pd(velecsum,velec);
422 fscal = _mm_and_pd(fscal,cutoff_mask);
424 /* Update vectorial force */
425 fix1 = _mm_macc_pd(dx11,fscal,fix1);
426 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
427 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
429 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
430 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
431 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
435 /**************************
436 * CALCULATE INTERACTIONS *
437 **************************/
439 if (gmx_mm_any_lt(rsq12,rcutoff2))
442 r12 = _mm_mul_pd(rsq12,rinv12);
444 /* EWALD ELECTROSTATICS */
446 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
447 ewrt = _mm_mul_pd(r12,ewtabscale);
448 ewitab = _mm_cvttpd_epi32(ewrt);
450 eweps = _mm_frcz_pd(ewrt);
452 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
454 twoeweps = _mm_add_pd(eweps,eweps);
455 ewitab = _mm_slli_epi32(ewitab,2);
456 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
457 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
458 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
459 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
460 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
461 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
462 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
463 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
464 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
465 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
467 d = _mm_sub_pd(r12,rswitch);
468 d = _mm_max_pd(d,_mm_setzero_pd());
469 d2 = _mm_mul_pd(d,d);
470 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
472 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
474 /* Evaluate switch function */
475 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
476 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
477 velec = _mm_mul_pd(velec,sw);
478 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
480 /* Update potential sum for this i atom from the interaction with this j atom. */
481 velec = _mm_and_pd(velec,cutoff_mask);
482 velecsum = _mm_add_pd(velecsum,velec);
486 fscal = _mm_and_pd(fscal,cutoff_mask);
488 /* Update vectorial force */
489 fix1 = _mm_macc_pd(dx12,fscal,fix1);
490 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
491 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
493 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
494 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
495 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
499 /**************************
500 * CALCULATE INTERACTIONS *
501 **************************/
503 if (gmx_mm_any_lt(rsq13,rcutoff2))
506 r13 = _mm_mul_pd(rsq13,rinv13);
508 /* EWALD ELECTROSTATICS */
510 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
511 ewrt = _mm_mul_pd(r13,ewtabscale);
512 ewitab = _mm_cvttpd_epi32(ewrt);
514 eweps = _mm_frcz_pd(ewrt);
516 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
518 twoeweps = _mm_add_pd(eweps,eweps);
519 ewitab = _mm_slli_epi32(ewitab,2);
520 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
521 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
522 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
523 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
524 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
525 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
526 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
527 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
528 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
529 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
531 d = _mm_sub_pd(r13,rswitch);
532 d = _mm_max_pd(d,_mm_setzero_pd());
533 d2 = _mm_mul_pd(d,d);
534 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
536 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
538 /* Evaluate switch function */
539 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
540 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
541 velec = _mm_mul_pd(velec,sw);
542 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
544 /* Update potential sum for this i atom from the interaction with this j atom. */
545 velec = _mm_and_pd(velec,cutoff_mask);
546 velecsum = _mm_add_pd(velecsum,velec);
550 fscal = _mm_and_pd(fscal,cutoff_mask);
552 /* Update vectorial force */
553 fix1 = _mm_macc_pd(dx13,fscal,fix1);
554 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
555 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
557 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
558 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
559 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
563 /**************************
564 * CALCULATE INTERACTIONS *
565 **************************/
567 if (gmx_mm_any_lt(rsq21,rcutoff2))
570 r21 = _mm_mul_pd(rsq21,rinv21);
572 /* EWALD ELECTROSTATICS */
574 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
575 ewrt = _mm_mul_pd(r21,ewtabscale);
576 ewitab = _mm_cvttpd_epi32(ewrt);
578 eweps = _mm_frcz_pd(ewrt);
580 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
582 twoeweps = _mm_add_pd(eweps,eweps);
583 ewitab = _mm_slli_epi32(ewitab,2);
584 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
585 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
586 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
587 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
588 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
589 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
590 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
591 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
592 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
593 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
595 d = _mm_sub_pd(r21,rswitch);
596 d = _mm_max_pd(d,_mm_setzero_pd());
597 d2 = _mm_mul_pd(d,d);
598 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
600 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
602 /* Evaluate switch function */
603 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
604 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
605 velec = _mm_mul_pd(velec,sw);
606 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
608 /* Update potential sum for this i atom from the interaction with this j atom. */
609 velec = _mm_and_pd(velec,cutoff_mask);
610 velecsum = _mm_add_pd(velecsum,velec);
614 fscal = _mm_and_pd(fscal,cutoff_mask);
616 /* Update vectorial force */
617 fix2 = _mm_macc_pd(dx21,fscal,fix2);
618 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
619 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
621 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
622 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
623 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
627 /**************************
628 * CALCULATE INTERACTIONS *
629 **************************/
631 if (gmx_mm_any_lt(rsq22,rcutoff2))
634 r22 = _mm_mul_pd(rsq22,rinv22);
636 /* EWALD ELECTROSTATICS */
638 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
639 ewrt = _mm_mul_pd(r22,ewtabscale);
640 ewitab = _mm_cvttpd_epi32(ewrt);
642 eweps = _mm_frcz_pd(ewrt);
644 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
646 twoeweps = _mm_add_pd(eweps,eweps);
647 ewitab = _mm_slli_epi32(ewitab,2);
648 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
649 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
650 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
651 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
652 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
653 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
654 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
655 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
656 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
657 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
659 d = _mm_sub_pd(r22,rswitch);
660 d = _mm_max_pd(d,_mm_setzero_pd());
661 d2 = _mm_mul_pd(d,d);
662 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
664 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
666 /* Evaluate switch function */
667 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
668 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
669 velec = _mm_mul_pd(velec,sw);
670 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
672 /* Update potential sum for this i atom from the interaction with this j atom. */
673 velec = _mm_and_pd(velec,cutoff_mask);
674 velecsum = _mm_add_pd(velecsum,velec);
678 fscal = _mm_and_pd(fscal,cutoff_mask);
680 /* Update vectorial force */
681 fix2 = _mm_macc_pd(dx22,fscal,fix2);
682 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
683 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
685 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
686 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
687 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
691 /**************************
692 * CALCULATE INTERACTIONS *
693 **************************/
695 if (gmx_mm_any_lt(rsq23,rcutoff2))
698 r23 = _mm_mul_pd(rsq23,rinv23);
700 /* EWALD ELECTROSTATICS */
702 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
703 ewrt = _mm_mul_pd(r23,ewtabscale);
704 ewitab = _mm_cvttpd_epi32(ewrt);
706 eweps = _mm_frcz_pd(ewrt);
708 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
710 twoeweps = _mm_add_pd(eweps,eweps);
711 ewitab = _mm_slli_epi32(ewitab,2);
712 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
713 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
714 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
715 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
716 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
717 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
718 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
719 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
720 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
721 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
723 d = _mm_sub_pd(r23,rswitch);
724 d = _mm_max_pd(d,_mm_setzero_pd());
725 d2 = _mm_mul_pd(d,d);
726 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
728 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
730 /* Evaluate switch function */
731 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
732 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
733 velec = _mm_mul_pd(velec,sw);
734 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
736 /* Update potential sum for this i atom from the interaction with this j atom. */
737 velec = _mm_and_pd(velec,cutoff_mask);
738 velecsum = _mm_add_pd(velecsum,velec);
742 fscal = _mm_and_pd(fscal,cutoff_mask);
744 /* Update vectorial force */
745 fix2 = _mm_macc_pd(dx23,fscal,fix2);
746 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
747 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
749 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
750 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
751 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
755 /**************************
756 * CALCULATE INTERACTIONS *
757 **************************/
759 if (gmx_mm_any_lt(rsq31,rcutoff2))
762 r31 = _mm_mul_pd(rsq31,rinv31);
764 /* EWALD ELECTROSTATICS */
766 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
767 ewrt = _mm_mul_pd(r31,ewtabscale);
768 ewitab = _mm_cvttpd_epi32(ewrt);
770 eweps = _mm_frcz_pd(ewrt);
772 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
774 twoeweps = _mm_add_pd(eweps,eweps);
775 ewitab = _mm_slli_epi32(ewitab,2);
776 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
777 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
778 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
779 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
780 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
781 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
782 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
783 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
784 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
785 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
787 d = _mm_sub_pd(r31,rswitch);
788 d = _mm_max_pd(d,_mm_setzero_pd());
789 d2 = _mm_mul_pd(d,d);
790 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
792 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
794 /* Evaluate switch function */
795 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
796 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
797 velec = _mm_mul_pd(velec,sw);
798 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
800 /* Update potential sum for this i atom from the interaction with this j atom. */
801 velec = _mm_and_pd(velec,cutoff_mask);
802 velecsum = _mm_add_pd(velecsum,velec);
806 fscal = _mm_and_pd(fscal,cutoff_mask);
808 /* Update vectorial force */
809 fix3 = _mm_macc_pd(dx31,fscal,fix3);
810 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
811 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
813 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
814 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
815 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
819 /**************************
820 * CALCULATE INTERACTIONS *
821 **************************/
823 if (gmx_mm_any_lt(rsq32,rcutoff2))
826 r32 = _mm_mul_pd(rsq32,rinv32);
828 /* EWALD ELECTROSTATICS */
830 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
831 ewrt = _mm_mul_pd(r32,ewtabscale);
832 ewitab = _mm_cvttpd_epi32(ewrt);
834 eweps = _mm_frcz_pd(ewrt);
836 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
838 twoeweps = _mm_add_pd(eweps,eweps);
839 ewitab = _mm_slli_epi32(ewitab,2);
840 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
841 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
842 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
843 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
844 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
845 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
846 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
847 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
848 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
849 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
851 d = _mm_sub_pd(r32,rswitch);
852 d = _mm_max_pd(d,_mm_setzero_pd());
853 d2 = _mm_mul_pd(d,d);
854 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
856 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
858 /* Evaluate switch function */
859 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
860 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
861 velec = _mm_mul_pd(velec,sw);
862 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
864 /* Update potential sum for this i atom from the interaction with this j atom. */
865 velec = _mm_and_pd(velec,cutoff_mask);
866 velecsum = _mm_add_pd(velecsum,velec);
870 fscal = _mm_and_pd(fscal,cutoff_mask);
872 /* Update vectorial force */
873 fix3 = _mm_macc_pd(dx32,fscal,fix3);
874 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
875 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
877 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
878 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
879 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
883 /**************************
884 * CALCULATE INTERACTIONS *
885 **************************/
887 if (gmx_mm_any_lt(rsq33,rcutoff2))
890 r33 = _mm_mul_pd(rsq33,rinv33);
892 /* EWALD ELECTROSTATICS */
894 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
895 ewrt = _mm_mul_pd(r33,ewtabscale);
896 ewitab = _mm_cvttpd_epi32(ewrt);
898 eweps = _mm_frcz_pd(ewrt);
900 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
902 twoeweps = _mm_add_pd(eweps,eweps);
903 ewitab = _mm_slli_epi32(ewitab,2);
904 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
905 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
906 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
907 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
908 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
909 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
910 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
911 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
912 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
913 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
915 d = _mm_sub_pd(r33,rswitch);
916 d = _mm_max_pd(d,_mm_setzero_pd());
917 d2 = _mm_mul_pd(d,d);
918 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
920 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
922 /* Evaluate switch function */
923 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
924 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
925 velec = _mm_mul_pd(velec,sw);
926 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
928 /* Update potential sum for this i atom from the interaction with this j atom. */
929 velec = _mm_and_pd(velec,cutoff_mask);
930 velecsum = _mm_add_pd(velecsum,velec);
934 fscal = _mm_and_pd(fscal,cutoff_mask);
936 /* Update vectorial force */
937 fix3 = _mm_macc_pd(dx33,fscal,fix3);
938 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
939 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
941 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
942 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
943 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
947 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
949 /* Inner loop uses 677 flops */
956 j_coord_offsetA = DIM*jnrA;
958 /* load j atom coordinates */
959 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
960 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
961 &jy2,&jz2,&jx3,&jy3,&jz3);
963 /* Calculate displacement vector */
964 dx00 = _mm_sub_pd(ix0,jx0);
965 dy00 = _mm_sub_pd(iy0,jy0);
966 dz00 = _mm_sub_pd(iz0,jz0);
967 dx11 = _mm_sub_pd(ix1,jx1);
968 dy11 = _mm_sub_pd(iy1,jy1);
969 dz11 = _mm_sub_pd(iz1,jz1);
970 dx12 = _mm_sub_pd(ix1,jx2);
971 dy12 = _mm_sub_pd(iy1,jy2);
972 dz12 = _mm_sub_pd(iz1,jz2);
973 dx13 = _mm_sub_pd(ix1,jx3);
974 dy13 = _mm_sub_pd(iy1,jy3);
975 dz13 = _mm_sub_pd(iz1,jz3);
976 dx21 = _mm_sub_pd(ix2,jx1);
977 dy21 = _mm_sub_pd(iy2,jy1);
978 dz21 = _mm_sub_pd(iz2,jz1);
979 dx22 = _mm_sub_pd(ix2,jx2);
980 dy22 = _mm_sub_pd(iy2,jy2);
981 dz22 = _mm_sub_pd(iz2,jz2);
982 dx23 = _mm_sub_pd(ix2,jx3);
983 dy23 = _mm_sub_pd(iy2,jy3);
984 dz23 = _mm_sub_pd(iz2,jz3);
985 dx31 = _mm_sub_pd(ix3,jx1);
986 dy31 = _mm_sub_pd(iy3,jy1);
987 dz31 = _mm_sub_pd(iz3,jz1);
988 dx32 = _mm_sub_pd(ix3,jx2);
989 dy32 = _mm_sub_pd(iy3,jy2);
990 dz32 = _mm_sub_pd(iz3,jz2);
991 dx33 = _mm_sub_pd(ix3,jx3);
992 dy33 = _mm_sub_pd(iy3,jy3);
993 dz33 = _mm_sub_pd(iz3,jz3);
995 /* Calculate squared distance and things based on it */
996 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
997 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
998 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
999 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1000 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1001 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1002 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1003 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1004 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1005 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1007 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1008 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1009 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1010 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1011 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1012 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1013 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1014 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1015 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1016 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1018 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1019 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1020 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1021 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1022 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1023 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1024 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1025 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1026 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1027 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1029 fjx0 = _mm_setzero_pd();
1030 fjy0 = _mm_setzero_pd();
1031 fjz0 = _mm_setzero_pd();
1032 fjx1 = _mm_setzero_pd();
1033 fjy1 = _mm_setzero_pd();
1034 fjz1 = _mm_setzero_pd();
1035 fjx2 = _mm_setzero_pd();
1036 fjy2 = _mm_setzero_pd();
1037 fjz2 = _mm_setzero_pd();
1038 fjx3 = _mm_setzero_pd();
1039 fjy3 = _mm_setzero_pd();
1040 fjz3 = _mm_setzero_pd();
1042 /**************************
1043 * CALCULATE INTERACTIONS *
1044 **************************/
1046 if (gmx_mm_any_lt(rsq00,rcutoff2))
1049 r00 = _mm_mul_pd(rsq00,rinv00);
1051 /* LENNARD-JONES DISPERSION/REPULSION */
1053 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1054 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1055 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1056 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
1057 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1059 d = _mm_sub_pd(r00,rswitch);
1060 d = _mm_max_pd(d,_mm_setzero_pd());
1061 d2 = _mm_mul_pd(d,d);
1062 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1064 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1066 /* Evaluate switch function */
1067 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1068 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1069 vvdw = _mm_mul_pd(vvdw,sw);
1070 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1072 /* Update potential sum for this i atom from the interaction with this j atom. */
1073 vvdw = _mm_and_pd(vvdw,cutoff_mask);
1074 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
1075 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
1079 fscal = _mm_and_pd(fscal,cutoff_mask);
1081 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1083 /* Update vectorial force */
1084 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1085 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1086 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1088 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1089 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1090 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1094 /**************************
1095 * CALCULATE INTERACTIONS *
1096 **************************/
1098 if (gmx_mm_any_lt(rsq11,rcutoff2))
1101 r11 = _mm_mul_pd(rsq11,rinv11);
1103 /* EWALD ELECTROSTATICS */
1105 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1106 ewrt = _mm_mul_pd(r11,ewtabscale);
1107 ewitab = _mm_cvttpd_epi32(ewrt);
1109 eweps = _mm_frcz_pd(ewrt);
1111 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1113 twoeweps = _mm_add_pd(eweps,eweps);
1114 ewitab = _mm_slli_epi32(ewitab,2);
1115 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1116 ewtabD = _mm_setzero_pd();
1117 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1118 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1119 ewtabFn = _mm_setzero_pd();
1120 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1121 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1122 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1123 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1124 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1126 d = _mm_sub_pd(r11,rswitch);
1127 d = _mm_max_pd(d,_mm_setzero_pd());
1128 d2 = _mm_mul_pd(d,d);
1129 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1131 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1133 /* Evaluate switch function */
1134 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1135 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
1136 velec = _mm_mul_pd(velec,sw);
1137 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1139 /* Update potential sum for this i atom from the interaction with this j atom. */
1140 velec = _mm_and_pd(velec,cutoff_mask);
1141 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1142 velecsum = _mm_add_pd(velecsum,velec);
1146 fscal = _mm_and_pd(fscal,cutoff_mask);
1148 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1150 /* Update vectorial force */
1151 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1152 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1153 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1155 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1156 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1157 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1161 /**************************
1162 * CALCULATE INTERACTIONS *
1163 **************************/
1165 if (gmx_mm_any_lt(rsq12,rcutoff2))
1168 r12 = _mm_mul_pd(rsq12,rinv12);
1170 /* EWALD ELECTROSTATICS */
1172 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1173 ewrt = _mm_mul_pd(r12,ewtabscale);
1174 ewitab = _mm_cvttpd_epi32(ewrt);
1176 eweps = _mm_frcz_pd(ewrt);
1178 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1180 twoeweps = _mm_add_pd(eweps,eweps);
1181 ewitab = _mm_slli_epi32(ewitab,2);
1182 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1183 ewtabD = _mm_setzero_pd();
1184 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1185 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1186 ewtabFn = _mm_setzero_pd();
1187 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1188 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1189 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1190 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1191 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1193 d = _mm_sub_pd(r12,rswitch);
1194 d = _mm_max_pd(d,_mm_setzero_pd());
1195 d2 = _mm_mul_pd(d,d);
1196 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1198 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1200 /* Evaluate switch function */
1201 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1202 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1203 velec = _mm_mul_pd(velec,sw);
1204 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1206 /* Update potential sum for this i atom from the interaction with this j atom. */
1207 velec = _mm_and_pd(velec,cutoff_mask);
1208 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1209 velecsum = _mm_add_pd(velecsum,velec);
1213 fscal = _mm_and_pd(fscal,cutoff_mask);
1215 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1217 /* Update vectorial force */
1218 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1219 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1220 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1222 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1223 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1224 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1228 /**************************
1229 * CALCULATE INTERACTIONS *
1230 **************************/
1232 if (gmx_mm_any_lt(rsq13,rcutoff2))
1235 r13 = _mm_mul_pd(rsq13,rinv13);
1237 /* EWALD ELECTROSTATICS */
1239 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1240 ewrt = _mm_mul_pd(r13,ewtabscale);
1241 ewitab = _mm_cvttpd_epi32(ewrt);
1243 eweps = _mm_frcz_pd(ewrt);
1245 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1247 twoeweps = _mm_add_pd(eweps,eweps);
1248 ewitab = _mm_slli_epi32(ewitab,2);
1249 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1250 ewtabD = _mm_setzero_pd();
1251 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1252 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1253 ewtabFn = _mm_setzero_pd();
1254 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1255 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1256 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1257 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
1258 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1260 d = _mm_sub_pd(r13,rswitch);
1261 d = _mm_max_pd(d,_mm_setzero_pd());
1262 d2 = _mm_mul_pd(d,d);
1263 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1265 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1267 /* Evaluate switch function */
1268 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1269 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
1270 velec = _mm_mul_pd(velec,sw);
1271 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1273 /* Update potential sum for this i atom from the interaction with this j atom. */
1274 velec = _mm_and_pd(velec,cutoff_mask);
1275 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1276 velecsum = _mm_add_pd(velecsum,velec);
1280 fscal = _mm_and_pd(fscal,cutoff_mask);
1282 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1284 /* Update vectorial force */
1285 fix1 = _mm_macc_pd(dx13,fscal,fix1);
1286 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
1287 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
1289 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
1290 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
1291 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
1295 /**************************
1296 * CALCULATE INTERACTIONS *
1297 **************************/
1299 if (gmx_mm_any_lt(rsq21,rcutoff2))
1302 r21 = _mm_mul_pd(rsq21,rinv21);
1304 /* EWALD ELECTROSTATICS */
1306 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1307 ewrt = _mm_mul_pd(r21,ewtabscale);
1308 ewitab = _mm_cvttpd_epi32(ewrt);
1310 eweps = _mm_frcz_pd(ewrt);
1312 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1314 twoeweps = _mm_add_pd(eweps,eweps);
1315 ewitab = _mm_slli_epi32(ewitab,2);
1316 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1317 ewtabD = _mm_setzero_pd();
1318 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1319 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1320 ewtabFn = _mm_setzero_pd();
1321 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1322 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1323 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1324 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1325 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1327 d = _mm_sub_pd(r21,rswitch);
1328 d = _mm_max_pd(d,_mm_setzero_pd());
1329 d2 = _mm_mul_pd(d,d);
1330 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1332 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1334 /* Evaluate switch function */
1335 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1336 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
1337 velec = _mm_mul_pd(velec,sw);
1338 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1340 /* Update potential sum for this i atom from the interaction with this j atom. */
1341 velec = _mm_and_pd(velec,cutoff_mask);
1342 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1343 velecsum = _mm_add_pd(velecsum,velec);
1347 fscal = _mm_and_pd(fscal,cutoff_mask);
1349 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1351 /* Update vectorial force */
1352 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1353 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1354 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1356 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1357 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1358 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1362 /**************************
1363 * CALCULATE INTERACTIONS *
1364 **************************/
1366 if (gmx_mm_any_lt(rsq22,rcutoff2))
1369 r22 = _mm_mul_pd(rsq22,rinv22);
1371 /* EWALD ELECTROSTATICS */
1373 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1374 ewrt = _mm_mul_pd(r22,ewtabscale);
1375 ewitab = _mm_cvttpd_epi32(ewrt);
1377 eweps = _mm_frcz_pd(ewrt);
1379 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1381 twoeweps = _mm_add_pd(eweps,eweps);
1382 ewitab = _mm_slli_epi32(ewitab,2);
1383 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1384 ewtabD = _mm_setzero_pd();
1385 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1386 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1387 ewtabFn = _mm_setzero_pd();
1388 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1389 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1390 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1391 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1392 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1394 d = _mm_sub_pd(r22,rswitch);
1395 d = _mm_max_pd(d,_mm_setzero_pd());
1396 d2 = _mm_mul_pd(d,d);
1397 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1399 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1401 /* Evaluate switch function */
1402 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1403 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
1404 velec = _mm_mul_pd(velec,sw);
1405 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1407 /* Update potential sum for this i atom from the interaction with this j atom. */
1408 velec = _mm_and_pd(velec,cutoff_mask);
1409 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1410 velecsum = _mm_add_pd(velecsum,velec);
1414 fscal = _mm_and_pd(fscal,cutoff_mask);
1416 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1418 /* Update vectorial force */
1419 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1420 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1421 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1423 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1424 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1425 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1429 /**************************
1430 * CALCULATE INTERACTIONS *
1431 **************************/
1433 if (gmx_mm_any_lt(rsq23,rcutoff2))
1436 r23 = _mm_mul_pd(rsq23,rinv23);
1438 /* EWALD ELECTROSTATICS */
1440 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1441 ewrt = _mm_mul_pd(r23,ewtabscale);
1442 ewitab = _mm_cvttpd_epi32(ewrt);
1444 eweps = _mm_frcz_pd(ewrt);
1446 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1448 twoeweps = _mm_add_pd(eweps,eweps);
1449 ewitab = _mm_slli_epi32(ewitab,2);
1450 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1451 ewtabD = _mm_setzero_pd();
1452 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1453 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1454 ewtabFn = _mm_setzero_pd();
1455 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1456 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1457 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1458 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
1459 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1461 d = _mm_sub_pd(r23,rswitch);
1462 d = _mm_max_pd(d,_mm_setzero_pd());
1463 d2 = _mm_mul_pd(d,d);
1464 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1466 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1468 /* Evaluate switch function */
1469 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1470 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
1471 velec = _mm_mul_pd(velec,sw);
1472 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1474 /* Update potential sum for this i atom from the interaction with this j atom. */
1475 velec = _mm_and_pd(velec,cutoff_mask);
1476 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1477 velecsum = _mm_add_pd(velecsum,velec);
1481 fscal = _mm_and_pd(fscal,cutoff_mask);
1483 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1485 /* Update vectorial force */
1486 fix2 = _mm_macc_pd(dx23,fscal,fix2);
1487 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
1488 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
1490 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
1491 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
1492 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
1496 /**************************
1497 * CALCULATE INTERACTIONS *
1498 **************************/
1500 if (gmx_mm_any_lt(rsq31,rcutoff2))
1503 r31 = _mm_mul_pd(rsq31,rinv31);
1505 /* EWALD ELECTROSTATICS */
1507 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1508 ewrt = _mm_mul_pd(r31,ewtabscale);
1509 ewitab = _mm_cvttpd_epi32(ewrt);
1511 eweps = _mm_frcz_pd(ewrt);
1513 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1515 twoeweps = _mm_add_pd(eweps,eweps);
1516 ewitab = _mm_slli_epi32(ewitab,2);
1517 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1518 ewtabD = _mm_setzero_pd();
1519 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1520 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1521 ewtabFn = _mm_setzero_pd();
1522 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1523 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1524 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1525 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
1526 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1528 d = _mm_sub_pd(r31,rswitch);
1529 d = _mm_max_pd(d,_mm_setzero_pd());
1530 d2 = _mm_mul_pd(d,d);
1531 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1533 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1535 /* Evaluate switch function */
1536 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1537 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
1538 velec = _mm_mul_pd(velec,sw);
1539 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1541 /* Update potential sum for this i atom from the interaction with this j atom. */
1542 velec = _mm_and_pd(velec,cutoff_mask);
1543 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1544 velecsum = _mm_add_pd(velecsum,velec);
1548 fscal = _mm_and_pd(fscal,cutoff_mask);
1550 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1552 /* Update vectorial force */
1553 fix3 = _mm_macc_pd(dx31,fscal,fix3);
1554 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
1555 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
1557 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
1558 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
1559 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
1563 /**************************
1564 * CALCULATE INTERACTIONS *
1565 **************************/
1567 if (gmx_mm_any_lt(rsq32,rcutoff2))
1570 r32 = _mm_mul_pd(rsq32,rinv32);
1572 /* EWALD ELECTROSTATICS */
1574 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1575 ewrt = _mm_mul_pd(r32,ewtabscale);
1576 ewitab = _mm_cvttpd_epi32(ewrt);
1578 eweps = _mm_frcz_pd(ewrt);
1580 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1582 twoeweps = _mm_add_pd(eweps,eweps);
1583 ewitab = _mm_slli_epi32(ewitab,2);
1584 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1585 ewtabD = _mm_setzero_pd();
1586 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1587 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1588 ewtabFn = _mm_setzero_pd();
1589 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1590 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1591 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1592 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
1593 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1595 d = _mm_sub_pd(r32,rswitch);
1596 d = _mm_max_pd(d,_mm_setzero_pd());
1597 d2 = _mm_mul_pd(d,d);
1598 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1600 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1602 /* Evaluate switch function */
1603 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1604 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
1605 velec = _mm_mul_pd(velec,sw);
1606 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1608 /* Update potential sum for this i atom from the interaction with this j atom. */
1609 velec = _mm_and_pd(velec,cutoff_mask);
1610 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1611 velecsum = _mm_add_pd(velecsum,velec);
1615 fscal = _mm_and_pd(fscal,cutoff_mask);
1617 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1619 /* Update vectorial force */
1620 fix3 = _mm_macc_pd(dx32,fscal,fix3);
1621 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
1622 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
1624 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
1625 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
1626 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
1630 /**************************
1631 * CALCULATE INTERACTIONS *
1632 **************************/
1634 if (gmx_mm_any_lt(rsq33,rcutoff2))
1637 r33 = _mm_mul_pd(rsq33,rinv33);
1639 /* EWALD ELECTROSTATICS */
1641 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1642 ewrt = _mm_mul_pd(r33,ewtabscale);
1643 ewitab = _mm_cvttpd_epi32(ewrt);
1645 eweps = _mm_frcz_pd(ewrt);
1647 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1649 twoeweps = _mm_add_pd(eweps,eweps);
1650 ewitab = _mm_slli_epi32(ewitab,2);
1651 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1652 ewtabD = _mm_setzero_pd();
1653 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1654 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1655 ewtabFn = _mm_setzero_pd();
1656 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1657 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1658 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1659 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
1660 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1662 d = _mm_sub_pd(r33,rswitch);
1663 d = _mm_max_pd(d,_mm_setzero_pd());
1664 d2 = _mm_mul_pd(d,d);
1665 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1667 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1669 /* Evaluate switch function */
1670 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1671 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
1672 velec = _mm_mul_pd(velec,sw);
1673 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1675 /* Update potential sum for this i atom from the interaction with this j atom. */
1676 velec = _mm_and_pd(velec,cutoff_mask);
1677 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1678 velecsum = _mm_add_pd(velecsum,velec);
1682 fscal = _mm_and_pd(fscal,cutoff_mask);
1684 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1686 /* Update vectorial force */
1687 fix3 = _mm_macc_pd(dx33,fscal,fix3);
1688 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
1689 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
1691 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
1692 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
1693 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
1697 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1699 /* Inner loop uses 677 flops */
1702 /* End of innermost loop */
1704 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1705 f+i_coord_offset,fshift+i_shift_offset);
1708 /* Update potential energies */
1709 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1710 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1712 /* Increment number of inner iterations */
1713 inneriter += j_index_end - j_index_start;
1715 /* Outer loop uses 26 flops */
1718 /* Increment number of outer iterations */
1721 /* Update outer/inner flops */
1723 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*677);
1726 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_128_fma_double
1727 * Electrostatics interaction: Ewald
1728 * VdW interaction: LennardJones
1729 * Geometry: Water4-Water4
1730 * Calculate force/pot: Force
1733 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_128_fma_double
1734 (t_nblist * gmx_restrict nlist,
1735 rvec * gmx_restrict xx,
1736 rvec * gmx_restrict ff,
1737 t_forcerec * gmx_restrict fr,
1738 t_mdatoms * gmx_restrict mdatoms,
1739 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1740 t_nrnb * gmx_restrict nrnb)
1742 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1743 * just 0 for non-waters.
1744 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1745 * jnr indices corresponding to data put in the four positions in the SIMD register.
1747 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1748 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1750 int j_coord_offsetA,j_coord_offsetB;
1751 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1752 real rcutoff_scalar;
1753 real *shiftvec,*fshift,*x,*f;
1754 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1756 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1758 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1760 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1762 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1763 int vdwjidx0A,vdwjidx0B;
1764 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1765 int vdwjidx1A,vdwjidx1B;
1766 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1767 int vdwjidx2A,vdwjidx2B;
1768 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1769 int vdwjidx3A,vdwjidx3B;
1770 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1771 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1772 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1773 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1774 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1775 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1776 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1777 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1778 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1779 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1780 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1781 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1784 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1787 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1788 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1790 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1792 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1793 real rswitch_scalar,d_scalar;
1794 __m128d dummy_mask,cutoff_mask;
1795 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1796 __m128d one = _mm_set1_pd(1.0);
1797 __m128d two = _mm_set1_pd(2.0);
1803 jindex = nlist->jindex;
1805 shiftidx = nlist->shift;
1807 shiftvec = fr->shift_vec[0];
1808 fshift = fr->fshift[0];
1809 facel = _mm_set1_pd(fr->epsfac);
1810 charge = mdatoms->chargeA;
1811 nvdwtype = fr->ntype;
1812 vdwparam = fr->nbfp;
1813 vdwtype = mdatoms->typeA;
1815 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1816 ewtab = fr->ic->tabq_coul_FDV0;
1817 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1818 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1820 /* Setup water-specific parameters */
1821 inr = nlist->iinr[0];
1822 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1823 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1824 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1825 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1827 jq1 = _mm_set1_pd(charge[inr+1]);
1828 jq2 = _mm_set1_pd(charge[inr+2]);
1829 jq3 = _mm_set1_pd(charge[inr+3]);
1830 vdwjidx0A = 2*vdwtype[inr+0];
1831 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1832 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1833 qq11 = _mm_mul_pd(iq1,jq1);
1834 qq12 = _mm_mul_pd(iq1,jq2);
1835 qq13 = _mm_mul_pd(iq1,jq3);
1836 qq21 = _mm_mul_pd(iq2,jq1);
1837 qq22 = _mm_mul_pd(iq2,jq2);
1838 qq23 = _mm_mul_pd(iq2,jq3);
1839 qq31 = _mm_mul_pd(iq3,jq1);
1840 qq32 = _mm_mul_pd(iq3,jq2);
1841 qq33 = _mm_mul_pd(iq3,jq3);
1843 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1844 rcutoff_scalar = fr->rcoulomb;
1845 rcutoff = _mm_set1_pd(rcutoff_scalar);
1846 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1848 rswitch_scalar = fr->rcoulomb_switch;
1849 rswitch = _mm_set1_pd(rswitch_scalar);
1850 /* Setup switch parameters */
1851 d_scalar = rcutoff_scalar-rswitch_scalar;
1852 d = _mm_set1_pd(d_scalar);
1853 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1854 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1855 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1856 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1857 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1858 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1860 /* Avoid stupid compiler warnings */
1862 j_coord_offsetA = 0;
1863 j_coord_offsetB = 0;
1868 /* Start outer loop over neighborlists */
1869 for(iidx=0; iidx<nri; iidx++)
1871 /* Load shift vector for this list */
1872 i_shift_offset = DIM*shiftidx[iidx];
1874 /* Load limits for loop over neighbors */
1875 j_index_start = jindex[iidx];
1876 j_index_end = jindex[iidx+1];
1878 /* Get outer coordinate index */
1880 i_coord_offset = DIM*inr;
1882 /* Load i particle coords and add shift vector */
1883 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1884 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1886 fix0 = _mm_setzero_pd();
1887 fiy0 = _mm_setzero_pd();
1888 fiz0 = _mm_setzero_pd();
1889 fix1 = _mm_setzero_pd();
1890 fiy1 = _mm_setzero_pd();
1891 fiz1 = _mm_setzero_pd();
1892 fix2 = _mm_setzero_pd();
1893 fiy2 = _mm_setzero_pd();
1894 fiz2 = _mm_setzero_pd();
1895 fix3 = _mm_setzero_pd();
1896 fiy3 = _mm_setzero_pd();
1897 fiz3 = _mm_setzero_pd();
1899 /* Start inner kernel loop */
1900 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1903 /* Get j neighbor index, and coordinate index */
1905 jnrB = jjnr[jidx+1];
1906 j_coord_offsetA = DIM*jnrA;
1907 j_coord_offsetB = DIM*jnrB;
1909 /* load j atom coordinates */
1910 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1911 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1912 &jy2,&jz2,&jx3,&jy3,&jz3);
1914 /* Calculate displacement vector */
1915 dx00 = _mm_sub_pd(ix0,jx0);
1916 dy00 = _mm_sub_pd(iy0,jy0);
1917 dz00 = _mm_sub_pd(iz0,jz0);
1918 dx11 = _mm_sub_pd(ix1,jx1);
1919 dy11 = _mm_sub_pd(iy1,jy1);
1920 dz11 = _mm_sub_pd(iz1,jz1);
1921 dx12 = _mm_sub_pd(ix1,jx2);
1922 dy12 = _mm_sub_pd(iy1,jy2);
1923 dz12 = _mm_sub_pd(iz1,jz2);
1924 dx13 = _mm_sub_pd(ix1,jx3);
1925 dy13 = _mm_sub_pd(iy1,jy3);
1926 dz13 = _mm_sub_pd(iz1,jz3);
1927 dx21 = _mm_sub_pd(ix2,jx1);
1928 dy21 = _mm_sub_pd(iy2,jy1);
1929 dz21 = _mm_sub_pd(iz2,jz1);
1930 dx22 = _mm_sub_pd(ix2,jx2);
1931 dy22 = _mm_sub_pd(iy2,jy2);
1932 dz22 = _mm_sub_pd(iz2,jz2);
1933 dx23 = _mm_sub_pd(ix2,jx3);
1934 dy23 = _mm_sub_pd(iy2,jy3);
1935 dz23 = _mm_sub_pd(iz2,jz3);
1936 dx31 = _mm_sub_pd(ix3,jx1);
1937 dy31 = _mm_sub_pd(iy3,jy1);
1938 dz31 = _mm_sub_pd(iz3,jz1);
1939 dx32 = _mm_sub_pd(ix3,jx2);
1940 dy32 = _mm_sub_pd(iy3,jy2);
1941 dz32 = _mm_sub_pd(iz3,jz2);
1942 dx33 = _mm_sub_pd(ix3,jx3);
1943 dy33 = _mm_sub_pd(iy3,jy3);
1944 dz33 = _mm_sub_pd(iz3,jz3);
1946 /* Calculate squared distance and things based on it */
1947 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1948 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1949 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1950 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1951 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1952 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1953 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1954 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1955 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1956 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1958 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1959 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1960 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1961 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1962 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1963 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1964 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1965 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1966 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1967 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1969 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1970 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1971 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1972 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1973 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1974 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1975 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1976 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1977 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1978 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1980 fjx0 = _mm_setzero_pd();
1981 fjy0 = _mm_setzero_pd();
1982 fjz0 = _mm_setzero_pd();
1983 fjx1 = _mm_setzero_pd();
1984 fjy1 = _mm_setzero_pd();
1985 fjz1 = _mm_setzero_pd();
1986 fjx2 = _mm_setzero_pd();
1987 fjy2 = _mm_setzero_pd();
1988 fjz2 = _mm_setzero_pd();
1989 fjx3 = _mm_setzero_pd();
1990 fjy3 = _mm_setzero_pd();
1991 fjz3 = _mm_setzero_pd();
1993 /**************************
1994 * CALCULATE INTERACTIONS *
1995 **************************/
1997 if (gmx_mm_any_lt(rsq00,rcutoff2))
2000 r00 = _mm_mul_pd(rsq00,rinv00);
2002 /* LENNARD-JONES DISPERSION/REPULSION */
2004 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2005 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
2006 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
2007 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
2008 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
2010 d = _mm_sub_pd(r00,rswitch);
2011 d = _mm_max_pd(d,_mm_setzero_pd());
2012 d2 = _mm_mul_pd(d,d);
2013 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2015 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2017 /* Evaluate switch function */
2018 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2019 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2020 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
2024 fscal = _mm_and_pd(fscal,cutoff_mask);
2026 /* Update vectorial force */
2027 fix0 = _mm_macc_pd(dx00,fscal,fix0);
2028 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
2029 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
2031 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
2032 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
2033 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
2037 /**************************
2038 * CALCULATE INTERACTIONS *
2039 **************************/
2041 if (gmx_mm_any_lt(rsq11,rcutoff2))
2044 r11 = _mm_mul_pd(rsq11,rinv11);
2046 /* EWALD ELECTROSTATICS */
2048 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2049 ewrt = _mm_mul_pd(r11,ewtabscale);
2050 ewitab = _mm_cvttpd_epi32(ewrt);
2052 eweps = _mm_frcz_pd(ewrt);
2054 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2056 twoeweps = _mm_add_pd(eweps,eweps);
2057 ewitab = _mm_slli_epi32(ewitab,2);
2058 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2059 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2060 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2061 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2062 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2063 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2064 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2065 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2066 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2067 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2069 d = _mm_sub_pd(r11,rswitch);
2070 d = _mm_max_pd(d,_mm_setzero_pd());
2071 d2 = _mm_mul_pd(d,d);
2072 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2074 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2076 /* Evaluate switch function */
2077 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2078 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2079 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2083 fscal = _mm_and_pd(fscal,cutoff_mask);
2085 /* Update vectorial force */
2086 fix1 = _mm_macc_pd(dx11,fscal,fix1);
2087 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
2088 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
2090 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
2091 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
2092 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
2096 /**************************
2097 * CALCULATE INTERACTIONS *
2098 **************************/
2100 if (gmx_mm_any_lt(rsq12,rcutoff2))
2103 r12 = _mm_mul_pd(rsq12,rinv12);
2105 /* EWALD ELECTROSTATICS */
2107 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2108 ewrt = _mm_mul_pd(r12,ewtabscale);
2109 ewitab = _mm_cvttpd_epi32(ewrt);
2111 eweps = _mm_frcz_pd(ewrt);
2113 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2115 twoeweps = _mm_add_pd(eweps,eweps);
2116 ewitab = _mm_slli_epi32(ewitab,2);
2117 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2118 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2119 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2120 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2121 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2122 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2123 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2124 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2125 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2126 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2128 d = _mm_sub_pd(r12,rswitch);
2129 d = _mm_max_pd(d,_mm_setzero_pd());
2130 d2 = _mm_mul_pd(d,d);
2131 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2133 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2135 /* Evaluate switch function */
2136 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2137 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2138 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2142 fscal = _mm_and_pd(fscal,cutoff_mask);
2144 /* Update vectorial force */
2145 fix1 = _mm_macc_pd(dx12,fscal,fix1);
2146 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
2147 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
2149 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
2150 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
2151 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
2155 /**************************
2156 * CALCULATE INTERACTIONS *
2157 **************************/
2159 if (gmx_mm_any_lt(rsq13,rcutoff2))
2162 r13 = _mm_mul_pd(rsq13,rinv13);
2164 /* EWALD ELECTROSTATICS */
2166 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2167 ewrt = _mm_mul_pd(r13,ewtabscale);
2168 ewitab = _mm_cvttpd_epi32(ewrt);
2170 eweps = _mm_frcz_pd(ewrt);
2172 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2174 twoeweps = _mm_add_pd(eweps,eweps);
2175 ewitab = _mm_slli_epi32(ewitab,2);
2176 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2177 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2178 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2179 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2180 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2181 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2182 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2183 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2184 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
2185 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2187 d = _mm_sub_pd(r13,rswitch);
2188 d = _mm_max_pd(d,_mm_setzero_pd());
2189 d2 = _mm_mul_pd(d,d);
2190 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2192 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2194 /* Evaluate switch function */
2195 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2196 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
2197 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
2201 fscal = _mm_and_pd(fscal,cutoff_mask);
2203 /* Update vectorial force */
2204 fix1 = _mm_macc_pd(dx13,fscal,fix1);
2205 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
2206 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
2208 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
2209 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
2210 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
2214 /**************************
2215 * CALCULATE INTERACTIONS *
2216 **************************/
2218 if (gmx_mm_any_lt(rsq21,rcutoff2))
2221 r21 = _mm_mul_pd(rsq21,rinv21);
2223 /* EWALD ELECTROSTATICS */
2225 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2226 ewrt = _mm_mul_pd(r21,ewtabscale);
2227 ewitab = _mm_cvttpd_epi32(ewrt);
2229 eweps = _mm_frcz_pd(ewrt);
2231 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2233 twoeweps = _mm_add_pd(eweps,eweps);
2234 ewitab = _mm_slli_epi32(ewitab,2);
2235 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2236 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2237 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2238 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2239 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2240 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2241 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2242 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2243 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2244 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2246 d = _mm_sub_pd(r21,rswitch);
2247 d = _mm_max_pd(d,_mm_setzero_pd());
2248 d2 = _mm_mul_pd(d,d);
2249 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2251 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2253 /* Evaluate switch function */
2254 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2255 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2256 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2260 fscal = _mm_and_pd(fscal,cutoff_mask);
2262 /* Update vectorial force */
2263 fix2 = _mm_macc_pd(dx21,fscal,fix2);
2264 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
2265 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
2267 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
2268 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
2269 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
2273 /**************************
2274 * CALCULATE INTERACTIONS *
2275 **************************/
2277 if (gmx_mm_any_lt(rsq22,rcutoff2))
2280 r22 = _mm_mul_pd(rsq22,rinv22);
2282 /* EWALD ELECTROSTATICS */
2284 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2285 ewrt = _mm_mul_pd(r22,ewtabscale);
2286 ewitab = _mm_cvttpd_epi32(ewrt);
2288 eweps = _mm_frcz_pd(ewrt);
2290 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2292 twoeweps = _mm_add_pd(eweps,eweps);
2293 ewitab = _mm_slli_epi32(ewitab,2);
2294 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2295 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2296 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2297 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2298 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2299 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2300 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2301 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2302 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2303 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2305 d = _mm_sub_pd(r22,rswitch);
2306 d = _mm_max_pd(d,_mm_setzero_pd());
2307 d2 = _mm_mul_pd(d,d);
2308 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2310 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2312 /* Evaluate switch function */
2313 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2314 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2315 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2319 fscal = _mm_and_pd(fscal,cutoff_mask);
2321 /* Update vectorial force */
2322 fix2 = _mm_macc_pd(dx22,fscal,fix2);
2323 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
2324 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
2326 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
2327 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
2328 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
2332 /**************************
2333 * CALCULATE INTERACTIONS *
2334 **************************/
2336 if (gmx_mm_any_lt(rsq23,rcutoff2))
2339 r23 = _mm_mul_pd(rsq23,rinv23);
2341 /* EWALD ELECTROSTATICS */
2343 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2344 ewrt = _mm_mul_pd(r23,ewtabscale);
2345 ewitab = _mm_cvttpd_epi32(ewrt);
2347 eweps = _mm_frcz_pd(ewrt);
2349 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2351 twoeweps = _mm_add_pd(eweps,eweps);
2352 ewitab = _mm_slli_epi32(ewitab,2);
2353 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2354 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2355 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2356 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2357 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2358 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2359 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2360 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2361 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
2362 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2364 d = _mm_sub_pd(r23,rswitch);
2365 d = _mm_max_pd(d,_mm_setzero_pd());
2366 d2 = _mm_mul_pd(d,d);
2367 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2369 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2371 /* Evaluate switch function */
2372 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2373 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
2374 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
2378 fscal = _mm_and_pd(fscal,cutoff_mask);
2380 /* Update vectorial force */
2381 fix2 = _mm_macc_pd(dx23,fscal,fix2);
2382 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
2383 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
2385 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
2386 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
2387 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
2391 /**************************
2392 * CALCULATE INTERACTIONS *
2393 **************************/
2395 if (gmx_mm_any_lt(rsq31,rcutoff2))
2398 r31 = _mm_mul_pd(rsq31,rinv31);
2400 /* EWALD ELECTROSTATICS */
2402 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2403 ewrt = _mm_mul_pd(r31,ewtabscale);
2404 ewitab = _mm_cvttpd_epi32(ewrt);
2406 eweps = _mm_frcz_pd(ewrt);
2408 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2410 twoeweps = _mm_add_pd(eweps,eweps);
2411 ewitab = _mm_slli_epi32(ewitab,2);
2412 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2413 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2414 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2415 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2416 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2417 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2418 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2419 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2420 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
2421 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2423 d = _mm_sub_pd(r31,rswitch);
2424 d = _mm_max_pd(d,_mm_setzero_pd());
2425 d2 = _mm_mul_pd(d,d);
2426 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2428 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2430 /* Evaluate switch function */
2431 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2432 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
2433 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
2437 fscal = _mm_and_pd(fscal,cutoff_mask);
2439 /* Update vectorial force */
2440 fix3 = _mm_macc_pd(dx31,fscal,fix3);
2441 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
2442 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
2444 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
2445 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
2446 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
2450 /**************************
2451 * CALCULATE INTERACTIONS *
2452 **************************/
2454 if (gmx_mm_any_lt(rsq32,rcutoff2))
2457 r32 = _mm_mul_pd(rsq32,rinv32);
2459 /* EWALD ELECTROSTATICS */
2461 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2462 ewrt = _mm_mul_pd(r32,ewtabscale);
2463 ewitab = _mm_cvttpd_epi32(ewrt);
2465 eweps = _mm_frcz_pd(ewrt);
2467 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2469 twoeweps = _mm_add_pd(eweps,eweps);
2470 ewitab = _mm_slli_epi32(ewitab,2);
2471 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2472 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2473 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2474 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2475 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2476 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2477 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2478 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2479 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
2480 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2482 d = _mm_sub_pd(r32,rswitch);
2483 d = _mm_max_pd(d,_mm_setzero_pd());
2484 d2 = _mm_mul_pd(d,d);
2485 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2487 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2489 /* Evaluate switch function */
2490 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2491 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
2492 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
2496 fscal = _mm_and_pd(fscal,cutoff_mask);
2498 /* Update vectorial force */
2499 fix3 = _mm_macc_pd(dx32,fscal,fix3);
2500 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
2501 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
2503 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
2504 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
2505 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
2509 /**************************
2510 * CALCULATE INTERACTIONS *
2511 **************************/
2513 if (gmx_mm_any_lt(rsq33,rcutoff2))
2516 r33 = _mm_mul_pd(rsq33,rinv33);
2518 /* EWALD ELECTROSTATICS */
2520 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2521 ewrt = _mm_mul_pd(r33,ewtabscale);
2522 ewitab = _mm_cvttpd_epi32(ewrt);
2524 eweps = _mm_frcz_pd(ewrt);
2526 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2528 twoeweps = _mm_add_pd(eweps,eweps);
2529 ewitab = _mm_slli_epi32(ewitab,2);
2530 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2531 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2532 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2533 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2534 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2535 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2536 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2537 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2538 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
2539 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2541 d = _mm_sub_pd(r33,rswitch);
2542 d = _mm_max_pd(d,_mm_setzero_pd());
2543 d2 = _mm_mul_pd(d,d);
2544 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2546 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2548 /* Evaluate switch function */
2549 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2550 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
2551 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
2555 fscal = _mm_and_pd(fscal,cutoff_mask);
2557 /* Update vectorial force */
2558 fix3 = _mm_macc_pd(dx33,fscal,fix3);
2559 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
2560 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
2562 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
2563 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
2564 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
2568 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2570 /* Inner loop uses 647 flops */
2573 if(jidx<j_index_end)
2577 j_coord_offsetA = DIM*jnrA;
2579 /* load j atom coordinates */
2580 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2581 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2582 &jy2,&jz2,&jx3,&jy3,&jz3);
2584 /* Calculate displacement vector */
2585 dx00 = _mm_sub_pd(ix0,jx0);
2586 dy00 = _mm_sub_pd(iy0,jy0);
2587 dz00 = _mm_sub_pd(iz0,jz0);
2588 dx11 = _mm_sub_pd(ix1,jx1);
2589 dy11 = _mm_sub_pd(iy1,jy1);
2590 dz11 = _mm_sub_pd(iz1,jz1);
2591 dx12 = _mm_sub_pd(ix1,jx2);
2592 dy12 = _mm_sub_pd(iy1,jy2);
2593 dz12 = _mm_sub_pd(iz1,jz2);
2594 dx13 = _mm_sub_pd(ix1,jx3);
2595 dy13 = _mm_sub_pd(iy1,jy3);
2596 dz13 = _mm_sub_pd(iz1,jz3);
2597 dx21 = _mm_sub_pd(ix2,jx1);
2598 dy21 = _mm_sub_pd(iy2,jy1);
2599 dz21 = _mm_sub_pd(iz2,jz1);
2600 dx22 = _mm_sub_pd(ix2,jx2);
2601 dy22 = _mm_sub_pd(iy2,jy2);
2602 dz22 = _mm_sub_pd(iz2,jz2);
2603 dx23 = _mm_sub_pd(ix2,jx3);
2604 dy23 = _mm_sub_pd(iy2,jy3);
2605 dz23 = _mm_sub_pd(iz2,jz3);
2606 dx31 = _mm_sub_pd(ix3,jx1);
2607 dy31 = _mm_sub_pd(iy3,jy1);
2608 dz31 = _mm_sub_pd(iz3,jz1);
2609 dx32 = _mm_sub_pd(ix3,jx2);
2610 dy32 = _mm_sub_pd(iy3,jy2);
2611 dz32 = _mm_sub_pd(iz3,jz2);
2612 dx33 = _mm_sub_pd(ix3,jx3);
2613 dy33 = _mm_sub_pd(iy3,jy3);
2614 dz33 = _mm_sub_pd(iz3,jz3);
2616 /* Calculate squared distance and things based on it */
2617 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2618 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2619 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2620 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2621 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2622 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2623 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2624 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2625 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2626 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2628 rinv00 = gmx_mm_invsqrt_pd(rsq00);
2629 rinv11 = gmx_mm_invsqrt_pd(rsq11);
2630 rinv12 = gmx_mm_invsqrt_pd(rsq12);
2631 rinv13 = gmx_mm_invsqrt_pd(rsq13);
2632 rinv21 = gmx_mm_invsqrt_pd(rsq21);
2633 rinv22 = gmx_mm_invsqrt_pd(rsq22);
2634 rinv23 = gmx_mm_invsqrt_pd(rsq23);
2635 rinv31 = gmx_mm_invsqrt_pd(rsq31);
2636 rinv32 = gmx_mm_invsqrt_pd(rsq32);
2637 rinv33 = gmx_mm_invsqrt_pd(rsq33);
2639 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
2640 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
2641 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
2642 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
2643 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
2644 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
2645 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
2646 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
2647 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
2648 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
2650 fjx0 = _mm_setzero_pd();
2651 fjy0 = _mm_setzero_pd();
2652 fjz0 = _mm_setzero_pd();
2653 fjx1 = _mm_setzero_pd();
2654 fjy1 = _mm_setzero_pd();
2655 fjz1 = _mm_setzero_pd();
2656 fjx2 = _mm_setzero_pd();
2657 fjy2 = _mm_setzero_pd();
2658 fjz2 = _mm_setzero_pd();
2659 fjx3 = _mm_setzero_pd();
2660 fjy3 = _mm_setzero_pd();
2661 fjz3 = _mm_setzero_pd();
2663 /**************************
2664 * CALCULATE INTERACTIONS *
2665 **************************/
2667 if (gmx_mm_any_lt(rsq00,rcutoff2))
2670 r00 = _mm_mul_pd(rsq00,rinv00);
2672 /* LENNARD-JONES DISPERSION/REPULSION */
2674 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2675 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
2676 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
2677 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
2678 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
2680 d = _mm_sub_pd(r00,rswitch);
2681 d = _mm_max_pd(d,_mm_setzero_pd());
2682 d2 = _mm_mul_pd(d,d);
2683 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2685 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2687 /* Evaluate switch function */
2688 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2689 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2690 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
2694 fscal = _mm_and_pd(fscal,cutoff_mask);
2696 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2698 /* Update vectorial force */
2699 fix0 = _mm_macc_pd(dx00,fscal,fix0);
2700 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
2701 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
2703 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
2704 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
2705 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
2709 /**************************
2710 * CALCULATE INTERACTIONS *
2711 **************************/
2713 if (gmx_mm_any_lt(rsq11,rcutoff2))
2716 r11 = _mm_mul_pd(rsq11,rinv11);
2718 /* EWALD ELECTROSTATICS */
2720 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2721 ewrt = _mm_mul_pd(r11,ewtabscale);
2722 ewitab = _mm_cvttpd_epi32(ewrt);
2724 eweps = _mm_frcz_pd(ewrt);
2726 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2728 twoeweps = _mm_add_pd(eweps,eweps);
2729 ewitab = _mm_slli_epi32(ewitab,2);
2730 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2731 ewtabD = _mm_setzero_pd();
2732 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2733 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2734 ewtabFn = _mm_setzero_pd();
2735 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2736 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2737 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2738 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2739 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2741 d = _mm_sub_pd(r11,rswitch);
2742 d = _mm_max_pd(d,_mm_setzero_pd());
2743 d2 = _mm_mul_pd(d,d);
2744 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2746 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2748 /* Evaluate switch function */
2749 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2750 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2751 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2755 fscal = _mm_and_pd(fscal,cutoff_mask);
2757 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2759 /* Update vectorial force */
2760 fix1 = _mm_macc_pd(dx11,fscal,fix1);
2761 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
2762 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
2764 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
2765 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
2766 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
2770 /**************************
2771 * CALCULATE INTERACTIONS *
2772 **************************/
2774 if (gmx_mm_any_lt(rsq12,rcutoff2))
2777 r12 = _mm_mul_pd(rsq12,rinv12);
2779 /* EWALD ELECTROSTATICS */
2781 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2782 ewrt = _mm_mul_pd(r12,ewtabscale);
2783 ewitab = _mm_cvttpd_epi32(ewrt);
2785 eweps = _mm_frcz_pd(ewrt);
2787 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2789 twoeweps = _mm_add_pd(eweps,eweps);
2790 ewitab = _mm_slli_epi32(ewitab,2);
2791 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2792 ewtabD = _mm_setzero_pd();
2793 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2794 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2795 ewtabFn = _mm_setzero_pd();
2796 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2797 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2798 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2799 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2800 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2802 d = _mm_sub_pd(r12,rswitch);
2803 d = _mm_max_pd(d,_mm_setzero_pd());
2804 d2 = _mm_mul_pd(d,d);
2805 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2807 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2809 /* Evaluate switch function */
2810 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2811 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2812 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2816 fscal = _mm_and_pd(fscal,cutoff_mask);
2818 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2820 /* Update vectorial force */
2821 fix1 = _mm_macc_pd(dx12,fscal,fix1);
2822 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
2823 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
2825 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
2826 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
2827 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
2831 /**************************
2832 * CALCULATE INTERACTIONS *
2833 **************************/
2835 if (gmx_mm_any_lt(rsq13,rcutoff2))
2838 r13 = _mm_mul_pd(rsq13,rinv13);
2840 /* EWALD ELECTROSTATICS */
2842 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2843 ewrt = _mm_mul_pd(r13,ewtabscale);
2844 ewitab = _mm_cvttpd_epi32(ewrt);
2846 eweps = _mm_frcz_pd(ewrt);
2848 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2850 twoeweps = _mm_add_pd(eweps,eweps);
2851 ewitab = _mm_slli_epi32(ewitab,2);
2852 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2853 ewtabD = _mm_setzero_pd();
2854 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2855 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2856 ewtabFn = _mm_setzero_pd();
2857 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2858 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2859 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2860 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
2861 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2863 d = _mm_sub_pd(r13,rswitch);
2864 d = _mm_max_pd(d,_mm_setzero_pd());
2865 d2 = _mm_mul_pd(d,d);
2866 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2868 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2870 /* Evaluate switch function */
2871 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2872 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
2873 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
2877 fscal = _mm_and_pd(fscal,cutoff_mask);
2879 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2881 /* Update vectorial force */
2882 fix1 = _mm_macc_pd(dx13,fscal,fix1);
2883 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
2884 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
2886 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
2887 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
2888 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
2892 /**************************
2893 * CALCULATE INTERACTIONS *
2894 **************************/
2896 if (gmx_mm_any_lt(rsq21,rcutoff2))
2899 r21 = _mm_mul_pd(rsq21,rinv21);
2901 /* EWALD ELECTROSTATICS */
2903 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2904 ewrt = _mm_mul_pd(r21,ewtabscale);
2905 ewitab = _mm_cvttpd_epi32(ewrt);
2907 eweps = _mm_frcz_pd(ewrt);
2909 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2911 twoeweps = _mm_add_pd(eweps,eweps);
2912 ewitab = _mm_slli_epi32(ewitab,2);
2913 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2914 ewtabD = _mm_setzero_pd();
2915 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2916 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2917 ewtabFn = _mm_setzero_pd();
2918 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2919 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2920 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2921 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2922 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2924 d = _mm_sub_pd(r21,rswitch);
2925 d = _mm_max_pd(d,_mm_setzero_pd());
2926 d2 = _mm_mul_pd(d,d);
2927 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2929 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2931 /* Evaluate switch function */
2932 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2933 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2934 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2938 fscal = _mm_and_pd(fscal,cutoff_mask);
2940 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2942 /* Update vectorial force */
2943 fix2 = _mm_macc_pd(dx21,fscal,fix2);
2944 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
2945 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
2947 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
2948 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
2949 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
2953 /**************************
2954 * CALCULATE INTERACTIONS *
2955 **************************/
2957 if (gmx_mm_any_lt(rsq22,rcutoff2))
2960 r22 = _mm_mul_pd(rsq22,rinv22);
2962 /* EWALD ELECTROSTATICS */
2964 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2965 ewrt = _mm_mul_pd(r22,ewtabscale);
2966 ewitab = _mm_cvttpd_epi32(ewrt);
2968 eweps = _mm_frcz_pd(ewrt);
2970 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2972 twoeweps = _mm_add_pd(eweps,eweps);
2973 ewitab = _mm_slli_epi32(ewitab,2);
2974 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2975 ewtabD = _mm_setzero_pd();
2976 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2977 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2978 ewtabFn = _mm_setzero_pd();
2979 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2980 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2981 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2982 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2983 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2985 d = _mm_sub_pd(r22,rswitch);
2986 d = _mm_max_pd(d,_mm_setzero_pd());
2987 d2 = _mm_mul_pd(d,d);
2988 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2990 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2992 /* Evaluate switch function */
2993 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2994 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2995 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2999 fscal = _mm_and_pd(fscal,cutoff_mask);
3001 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3003 /* Update vectorial force */
3004 fix2 = _mm_macc_pd(dx22,fscal,fix2);
3005 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
3006 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
3008 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
3009 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
3010 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
3014 /**************************
3015 * CALCULATE INTERACTIONS *
3016 **************************/
3018 if (gmx_mm_any_lt(rsq23,rcutoff2))
3021 r23 = _mm_mul_pd(rsq23,rinv23);
3023 /* EWALD ELECTROSTATICS */
3025 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3026 ewrt = _mm_mul_pd(r23,ewtabscale);
3027 ewitab = _mm_cvttpd_epi32(ewrt);
3029 eweps = _mm_frcz_pd(ewrt);
3031 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3033 twoeweps = _mm_add_pd(eweps,eweps);
3034 ewitab = _mm_slli_epi32(ewitab,2);
3035 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3036 ewtabD = _mm_setzero_pd();
3037 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3038 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
3039 ewtabFn = _mm_setzero_pd();
3040 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3041 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
3042 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
3043 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
3044 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
3046 d = _mm_sub_pd(r23,rswitch);
3047 d = _mm_max_pd(d,_mm_setzero_pd());
3048 d2 = _mm_mul_pd(d,d);
3049 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
3051 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
3053 /* Evaluate switch function */
3054 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3055 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
3056 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
3060 fscal = _mm_and_pd(fscal,cutoff_mask);
3062 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3064 /* Update vectorial force */
3065 fix2 = _mm_macc_pd(dx23,fscal,fix2);
3066 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
3067 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
3069 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
3070 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
3071 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
3075 /**************************
3076 * CALCULATE INTERACTIONS *
3077 **************************/
3079 if (gmx_mm_any_lt(rsq31,rcutoff2))
3082 r31 = _mm_mul_pd(rsq31,rinv31);
3084 /* EWALD ELECTROSTATICS */
3086 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3087 ewrt = _mm_mul_pd(r31,ewtabscale);
3088 ewitab = _mm_cvttpd_epi32(ewrt);
3090 eweps = _mm_frcz_pd(ewrt);
3092 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3094 twoeweps = _mm_add_pd(eweps,eweps);
3095 ewitab = _mm_slli_epi32(ewitab,2);
3096 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3097 ewtabD = _mm_setzero_pd();
3098 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3099 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
3100 ewtabFn = _mm_setzero_pd();
3101 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3102 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
3103 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
3104 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
3105 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
3107 d = _mm_sub_pd(r31,rswitch);
3108 d = _mm_max_pd(d,_mm_setzero_pd());
3109 d2 = _mm_mul_pd(d,d);
3110 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
3112 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
3114 /* Evaluate switch function */
3115 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3116 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
3117 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
3121 fscal = _mm_and_pd(fscal,cutoff_mask);
3123 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3125 /* Update vectorial force */
3126 fix3 = _mm_macc_pd(dx31,fscal,fix3);
3127 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
3128 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
3130 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
3131 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
3132 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
3136 /**************************
3137 * CALCULATE INTERACTIONS *
3138 **************************/
3140 if (gmx_mm_any_lt(rsq32,rcutoff2))
3143 r32 = _mm_mul_pd(rsq32,rinv32);
3145 /* EWALD ELECTROSTATICS */
3147 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3148 ewrt = _mm_mul_pd(r32,ewtabscale);
3149 ewitab = _mm_cvttpd_epi32(ewrt);
3151 eweps = _mm_frcz_pd(ewrt);
3153 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3155 twoeweps = _mm_add_pd(eweps,eweps);
3156 ewitab = _mm_slli_epi32(ewitab,2);
3157 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3158 ewtabD = _mm_setzero_pd();
3159 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3160 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
3161 ewtabFn = _mm_setzero_pd();
3162 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3163 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
3164 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
3165 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
3166 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
3168 d = _mm_sub_pd(r32,rswitch);
3169 d = _mm_max_pd(d,_mm_setzero_pd());
3170 d2 = _mm_mul_pd(d,d);
3171 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
3173 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
3175 /* Evaluate switch function */
3176 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3177 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
3178 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
3182 fscal = _mm_and_pd(fscal,cutoff_mask);
3184 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3186 /* Update vectorial force */
3187 fix3 = _mm_macc_pd(dx32,fscal,fix3);
3188 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
3189 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
3191 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
3192 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
3193 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
3197 /**************************
3198 * CALCULATE INTERACTIONS *
3199 **************************/
3201 if (gmx_mm_any_lt(rsq33,rcutoff2))
3204 r33 = _mm_mul_pd(rsq33,rinv33);
3206 /* EWALD ELECTROSTATICS */
3208 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3209 ewrt = _mm_mul_pd(r33,ewtabscale);
3210 ewitab = _mm_cvttpd_epi32(ewrt);
3212 eweps = _mm_frcz_pd(ewrt);
3214 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3216 twoeweps = _mm_add_pd(eweps,eweps);
3217 ewitab = _mm_slli_epi32(ewitab,2);
3218 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3219 ewtabD = _mm_setzero_pd();
3220 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3221 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
3222 ewtabFn = _mm_setzero_pd();
3223 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3224 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
3225 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
3226 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
3227 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
3229 d = _mm_sub_pd(r33,rswitch);
3230 d = _mm_max_pd(d,_mm_setzero_pd());
3231 d2 = _mm_mul_pd(d,d);
3232 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
3234 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
3236 /* Evaluate switch function */
3237 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3238 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
3239 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
3243 fscal = _mm_and_pd(fscal,cutoff_mask);
3245 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3247 /* Update vectorial force */
3248 fix3 = _mm_macc_pd(dx33,fscal,fix3);
3249 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
3250 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
3252 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
3253 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
3254 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
3258 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3260 /* Inner loop uses 647 flops */
3263 /* End of innermost loop */
3265 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3266 f+i_coord_offset,fshift+i_shift_offset);
3268 /* Increment number of inner iterations */
3269 inneriter += j_index_end - j_index_start;
3271 /* Outer loop uses 24 flops */
3274 /* Increment number of outer iterations */
3277 /* Update outer/inner flops */
3279 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*647);