2 * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_single.h"
34 #include "kernelutil_x86_avx_128_fma_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_128_fma_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_128_fma_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
93 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
96 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
97 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
99 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
100 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
102 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
103 real rswitch_scalar,d_scalar;
104 __m128 dummy_mask,cutoff_mask;
105 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
106 __m128 one = _mm_set1_ps(1.0);
107 __m128 two = _mm_set1_ps(2.0);
113 jindex = nlist->jindex;
115 shiftidx = nlist->shift;
117 shiftvec = fr->shift_vec[0];
118 fshift = fr->fshift[0];
119 facel = _mm_set1_ps(fr->epsfac);
120 charge = mdatoms->chargeA;
121 nvdwtype = fr->ntype;
123 vdwtype = mdatoms->typeA;
125 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
126 beta = _mm_set1_ps(fr->ic->ewaldcoeff);
127 beta2 = _mm_mul_ps(beta,beta);
128 beta3 = _mm_mul_ps(beta,beta2);
129 ewtab = fr->ic->tabq_coul_FDV0;
130 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
131 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
133 /* Setup water-specific parameters */
134 inr = nlist->iinr[0];
135 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
136 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
137 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
138 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
140 jq0 = _mm_set1_ps(charge[inr+0]);
141 jq1 = _mm_set1_ps(charge[inr+1]);
142 jq2 = _mm_set1_ps(charge[inr+2]);
143 vdwjidx0A = 2*vdwtype[inr+0];
144 qq00 = _mm_mul_ps(iq0,jq0);
145 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
146 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
147 qq01 = _mm_mul_ps(iq0,jq1);
148 qq02 = _mm_mul_ps(iq0,jq2);
149 qq10 = _mm_mul_ps(iq1,jq0);
150 qq11 = _mm_mul_ps(iq1,jq1);
151 qq12 = _mm_mul_ps(iq1,jq2);
152 qq20 = _mm_mul_ps(iq2,jq0);
153 qq21 = _mm_mul_ps(iq2,jq1);
154 qq22 = _mm_mul_ps(iq2,jq2);
156 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
157 rcutoff_scalar = fr->rcoulomb;
158 rcutoff = _mm_set1_ps(rcutoff_scalar);
159 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
161 rswitch_scalar = fr->rcoulomb_switch;
162 rswitch = _mm_set1_ps(rswitch_scalar);
163 /* Setup switch parameters */
164 d_scalar = rcutoff_scalar-rswitch_scalar;
165 d = _mm_set1_ps(d_scalar);
166 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
167 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
168 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
169 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
170 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
171 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
173 /* Avoid stupid compiler warnings */
174 jnrA = jnrB = jnrC = jnrD = 0;
183 for(iidx=0;iidx<4*DIM;iidx++)
188 /* Start outer loop over neighborlists */
189 for(iidx=0; iidx<nri; iidx++)
191 /* Load shift vector for this list */
192 i_shift_offset = DIM*shiftidx[iidx];
194 /* Load limits for loop over neighbors */
195 j_index_start = jindex[iidx];
196 j_index_end = jindex[iidx+1];
198 /* Get outer coordinate index */
200 i_coord_offset = DIM*inr;
202 /* Load i particle coords and add shift vector */
203 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
204 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
206 fix0 = _mm_setzero_ps();
207 fiy0 = _mm_setzero_ps();
208 fiz0 = _mm_setzero_ps();
209 fix1 = _mm_setzero_ps();
210 fiy1 = _mm_setzero_ps();
211 fiz1 = _mm_setzero_ps();
212 fix2 = _mm_setzero_ps();
213 fiy2 = _mm_setzero_ps();
214 fiz2 = _mm_setzero_ps();
216 /* Reset potential sums */
217 velecsum = _mm_setzero_ps();
218 vvdwsum = _mm_setzero_ps();
220 /* Start inner kernel loop */
221 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
224 /* Get j neighbor index, and coordinate index */
229 j_coord_offsetA = DIM*jnrA;
230 j_coord_offsetB = DIM*jnrB;
231 j_coord_offsetC = DIM*jnrC;
232 j_coord_offsetD = DIM*jnrD;
234 /* load j atom coordinates */
235 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
236 x+j_coord_offsetC,x+j_coord_offsetD,
237 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
239 /* Calculate displacement vector */
240 dx00 = _mm_sub_ps(ix0,jx0);
241 dy00 = _mm_sub_ps(iy0,jy0);
242 dz00 = _mm_sub_ps(iz0,jz0);
243 dx01 = _mm_sub_ps(ix0,jx1);
244 dy01 = _mm_sub_ps(iy0,jy1);
245 dz01 = _mm_sub_ps(iz0,jz1);
246 dx02 = _mm_sub_ps(ix0,jx2);
247 dy02 = _mm_sub_ps(iy0,jy2);
248 dz02 = _mm_sub_ps(iz0,jz2);
249 dx10 = _mm_sub_ps(ix1,jx0);
250 dy10 = _mm_sub_ps(iy1,jy0);
251 dz10 = _mm_sub_ps(iz1,jz0);
252 dx11 = _mm_sub_ps(ix1,jx1);
253 dy11 = _mm_sub_ps(iy1,jy1);
254 dz11 = _mm_sub_ps(iz1,jz1);
255 dx12 = _mm_sub_ps(ix1,jx2);
256 dy12 = _mm_sub_ps(iy1,jy2);
257 dz12 = _mm_sub_ps(iz1,jz2);
258 dx20 = _mm_sub_ps(ix2,jx0);
259 dy20 = _mm_sub_ps(iy2,jy0);
260 dz20 = _mm_sub_ps(iz2,jz0);
261 dx21 = _mm_sub_ps(ix2,jx1);
262 dy21 = _mm_sub_ps(iy2,jy1);
263 dz21 = _mm_sub_ps(iz2,jz1);
264 dx22 = _mm_sub_ps(ix2,jx2);
265 dy22 = _mm_sub_ps(iy2,jy2);
266 dz22 = _mm_sub_ps(iz2,jz2);
268 /* Calculate squared distance and things based on it */
269 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
270 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
271 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
272 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
273 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
274 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
275 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
276 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
277 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
279 rinv00 = gmx_mm_invsqrt_ps(rsq00);
280 rinv01 = gmx_mm_invsqrt_ps(rsq01);
281 rinv02 = gmx_mm_invsqrt_ps(rsq02);
282 rinv10 = gmx_mm_invsqrt_ps(rsq10);
283 rinv11 = gmx_mm_invsqrt_ps(rsq11);
284 rinv12 = gmx_mm_invsqrt_ps(rsq12);
285 rinv20 = gmx_mm_invsqrt_ps(rsq20);
286 rinv21 = gmx_mm_invsqrt_ps(rsq21);
287 rinv22 = gmx_mm_invsqrt_ps(rsq22);
289 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
290 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
291 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
292 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
293 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
294 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
295 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
296 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
297 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
299 fjx0 = _mm_setzero_ps();
300 fjy0 = _mm_setzero_ps();
301 fjz0 = _mm_setzero_ps();
302 fjx1 = _mm_setzero_ps();
303 fjy1 = _mm_setzero_ps();
304 fjz1 = _mm_setzero_ps();
305 fjx2 = _mm_setzero_ps();
306 fjy2 = _mm_setzero_ps();
307 fjz2 = _mm_setzero_ps();
309 /**************************
310 * CALCULATE INTERACTIONS *
311 **************************/
313 if (gmx_mm_any_lt(rsq00,rcutoff2))
316 r00 = _mm_mul_ps(rsq00,rinv00);
318 /* EWALD ELECTROSTATICS */
320 /* Analytical PME correction */
321 zeta2 = _mm_mul_ps(beta2,rsq00);
322 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
323 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
324 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
325 felec = _mm_mul_ps(qq00,felec);
326 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
327 velec = _mm_nmacc_ps(pmecorrV,beta,rinv00);
328 velec = _mm_mul_ps(qq00,velec);
330 /* LENNARD-JONES DISPERSION/REPULSION */
332 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
333 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
334 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
335 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
336 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
338 d = _mm_sub_ps(r00,rswitch);
339 d = _mm_max_ps(d,_mm_setzero_ps());
340 d2 = _mm_mul_ps(d,d);
341 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
343 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
345 /* Evaluate switch function */
346 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
347 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
348 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
349 velec = _mm_mul_ps(velec,sw);
350 vvdw = _mm_mul_ps(vvdw,sw);
351 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
353 /* Update potential sum for this i atom from the interaction with this j atom. */
354 velec = _mm_and_ps(velec,cutoff_mask);
355 velecsum = _mm_add_ps(velecsum,velec);
356 vvdw = _mm_and_ps(vvdw,cutoff_mask);
357 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
359 fscal = _mm_add_ps(felec,fvdw);
361 fscal = _mm_and_ps(fscal,cutoff_mask);
363 /* Update vectorial force */
364 fix0 = _mm_macc_ps(dx00,fscal,fix0);
365 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
366 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
368 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
369 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
370 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
374 /**************************
375 * CALCULATE INTERACTIONS *
376 **************************/
378 if (gmx_mm_any_lt(rsq01,rcutoff2))
381 r01 = _mm_mul_ps(rsq01,rinv01);
383 /* EWALD ELECTROSTATICS */
385 /* Analytical PME correction */
386 zeta2 = _mm_mul_ps(beta2,rsq01);
387 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
388 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
389 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
390 felec = _mm_mul_ps(qq01,felec);
391 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
392 velec = _mm_nmacc_ps(pmecorrV,beta,rinv01);
393 velec = _mm_mul_ps(qq01,velec);
395 d = _mm_sub_ps(r01,rswitch);
396 d = _mm_max_ps(d,_mm_setzero_ps());
397 d2 = _mm_mul_ps(d,d);
398 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
400 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
402 /* Evaluate switch function */
403 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
404 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
405 velec = _mm_mul_ps(velec,sw);
406 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
408 /* Update potential sum for this i atom from the interaction with this j atom. */
409 velec = _mm_and_ps(velec,cutoff_mask);
410 velecsum = _mm_add_ps(velecsum,velec);
414 fscal = _mm_and_ps(fscal,cutoff_mask);
416 /* Update vectorial force */
417 fix0 = _mm_macc_ps(dx01,fscal,fix0);
418 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
419 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
421 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
422 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
423 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
427 /**************************
428 * CALCULATE INTERACTIONS *
429 **************************/
431 if (gmx_mm_any_lt(rsq02,rcutoff2))
434 r02 = _mm_mul_ps(rsq02,rinv02);
436 /* EWALD ELECTROSTATICS */
438 /* Analytical PME correction */
439 zeta2 = _mm_mul_ps(beta2,rsq02);
440 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
441 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
442 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
443 felec = _mm_mul_ps(qq02,felec);
444 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
445 velec = _mm_nmacc_ps(pmecorrV,beta,rinv02);
446 velec = _mm_mul_ps(qq02,velec);
448 d = _mm_sub_ps(r02,rswitch);
449 d = _mm_max_ps(d,_mm_setzero_ps());
450 d2 = _mm_mul_ps(d,d);
451 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
453 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
455 /* Evaluate switch function */
456 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
457 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
458 velec = _mm_mul_ps(velec,sw);
459 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
461 /* Update potential sum for this i atom from the interaction with this j atom. */
462 velec = _mm_and_ps(velec,cutoff_mask);
463 velecsum = _mm_add_ps(velecsum,velec);
467 fscal = _mm_and_ps(fscal,cutoff_mask);
469 /* Update vectorial force */
470 fix0 = _mm_macc_ps(dx02,fscal,fix0);
471 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
472 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
474 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
475 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
476 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
480 /**************************
481 * CALCULATE INTERACTIONS *
482 **************************/
484 if (gmx_mm_any_lt(rsq10,rcutoff2))
487 r10 = _mm_mul_ps(rsq10,rinv10);
489 /* EWALD ELECTROSTATICS */
491 /* Analytical PME correction */
492 zeta2 = _mm_mul_ps(beta2,rsq10);
493 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
494 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
495 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
496 felec = _mm_mul_ps(qq10,felec);
497 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
498 velec = _mm_nmacc_ps(pmecorrV,beta,rinv10);
499 velec = _mm_mul_ps(qq10,velec);
501 d = _mm_sub_ps(r10,rswitch);
502 d = _mm_max_ps(d,_mm_setzero_ps());
503 d2 = _mm_mul_ps(d,d);
504 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
506 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
508 /* Evaluate switch function */
509 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
510 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
511 velec = _mm_mul_ps(velec,sw);
512 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
514 /* Update potential sum for this i atom from the interaction with this j atom. */
515 velec = _mm_and_ps(velec,cutoff_mask);
516 velecsum = _mm_add_ps(velecsum,velec);
520 fscal = _mm_and_ps(fscal,cutoff_mask);
522 /* Update vectorial force */
523 fix1 = _mm_macc_ps(dx10,fscal,fix1);
524 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
525 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
527 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
528 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
529 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
533 /**************************
534 * CALCULATE INTERACTIONS *
535 **************************/
537 if (gmx_mm_any_lt(rsq11,rcutoff2))
540 r11 = _mm_mul_ps(rsq11,rinv11);
542 /* EWALD ELECTROSTATICS */
544 /* Analytical PME correction */
545 zeta2 = _mm_mul_ps(beta2,rsq11);
546 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
547 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
548 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
549 felec = _mm_mul_ps(qq11,felec);
550 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
551 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
552 velec = _mm_mul_ps(qq11,velec);
554 d = _mm_sub_ps(r11,rswitch);
555 d = _mm_max_ps(d,_mm_setzero_ps());
556 d2 = _mm_mul_ps(d,d);
557 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
559 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
561 /* Evaluate switch function */
562 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
563 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
564 velec = _mm_mul_ps(velec,sw);
565 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
567 /* Update potential sum for this i atom from the interaction with this j atom. */
568 velec = _mm_and_ps(velec,cutoff_mask);
569 velecsum = _mm_add_ps(velecsum,velec);
573 fscal = _mm_and_ps(fscal,cutoff_mask);
575 /* Update vectorial force */
576 fix1 = _mm_macc_ps(dx11,fscal,fix1);
577 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
578 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
580 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
581 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
582 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
586 /**************************
587 * CALCULATE INTERACTIONS *
588 **************************/
590 if (gmx_mm_any_lt(rsq12,rcutoff2))
593 r12 = _mm_mul_ps(rsq12,rinv12);
595 /* EWALD ELECTROSTATICS */
597 /* Analytical PME correction */
598 zeta2 = _mm_mul_ps(beta2,rsq12);
599 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
600 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
601 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
602 felec = _mm_mul_ps(qq12,felec);
603 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
604 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
605 velec = _mm_mul_ps(qq12,velec);
607 d = _mm_sub_ps(r12,rswitch);
608 d = _mm_max_ps(d,_mm_setzero_ps());
609 d2 = _mm_mul_ps(d,d);
610 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
612 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
614 /* Evaluate switch function */
615 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
616 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
617 velec = _mm_mul_ps(velec,sw);
618 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
620 /* Update potential sum for this i atom from the interaction with this j atom. */
621 velec = _mm_and_ps(velec,cutoff_mask);
622 velecsum = _mm_add_ps(velecsum,velec);
626 fscal = _mm_and_ps(fscal,cutoff_mask);
628 /* Update vectorial force */
629 fix1 = _mm_macc_ps(dx12,fscal,fix1);
630 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
631 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
633 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
634 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
635 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
639 /**************************
640 * CALCULATE INTERACTIONS *
641 **************************/
643 if (gmx_mm_any_lt(rsq20,rcutoff2))
646 r20 = _mm_mul_ps(rsq20,rinv20);
648 /* EWALD ELECTROSTATICS */
650 /* Analytical PME correction */
651 zeta2 = _mm_mul_ps(beta2,rsq20);
652 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
653 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
654 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
655 felec = _mm_mul_ps(qq20,felec);
656 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
657 velec = _mm_nmacc_ps(pmecorrV,beta,rinv20);
658 velec = _mm_mul_ps(qq20,velec);
660 d = _mm_sub_ps(r20,rswitch);
661 d = _mm_max_ps(d,_mm_setzero_ps());
662 d2 = _mm_mul_ps(d,d);
663 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
665 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
667 /* Evaluate switch function */
668 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
669 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
670 velec = _mm_mul_ps(velec,sw);
671 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
673 /* Update potential sum for this i atom from the interaction with this j atom. */
674 velec = _mm_and_ps(velec,cutoff_mask);
675 velecsum = _mm_add_ps(velecsum,velec);
679 fscal = _mm_and_ps(fscal,cutoff_mask);
681 /* Update vectorial force */
682 fix2 = _mm_macc_ps(dx20,fscal,fix2);
683 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
684 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
686 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
687 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
688 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
692 /**************************
693 * CALCULATE INTERACTIONS *
694 **************************/
696 if (gmx_mm_any_lt(rsq21,rcutoff2))
699 r21 = _mm_mul_ps(rsq21,rinv21);
701 /* EWALD ELECTROSTATICS */
703 /* Analytical PME correction */
704 zeta2 = _mm_mul_ps(beta2,rsq21);
705 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
706 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
707 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
708 felec = _mm_mul_ps(qq21,felec);
709 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
710 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
711 velec = _mm_mul_ps(qq21,velec);
713 d = _mm_sub_ps(r21,rswitch);
714 d = _mm_max_ps(d,_mm_setzero_ps());
715 d2 = _mm_mul_ps(d,d);
716 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
718 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
720 /* Evaluate switch function */
721 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
722 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
723 velec = _mm_mul_ps(velec,sw);
724 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
726 /* Update potential sum for this i atom from the interaction with this j atom. */
727 velec = _mm_and_ps(velec,cutoff_mask);
728 velecsum = _mm_add_ps(velecsum,velec);
732 fscal = _mm_and_ps(fscal,cutoff_mask);
734 /* Update vectorial force */
735 fix2 = _mm_macc_ps(dx21,fscal,fix2);
736 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
737 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
739 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
740 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
741 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
745 /**************************
746 * CALCULATE INTERACTIONS *
747 **************************/
749 if (gmx_mm_any_lt(rsq22,rcutoff2))
752 r22 = _mm_mul_ps(rsq22,rinv22);
754 /* EWALD ELECTROSTATICS */
756 /* Analytical PME correction */
757 zeta2 = _mm_mul_ps(beta2,rsq22);
758 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
759 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
760 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
761 felec = _mm_mul_ps(qq22,felec);
762 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
763 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
764 velec = _mm_mul_ps(qq22,velec);
766 d = _mm_sub_ps(r22,rswitch);
767 d = _mm_max_ps(d,_mm_setzero_ps());
768 d2 = _mm_mul_ps(d,d);
769 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
771 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
773 /* Evaluate switch function */
774 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
775 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
776 velec = _mm_mul_ps(velec,sw);
777 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
779 /* Update potential sum for this i atom from the interaction with this j atom. */
780 velec = _mm_and_ps(velec,cutoff_mask);
781 velecsum = _mm_add_ps(velecsum,velec);
785 fscal = _mm_and_ps(fscal,cutoff_mask);
787 /* Update vectorial force */
788 fix2 = _mm_macc_ps(dx22,fscal,fix2);
789 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
790 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
792 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
793 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
794 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
798 fjptrA = f+j_coord_offsetA;
799 fjptrB = f+j_coord_offsetB;
800 fjptrC = f+j_coord_offsetC;
801 fjptrD = f+j_coord_offsetD;
803 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
804 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
806 /* Inner loop uses 495 flops */
812 /* Get j neighbor index, and coordinate index */
813 jnrlistA = jjnr[jidx];
814 jnrlistB = jjnr[jidx+1];
815 jnrlistC = jjnr[jidx+2];
816 jnrlistD = jjnr[jidx+3];
817 /* Sign of each element will be negative for non-real atoms.
818 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
819 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
821 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
822 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
823 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
824 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
825 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
826 j_coord_offsetA = DIM*jnrA;
827 j_coord_offsetB = DIM*jnrB;
828 j_coord_offsetC = DIM*jnrC;
829 j_coord_offsetD = DIM*jnrD;
831 /* load j atom coordinates */
832 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
833 x+j_coord_offsetC,x+j_coord_offsetD,
834 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
836 /* Calculate displacement vector */
837 dx00 = _mm_sub_ps(ix0,jx0);
838 dy00 = _mm_sub_ps(iy0,jy0);
839 dz00 = _mm_sub_ps(iz0,jz0);
840 dx01 = _mm_sub_ps(ix0,jx1);
841 dy01 = _mm_sub_ps(iy0,jy1);
842 dz01 = _mm_sub_ps(iz0,jz1);
843 dx02 = _mm_sub_ps(ix0,jx2);
844 dy02 = _mm_sub_ps(iy0,jy2);
845 dz02 = _mm_sub_ps(iz0,jz2);
846 dx10 = _mm_sub_ps(ix1,jx0);
847 dy10 = _mm_sub_ps(iy1,jy0);
848 dz10 = _mm_sub_ps(iz1,jz0);
849 dx11 = _mm_sub_ps(ix1,jx1);
850 dy11 = _mm_sub_ps(iy1,jy1);
851 dz11 = _mm_sub_ps(iz1,jz1);
852 dx12 = _mm_sub_ps(ix1,jx2);
853 dy12 = _mm_sub_ps(iy1,jy2);
854 dz12 = _mm_sub_ps(iz1,jz2);
855 dx20 = _mm_sub_ps(ix2,jx0);
856 dy20 = _mm_sub_ps(iy2,jy0);
857 dz20 = _mm_sub_ps(iz2,jz0);
858 dx21 = _mm_sub_ps(ix2,jx1);
859 dy21 = _mm_sub_ps(iy2,jy1);
860 dz21 = _mm_sub_ps(iz2,jz1);
861 dx22 = _mm_sub_ps(ix2,jx2);
862 dy22 = _mm_sub_ps(iy2,jy2);
863 dz22 = _mm_sub_ps(iz2,jz2);
865 /* Calculate squared distance and things based on it */
866 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
867 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
868 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
869 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
870 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
871 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
872 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
873 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
874 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
876 rinv00 = gmx_mm_invsqrt_ps(rsq00);
877 rinv01 = gmx_mm_invsqrt_ps(rsq01);
878 rinv02 = gmx_mm_invsqrt_ps(rsq02);
879 rinv10 = gmx_mm_invsqrt_ps(rsq10);
880 rinv11 = gmx_mm_invsqrt_ps(rsq11);
881 rinv12 = gmx_mm_invsqrt_ps(rsq12);
882 rinv20 = gmx_mm_invsqrt_ps(rsq20);
883 rinv21 = gmx_mm_invsqrt_ps(rsq21);
884 rinv22 = gmx_mm_invsqrt_ps(rsq22);
886 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
887 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
888 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
889 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
890 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
891 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
892 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
893 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
894 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
896 fjx0 = _mm_setzero_ps();
897 fjy0 = _mm_setzero_ps();
898 fjz0 = _mm_setzero_ps();
899 fjx1 = _mm_setzero_ps();
900 fjy1 = _mm_setzero_ps();
901 fjz1 = _mm_setzero_ps();
902 fjx2 = _mm_setzero_ps();
903 fjy2 = _mm_setzero_ps();
904 fjz2 = _mm_setzero_ps();
906 /**************************
907 * CALCULATE INTERACTIONS *
908 **************************/
910 if (gmx_mm_any_lt(rsq00,rcutoff2))
913 r00 = _mm_mul_ps(rsq00,rinv00);
914 r00 = _mm_andnot_ps(dummy_mask,r00);
916 /* EWALD ELECTROSTATICS */
918 /* Analytical PME correction */
919 zeta2 = _mm_mul_ps(beta2,rsq00);
920 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
921 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
922 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
923 felec = _mm_mul_ps(qq00,felec);
924 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
925 velec = _mm_nmacc_ps(pmecorrV,beta,rinv00);
926 velec = _mm_mul_ps(qq00,velec);
928 /* LENNARD-JONES DISPERSION/REPULSION */
930 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
931 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
932 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
933 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
934 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
936 d = _mm_sub_ps(r00,rswitch);
937 d = _mm_max_ps(d,_mm_setzero_ps());
938 d2 = _mm_mul_ps(d,d);
939 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
941 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
943 /* Evaluate switch function */
944 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
945 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
946 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
947 velec = _mm_mul_ps(velec,sw);
948 vvdw = _mm_mul_ps(vvdw,sw);
949 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
951 /* Update potential sum for this i atom from the interaction with this j atom. */
952 velec = _mm_and_ps(velec,cutoff_mask);
953 velec = _mm_andnot_ps(dummy_mask,velec);
954 velecsum = _mm_add_ps(velecsum,velec);
955 vvdw = _mm_and_ps(vvdw,cutoff_mask);
956 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
957 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
959 fscal = _mm_add_ps(felec,fvdw);
961 fscal = _mm_and_ps(fscal,cutoff_mask);
963 fscal = _mm_andnot_ps(dummy_mask,fscal);
965 /* Update vectorial force */
966 fix0 = _mm_macc_ps(dx00,fscal,fix0);
967 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
968 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
970 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
971 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
972 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
976 /**************************
977 * CALCULATE INTERACTIONS *
978 **************************/
980 if (gmx_mm_any_lt(rsq01,rcutoff2))
983 r01 = _mm_mul_ps(rsq01,rinv01);
984 r01 = _mm_andnot_ps(dummy_mask,r01);
986 /* EWALD ELECTROSTATICS */
988 /* Analytical PME correction */
989 zeta2 = _mm_mul_ps(beta2,rsq01);
990 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
991 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
992 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
993 felec = _mm_mul_ps(qq01,felec);
994 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
995 velec = _mm_nmacc_ps(pmecorrV,beta,rinv01);
996 velec = _mm_mul_ps(qq01,velec);
998 d = _mm_sub_ps(r01,rswitch);
999 d = _mm_max_ps(d,_mm_setzero_ps());
1000 d2 = _mm_mul_ps(d,d);
1001 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1003 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1005 /* Evaluate switch function */
1006 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1007 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1008 velec = _mm_mul_ps(velec,sw);
1009 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1011 /* Update potential sum for this i atom from the interaction with this j atom. */
1012 velec = _mm_and_ps(velec,cutoff_mask);
1013 velec = _mm_andnot_ps(dummy_mask,velec);
1014 velecsum = _mm_add_ps(velecsum,velec);
1018 fscal = _mm_and_ps(fscal,cutoff_mask);
1020 fscal = _mm_andnot_ps(dummy_mask,fscal);
1022 /* Update vectorial force */
1023 fix0 = _mm_macc_ps(dx01,fscal,fix0);
1024 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1025 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1027 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1028 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1029 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1033 /**************************
1034 * CALCULATE INTERACTIONS *
1035 **************************/
1037 if (gmx_mm_any_lt(rsq02,rcutoff2))
1040 r02 = _mm_mul_ps(rsq02,rinv02);
1041 r02 = _mm_andnot_ps(dummy_mask,r02);
1043 /* EWALD ELECTROSTATICS */
1045 /* Analytical PME correction */
1046 zeta2 = _mm_mul_ps(beta2,rsq02);
1047 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
1048 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1049 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1050 felec = _mm_mul_ps(qq02,felec);
1051 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1052 velec = _mm_nmacc_ps(pmecorrV,beta,rinv02);
1053 velec = _mm_mul_ps(qq02,velec);
1055 d = _mm_sub_ps(r02,rswitch);
1056 d = _mm_max_ps(d,_mm_setzero_ps());
1057 d2 = _mm_mul_ps(d,d);
1058 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1060 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1062 /* Evaluate switch function */
1063 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1064 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1065 velec = _mm_mul_ps(velec,sw);
1066 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1068 /* Update potential sum for this i atom from the interaction with this j atom. */
1069 velec = _mm_and_ps(velec,cutoff_mask);
1070 velec = _mm_andnot_ps(dummy_mask,velec);
1071 velecsum = _mm_add_ps(velecsum,velec);
1075 fscal = _mm_and_ps(fscal,cutoff_mask);
1077 fscal = _mm_andnot_ps(dummy_mask,fscal);
1079 /* Update vectorial force */
1080 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1081 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1082 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
1084 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
1085 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
1086 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
1090 /**************************
1091 * CALCULATE INTERACTIONS *
1092 **************************/
1094 if (gmx_mm_any_lt(rsq10,rcutoff2))
1097 r10 = _mm_mul_ps(rsq10,rinv10);
1098 r10 = _mm_andnot_ps(dummy_mask,r10);
1100 /* EWALD ELECTROSTATICS */
1102 /* Analytical PME correction */
1103 zeta2 = _mm_mul_ps(beta2,rsq10);
1104 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
1105 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1106 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1107 felec = _mm_mul_ps(qq10,felec);
1108 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1109 velec = _mm_nmacc_ps(pmecorrV,beta,rinv10);
1110 velec = _mm_mul_ps(qq10,velec);
1112 d = _mm_sub_ps(r10,rswitch);
1113 d = _mm_max_ps(d,_mm_setzero_ps());
1114 d2 = _mm_mul_ps(d,d);
1115 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1117 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1119 /* Evaluate switch function */
1120 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1121 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1122 velec = _mm_mul_ps(velec,sw);
1123 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1125 /* Update potential sum for this i atom from the interaction with this j atom. */
1126 velec = _mm_and_ps(velec,cutoff_mask);
1127 velec = _mm_andnot_ps(dummy_mask,velec);
1128 velecsum = _mm_add_ps(velecsum,velec);
1132 fscal = _mm_and_ps(fscal,cutoff_mask);
1134 fscal = _mm_andnot_ps(dummy_mask,fscal);
1136 /* Update vectorial force */
1137 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1138 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1139 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1141 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1142 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1143 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1147 /**************************
1148 * CALCULATE INTERACTIONS *
1149 **************************/
1151 if (gmx_mm_any_lt(rsq11,rcutoff2))
1154 r11 = _mm_mul_ps(rsq11,rinv11);
1155 r11 = _mm_andnot_ps(dummy_mask,r11);
1157 /* EWALD ELECTROSTATICS */
1159 /* Analytical PME correction */
1160 zeta2 = _mm_mul_ps(beta2,rsq11);
1161 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1162 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1163 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1164 felec = _mm_mul_ps(qq11,felec);
1165 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1166 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
1167 velec = _mm_mul_ps(qq11,velec);
1169 d = _mm_sub_ps(r11,rswitch);
1170 d = _mm_max_ps(d,_mm_setzero_ps());
1171 d2 = _mm_mul_ps(d,d);
1172 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1174 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1176 /* Evaluate switch function */
1177 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1178 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1179 velec = _mm_mul_ps(velec,sw);
1180 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1182 /* Update potential sum for this i atom from the interaction with this j atom. */
1183 velec = _mm_and_ps(velec,cutoff_mask);
1184 velec = _mm_andnot_ps(dummy_mask,velec);
1185 velecsum = _mm_add_ps(velecsum,velec);
1189 fscal = _mm_and_ps(fscal,cutoff_mask);
1191 fscal = _mm_andnot_ps(dummy_mask,fscal);
1193 /* Update vectorial force */
1194 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1195 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1196 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1198 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1199 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1200 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1204 /**************************
1205 * CALCULATE INTERACTIONS *
1206 **************************/
1208 if (gmx_mm_any_lt(rsq12,rcutoff2))
1211 r12 = _mm_mul_ps(rsq12,rinv12);
1212 r12 = _mm_andnot_ps(dummy_mask,r12);
1214 /* EWALD ELECTROSTATICS */
1216 /* Analytical PME correction */
1217 zeta2 = _mm_mul_ps(beta2,rsq12);
1218 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1219 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1220 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1221 felec = _mm_mul_ps(qq12,felec);
1222 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1223 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
1224 velec = _mm_mul_ps(qq12,velec);
1226 d = _mm_sub_ps(r12,rswitch);
1227 d = _mm_max_ps(d,_mm_setzero_ps());
1228 d2 = _mm_mul_ps(d,d);
1229 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1231 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1233 /* Evaluate switch function */
1234 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1235 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1236 velec = _mm_mul_ps(velec,sw);
1237 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1239 /* Update potential sum for this i atom from the interaction with this j atom. */
1240 velec = _mm_and_ps(velec,cutoff_mask);
1241 velec = _mm_andnot_ps(dummy_mask,velec);
1242 velecsum = _mm_add_ps(velecsum,velec);
1246 fscal = _mm_and_ps(fscal,cutoff_mask);
1248 fscal = _mm_andnot_ps(dummy_mask,fscal);
1250 /* Update vectorial force */
1251 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1252 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1253 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1255 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1256 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1257 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1261 /**************************
1262 * CALCULATE INTERACTIONS *
1263 **************************/
1265 if (gmx_mm_any_lt(rsq20,rcutoff2))
1268 r20 = _mm_mul_ps(rsq20,rinv20);
1269 r20 = _mm_andnot_ps(dummy_mask,r20);
1271 /* EWALD ELECTROSTATICS */
1273 /* Analytical PME correction */
1274 zeta2 = _mm_mul_ps(beta2,rsq20);
1275 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
1276 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1277 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1278 felec = _mm_mul_ps(qq20,felec);
1279 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1280 velec = _mm_nmacc_ps(pmecorrV,beta,rinv20);
1281 velec = _mm_mul_ps(qq20,velec);
1283 d = _mm_sub_ps(r20,rswitch);
1284 d = _mm_max_ps(d,_mm_setzero_ps());
1285 d2 = _mm_mul_ps(d,d);
1286 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1288 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1290 /* Evaluate switch function */
1291 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1292 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1293 velec = _mm_mul_ps(velec,sw);
1294 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1296 /* Update potential sum for this i atom from the interaction with this j atom. */
1297 velec = _mm_and_ps(velec,cutoff_mask);
1298 velec = _mm_andnot_ps(dummy_mask,velec);
1299 velecsum = _mm_add_ps(velecsum,velec);
1303 fscal = _mm_and_ps(fscal,cutoff_mask);
1305 fscal = _mm_andnot_ps(dummy_mask,fscal);
1307 /* Update vectorial force */
1308 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1309 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1310 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1312 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1313 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1314 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1318 /**************************
1319 * CALCULATE INTERACTIONS *
1320 **************************/
1322 if (gmx_mm_any_lt(rsq21,rcutoff2))
1325 r21 = _mm_mul_ps(rsq21,rinv21);
1326 r21 = _mm_andnot_ps(dummy_mask,r21);
1328 /* EWALD ELECTROSTATICS */
1330 /* Analytical PME correction */
1331 zeta2 = _mm_mul_ps(beta2,rsq21);
1332 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1333 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1334 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1335 felec = _mm_mul_ps(qq21,felec);
1336 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1337 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
1338 velec = _mm_mul_ps(qq21,velec);
1340 d = _mm_sub_ps(r21,rswitch);
1341 d = _mm_max_ps(d,_mm_setzero_ps());
1342 d2 = _mm_mul_ps(d,d);
1343 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1345 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1347 /* Evaluate switch function */
1348 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1349 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1350 velec = _mm_mul_ps(velec,sw);
1351 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1353 /* Update potential sum for this i atom from the interaction with this j atom. */
1354 velec = _mm_and_ps(velec,cutoff_mask);
1355 velec = _mm_andnot_ps(dummy_mask,velec);
1356 velecsum = _mm_add_ps(velecsum,velec);
1360 fscal = _mm_and_ps(fscal,cutoff_mask);
1362 fscal = _mm_andnot_ps(dummy_mask,fscal);
1364 /* Update vectorial force */
1365 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1366 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1367 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1369 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1370 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1371 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1375 /**************************
1376 * CALCULATE INTERACTIONS *
1377 **************************/
1379 if (gmx_mm_any_lt(rsq22,rcutoff2))
1382 r22 = _mm_mul_ps(rsq22,rinv22);
1383 r22 = _mm_andnot_ps(dummy_mask,r22);
1385 /* EWALD ELECTROSTATICS */
1387 /* Analytical PME correction */
1388 zeta2 = _mm_mul_ps(beta2,rsq22);
1389 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1390 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1391 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1392 felec = _mm_mul_ps(qq22,felec);
1393 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1394 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
1395 velec = _mm_mul_ps(qq22,velec);
1397 d = _mm_sub_ps(r22,rswitch);
1398 d = _mm_max_ps(d,_mm_setzero_ps());
1399 d2 = _mm_mul_ps(d,d);
1400 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1402 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1404 /* Evaluate switch function */
1405 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1406 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1407 velec = _mm_mul_ps(velec,sw);
1408 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1410 /* Update potential sum for this i atom from the interaction with this j atom. */
1411 velec = _mm_and_ps(velec,cutoff_mask);
1412 velec = _mm_andnot_ps(dummy_mask,velec);
1413 velecsum = _mm_add_ps(velecsum,velec);
1417 fscal = _mm_and_ps(fscal,cutoff_mask);
1419 fscal = _mm_andnot_ps(dummy_mask,fscal);
1421 /* Update vectorial force */
1422 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1423 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1424 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1426 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1427 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1428 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1432 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1433 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1434 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1435 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1437 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1438 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1440 /* Inner loop uses 504 flops */
1443 /* End of innermost loop */
1445 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1446 f+i_coord_offset,fshift+i_shift_offset);
1449 /* Update potential energies */
1450 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1451 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1453 /* Increment number of inner iterations */
1454 inneriter += j_index_end - j_index_start;
1456 /* Outer loop uses 20 flops */
1459 /* Increment number of outer iterations */
1462 /* Update outer/inner flops */
1464 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*504);
1467 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_128_fma_single
1468 * Electrostatics interaction: Ewald
1469 * VdW interaction: LennardJones
1470 * Geometry: Water3-Water3
1471 * Calculate force/pot: Force
1474 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_128_fma_single
1475 (t_nblist * gmx_restrict nlist,
1476 rvec * gmx_restrict xx,
1477 rvec * gmx_restrict ff,
1478 t_forcerec * gmx_restrict fr,
1479 t_mdatoms * gmx_restrict mdatoms,
1480 nb_kernel_data_t * gmx_restrict kernel_data,
1481 t_nrnb * gmx_restrict nrnb)
1483 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1484 * just 0 for non-waters.
1485 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1486 * jnr indices corresponding to data put in the four positions in the SIMD register.
1488 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1489 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1490 int jnrA,jnrB,jnrC,jnrD;
1491 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1492 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1493 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1494 real rcutoff_scalar;
1495 real *shiftvec,*fshift,*x,*f;
1496 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1497 real scratch[4*DIM];
1498 __m128 fscal,rcutoff,rcutoff2,jidxall;
1500 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1502 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1504 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1505 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1506 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1507 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1508 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1509 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1510 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1511 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1512 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1513 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1514 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1515 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1516 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1517 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1518 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1519 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1520 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1523 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1526 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1527 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1529 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1530 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1532 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1533 real rswitch_scalar,d_scalar;
1534 __m128 dummy_mask,cutoff_mask;
1535 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1536 __m128 one = _mm_set1_ps(1.0);
1537 __m128 two = _mm_set1_ps(2.0);
1543 jindex = nlist->jindex;
1545 shiftidx = nlist->shift;
1547 shiftvec = fr->shift_vec[0];
1548 fshift = fr->fshift[0];
1549 facel = _mm_set1_ps(fr->epsfac);
1550 charge = mdatoms->chargeA;
1551 nvdwtype = fr->ntype;
1552 vdwparam = fr->nbfp;
1553 vdwtype = mdatoms->typeA;
1555 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1556 beta = _mm_set1_ps(fr->ic->ewaldcoeff);
1557 beta2 = _mm_mul_ps(beta,beta);
1558 beta3 = _mm_mul_ps(beta,beta2);
1559 ewtab = fr->ic->tabq_coul_FDV0;
1560 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1561 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1563 /* Setup water-specific parameters */
1564 inr = nlist->iinr[0];
1565 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1566 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1567 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1568 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1570 jq0 = _mm_set1_ps(charge[inr+0]);
1571 jq1 = _mm_set1_ps(charge[inr+1]);
1572 jq2 = _mm_set1_ps(charge[inr+2]);
1573 vdwjidx0A = 2*vdwtype[inr+0];
1574 qq00 = _mm_mul_ps(iq0,jq0);
1575 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1576 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1577 qq01 = _mm_mul_ps(iq0,jq1);
1578 qq02 = _mm_mul_ps(iq0,jq2);
1579 qq10 = _mm_mul_ps(iq1,jq0);
1580 qq11 = _mm_mul_ps(iq1,jq1);
1581 qq12 = _mm_mul_ps(iq1,jq2);
1582 qq20 = _mm_mul_ps(iq2,jq0);
1583 qq21 = _mm_mul_ps(iq2,jq1);
1584 qq22 = _mm_mul_ps(iq2,jq2);
1586 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1587 rcutoff_scalar = fr->rcoulomb;
1588 rcutoff = _mm_set1_ps(rcutoff_scalar);
1589 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1591 rswitch_scalar = fr->rcoulomb_switch;
1592 rswitch = _mm_set1_ps(rswitch_scalar);
1593 /* Setup switch parameters */
1594 d_scalar = rcutoff_scalar-rswitch_scalar;
1595 d = _mm_set1_ps(d_scalar);
1596 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1597 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1598 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1599 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1600 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1601 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1603 /* Avoid stupid compiler warnings */
1604 jnrA = jnrB = jnrC = jnrD = 0;
1605 j_coord_offsetA = 0;
1606 j_coord_offsetB = 0;
1607 j_coord_offsetC = 0;
1608 j_coord_offsetD = 0;
1613 for(iidx=0;iidx<4*DIM;iidx++)
1615 scratch[iidx] = 0.0;
1618 /* Start outer loop over neighborlists */
1619 for(iidx=0; iidx<nri; iidx++)
1621 /* Load shift vector for this list */
1622 i_shift_offset = DIM*shiftidx[iidx];
1624 /* Load limits for loop over neighbors */
1625 j_index_start = jindex[iidx];
1626 j_index_end = jindex[iidx+1];
1628 /* Get outer coordinate index */
1630 i_coord_offset = DIM*inr;
1632 /* Load i particle coords and add shift vector */
1633 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1634 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1636 fix0 = _mm_setzero_ps();
1637 fiy0 = _mm_setzero_ps();
1638 fiz0 = _mm_setzero_ps();
1639 fix1 = _mm_setzero_ps();
1640 fiy1 = _mm_setzero_ps();
1641 fiz1 = _mm_setzero_ps();
1642 fix2 = _mm_setzero_ps();
1643 fiy2 = _mm_setzero_ps();
1644 fiz2 = _mm_setzero_ps();
1646 /* Start inner kernel loop */
1647 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1650 /* Get j neighbor index, and coordinate index */
1652 jnrB = jjnr[jidx+1];
1653 jnrC = jjnr[jidx+2];
1654 jnrD = jjnr[jidx+3];
1655 j_coord_offsetA = DIM*jnrA;
1656 j_coord_offsetB = DIM*jnrB;
1657 j_coord_offsetC = DIM*jnrC;
1658 j_coord_offsetD = DIM*jnrD;
1660 /* load j atom coordinates */
1661 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1662 x+j_coord_offsetC,x+j_coord_offsetD,
1663 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1665 /* Calculate displacement vector */
1666 dx00 = _mm_sub_ps(ix0,jx0);
1667 dy00 = _mm_sub_ps(iy0,jy0);
1668 dz00 = _mm_sub_ps(iz0,jz0);
1669 dx01 = _mm_sub_ps(ix0,jx1);
1670 dy01 = _mm_sub_ps(iy0,jy1);
1671 dz01 = _mm_sub_ps(iz0,jz1);
1672 dx02 = _mm_sub_ps(ix0,jx2);
1673 dy02 = _mm_sub_ps(iy0,jy2);
1674 dz02 = _mm_sub_ps(iz0,jz2);
1675 dx10 = _mm_sub_ps(ix1,jx0);
1676 dy10 = _mm_sub_ps(iy1,jy0);
1677 dz10 = _mm_sub_ps(iz1,jz0);
1678 dx11 = _mm_sub_ps(ix1,jx1);
1679 dy11 = _mm_sub_ps(iy1,jy1);
1680 dz11 = _mm_sub_ps(iz1,jz1);
1681 dx12 = _mm_sub_ps(ix1,jx2);
1682 dy12 = _mm_sub_ps(iy1,jy2);
1683 dz12 = _mm_sub_ps(iz1,jz2);
1684 dx20 = _mm_sub_ps(ix2,jx0);
1685 dy20 = _mm_sub_ps(iy2,jy0);
1686 dz20 = _mm_sub_ps(iz2,jz0);
1687 dx21 = _mm_sub_ps(ix2,jx1);
1688 dy21 = _mm_sub_ps(iy2,jy1);
1689 dz21 = _mm_sub_ps(iz2,jz1);
1690 dx22 = _mm_sub_ps(ix2,jx2);
1691 dy22 = _mm_sub_ps(iy2,jy2);
1692 dz22 = _mm_sub_ps(iz2,jz2);
1694 /* Calculate squared distance and things based on it */
1695 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1696 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1697 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1698 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1699 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1700 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1701 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1702 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1703 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1705 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1706 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1707 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1708 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1709 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1710 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1711 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1712 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1713 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1715 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1716 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1717 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1718 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1719 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1720 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1721 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1722 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1723 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1725 fjx0 = _mm_setzero_ps();
1726 fjy0 = _mm_setzero_ps();
1727 fjz0 = _mm_setzero_ps();
1728 fjx1 = _mm_setzero_ps();
1729 fjy1 = _mm_setzero_ps();
1730 fjz1 = _mm_setzero_ps();
1731 fjx2 = _mm_setzero_ps();
1732 fjy2 = _mm_setzero_ps();
1733 fjz2 = _mm_setzero_ps();
1735 /**************************
1736 * CALCULATE INTERACTIONS *
1737 **************************/
1739 if (gmx_mm_any_lt(rsq00,rcutoff2))
1742 r00 = _mm_mul_ps(rsq00,rinv00);
1744 /* EWALD ELECTROSTATICS */
1746 /* Analytical PME correction */
1747 zeta2 = _mm_mul_ps(beta2,rsq00);
1748 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
1749 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1750 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1751 felec = _mm_mul_ps(qq00,felec);
1752 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1753 velec = _mm_nmacc_ps(pmecorrV,beta,rinv00);
1754 velec = _mm_mul_ps(qq00,velec);
1756 /* LENNARD-JONES DISPERSION/REPULSION */
1758 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1759 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1760 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1761 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
1762 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1764 d = _mm_sub_ps(r00,rswitch);
1765 d = _mm_max_ps(d,_mm_setzero_ps());
1766 d2 = _mm_mul_ps(d,d);
1767 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1769 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1771 /* Evaluate switch function */
1772 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1773 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1774 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1775 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1777 fscal = _mm_add_ps(felec,fvdw);
1779 fscal = _mm_and_ps(fscal,cutoff_mask);
1781 /* Update vectorial force */
1782 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1783 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1784 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1786 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1787 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1788 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1792 /**************************
1793 * CALCULATE INTERACTIONS *
1794 **************************/
1796 if (gmx_mm_any_lt(rsq01,rcutoff2))
1799 r01 = _mm_mul_ps(rsq01,rinv01);
1801 /* EWALD ELECTROSTATICS */
1803 /* Analytical PME correction */
1804 zeta2 = _mm_mul_ps(beta2,rsq01);
1805 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
1806 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1807 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1808 felec = _mm_mul_ps(qq01,felec);
1809 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1810 velec = _mm_nmacc_ps(pmecorrV,beta,rinv01);
1811 velec = _mm_mul_ps(qq01,velec);
1813 d = _mm_sub_ps(r01,rswitch);
1814 d = _mm_max_ps(d,_mm_setzero_ps());
1815 d2 = _mm_mul_ps(d,d);
1816 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1818 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1820 /* Evaluate switch function */
1821 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1822 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1823 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1827 fscal = _mm_and_ps(fscal,cutoff_mask);
1829 /* Update vectorial force */
1830 fix0 = _mm_macc_ps(dx01,fscal,fix0);
1831 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1832 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1834 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1835 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1836 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1840 /**************************
1841 * CALCULATE INTERACTIONS *
1842 **************************/
1844 if (gmx_mm_any_lt(rsq02,rcutoff2))
1847 r02 = _mm_mul_ps(rsq02,rinv02);
1849 /* EWALD ELECTROSTATICS */
1851 /* Analytical PME correction */
1852 zeta2 = _mm_mul_ps(beta2,rsq02);
1853 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
1854 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1855 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1856 felec = _mm_mul_ps(qq02,felec);
1857 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1858 velec = _mm_nmacc_ps(pmecorrV,beta,rinv02);
1859 velec = _mm_mul_ps(qq02,velec);
1861 d = _mm_sub_ps(r02,rswitch);
1862 d = _mm_max_ps(d,_mm_setzero_ps());
1863 d2 = _mm_mul_ps(d,d);
1864 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1866 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1868 /* Evaluate switch function */
1869 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1870 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1871 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1875 fscal = _mm_and_ps(fscal,cutoff_mask);
1877 /* Update vectorial force */
1878 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1879 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1880 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
1882 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
1883 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
1884 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
1888 /**************************
1889 * CALCULATE INTERACTIONS *
1890 **************************/
1892 if (gmx_mm_any_lt(rsq10,rcutoff2))
1895 r10 = _mm_mul_ps(rsq10,rinv10);
1897 /* EWALD ELECTROSTATICS */
1899 /* Analytical PME correction */
1900 zeta2 = _mm_mul_ps(beta2,rsq10);
1901 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
1902 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1903 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1904 felec = _mm_mul_ps(qq10,felec);
1905 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1906 velec = _mm_nmacc_ps(pmecorrV,beta,rinv10);
1907 velec = _mm_mul_ps(qq10,velec);
1909 d = _mm_sub_ps(r10,rswitch);
1910 d = _mm_max_ps(d,_mm_setzero_ps());
1911 d2 = _mm_mul_ps(d,d);
1912 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1914 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1916 /* Evaluate switch function */
1917 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1918 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1919 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1923 fscal = _mm_and_ps(fscal,cutoff_mask);
1925 /* Update vectorial force */
1926 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1927 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1928 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1930 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1931 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1932 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1936 /**************************
1937 * CALCULATE INTERACTIONS *
1938 **************************/
1940 if (gmx_mm_any_lt(rsq11,rcutoff2))
1943 r11 = _mm_mul_ps(rsq11,rinv11);
1945 /* EWALD ELECTROSTATICS */
1947 /* Analytical PME correction */
1948 zeta2 = _mm_mul_ps(beta2,rsq11);
1949 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1950 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1951 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1952 felec = _mm_mul_ps(qq11,felec);
1953 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1954 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
1955 velec = _mm_mul_ps(qq11,velec);
1957 d = _mm_sub_ps(r11,rswitch);
1958 d = _mm_max_ps(d,_mm_setzero_ps());
1959 d2 = _mm_mul_ps(d,d);
1960 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1962 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1964 /* Evaluate switch function */
1965 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1966 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1967 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1971 fscal = _mm_and_ps(fscal,cutoff_mask);
1973 /* Update vectorial force */
1974 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1975 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1976 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1978 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1979 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1980 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1984 /**************************
1985 * CALCULATE INTERACTIONS *
1986 **************************/
1988 if (gmx_mm_any_lt(rsq12,rcutoff2))
1991 r12 = _mm_mul_ps(rsq12,rinv12);
1993 /* EWALD ELECTROSTATICS */
1995 /* Analytical PME correction */
1996 zeta2 = _mm_mul_ps(beta2,rsq12);
1997 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1998 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1999 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2000 felec = _mm_mul_ps(qq12,felec);
2001 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2002 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
2003 velec = _mm_mul_ps(qq12,velec);
2005 d = _mm_sub_ps(r12,rswitch);
2006 d = _mm_max_ps(d,_mm_setzero_ps());
2007 d2 = _mm_mul_ps(d,d);
2008 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2010 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2012 /* Evaluate switch function */
2013 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2014 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2015 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2019 fscal = _mm_and_ps(fscal,cutoff_mask);
2021 /* Update vectorial force */
2022 fix1 = _mm_macc_ps(dx12,fscal,fix1);
2023 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
2024 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
2026 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
2027 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
2028 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
2032 /**************************
2033 * CALCULATE INTERACTIONS *
2034 **************************/
2036 if (gmx_mm_any_lt(rsq20,rcutoff2))
2039 r20 = _mm_mul_ps(rsq20,rinv20);
2041 /* EWALD ELECTROSTATICS */
2043 /* Analytical PME correction */
2044 zeta2 = _mm_mul_ps(beta2,rsq20);
2045 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
2046 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2047 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2048 felec = _mm_mul_ps(qq20,felec);
2049 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2050 velec = _mm_nmacc_ps(pmecorrV,beta,rinv20);
2051 velec = _mm_mul_ps(qq20,velec);
2053 d = _mm_sub_ps(r20,rswitch);
2054 d = _mm_max_ps(d,_mm_setzero_ps());
2055 d2 = _mm_mul_ps(d,d);
2056 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2058 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2060 /* Evaluate switch function */
2061 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2062 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2063 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2067 fscal = _mm_and_ps(fscal,cutoff_mask);
2069 /* Update vectorial force */
2070 fix2 = _mm_macc_ps(dx20,fscal,fix2);
2071 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
2072 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
2074 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
2075 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
2076 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
2080 /**************************
2081 * CALCULATE INTERACTIONS *
2082 **************************/
2084 if (gmx_mm_any_lt(rsq21,rcutoff2))
2087 r21 = _mm_mul_ps(rsq21,rinv21);
2089 /* EWALD ELECTROSTATICS */
2091 /* Analytical PME correction */
2092 zeta2 = _mm_mul_ps(beta2,rsq21);
2093 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
2094 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2095 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2096 felec = _mm_mul_ps(qq21,felec);
2097 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2098 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
2099 velec = _mm_mul_ps(qq21,velec);
2101 d = _mm_sub_ps(r21,rswitch);
2102 d = _mm_max_ps(d,_mm_setzero_ps());
2103 d2 = _mm_mul_ps(d,d);
2104 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2106 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2108 /* Evaluate switch function */
2109 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2110 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2111 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2115 fscal = _mm_and_ps(fscal,cutoff_mask);
2117 /* Update vectorial force */
2118 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2119 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2120 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2122 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2123 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2124 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2128 /**************************
2129 * CALCULATE INTERACTIONS *
2130 **************************/
2132 if (gmx_mm_any_lt(rsq22,rcutoff2))
2135 r22 = _mm_mul_ps(rsq22,rinv22);
2137 /* EWALD ELECTROSTATICS */
2139 /* Analytical PME correction */
2140 zeta2 = _mm_mul_ps(beta2,rsq22);
2141 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
2142 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2143 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2144 felec = _mm_mul_ps(qq22,felec);
2145 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2146 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
2147 velec = _mm_mul_ps(qq22,velec);
2149 d = _mm_sub_ps(r22,rswitch);
2150 d = _mm_max_ps(d,_mm_setzero_ps());
2151 d2 = _mm_mul_ps(d,d);
2152 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2154 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2156 /* Evaluate switch function */
2157 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2158 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2159 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2163 fscal = _mm_and_ps(fscal,cutoff_mask);
2165 /* Update vectorial force */
2166 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2167 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2168 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2170 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2171 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2172 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2176 fjptrA = f+j_coord_offsetA;
2177 fjptrB = f+j_coord_offsetB;
2178 fjptrC = f+j_coord_offsetC;
2179 fjptrD = f+j_coord_offsetD;
2181 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2182 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2184 /* Inner loop uses 465 flops */
2187 if(jidx<j_index_end)
2190 /* Get j neighbor index, and coordinate index */
2191 jnrlistA = jjnr[jidx];
2192 jnrlistB = jjnr[jidx+1];
2193 jnrlistC = jjnr[jidx+2];
2194 jnrlistD = jjnr[jidx+3];
2195 /* Sign of each element will be negative for non-real atoms.
2196 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2197 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2199 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2200 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2201 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2202 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2203 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2204 j_coord_offsetA = DIM*jnrA;
2205 j_coord_offsetB = DIM*jnrB;
2206 j_coord_offsetC = DIM*jnrC;
2207 j_coord_offsetD = DIM*jnrD;
2209 /* load j atom coordinates */
2210 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2211 x+j_coord_offsetC,x+j_coord_offsetD,
2212 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2214 /* Calculate displacement vector */
2215 dx00 = _mm_sub_ps(ix0,jx0);
2216 dy00 = _mm_sub_ps(iy0,jy0);
2217 dz00 = _mm_sub_ps(iz0,jz0);
2218 dx01 = _mm_sub_ps(ix0,jx1);
2219 dy01 = _mm_sub_ps(iy0,jy1);
2220 dz01 = _mm_sub_ps(iz0,jz1);
2221 dx02 = _mm_sub_ps(ix0,jx2);
2222 dy02 = _mm_sub_ps(iy0,jy2);
2223 dz02 = _mm_sub_ps(iz0,jz2);
2224 dx10 = _mm_sub_ps(ix1,jx0);
2225 dy10 = _mm_sub_ps(iy1,jy0);
2226 dz10 = _mm_sub_ps(iz1,jz0);
2227 dx11 = _mm_sub_ps(ix1,jx1);
2228 dy11 = _mm_sub_ps(iy1,jy1);
2229 dz11 = _mm_sub_ps(iz1,jz1);
2230 dx12 = _mm_sub_ps(ix1,jx2);
2231 dy12 = _mm_sub_ps(iy1,jy2);
2232 dz12 = _mm_sub_ps(iz1,jz2);
2233 dx20 = _mm_sub_ps(ix2,jx0);
2234 dy20 = _mm_sub_ps(iy2,jy0);
2235 dz20 = _mm_sub_ps(iz2,jz0);
2236 dx21 = _mm_sub_ps(ix2,jx1);
2237 dy21 = _mm_sub_ps(iy2,jy1);
2238 dz21 = _mm_sub_ps(iz2,jz1);
2239 dx22 = _mm_sub_ps(ix2,jx2);
2240 dy22 = _mm_sub_ps(iy2,jy2);
2241 dz22 = _mm_sub_ps(iz2,jz2);
2243 /* Calculate squared distance and things based on it */
2244 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2245 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2246 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2247 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2248 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2249 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2250 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2251 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2252 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2254 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2255 rinv01 = gmx_mm_invsqrt_ps(rsq01);
2256 rinv02 = gmx_mm_invsqrt_ps(rsq02);
2257 rinv10 = gmx_mm_invsqrt_ps(rsq10);
2258 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2259 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2260 rinv20 = gmx_mm_invsqrt_ps(rsq20);
2261 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2262 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2264 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2265 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
2266 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
2267 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
2268 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2269 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2270 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
2271 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2272 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2274 fjx0 = _mm_setzero_ps();
2275 fjy0 = _mm_setzero_ps();
2276 fjz0 = _mm_setzero_ps();
2277 fjx1 = _mm_setzero_ps();
2278 fjy1 = _mm_setzero_ps();
2279 fjz1 = _mm_setzero_ps();
2280 fjx2 = _mm_setzero_ps();
2281 fjy2 = _mm_setzero_ps();
2282 fjz2 = _mm_setzero_ps();
2284 /**************************
2285 * CALCULATE INTERACTIONS *
2286 **************************/
2288 if (gmx_mm_any_lt(rsq00,rcutoff2))
2291 r00 = _mm_mul_ps(rsq00,rinv00);
2292 r00 = _mm_andnot_ps(dummy_mask,r00);
2294 /* EWALD ELECTROSTATICS */
2296 /* Analytical PME correction */
2297 zeta2 = _mm_mul_ps(beta2,rsq00);
2298 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
2299 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2300 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2301 felec = _mm_mul_ps(qq00,felec);
2302 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2303 velec = _mm_nmacc_ps(pmecorrV,beta,rinv00);
2304 velec = _mm_mul_ps(qq00,velec);
2306 /* LENNARD-JONES DISPERSION/REPULSION */
2308 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2309 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
2310 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2311 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
2312 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2314 d = _mm_sub_ps(r00,rswitch);
2315 d = _mm_max_ps(d,_mm_setzero_ps());
2316 d2 = _mm_mul_ps(d,d);
2317 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2319 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2321 /* Evaluate switch function */
2322 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2323 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
2324 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2325 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2327 fscal = _mm_add_ps(felec,fvdw);
2329 fscal = _mm_and_ps(fscal,cutoff_mask);
2331 fscal = _mm_andnot_ps(dummy_mask,fscal);
2333 /* Update vectorial force */
2334 fix0 = _mm_macc_ps(dx00,fscal,fix0);
2335 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
2336 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
2338 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
2339 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
2340 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
2344 /**************************
2345 * CALCULATE INTERACTIONS *
2346 **************************/
2348 if (gmx_mm_any_lt(rsq01,rcutoff2))
2351 r01 = _mm_mul_ps(rsq01,rinv01);
2352 r01 = _mm_andnot_ps(dummy_mask,r01);
2354 /* EWALD ELECTROSTATICS */
2356 /* Analytical PME correction */
2357 zeta2 = _mm_mul_ps(beta2,rsq01);
2358 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
2359 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2360 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2361 felec = _mm_mul_ps(qq01,felec);
2362 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2363 velec = _mm_nmacc_ps(pmecorrV,beta,rinv01);
2364 velec = _mm_mul_ps(qq01,velec);
2366 d = _mm_sub_ps(r01,rswitch);
2367 d = _mm_max_ps(d,_mm_setzero_ps());
2368 d2 = _mm_mul_ps(d,d);
2369 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2371 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2373 /* Evaluate switch function */
2374 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2375 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2376 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2380 fscal = _mm_and_ps(fscal,cutoff_mask);
2382 fscal = _mm_andnot_ps(dummy_mask,fscal);
2384 /* Update vectorial force */
2385 fix0 = _mm_macc_ps(dx01,fscal,fix0);
2386 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
2387 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
2389 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
2390 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
2391 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
2395 /**************************
2396 * CALCULATE INTERACTIONS *
2397 **************************/
2399 if (gmx_mm_any_lt(rsq02,rcutoff2))
2402 r02 = _mm_mul_ps(rsq02,rinv02);
2403 r02 = _mm_andnot_ps(dummy_mask,r02);
2405 /* EWALD ELECTROSTATICS */
2407 /* Analytical PME correction */
2408 zeta2 = _mm_mul_ps(beta2,rsq02);
2409 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
2410 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2411 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2412 felec = _mm_mul_ps(qq02,felec);
2413 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2414 velec = _mm_nmacc_ps(pmecorrV,beta,rinv02);
2415 velec = _mm_mul_ps(qq02,velec);
2417 d = _mm_sub_ps(r02,rswitch);
2418 d = _mm_max_ps(d,_mm_setzero_ps());
2419 d2 = _mm_mul_ps(d,d);
2420 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2422 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2424 /* Evaluate switch function */
2425 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2426 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2427 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2431 fscal = _mm_and_ps(fscal,cutoff_mask);
2433 fscal = _mm_andnot_ps(dummy_mask,fscal);
2435 /* Update vectorial force */
2436 fix0 = _mm_macc_ps(dx02,fscal,fix0);
2437 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
2438 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
2440 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
2441 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
2442 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
2446 /**************************
2447 * CALCULATE INTERACTIONS *
2448 **************************/
2450 if (gmx_mm_any_lt(rsq10,rcutoff2))
2453 r10 = _mm_mul_ps(rsq10,rinv10);
2454 r10 = _mm_andnot_ps(dummy_mask,r10);
2456 /* EWALD ELECTROSTATICS */
2458 /* Analytical PME correction */
2459 zeta2 = _mm_mul_ps(beta2,rsq10);
2460 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
2461 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2462 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2463 felec = _mm_mul_ps(qq10,felec);
2464 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2465 velec = _mm_nmacc_ps(pmecorrV,beta,rinv10);
2466 velec = _mm_mul_ps(qq10,velec);
2468 d = _mm_sub_ps(r10,rswitch);
2469 d = _mm_max_ps(d,_mm_setzero_ps());
2470 d2 = _mm_mul_ps(d,d);
2471 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2473 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2475 /* Evaluate switch function */
2476 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2477 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2478 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2482 fscal = _mm_and_ps(fscal,cutoff_mask);
2484 fscal = _mm_andnot_ps(dummy_mask,fscal);
2486 /* Update vectorial force */
2487 fix1 = _mm_macc_ps(dx10,fscal,fix1);
2488 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
2489 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
2491 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
2492 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
2493 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
2497 /**************************
2498 * CALCULATE INTERACTIONS *
2499 **************************/
2501 if (gmx_mm_any_lt(rsq11,rcutoff2))
2504 r11 = _mm_mul_ps(rsq11,rinv11);
2505 r11 = _mm_andnot_ps(dummy_mask,r11);
2507 /* EWALD ELECTROSTATICS */
2509 /* Analytical PME correction */
2510 zeta2 = _mm_mul_ps(beta2,rsq11);
2511 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
2512 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2513 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2514 felec = _mm_mul_ps(qq11,felec);
2515 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2516 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
2517 velec = _mm_mul_ps(qq11,velec);
2519 d = _mm_sub_ps(r11,rswitch);
2520 d = _mm_max_ps(d,_mm_setzero_ps());
2521 d2 = _mm_mul_ps(d,d);
2522 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2524 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2526 /* Evaluate switch function */
2527 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2528 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2529 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2533 fscal = _mm_and_ps(fscal,cutoff_mask);
2535 fscal = _mm_andnot_ps(dummy_mask,fscal);
2537 /* Update vectorial force */
2538 fix1 = _mm_macc_ps(dx11,fscal,fix1);
2539 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
2540 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
2542 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
2543 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
2544 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
2548 /**************************
2549 * CALCULATE INTERACTIONS *
2550 **************************/
2552 if (gmx_mm_any_lt(rsq12,rcutoff2))
2555 r12 = _mm_mul_ps(rsq12,rinv12);
2556 r12 = _mm_andnot_ps(dummy_mask,r12);
2558 /* EWALD ELECTROSTATICS */
2560 /* Analytical PME correction */
2561 zeta2 = _mm_mul_ps(beta2,rsq12);
2562 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
2563 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2564 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2565 felec = _mm_mul_ps(qq12,felec);
2566 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2567 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
2568 velec = _mm_mul_ps(qq12,velec);
2570 d = _mm_sub_ps(r12,rswitch);
2571 d = _mm_max_ps(d,_mm_setzero_ps());
2572 d2 = _mm_mul_ps(d,d);
2573 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2575 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2577 /* Evaluate switch function */
2578 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2579 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2580 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2584 fscal = _mm_and_ps(fscal,cutoff_mask);
2586 fscal = _mm_andnot_ps(dummy_mask,fscal);
2588 /* Update vectorial force */
2589 fix1 = _mm_macc_ps(dx12,fscal,fix1);
2590 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
2591 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
2593 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
2594 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
2595 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
2599 /**************************
2600 * CALCULATE INTERACTIONS *
2601 **************************/
2603 if (gmx_mm_any_lt(rsq20,rcutoff2))
2606 r20 = _mm_mul_ps(rsq20,rinv20);
2607 r20 = _mm_andnot_ps(dummy_mask,r20);
2609 /* EWALD ELECTROSTATICS */
2611 /* Analytical PME correction */
2612 zeta2 = _mm_mul_ps(beta2,rsq20);
2613 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
2614 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2615 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2616 felec = _mm_mul_ps(qq20,felec);
2617 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2618 velec = _mm_nmacc_ps(pmecorrV,beta,rinv20);
2619 velec = _mm_mul_ps(qq20,velec);
2621 d = _mm_sub_ps(r20,rswitch);
2622 d = _mm_max_ps(d,_mm_setzero_ps());
2623 d2 = _mm_mul_ps(d,d);
2624 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2626 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2628 /* Evaluate switch function */
2629 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2630 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2631 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2635 fscal = _mm_and_ps(fscal,cutoff_mask);
2637 fscal = _mm_andnot_ps(dummy_mask,fscal);
2639 /* Update vectorial force */
2640 fix2 = _mm_macc_ps(dx20,fscal,fix2);
2641 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
2642 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
2644 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
2645 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
2646 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
2650 /**************************
2651 * CALCULATE INTERACTIONS *
2652 **************************/
2654 if (gmx_mm_any_lt(rsq21,rcutoff2))
2657 r21 = _mm_mul_ps(rsq21,rinv21);
2658 r21 = _mm_andnot_ps(dummy_mask,r21);
2660 /* EWALD ELECTROSTATICS */
2662 /* Analytical PME correction */
2663 zeta2 = _mm_mul_ps(beta2,rsq21);
2664 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
2665 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2666 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2667 felec = _mm_mul_ps(qq21,felec);
2668 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2669 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
2670 velec = _mm_mul_ps(qq21,velec);
2672 d = _mm_sub_ps(r21,rswitch);
2673 d = _mm_max_ps(d,_mm_setzero_ps());
2674 d2 = _mm_mul_ps(d,d);
2675 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2677 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2679 /* Evaluate switch function */
2680 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2681 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2682 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2686 fscal = _mm_and_ps(fscal,cutoff_mask);
2688 fscal = _mm_andnot_ps(dummy_mask,fscal);
2690 /* Update vectorial force */
2691 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2692 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2693 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2695 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2696 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2697 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2701 /**************************
2702 * CALCULATE INTERACTIONS *
2703 **************************/
2705 if (gmx_mm_any_lt(rsq22,rcutoff2))
2708 r22 = _mm_mul_ps(rsq22,rinv22);
2709 r22 = _mm_andnot_ps(dummy_mask,r22);
2711 /* EWALD ELECTROSTATICS */
2713 /* Analytical PME correction */
2714 zeta2 = _mm_mul_ps(beta2,rsq22);
2715 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
2716 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2717 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2718 felec = _mm_mul_ps(qq22,felec);
2719 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2720 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
2721 velec = _mm_mul_ps(qq22,velec);
2723 d = _mm_sub_ps(r22,rswitch);
2724 d = _mm_max_ps(d,_mm_setzero_ps());
2725 d2 = _mm_mul_ps(d,d);
2726 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2728 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2730 /* Evaluate switch function */
2731 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2732 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2733 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2737 fscal = _mm_and_ps(fscal,cutoff_mask);
2739 fscal = _mm_andnot_ps(dummy_mask,fscal);
2741 /* Update vectorial force */
2742 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2743 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2744 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2746 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2747 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2748 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2752 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2753 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2754 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2755 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2757 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2758 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2760 /* Inner loop uses 474 flops */
2763 /* End of innermost loop */
2765 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2766 f+i_coord_offset,fshift+i_shift_offset);
2768 /* Increment number of inner iterations */
2769 inneriter += j_index_end - j_index_start;
2771 /* Outer loop uses 18 flops */
2774 /* Increment number of outer iterations */
2777 /* Update outer/inner flops */
2779 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*474);