2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "gromacs/math/vec.h"
49 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
50 #include "kernelutil_x86_avx_128_fma_single.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_128_fma_single
54 * Electrostatics interaction: Ewald
55 * VdW interaction: LennardJones
56 * Geometry: Water4-Water4
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_128_fma_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
84 __m128 fscal,rcutoff,rcutoff2,jidxall;
86 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
96 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
97 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
98 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
99 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
100 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
101 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
102 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
105 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
108 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
109 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
110 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
111 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
114 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
117 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
118 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
120 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
121 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
123 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
124 real rswitch_scalar,d_scalar;
125 __m128 dummy_mask,cutoff_mask;
126 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
127 __m128 one = _mm_set1_ps(1.0);
128 __m128 two = _mm_set1_ps(2.0);
134 jindex = nlist->jindex;
136 shiftidx = nlist->shift;
138 shiftvec = fr->shift_vec[0];
139 fshift = fr->fshift[0];
140 facel = _mm_set1_ps(fr->epsfac);
141 charge = mdatoms->chargeA;
142 nvdwtype = fr->ntype;
144 vdwtype = mdatoms->typeA;
146 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
147 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
148 beta2 = _mm_mul_ps(beta,beta);
149 beta3 = _mm_mul_ps(beta,beta2);
150 ewtab = fr->ic->tabq_coul_FDV0;
151 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
152 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
154 /* Setup water-specific parameters */
155 inr = nlist->iinr[0];
156 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
157 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
158 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
159 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
161 jq1 = _mm_set1_ps(charge[inr+1]);
162 jq2 = _mm_set1_ps(charge[inr+2]);
163 jq3 = _mm_set1_ps(charge[inr+3]);
164 vdwjidx0A = 2*vdwtype[inr+0];
165 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
166 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
167 qq11 = _mm_mul_ps(iq1,jq1);
168 qq12 = _mm_mul_ps(iq1,jq2);
169 qq13 = _mm_mul_ps(iq1,jq3);
170 qq21 = _mm_mul_ps(iq2,jq1);
171 qq22 = _mm_mul_ps(iq2,jq2);
172 qq23 = _mm_mul_ps(iq2,jq3);
173 qq31 = _mm_mul_ps(iq3,jq1);
174 qq32 = _mm_mul_ps(iq3,jq2);
175 qq33 = _mm_mul_ps(iq3,jq3);
177 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
178 rcutoff_scalar = fr->rcoulomb;
179 rcutoff = _mm_set1_ps(rcutoff_scalar);
180 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
182 rswitch_scalar = fr->rcoulomb_switch;
183 rswitch = _mm_set1_ps(rswitch_scalar);
184 /* Setup switch parameters */
185 d_scalar = rcutoff_scalar-rswitch_scalar;
186 d = _mm_set1_ps(d_scalar);
187 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
188 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
189 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
190 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
191 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
192 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
194 /* Avoid stupid compiler warnings */
195 jnrA = jnrB = jnrC = jnrD = 0;
204 for(iidx=0;iidx<4*DIM;iidx++)
209 /* Start outer loop over neighborlists */
210 for(iidx=0; iidx<nri; iidx++)
212 /* Load shift vector for this list */
213 i_shift_offset = DIM*shiftidx[iidx];
215 /* Load limits for loop over neighbors */
216 j_index_start = jindex[iidx];
217 j_index_end = jindex[iidx+1];
219 /* Get outer coordinate index */
221 i_coord_offset = DIM*inr;
223 /* Load i particle coords and add shift vector */
224 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
225 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
227 fix0 = _mm_setzero_ps();
228 fiy0 = _mm_setzero_ps();
229 fiz0 = _mm_setzero_ps();
230 fix1 = _mm_setzero_ps();
231 fiy1 = _mm_setzero_ps();
232 fiz1 = _mm_setzero_ps();
233 fix2 = _mm_setzero_ps();
234 fiy2 = _mm_setzero_ps();
235 fiz2 = _mm_setzero_ps();
236 fix3 = _mm_setzero_ps();
237 fiy3 = _mm_setzero_ps();
238 fiz3 = _mm_setzero_ps();
240 /* Reset potential sums */
241 velecsum = _mm_setzero_ps();
242 vvdwsum = _mm_setzero_ps();
244 /* Start inner kernel loop */
245 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
248 /* Get j neighbor index, and coordinate index */
253 j_coord_offsetA = DIM*jnrA;
254 j_coord_offsetB = DIM*jnrB;
255 j_coord_offsetC = DIM*jnrC;
256 j_coord_offsetD = DIM*jnrD;
258 /* load j atom coordinates */
259 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
260 x+j_coord_offsetC,x+j_coord_offsetD,
261 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
262 &jy2,&jz2,&jx3,&jy3,&jz3);
264 /* Calculate displacement vector */
265 dx00 = _mm_sub_ps(ix0,jx0);
266 dy00 = _mm_sub_ps(iy0,jy0);
267 dz00 = _mm_sub_ps(iz0,jz0);
268 dx11 = _mm_sub_ps(ix1,jx1);
269 dy11 = _mm_sub_ps(iy1,jy1);
270 dz11 = _mm_sub_ps(iz1,jz1);
271 dx12 = _mm_sub_ps(ix1,jx2);
272 dy12 = _mm_sub_ps(iy1,jy2);
273 dz12 = _mm_sub_ps(iz1,jz2);
274 dx13 = _mm_sub_ps(ix1,jx3);
275 dy13 = _mm_sub_ps(iy1,jy3);
276 dz13 = _mm_sub_ps(iz1,jz3);
277 dx21 = _mm_sub_ps(ix2,jx1);
278 dy21 = _mm_sub_ps(iy2,jy1);
279 dz21 = _mm_sub_ps(iz2,jz1);
280 dx22 = _mm_sub_ps(ix2,jx2);
281 dy22 = _mm_sub_ps(iy2,jy2);
282 dz22 = _mm_sub_ps(iz2,jz2);
283 dx23 = _mm_sub_ps(ix2,jx3);
284 dy23 = _mm_sub_ps(iy2,jy3);
285 dz23 = _mm_sub_ps(iz2,jz3);
286 dx31 = _mm_sub_ps(ix3,jx1);
287 dy31 = _mm_sub_ps(iy3,jy1);
288 dz31 = _mm_sub_ps(iz3,jz1);
289 dx32 = _mm_sub_ps(ix3,jx2);
290 dy32 = _mm_sub_ps(iy3,jy2);
291 dz32 = _mm_sub_ps(iz3,jz2);
292 dx33 = _mm_sub_ps(ix3,jx3);
293 dy33 = _mm_sub_ps(iy3,jy3);
294 dz33 = _mm_sub_ps(iz3,jz3);
296 /* Calculate squared distance and things based on it */
297 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
298 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
299 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
300 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
301 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
302 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
303 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
304 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
305 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
306 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
308 rinv00 = gmx_mm_invsqrt_ps(rsq00);
309 rinv11 = gmx_mm_invsqrt_ps(rsq11);
310 rinv12 = gmx_mm_invsqrt_ps(rsq12);
311 rinv13 = gmx_mm_invsqrt_ps(rsq13);
312 rinv21 = gmx_mm_invsqrt_ps(rsq21);
313 rinv22 = gmx_mm_invsqrt_ps(rsq22);
314 rinv23 = gmx_mm_invsqrt_ps(rsq23);
315 rinv31 = gmx_mm_invsqrt_ps(rsq31);
316 rinv32 = gmx_mm_invsqrt_ps(rsq32);
317 rinv33 = gmx_mm_invsqrt_ps(rsq33);
319 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
320 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
321 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
322 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
323 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
324 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
325 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
326 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
327 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
328 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
330 fjx0 = _mm_setzero_ps();
331 fjy0 = _mm_setzero_ps();
332 fjz0 = _mm_setzero_ps();
333 fjx1 = _mm_setzero_ps();
334 fjy1 = _mm_setzero_ps();
335 fjz1 = _mm_setzero_ps();
336 fjx2 = _mm_setzero_ps();
337 fjy2 = _mm_setzero_ps();
338 fjz2 = _mm_setzero_ps();
339 fjx3 = _mm_setzero_ps();
340 fjy3 = _mm_setzero_ps();
341 fjz3 = _mm_setzero_ps();
343 /**************************
344 * CALCULATE INTERACTIONS *
345 **************************/
347 if (gmx_mm_any_lt(rsq00,rcutoff2))
350 r00 = _mm_mul_ps(rsq00,rinv00);
352 /* LENNARD-JONES DISPERSION/REPULSION */
354 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
355 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
356 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
357 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
358 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
360 d = _mm_sub_ps(r00,rswitch);
361 d = _mm_max_ps(d,_mm_setzero_ps());
362 d2 = _mm_mul_ps(d,d);
363 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
365 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
367 /* Evaluate switch function */
368 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
369 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
370 vvdw = _mm_mul_ps(vvdw,sw);
371 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
373 /* Update potential sum for this i atom from the interaction with this j atom. */
374 vvdw = _mm_and_ps(vvdw,cutoff_mask);
375 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
379 fscal = _mm_and_ps(fscal,cutoff_mask);
381 /* Update vectorial force */
382 fix0 = _mm_macc_ps(dx00,fscal,fix0);
383 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
384 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
386 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
387 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
388 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
392 /**************************
393 * CALCULATE INTERACTIONS *
394 **************************/
396 if (gmx_mm_any_lt(rsq11,rcutoff2))
399 r11 = _mm_mul_ps(rsq11,rinv11);
401 /* EWALD ELECTROSTATICS */
403 /* Analytical PME correction */
404 zeta2 = _mm_mul_ps(beta2,rsq11);
405 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
406 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
407 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
408 felec = _mm_mul_ps(qq11,felec);
409 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
410 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
411 velec = _mm_mul_ps(qq11,velec);
413 d = _mm_sub_ps(r11,rswitch);
414 d = _mm_max_ps(d,_mm_setzero_ps());
415 d2 = _mm_mul_ps(d,d);
416 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
418 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
420 /* Evaluate switch function */
421 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
422 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
423 velec = _mm_mul_ps(velec,sw);
424 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
426 /* Update potential sum for this i atom from the interaction with this j atom. */
427 velec = _mm_and_ps(velec,cutoff_mask);
428 velecsum = _mm_add_ps(velecsum,velec);
432 fscal = _mm_and_ps(fscal,cutoff_mask);
434 /* Update vectorial force */
435 fix1 = _mm_macc_ps(dx11,fscal,fix1);
436 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
437 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
439 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
440 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
441 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
445 /**************************
446 * CALCULATE INTERACTIONS *
447 **************************/
449 if (gmx_mm_any_lt(rsq12,rcutoff2))
452 r12 = _mm_mul_ps(rsq12,rinv12);
454 /* EWALD ELECTROSTATICS */
456 /* Analytical PME correction */
457 zeta2 = _mm_mul_ps(beta2,rsq12);
458 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
459 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
460 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
461 felec = _mm_mul_ps(qq12,felec);
462 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
463 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
464 velec = _mm_mul_ps(qq12,velec);
466 d = _mm_sub_ps(r12,rswitch);
467 d = _mm_max_ps(d,_mm_setzero_ps());
468 d2 = _mm_mul_ps(d,d);
469 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
471 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
473 /* Evaluate switch function */
474 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
475 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
476 velec = _mm_mul_ps(velec,sw);
477 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
479 /* Update potential sum for this i atom from the interaction with this j atom. */
480 velec = _mm_and_ps(velec,cutoff_mask);
481 velecsum = _mm_add_ps(velecsum,velec);
485 fscal = _mm_and_ps(fscal,cutoff_mask);
487 /* Update vectorial force */
488 fix1 = _mm_macc_ps(dx12,fscal,fix1);
489 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
490 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
492 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
493 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
494 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
498 /**************************
499 * CALCULATE INTERACTIONS *
500 **************************/
502 if (gmx_mm_any_lt(rsq13,rcutoff2))
505 r13 = _mm_mul_ps(rsq13,rinv13);
507 /* EWALD ELECTROSTATICS */
509 /* Analytical PME correction */
510 zeta2 = _mm_mul_ps(beta2,rsq13);
511 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
512 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
513 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
514 felec = _mm_mul_ps(qq13,felec);
515 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
516 velec = _mm_nmacc_ps(pmecorrV,beta,rinv13);
517 velec = _mm_mul_ps(qq13,velec);
519 d = _mm_sub_ps(r13,rswitch);
520 d = _mm_max_ps(d,_mm_setzero_ps());
521 d2 = _mm_mul_ps(d,d);
522 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
524 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
526 /* Evaluate switch function */
527 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
528 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
529 velec = _mm_mul_ps(velec,sw);
530 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
532 /* Update potential sum for this i atom from the interaction with this j atom. */
533 velec = _mm_and_ps(velec,cutoff_mask);
534 velecsum = _mm_add_ps(velecsum,velec);
538 fscal = _mm_and_ps(fscal,cutoff_mask);
540 /* Update vectorial force */
541 fix1 = _mm_macc_ps(dx13,fscal,fix1);
542 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
543 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
545 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
546 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
547 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
551 /**************************
552 * CALCULATE INTERACTIONS *
553 **************************/
555 if (gmx_mm_any_lt(rsq21,rcutoff2))
558 r21 = _mm_mul_ps(rsq21,rinv21);
560 /* EWALD ELECTROSTATICS */
562 /* Analytical PME correction */
563 zeta2 = _mm_mul_ps(beta2,rsq21);
564 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
565 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
566 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
567 felec = _mm_mul_ps(qq21,felec);
568 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
569 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
570 velec = _mm_mul_ps(qq21,velec);
572 d = _mm_sub_ps(r21,rswitch);
573 d = _mm_max_ps(d,_mm_setzero_ps());
574 d2 = _mm_mul_ps(d,d);
575 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
577 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
579 /* Evaluate switch function */
580 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
581 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
582 velec = _mm_mul_ps(velec,sw);
583 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
585 /* Update potential sum for this i atom from the interaction with this j atom. */
586 velec = _mm_and_ps(velec,cutoff_mask);
587 velecsum = _mm_add_ps(velecsum,velec);
591 fscal = _mm_and_ps(fscal,cutoff_mask);
593 /* Update vectorial force */
594 fix2 = _mm_macc_ps(dx21,fscal,fix2);
595 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
596 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
598 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
599 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
600 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
604 /**************************
605 * CALCULATE INTERACTIONS *
606 **************************/
608 if (gmx_mm_any_lt(rsq22,rcutoff2))
611 r22 = _mm_mul_ps(rsq22,rinv22);
613 /* EWALD ELECTROSTATICS */
615 /* Analytical PME correction */
616 zeta2 = _mm_mul_ps(beta2,rsq22);
617 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
618 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
619 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
620 felec = _mm_mul_ps(qq22,felec);
621 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
622 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
623 velec = _mm_mul_ps(qq22,velec);
625 d = _mm_sub_ps(r22,rswitch);
626 d = _mm_max_ps(d,_mm_setzero_ps());
627 d2 = _mm_mul_ps(d,d);
628 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
630 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
632 /* Evaluate switch function */
633 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
634 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
635 velec = _mm_mul_ps(velec,sw);
636 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
638 /* Update potential sum for this i atom from the interaction with this j atom. */
639 velec = _mm_and_ps(velec,cutoff_mask);
640 velecsum = _mm_add_ps(velecsum,velec);
644 fscal = _mm_and_ps(fscal,cutoff_mask);
646 /* Update vectorial force */
647 fix2 = _mm_macc_ps(dx22,fscal,fix2);
648 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
649 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
651 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
652 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
653 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
657 /**************************
658 * CALCULATE INTERACTIONS *
659 **************************/
661 if (gmx_mm_any_lt(rsq23,rcutoff2))
664 r23 = _mm_mul_ps(rsq23,rinv23);
666 /* EWALD ELECTROSTATICS */
668 /* Analytical PME correction */
669 zeta2 = _mm_mul_ps(beta2,rsq23);
670 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
671 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
672 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
673 felec = _mm_mul_ps(qq23,felec);
674 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
675 velec = _mm_nmacc_ps(pmecorrV,beta,rinv23);
676 velec = _mm_mul_ps(qq23,velec);
678 d = _mm_sub_ps(r23,rswitch);
679 d = _mm_max_ps(d,_mm_setzero_ps());
680 d2 = _mm_mul_ps(d,d);
681 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
683 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
685 /* Evaluate switch function */
686 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
687 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
688 velec = _mm_mul_ps(velec,sw);
689 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
691 /* Update potential sum for this i atom from the interaction with this j atom. */
692 velec = _mm_and_ps(velec,cutoff_mask);
693 velecsum = _mm_add_ps(velecsum,velec);
697 fscal = _mm_and_ps(fscal,cutoff_mask);
699 /* Update vectorial force */
700 fix2 = _mm_macc_ps(dx23,fscal,fix2);
701 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
702 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
704 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
705 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
706 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
710 /**************************
711 * CALCULATE INTERACTIONS *
712 **************************/
714 if (gmx_mm_any_lt(rsq31,rcutoff2))
717 r31 = _mm_mul_ps(rsq31,rinv31);
719 /* EWALD ELECTROSTATICS */
721 /* Analytical PME correction */
722 zeta2 = _mm_mul_ps(beta2,rsq31);
723 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
724 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
725 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
726 felec = _mm_mul_ps(qq31,felec);
727 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
728 velec = _mm_nmacc_ps(pmecorrV,beta,rinv31);
729 velec = _mm_mul_ps(qq31,velec);
731 d = _mm_sub_ps(r31,rswitch);
732 d = _mm_max_ps(d,_mm_setzero_ps());
733 d2 = _mm_mul_ps(d,d);
734 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
736 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
738 /* Evaluate switch function */
739 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
740 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
741 velec = _mm_mul_ps(velec,sw);
742 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
744 /* Update potential sum for this i atom from the interaction with this j atom. */
745 velec = _mm_and_ps(velec,cutoff_mask);
746 velecsum = _mm_add_ps(velecsum,velec);
750 fscal = _mm_and_ps(fscal,cutoff_mask);
752 /* Update vectorial force */
753 fix3 = _mm_macc_ps(dx31,fscal,fix3);
754 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
755 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
757 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
758 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
759 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
763 /**************************
764 * CALCULATE INTERACTIONS *
765 **************************/
767 if (gmx_mm_any_lt(rsq32,rcutoff2))
770 r32 = _mm_mul_ps(rsq32,rinv32);
772 /* EWALD ELECTROSTATICS */
774 /* Analytical PME correction */
775 zeta2 = _mm_mul_ps(beta2,rsq32);
776 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
777 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
778 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
779 felec = _mm_mul_ps(qq32,felec);
780 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
781 velec = _mm_nmacc_ps(pmecorrV,beta,rinv32);
782 velec = _mm_mul_ps(qq32,velec);
784 d = _mm_sub_ps(r32,rswitch);
785 d = _mm_max_ps(d,_mm_setzero_ps());
786 d2 = _mm_mul_ps(d,d);
787 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
789 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
791 /* Evaluate switch function */
792 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
793 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
794 velec = _mm_mul_ps(velec,sw);
795 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
797 /* Update potential sum for this i atom from the interaction with this j atom. */
798 velec = _mm_and_ps(velec,cutoff_mask);
799 velecsum = _mm_add_ps(velecsum,velec);
803 fscal = _mm_and_ps(fscal,cutoff_mask);
805 /* Update vectorial force */
806 fix3 = _mm_macc_ps(dx32,fscal,fix3);
807 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
808 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
810 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
811 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
812 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
816 /**************************
817 * CALCULATE INTERACTIONS *
818 **************************/
820 if (gmx_mm_any_lt(rsq33,rcutoff2))
823 r33 = _mm_mul_ps(rsq33,rinv33);
825 /* EWALD ELECTROSTATICS */
827 /* Analytical PME correction */
828 zeta2 = _mm_mul_ps(beta2,rsq33);
829 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
830 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
831 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
832 felec = _mm_mul_ps(qq33,felec);
833 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
834 velec = _mm_nmacc_ps(pmecorrV,beta,rinv33);
835 velec = _mm_mul_ps(qq33,velec);
837 d = _mm_sub_ps(r33,rswitch);
838 d = _mm_max_ps(d,_mm_setzero_ps());
839 d2 = _mm_mul_ps(d,d);
840 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
842 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
844 /* Evaluate switch function */
845 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
846 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
847 velec = _mm_mul_ps(velec,sw);
848 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
850 /* Update potential sum for this i atom from the interaction with this j atom. */
851 velec = _mm_and_ps(velec,cutoff_mask);
852 velecsum = _mm_add_ps(velecsum,velec);
856 fscal = _mm_and_ps(fscal,cutoff_mask);
858 /* Update vectorial force */
859 fix3 = _mm_macc_ps(dx33,fscal,fix3);
860 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
861 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
863 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
864 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
865 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
869 fjptrA = f+j_coord_offsetA;
870 fjptrB = f+j_coord_offsetB;
871 fjptrC = f+j_coord_offsetC;
872 fjptrD = f+j_coord_offsetD;
874 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
875 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
876 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
878 /* Inner loop uses 542 flops */
884 /* Get j neighbor index, and coordinate index */
885 jnrlistA = jjnr[jidx];
886 jnrlistB = jjnr[jidx+1];
887 jnrlistC = jjnr[jidx+2];
888 jnrlistD = jjnr[jidx+3];
889 /* Sign of each element will be negative for non-real atoms.
890 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
891 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
893 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
894 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
895 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
896 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
897 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
898 j_coord_offsetA = DIM*jnrA;
899 j_coord_offsetB = DIM*jnrB;
900 j_coord_offsetC = DIM*jnrC;
901 j_coord_offsetD = DIM*jnrD;
903 /* load j atom coordinates */
904 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
905 x+j_coord_offsetC,x+j_coord_offsetD,
906 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
907 &jy2,&jz2,&jx3,&jy3,&jz3);
909 /* Calculate displacement vector */
910 dx00 = _mm_sub_ps(ix0,jx0);
911 dy00 = _mm_sub_ps(iy0,jy0);
912 dz00 = _mm_sub_ps(iz0,jz0);
913 dx11 = _mm_sub_ps(ix1,jx1);
914 dy11 = _mm_sub_ps(iy1,jy1);
915 dz11 = _mm_sub_ps(iz1,jz1);
916 dx12 = _mm_sub_ps(ix1,jx2);
917 dy12 = _mm_sub_ps(iy1,jy2);
918 dz12 = _mm_sub_ps(iz1,jz2);
919 dx13 = _mm_sub_ps(ix1,jx3);
920 dy13 = _mm_sub_ps(iy1,jy3);
921 dz13 = _mm_sub_ps(iz1,jz3);
922 dx21 = _mm_sub_ps(ix2,jx1);
923 dy21 = _mm_sub_ps(iy2,jy1);
924 dz21 = _mm_sub_ps(iz2,jz1);
925 dx22 = _mm_sub_ps(ix2,jx2);
926 dy22 = _mm_sub_ps(iy2,jy2);
927 dz22 = _mm_sub_ps(iz2,jz2);
928 dx23 = _mm_sub_ps(ix2,jx3);
929 dy23 = _mm_sub_ps(iy2,jy3);
930 dz23 = _mm_sub_ps(iz2,jz3);
931 dx31 = _mm_sub_ps(ix3,jx1);
932 dy31 = _mm_sub_ps(iy3,jy1);
933 dz31 = _mm_sub_ps(iz3,jz1);
934 dx32 = _mm_sub_ps(ix3,jx2);
935 dy32 = _mm_sub_ps(iy3,jy2);
936 dz32 = _mm_sub_ps(iz3,jz2);
937 dx33 = _mm_sub_ps(ix3,jx3);
938 dy33 = _mm_sub_ps(iy3,jy3);
939 dz33 = _mm_sub_ps(iz3,jz3);
941 /* Calculate squared distance and things based on it */
942 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
943 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
944 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
945 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
946 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
947 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
948 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
949 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
950 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
951 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
953 rinv00 = gmx_mm_invsqrt_ps(rsq00);
954 rinv11 = gmx_mm_invsqrt_ps(rsq11);
955 rinv12 = gmx_mm_invsqrt_ps(rsq12);
956 rinv13 = gmx_mm_invsqrt_ps(rsq13);
957 rinv21 = gmx_mm_invsqrt_ps(rsq21);
958 rinv22 = gmx_mm_invsqrt_ps(rsq22);
959 rinv23 = gmx_mm_invsqrt_ps(rsq23);
960 rinv31 = gmx_mm_invsqrt_ps(rsq31);
961 rinv32 = gmx_mm_invsqrt_ps(rsq32);
962 rinv33 = gmx_mm_invsqrt_ps(rsq33);
964 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
965 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
966 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
967 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
968 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
969 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
970 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
971 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
972 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
973 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
975 fjx0 = _mm_setzero_ps();
976 fjy0 = _mm_setzero_ps();
977 fjz0 = _mm_setzero_ps();
978 fjx1 = _mm_setzero_ps();
979 fjy1 = _mm_setzero_ps();
980 fjz1 = _mm_setzero_ps();
981 fjx2 = _mm_setzero_ps();
982 fjy2 = _mm_setzero_ps();
983 fjz2 = _mm_setzero_ps();
984 fjx3 = _mm_setzero_ps();
985 fjy3 = _mm_setzero_ps();
986 fjz3 = _mm_setzero_ps();
988 /**************************
989 * CALCULATE INTERACTIONS *
990 **************************/
992 if (gmx_mm_any_lt(rsq00,rcutoff2))
995 r00 = _mm_mul_ps(rsq00,rinv00);
996 r00 = _mm_andnot_ps(dummy_mask,r00);
998 /* LENNARD-JONES DISPERSION/REPULSION */
1000 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1001 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1002 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1003 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
1004 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1006 d = _mm_sub_ps(r00,rswitch);
1007 d = _mm_max_ps(d,_mm_setzero_ps());
1008 d2 = _mm_mul_ps(d,d);
1009 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1011 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1013 /* Evaluate switch function */
1014 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1015 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1016 vvdw = _mm_mul_ps(vvdw,sw);
1017 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1019 /* Update potential sum for this i atom from the interaction with this j atom. */
1020 vvdw = _mm_and_ps(vvdw,cutoff_mask);
1021 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
1022 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
1026 fscal = _mm_and_ps(fscal,cutoff_mask);
1028 fscal = _mm_andnot_ps(dummy_mask,fscal);
1030 /* Update vectorial force */
1031 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1032 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1033 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1035 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1036 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1037 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1041 /**************************
1042 * CALCULATE INTERACTIONS *
1043 **************************/
1045 if (gmx_mm_any_lt(rsq11,rcutoff2))
1048 r11 = _mm_mul_ps(rsq11,rinv11);
1049 r11 = _mm_andnot_ps(dummy_mask,r11);
1051 /* EWALD ELECTROSTATICS */
1053 /* Analytical PME correction */
1054 zeta2 = _mm_mul_ps(beta2,rsq11);
1055 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1056 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1057 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1058 felec = _mm_mul_ps(qq11,felec);
1059 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1060 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
1061 velec = _mm_mul_ps(qq11,velec);
1063 d = _mm_sub_ps(r11,rswitch);
1064 d = _mm_max_ps(d,_mm_setzero_ps());
1065 d2 = _mm_mul_ps(d,d);
1066 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1068 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1070 /* Evaluate switch function */
1071 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1072 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1073 velec = _mm_mul_ps(velec,sw);
1074 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1076 /* Update potential sum for this i atom from the interaction with this j atom. */
1077 velec = _mm_and_ps(velec,cutoff_mask);
1078 velec = _mm_andnot_ps(dummy_mask,velec);
1079 velecsum = _mm_add_ps(velecsum,velec);
1083 fscal = _mm_and_ps(fscal,cutoff_mask);
1085 fscal = _mm_andnot_ps(dummy_mask,fscal);
1087 /* Update vectorial force */
1088 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1089 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1090 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1092 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1093 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1094 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1098 /**************************
1099 * CALCULATE INTERACTIONS *
1100 **************************/
1102 if (gmx_mm_any_lt(rsq12,rcutoff2))
1105 r12 = _mm_mul_ps(rsq12,rinv12);
1106 r12 = _mm_andnot_ps(dummy_mask,r12);
1108 /* EWALD ELECTROSTATICS */
1110 /* Analytical PME correction */
1111 zeta2 = _mm_mul_ps(beta2,rsq12);
1112 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1113 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1114 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1115 felec = _mm_mul_ps(qq12,felec);
1116 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1117 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
1118 velec = _mm_mul_ps(qq12,velec);
1120 d = _mm_sub_ps(r12,rswitch);
1121 d = _mm_max_ps(d,_mm_setzero_ps());
1122 d2 = _mm_mul_ps(d,d);
1123 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1125 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1127 /* Evaluate switch function */
1128 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1129 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1130 velec = _mm_mul_ps(velec,sw);
1131 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1133 /* Update potential sum for this i atom from the interaction with this j atom. */
1134 velec = _mm_and_ps(velec,cutoff_mask);
1135 velec = _mm_andnot_ps(dummy_mask,velec);
1136 velecsum = _mm_add_ps(velecsum,velec);
1140 fscal = _mm_and_ps(fscal,cutoff_mask);
1142 fscal = _mm_andnot_ps(dummy_mask,fscal);
1144 /* Update vectorial force */
1145 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1146 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1147 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1149 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1150 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1151 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1155 /**************************
1156 * CALCULATE INTERACTIONS *
1157 **************************/
1159 if (gmx_mm_any_lt(rsq13,rcutoff2))
1162 r13 = _mm_mul_ps(rsq13,rinv13);
1163 r13 = _mm_andnot_ps(dummy_mask,r13);
1165 /* EWALD ELECTROSTATICS */
1167 /* Analytical PME correction */
1168 zeta2 = _mm_mul_ps(beta2,rsq13);
1169 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
1170 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1171 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1172 felec = _mm_mul_ps(qq13,felec);
1173 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1174 velec = _mm_nmacc_ps(pmecorrV,beta,rinv13);
1175 velec = _mm_mul_ps(qq13,velec);
1177 d = _mm_sub_ps(r13,rswitch);
1178 d = _mm_max_ps(d,_mm_setzero_ps());
1179 d2 = _mm_mul_ps(d,d);
1180 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1182 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1184 /* Evaluate switch function */
1185 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1186 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
1187 velec = _mm_mul_ps(velec,sw);
1188 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1190 /* Update potential sum for this i atom from the interaction with this j atom. */
1191 velec = _mm_and_ps(velec,cutoff_mask);
1192 velec = _mm_andnot_ps(dummy_mask,velec);
1193 velecsum = _mm_add_ps(velecsum,velec);
1197 fscal = _mm_and_ps(fscal,cutoff_mask);
1199 fscal = _mm_andnot_ps(dummy_mask,fscal);
1201 /* Update vectorial force */
1202 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1203 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1204 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1206 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1207 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1208 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1212 /**************************
1213 * CALCULATE INTERACTIONS *
1214 **************************/
1216 if (gmx_mm_any_lt(rsq21,rcutoff2))
1219 r21 = _mm_mul_ps(rsq21,rinv21);
1220 r21 = _mm_andnot_ps(dummy_mask,r21);
1222 /* EWALD ELECTROSTATICS */
1224 /* Analytical PME correction */
1225 zeta2 = _mm_mul_ps(beta2,rsq21);
1226 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1227 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1228 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1229 felec = _mm_mul_ps(qq21,felec);
1230 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1231 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
1232 velec = _mm_mul_ps(qq21,velec);
1234 d = _mm_sub_ps(r21,rswitch);
1235 d = _mm_max_ps(d,_mm_setzero_ps());
1236 d2 = _mm_mul_ps(d,d);
1237 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1239 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1241 /* Evaluate switch function */
1242 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1243 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1244 velec = _mm_mul_ps(velec,sw);
1245 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1247 /* Update potential sum for this i atom from the interaction with this j atom. */
1248 velec = _mm_and_ps(velec,cutoff_mask);
1249 velec = _mm_andnot_ps(dummy_mask,velec);
1250 velecsum = _mm_add_ps(velecsum,velec);
1254 fscal = _mm_and_ps(fscal,cutoff_mask);
1256 fscal = _mm_andnot_ps(dummy_mask,fscal);
1258 /* Update vectorial force */
1259 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1260 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1261 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1263 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1264 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1265 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1269 /**************************
1270 * CALCULATE INTERACTIONS *
1271 **************************/
1273 if (gmx_mm_any_lt(rsq22,rcutoff2))
1276 r22 = _mm_mul_ps(rsq22,rinv22);
1277 r22 = _mm_andnot_ps(dummy_mask,r22);
1279 /* EWALD ELECTROSTATICS */
1281 /* Analytical PME correction */
1282 zeta2 = _mm_mul_ps(beta2,rsq22);
1283 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1284 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1285 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1286 felec = _mm_mul_ps(qq22,felec);
1287 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1288 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
1289 velec = _mm_mul_ps(qq22,velec);
1291 d = _mm_sub_ps(r22,rswitch);
1292 d = _mm_max_ps(d,_mm_setzero_ps());
1293 d2 = _mm_mul_ps(d,d);
1294 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1296 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1298 /* Evaluate switch function */
1299 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1300 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1301 velec = _mm_mul_ps(velec,sw);
1302 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1304 /* Update potential sum for this i atom from the interaction with this j atom. */
1305 velec = _mm_and_ps(velec,cutoff_mask);
1306 velec = _mm_andnot_ps(dummy_mask,velec);
1307 velecsum = _mm_add_ps(velecsum,velec);
1311 fscal = _mm_and_ps(fscal,cutoff_mask);
1313 fscal = _mm_andnot_ps(dummy_mask,fscal);
1315 /* Update vectorial force */
1316 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1317 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1318 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1320 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1321 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1322 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1326 /**************************
1327 * CALCULATE INTERACTIONS *
1328 **************************/
1330 if (gmx_mm_any_lt(rsq23,rcutoff2))
1333 r23 = _mm_mul_ps(rsq23,rinv23);
1334 r23 = _mm_andnot_ps(dummy_mask,r23);
1336 /* EWALD ELECTROSTATICS */
1338 /* Analytical PME correction */
1339 zeta2 = _mm_mul_ps(beta2,rsq23);
1340 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
1341 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1342 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1343 felec = _mm_mul_ps(qq23,felec);
1344 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1345 velec = _mm_nmacc_ps(pmecorrV,beta,rinv23);
1346 velec = _mm_mul_ps(qq23,velec);
1348 d = _mm_sub_ps(r23,rswitch);
1349 d = _mm_max_ps(d,_mm_setzero_ps());
1350 d2 = _mm_mul_ps(d,d);
1351 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1353 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1355 /* Evaluate switch function */
1356 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1357 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
1358 velec = _mm_mul_ps(velec,sw);
1359 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1361 /* Update potential sum for this i atom from the interaction with this j atom. */
1362 velec = _mm_and_ps(velec,cutoff_mask);
1363 velec = _mm_andnot_ps(dummy_mask,velec);
1364 velecsum = _mm_add_ps(velecsum,velec);
1368 fscal = _mm_and_ps(fscal,cutoff_mask);
1370 fscal = _mm_andnot_ps(dummy_mask,fscal);
1372 /* Update vectorial force */
1373 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1374 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1375 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1377 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1378 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1379 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1383 /**************************
1384 * CALCULATE INTERACTIONS *
1385 **************************/
1387 if (gmx_mm_any_lt(rsq31,rcutoff2))
1390 r31 = _mm_mul_ps(rsq31,rinv31);
1391 r31 = _mm_andnot_ps(dummy_mask,r31);
1393 /* EWALD ELECTROSTATICS */
1395 /* Analytical PME correction */
1396 zeta2 = _mm_mul_ps(beta2,rsq31);
1397 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
1398 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1399 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1400 felec = _mm_mul_ps(qq31,felec);
1401 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1402 velec = _mm_nmacc_ps(pmecorrV,beta,rinv31);
1403 velec = _mm_mul_ps(qq31,velec);
1405 d = _mm_sub_ps(r31,rswitch);
1406 d = _mm_max_ps(d,_mm_setzero_ps());
1407 d2 = _mm_mul_ps(d,d);
1408 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1410 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1412 /* Evaluate switch function */
1413 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1414 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
1415 velec = _mm_mul_ps(velec,sw);
1416 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1418 /* Update potential sum for this i atom from the interaction with this j atom. */
1419 velec = _mm_and_ps(velec,cutoff_mask);
1420 velec = _mm_andnot_ps(dummy_mask,velec);
1421 velecsum = _mm_add_ps(velecsum,velec);
1425 fscal = _mm_and_ps(fscal,cutoff_mask);
1427 fscal = _mm_andnot_ps(dummy_mask,fscal);
1429 /* Update vectorial force */
1430 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1431 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1432 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1434 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1435 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1436 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1440 /**************************
1441 * CALCULATE INTERACTIONS *
1442 **************************/
1444 if (gmx_mm_any_lt(rsq32,rcutoff2))
1447 r32 = _mm_mul_ps(rsq32,rinv32);
1448 r32 = _mm_andnot_ps(dummy_mask,r32);
1450 /* EWALD ELECTROSTATICS */
1452 /* Analytical PME correction */
1453 zeta2 = _mm_mul_ps(beta2,rsq32);
1454 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
1455 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1456 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1457 felec = _mm_mul_ps(qq32,felec);
1458 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1459 velec = _mm_nmacc_ps(pmecorrV,beta,rinv32);
1460 velec = _mm_mul_ps(qq32,velec);
1462 d = _mm_sub_ps(r32,rswitch);
1463 d = _mm_max_ps(d,_mm_setzero_ps());
1464 d2 = _mm_mul_ps(d,d);
1465 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1467 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1469 /* Evaluate switch function */
1470 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1471 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
1472 velec = _mm_mul_ps(velec,sw);
1473 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1475 /* Update potential sum for this i atom from the interaction with this j atom. */
1476 velec = _mm_and_ps(velec,cutoff_mask);
1477 velec = _mm_andnot_ps(dummy_mask,velec);
1478 velecsum = _mm_add_ps(velecsum,velec);
1482 fscal = _mm_and_ps(fscal,cutoff_mask);
1484 fscal = _mm_andnot_ps(dummy_mask,fscal);
1486 /* Update vectorial force */
1487 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1488 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1489 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1491 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1492 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1493 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1497 /**************************
1498 * CALCULATE INTERACTIONS *
1499 **************************/
1501 if (gmx_mm_any_lt(rsq33,rcutoff2))
1504 r33 = _mm_mul_ps(rsq33,rinv33);
1505 r33 = _mm_andnot_ps(dummy_mask,r33);
1507 /* EWALD ELECTROSTATICS */
1509 /* Analytical PME correction */
1510 zeta2 = _mm_mul_ps(beta2,rsq33);
1511 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
1512 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1513 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1514 felec = _mm_mul_ps(qq33,felec);
1515 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1516 velec = _mm_nmacc_ps(pmecorrV,beta,rinv33);
1517 velec = _mm_mul_ps(qq33,velec);
1519 d = _mm_sub_ps(r33,rswitch);
1520 d = _mm_max_ps(d,_mm_setzero_ps());
1521 d2 = _mm_mul_ps(d,d);
1522 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1524 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1526 /* Evaluate switch function */
1527 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1528 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
1529 velec = _mm_mul_ps(velec,sw);
1530 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1532 /* Update potential sum for this i atom from the interaction with this j atom. */
1533 velec = _mm_and_ps(velec,cutoff_mask);
1534 velec = _mm_andnot_ps(dummy_mask,velec);
1535 velecsum = _mm_add_ps(velecsum,velec);
1539 fscal = _mm_and_ps(fscal,cutoff_mask);
1541 fscal = _mm_andnot_ps(dummy_mask,fscal);
1543 /* Update vectorial force */
1544 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1545 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1546 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1548 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1549 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1550 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1554 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1555 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1556 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1557 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1559 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1560 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1561 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1563 /* Inner loop uses 552 flops */
1566 /* End of innermost loop */
1568 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1569 f+i_coord_offset,fshift+i_shift_offset);
1572 /* Update potential energies */
1573 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1574 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1576 /* Increment number of inner iterations */
1577 inneriter += j_index_end - j_index_start;
1579 /* Outer loop uses 26 flops */
1582 /* Increment number of outer iterations */
1585 /* Update outer/inner flops */
1587 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*552);
1590 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_128_fma_single
1591 * Electrostatics interaction: Ewald
1592 * VdW interaction: LennardJones
1593 * Geometry: Water4-Water4
1594 * Calculate force/pot: Force
1597 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_128_fma_single
1598 (t_nblist * gmx_restrict nlist,
1599 rvec * gmx_restrict xx,
1600 rvec * gmx_restrict ff,
1601 t_forcerec * gmx_restrict fr,
1602 t_mdatoms * gmx_restrict mdatoms,
1603 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1604 t_nrnb * gmx_restrict nrnb)
1606 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1607 * just 0 for non-waters.
1608 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1609 * jnr indices corresponding to data put in the four positions in the SIMD register.
1611 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1612 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1613 int jnrA,jnrB,jnrC,jnrD;
1614 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1615 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1616 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1617 real rcutoff_scalar;
1618 real *shiftvec,*fshift,*x,*f;
1619 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1620 real scratch[4*DIM];
1621 __m128 fscal,rcutoff,rcutoff2,jidxall;
1623 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1625 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1627 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1629 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1630 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1631 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1632 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1633 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1634 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1635 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1636 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1637 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1638 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1639 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1640 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1641 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1642 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1643 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1644 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1645 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1646 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1647 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1648 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1651 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1654 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1655 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1657 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1658 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1660 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1661 real rswitch_scalar,d_scalar;
1662 __m128 dummy_mask,cutoff_mask;
1663 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1664 __m128 one = _mm_set1_ps(1.0);
1665 __m128 two = _mm_set1_ps(2.0);
1671 jindex = nlist->jindex;
1673 shiftidx = nlist->shift;
1675 shiftvec = fr->shift_vec[0];
1676 fshift = fr->fshift[0];
1677 facel = _mm_set1_ps(fr->epsfac);
1678 charge = mdatoms->chargeA;
1679 nvdwtype = fr->ntype;
1680 vdwparam = fr->nbfp;
1681 vdwtype = mdatoms->typeA;
1683 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1684 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1685 beta2 = _mm_mul_ps(beta,beta);
1686 beta3 = _mm_mul_ps(beta,beta2);
1687 ewtab = fr->ic->tabq_coul_FDV0;
1688 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1689 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1691 /* Setup water-specific parameters */
1692 inr = nlist->iinr[0];
1693 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1694 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1695 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1696 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1698 jq1 = _mm_set1_ps(charge[inr+1]);
1699 jq2 = _mm_set1_ps(charge[inr+2]);
1700 jq3 = _mm_set1_ps(charge[inr+3]);
1701 vdwjidx0A = 2*vdwtype[inr+0];
1702 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1703 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1704 qq11 = _mm_mul_ps(iq1,jq1);
1705 qq12 = _mm_mul_ps(iq1,jq2);
1706 qq13 = _mm_mul_ps(iq1,jq3);
1707 qq21 = _mm_mul_ps(iq2,jq1);
1708 qq22 = _mm_mul_ps(iq2,jq2);
1709 qq23 = _mm_mul_ps(iq2,jq3);
1710 qq31 = _mm_mul_ps(iq3,jq1);
1711 qq32 = _mm_mul_ps(iq3,jq2);
1712 qq33 = _mm_mul_ps(iq3,jq3);
1714 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1715 rcutoff_scalar = fr->rcoulomb;
1716 rcutoff = _mm_set1_ps(rcutoff_scalar);
1717 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1719 rswitch_scalar = fr->rcoulomb_switch;
1720 rswitch = _mm_set1_ps(rswitch_scalar);
1721 /* Setup switch parameters */
1722 d_scalar = rcutoff_scalar-rswitch_scalar;
1723 d = _mm_set1_ps(d_scalar);
1724 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1725 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1726 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1727 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1728 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1729 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1731 /* Avoid stupid compiler warnings */
1732 jnrA = jnrB = jnrC = jnrD = 0;
1733 j_coord_offsetA = 0;
1734 j_coord_offsetB = 0;
1735 j_coord_offsetC = 0;
1736 j_coord_offsetD = 0;
1741 for(iidx=0;iidx<4*DIM;iidx++)
1743 scratch[iidx] = 0.0;
1746 /* Start outer loop over neighborlists */
1747 for(iidx=0; iidx<nri; iidx++)
1749 /* Load shift vector for this list */
1750 i_shift_offset = DIM*shiftidx[iidx];
1752 /* Load limits for loop over neighbors */
1753 j_index_start = jindex[iidx];
1754 j_index_end = jindex[iidx+1];
1756 /* Get outer coordinate index */
1758 i_coord_offset = DIM*inr;
1760 /* Load i particle coords and add shift vector */
1761 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1762 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1764 fix0 = _mm_setzero_ps();
1765 fiy0 = _mm_setzero_ps();
1766 fiz0 = _mm_setzero_ps();
1767 fix1 = _mm_setzero_ps();
1768 fiy1 = _mm_setzero_ps();
1769 fiz1 = _mm_setzero_ps();
1770 fix2 = _mm_setzero_ps();
1771 fiy2 = _mm_setzero_ps();
1772 fiz2 = _mm_setzero_ps();
1773 fix3 = _mm_setzero_ps();
1774 fiy3 = _mm_setzero_ps();
1775 fiz3 = _mm_setzero_ps();
1777 /* Start inner kernel loop */
1778 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1781 /* Get j neighbor index, and coordinate index */
1783 jnrB = jjnr[jidx+1];
1784 jnrC = jjnr[jidx+2];
1785 jnrD = jjnr[jidx+3];
1786 j_coord_offsetA = DIM*jnrA;
1787 j_coord_offsetB = DIM*jnrB;
1788 j_coord_offsetC = DIM*jnrC;
1789 j_coord_offsetD = DIM*jnrD;
1791 /* load j atom coordinates */
1792 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1793 x+j_coord_offsetC,x+j_coord_offsetD,
1794 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1795 &jy2,&jz2,&jx3,&jy3,&jz3);
1797 /* Calculate displacement vector */
1798 dx00 = _mm_sub_ps(ix0,jx0);
1799 dy00 = _mm_sub_ps(iy0,jy0);
1800 dz00 = _mm_sub_ps(iz0,jz0);
1801 dx11 = _mm_sub_ps(ix1,jx1);
1802 dy11 = _mm_sub_ps(iy1,jy1);
1803 dz11 = _mm_sub_ps(iz1,jz1);
1804 dx12 = _mm_sub_ps(ix1,jx2);
1805 dy12 = _mm_sub_ps(iy1,jy2);
1806 dz12 = _mm_sub_ps(iz1,jz2);
1807 dx13 = _mm_sub_ps(ix1,jx3);
1808 dy13 = _mm_sub_ps(iy1,jy3);
1809 dz13 = _mm_sub_ps(iz1,jz3);
1810 dx21 = _mm_sub_ps(ix2,jx1);
1811 dy21 = _mm_sub_ps(iy2,jy1);
1812 dz21 = _mm_sub_ps(iz2,jz1);
1813 dx22 = _mm_sub_ps(ix2,jx2);
1814 dy22 = _mm_sub_ps(iy2,jy2);
1815 dz22 = _mm_sub_ps(iz2,jz2);
1816 dx23 = _mm_sub_ps(ix2,jx3);
1817 dy23 = _mm_sub_ps(iy2,jy3);
1818 dz23 = _mm_sub_ps(iz2,jz3);
1819 dx31 = _mm_sub_ps(ix3,jx1);
1820 dy31 = _mm_sub_ps(iy3,jy1);
1821 dz31 = _mm_sub_ps(iz3,jz1);
1822 dx32 = _mm_sub_ps(ix3,jx2);
1823 dy32 = _mm_sub_ps(iy3,jy2);
1824 dz32 = _mm_sub_ps(iz3,jz2);
1825 dx33 = _mm_sub_ps(ix3,jx3);
1826 dy33 = _mm_sub_ps(iy3,jy3);
1827 dz33 = _mm_sub_ps(iz3,jz3);
1829 /* Calculate squared distance and things based on it */
1830 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1831 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1832 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1833 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1834 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1835 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1836 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1837 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1838 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1839 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1841 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1842 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1843 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1844 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1845 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1846 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1847 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1848 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1849 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1850 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1852 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1853 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1854 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1855 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1856 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1857 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1858 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1859 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1860 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1861 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1863 fjx0 = _mm_setzero_ps();
1864 fjy0 = _mm_setzero_ps();
1865 fjz0 = _mm_setzero_ps();
1866 fjx1 = _mm_setzero_ps();
1867 fjy1 = _mm_setzero_ps();
1868 fjz1 = _mm_setzero_ps();
1869 fjx2 = _mm_setzero_ps();
1870 fjy2 = _mm_setzero_ps();
1871 fjz2 = _mm_setzero_ps();
1872 fjx3 = _mm_setzero_ps();
1873 fjy3 = _mm_setzero_ps();
1874 fjz3 = _mm_setzero_ps();
1876 /**************************
1877 * CALCULATE INTERACTIONS *
1878 **************************/
1880 if (gmx_mm_any_lt(rsq00,rcutoff2))
1883 r00 = _mm_mul_ps(rsq00,rinv00);
1885 /* LENNARD-JONES DISPERSION/REPULSION */
1887 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1888 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1889 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1890 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
1891 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1893 d = _mm_sub_ps(r00,rswitch);
1894 d = _mm_max_ps(d,_mm_setzero_ps());
1895 d2 = _mm_mul_ps(d,d);
1896 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1898 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1900 /* Evaluate switch function */
1901 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1902 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1903 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1907 fscal = _mm_and_ps(fscal,cutoff_mask);
1909 /* Update vectorial force */
1910 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1911 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1912 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1914 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1915 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1916 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1920 /**************************
1921 * CALCULATE INTERACTIONS *
1922 **************************/
1924 if (gmx_mm_any_lt(rsq11,rcutoff2))
1927 r11 = _mm_mul_ps(rsq11,rinv11);
1929 /* EWALD ELECTROSTATICS */
1931 /* Analytical PME correction */
1932 zeta2 = _mm_mul_ps(beta2,rsq11);
1933 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1934 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1935 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1936 felec = _mm_mul_ps(qq11,felec);
1937 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1938 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
1939 velec = _mm_mul_ps(qq11,velec);
1941 d = _mm_sub_ps(r11,rswitch);
1942 d = _mm_max_ps(d,_mm_setzero_ps());
1943 d2 = _mm_mul_ps(d,d);
1944 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1946 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1948 /* Evaluate switch function */
1949 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1950 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1951 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1955 fscal = _mm_and_ps(fscal,cutoff_mask);
1957 /* Update vectorial force */
1958 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1959 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1960 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1962 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1963 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1964 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1968 /**************************
1969 * CALCULATE INTERACTIONS *
1970 **************************/
1972 if (gmx_mm_any_lt(rsq12,rcutoff2))
1975 r12 = _mm_mul_ps(rsq12,rinv12);
1977 /* EWALD ELECTROSTATICS */
1979 /* Analytical PME correction */
1980 zeta2 = _mm_mul_ps(beta2,rsq12);
1981 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1982 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1983 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1984 felec = _mm_mul_ps(qq12,felec);
1985 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1986 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
1987 velec = _mm_mul_ps(qq12,velec);
1989 d = _mm_sub_ps(r12,rswitch);
1990 d = _mm_max_ps(d,_mm_setzero_ps());
1991 d2 = _mm_mul_ps(d,d);
1992 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1994 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1996 /* Evaluate switch function */
1997 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1998 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1999 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2003 fscal = _mm_and_ps(fscal,cutoff_mask);
2005 /* Update vectorial force */
2006 fix1 = _mm_macc_ps(dx12,fscal,fix1);
2007 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
2008 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
2010 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
2011 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
2012 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
2016 /**************************
2017 * CALCULATE INTERACTIONS *
2018 **************************/
2020 if (gmx_mm_any_lt(rsq13,rcutoff2))
2023 r13 = _mm_mul_ps(rsq13,rinv13);
2025 /* EWALD ELECTROSTATICS */
2027 /* Analytical PME correction */
2028 zeta2 = _mm_mul_ps(beta2,rsq13);
2029 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
2030 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2031 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2032 felec = _mm_mul_ps(qq13,felec);
2033 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2034 velec = _mm_nmacc_ps(pmecorrV,beta,rinv13);
2035 velec = _mm_mul_ps(qq13,velec);
2037 d = _mm_sub_ps(r13,rswitch);
2038 d = _mm_max_ps(d,_mm_setzero_ps());
2039 d2 = _mm_mul_ps(d,d);
2040 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2042 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2044 /* Evaluate switch function */
2045 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2046 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2047 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2051 fscal = _mm_and_ps(fscal,cutoff_mask);
2053 /* Update vectorial force */
2054 fix1 = _mm_macc_ps(dx13,fscal,fix1);
2055 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
2056 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
2058 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
2059 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
2060 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
2064 /**************************
2065 * CALCULATE INTERACTIONS *
2066 **************************/
2068 if (gmx_mm_any_lt(rsq21,rcutoff2))
2071 r21 = _mm_mul_ps(rsq21,rinv21);
2073 /* EWALD ELECTROSTATICS */
2075 /* Analytical PME correction */
2076 zeta2 = _mm_mul_ps(beta2,rsq21);
2077 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
2078 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2079 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2080 felec = _mm_mul_ps(qq21,felec);
2081 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2082 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
2083 velec = _mm_mul_ps(qq21,velec);
2085 d = _mm_sub_ps(r21,rswitch);
2086 d = _mm_max_ps(d,_mm_setzero_ps());
2087 d2 = _mm_mul_ps(d,d);
2088 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2090 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2092 /* Evaluate switch function */
2093 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2094 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2095 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2099 fscal = _mm_and_ps(fscal,cutoff_mask);
2101 /* Update vectorial force */
2102 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2103 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2104 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2106 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2107 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2108 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2112 /**************************
2113 * CALCULATE INTERACTIONS *
2114 **************************/
2116 if (gmx_mm_any_lt(rsq22,rcutoff2))
2119 r22 = _mm_mul_ps(rsq22,rinv22);
2121 /* EWALD ELECTROSTATICS */
2123 /* Analytical PME correction */
2124 zeta2 = _mm_mul_ps(beta2,rsq22);
2125 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
2126 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2127 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2128 felec = _mm_mul_ps(qq22,felec);
2129 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2130 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
2131 velec = _mm_mul_ps(qq22,velec);
2133 d = _mm_sub_ps(r22,rswitch);
2134 d = _mm_max_ps(d,_mm_setzero_ps());
2135 d2 = _mm_mul_ps(d,d);
2136 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2138 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2140 /* Evaluate switch function */
2141 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2142 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2143 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2147 fscal = _mm_and_ps(fscal,cutoff_mask);
2149 /* Update vectorial force */
2150 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2151 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2152 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2154 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2155 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2156 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2160 /**************************
2161 * CALCULATE INTERACTIONS *
2162 **************************/
2164 if (gmx_mm_any_lt(rsq23,rcutoff2))
2167 r23 = _mm_mul_ps(rsq23,rinv23);
2169 /* EWALD ELECTROSTATICS */
2171 /* Analytical PME correction */
2172 zeta2 = _mm_mul_ps(beta2,rsq23);
2173 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
2174 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2175 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2176 felec = _mm_mul_ps(qq23,felec);
2177 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2178 velec = _mm_nmacc_ps(pmecorrV,beta,rinv23);
2179 velec = _mm_mul_ps(qq23,velec);
2181 d = _mm_sub_ps(r23,rswitch);
2182 d = _mm_max_ps(d,_mm_setzero_ps());
2183 d2 = _mm_mul_ps(d,d);
2184 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2186 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2188 /* Evaluate switch function */
2189 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2190 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2191 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2195 fscal = _mm_and_ps(fscal,cutoff_mask);
2197 /* Update vectorial force */
2198 fix2 = _mm_macc_ps(dx23,fscal,fix2);
2199 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
2200 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
2202 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
2203 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
2204 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
2208 /**************************
2209 * CALCULATE INTERACTIONS *
2210 **************************/
2212 if (gmx_mm_any_lt(rsq31,rcutoff2))
2215 r31 = _mm_mul_ps(rsq31,rinv31);
2217 /* EWALD ELECTROSTATICS */
2219 /* Analytical PME correction */
2220 zeta2 = _mm_mul_ps(beta2,rsq31);
2221 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
2222 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2223 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2224 felec = _mm_mul_ps(qq31,felec);
2225 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2226 velec = _mm_nmacc_ps(pmecorrV,beta,rinv31);
2227 velec = _mm_mul_ps(qq31,velec);
2229 d = _mm_sub_ps(r31,rswitch);
2230 d = _mm_max_ps(d,_mm_setzero_ps());
2231 d2 = _mm_mul_ps(d,d);
2232 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2234 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2236 /* Evaluate switch function */
2237 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2238 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2239 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2243 fscal = _mm_and_ps(fscal,cutoff_mask);
2245 /* Update vectorial force */
2246 fix3 = _mm_macc_ps(dx31,fscal,fix3);
2247 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
2248 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
2250 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
2251 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
2252 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
2256 /**************************
2257 * CALCULATE INTERACTIONS *
2258 **************************/
2260 if (gmx_mm_any_lt(rsq32,rcutoff2))
2263 r32 = _mm_mul_ps(rsq32,rinv32);
2265 /* EWALD ELECTROSTATICS */
2267 /* Analytical PME correction */
2268 zeta2 = _mm_mul_ps(beta2,rsq32);
2269 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
2270 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2271 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2272 felec = _mm_mul_ps(qq32,felec);
2273 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2274 velec = _mm_nmacc_ps(pmecorrV,beta,rinv32);
2275 velec = _mm_mul_ps(qq32,velec);
2277 d = _mm_sub_ps(r32,rswitch);
2278 d = _mm_max_ps(d,_mm_setzero_ps());
2279 d2 = _mm_mul_ps(d,d);
2280 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2282 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2284 /* Evaluate switch function */
2285 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2286 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2287 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2291 fscal = _mm_and_ps(fscal,cutoff_mask);
2293 /* Update vectorial force */
2294 fix3 = _mm_macc_ps(dx32,fscal,fix3);
2295 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
2296 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
2298 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
2299 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
2300 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
2304 /**************************
2305 * CALCULATE INTERACTIONS *
2306 **************************/
2308 if (gmx_mm_any_lt(rsq33,rcutoff2))
2311 r33 = _mm_mul_ps(rsq33,rinv33);
2313 /* EWALD ELECTROSTATICS */
2315 /* Analytical PME correction */
2316 zeta2 = _mm_mul_ps(beta2,rsq33);
2317 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
2318 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2319 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2320 felec = _mm_mul_ps(qq33,felec);
2321 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2322 velec = _mm_nmacc_ps(pmecorrV,beta,rinv33);
2323 velec = _mm_mul_ps(qq33,velec);
2325 d = _mm_sub_ps(r33,rswitch);
2326 d = _mm_max_ps(d,_mm_setzero_ps());
2327 d2 = _mm_mul_ps(d,d);
2328 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2330 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2332 /* Evaluate switch function */
2333 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2334 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2335 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2339 fscal = _mm_and_ps(fscal,cutoff_mask);
2341 /* Update vectorial force */
2342 fix3 = _mm_macc_ps(dx33,fscal,fix3);
2343 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
2344 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
2346 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
2347 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
2348 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
2352 fjptrA = f+j_coord_offsetA;
2353 fjptrB = f+j_coord_offsetB;
2354 fjptrC = f+j_coord_offsetC;
2355 fjptrD = f+j_coord_offsetD;
2357 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2358 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2359 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2361 /* Inner loop uses 512 flops */
2364 if(jidx<j_index_end)
2367 /* Get j neighbor index, and coordinate index */
2368 jnrlistA = jjnr[jidx];
2369 jnrlistB = jjnr[jidx+1];
2370 jnrlistC = jjnr[jidx+2];
2371 jnrlistD = jjnr[jidx+3];
2372 /* Sign of each element will be negative for non-real atoms.
2373 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2374 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2376 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2377 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2378 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2379 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2380 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2381 j_coord_offsetA = DIM*jnrA;
2382 j_coord_offsetB = DIM*jnrB;
2383 j_coord_offsetC = DIM*jnrC;
2384 j_coord_offsetD = DIM*jnrD;
2386 /* load j atom coordinates */
2387 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2388 x+j_coord_offsetC,x+j_coord_offsetD,
2389 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2390 &jy2,&jz2,&jx3,&jy3,&jz3);
2392 /* Calculate displacement vector */
2393 dx00 = _mm_sub_ps(ix0,jx0);
2394 dy00 = _mm_sub_ps(iy0,jy0);
2395 dz00 = _mm_sub_ps(iz0,jz0);
2396 dx11 = _mm_sub_ps(ix1,jx1);
2397 dy11 = _mm_sub_ps(iy1,jy1);
2398 dz11 = _mm_sub_ps(iz1,jz1);
2399 dx12 = _mm_sub_ps(ix1,jx2);
2400 dy12 = _mm_sub_ps(iy1,jy2);
2401 dz12 = _mm_sub_ps(iz1,jz2);
2402 dx13 = _mm_sub_ps(ix1,jx3);
2403 dy13 = _mm_sub_ps(iy1,jy3);
2404 dz13 = _mm_sub_ps(iz1,jz3);
2405 dx21 = _mm_sub_ps(ix2,jx1);
2406 dy21 = _mm_sub_ps(iy2,jy1);
2407 dz21 = _mm_sub_ps(iz2,jz1);
2408 dx22 = _mm_sub_ps(ix2,jx2);
2409 dy22 = _mm_sub_ps(iy2,jy2);
2410 dz22 = _mm_sub_ps(iz2,jz2);
2411 dx23 = _mm_sub_ps(ix2,jx3);
2412 dy23 = _mm_sub_ps(iy2,jy3);
2413 dz23 = _mm_sub_ps(iz2,jz3);
2414 dx31 = _mm_sub_ps(ix3,jx1);
2415 dy31 = _mm_sub_ps(iy3,jy1);
2416 dz31 = _mm_sub_ps(iz3,jz1);
2417 dx32 = _mm_sub_ps(ix3,jx2);
2418 dy32 = _mm_sub_ps(iy3,jy2);
2419 dz32 = _mm_sub_ps(iz3,jz2);
2420 dx33 = _mm_sub_ps(ix3,jx3);
2421 dy33 = _mm_sub_ps(iy3,jy3);
2422 dz33 = _mm_sub_ps(iz3,jz3);
2424 /* Calculate squared distance and things based on it */
2425 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2426 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2427 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2428 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2429 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2430 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2431 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2432 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2433 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2434 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2436 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2437 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2438 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2439 rinv13 = gmx_mm_invsqrt_ps(rsq13);
2440 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2441 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2442 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2443 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2444 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2445 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2447 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2448 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2449 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2450 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2451 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2452 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2453 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2454 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2455 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2456 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2458 fjx0 = _mm_setzero_ps();
2459 fjy0 = _mm_setzero_ps();
2460 fjz0 = _mm_setzero_ps();
2461 fjx1 = _mm_setzero_ps();
2462 fjy1 = _mm_setzero_ps();
2463 fjz1 = _mm_setzero_ps();
2464 fjx2 = _mm_setzero_ps();
2465 fjy2 = _mm_setzero_ps();
2466 fjz2 = _mm_setzero_ps();
2467 fjx3 = _mm_setzero_ps();
2468 fjy3 = _mm_setzero_ps();
2469 fjz3 = _mm_setzero_ps();
2471 /**************************
2472 * CALCULATE INTERACTIONS *
2473 **************************/
2475 if (gmx_mm_any_lt(rsq00,rcutoff2))
2478 r00 = _mm_mul_ps(rsq00,rinv00);
2479 r00 = _mm_andnot_ps(dummy_mask,r00);
2481 /* LENNARD-JONES DISPERSION/REPULSION */
2483 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2484 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
2485 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2486 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
2487 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2489 d = _mm_sub_ps(r00,rswitch);
2490 d = _mm_max_ps(d,_mm_setzero_ps());
2491 d2 = _mm_mul_ps(d,d);
2492 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2494 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2496 /* Evaluate switch function */
2497 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2498 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2499 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2503 fscal = _mm_and_ps(fscal,cutoff_mask);
2505 fscal = _mm_andnot_ps(dummy_mask,fscal);
2507 /* Update vectorial force */
2508 fix0 = _mm_macc_ps(dx00,fscal,fix0);
2509 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
2510 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
2512 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
2513 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
2514 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
2518 /**************************
2519 * CALCULATE INTERACTIONS *
2520 **************************/
2522 if (gmx_mm_any_lt(rsq11,rcutoff2))
2525 r11 = _mm_mul_ps(rsq11,rinv11);
2526 r11 = _mm_andnot_ps(dummy_mask,r11);
2528 /* EWALD ELECTROSTATICS */
2530 /* Analytical PME correction */
2531 zeta2 = _mm_mul_ps(beta2,rsq11);
2532 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
2533 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2534 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2535 felec = _mm_mul_ps(qq11,felec);
2536 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2537 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
2538 velec = _mm_mul_ps(qq11,velec);
2540 d = _mm_sub_ps(r11,rswitch);
2541 d = _mm_max_ps(d,_mm_setzero_ps());
2542 d2 = _mm_mul_ps(d,d);
2543 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2545 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2547 /* Evaluate switch function */
2548 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2549 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2550 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2554 fscal = _mm_and_ps(fscal,cutoff_mask);
2556 fscal = _mm_andnot_ps(dummy_mask,fscal);
2558 /* Update vectorial force */
2559 fix1 = _mm_macc_ps(dx11,fscal,fix1);
2560 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
2561 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
2563 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
2564 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
2565 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
2569 /**************************
2570 * CALCULATE INTERACTIONS *
2571 **************************/
2573 if (gmx_mm_any_lt(rsq12,rcutoff2))
2576 r12 = _mm_mul_ps(rsq12,rinv12);
2577 r12 = _mm_andnot_ps(dummy_mask,r12);
2579 /* EWALD ELECTROSTATICS */
2581 /* Analytical PME correction */
2582 zeta2 = _mm_mul_ps(beta2,rsq12);
2583 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
2584 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2585 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2586 felec = _mm_mul_ps(qq12,felec);
2587 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2588 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
2589 velec = _mm_mul_ps(qq12,velec);
2591 d = _mm_sub_ps(r12,rswitch);
2592 d = _mm_max_ps(d,_mm_setzero_ps());
2593 d2 = _mm_mul_ps(d,d);
2594 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2596 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2598 /* Evaluate switch function */
2599 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2600 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2601 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2605 fscal = _mm_and_ps(fscal,cutoff_mask);
2607 fscal = _mm_andnot_ps(dummy_mask,fscal);
2609 /* Update vectorial force */
2610 fix1 = _mm_macc_ps(dx12,fscal,fix1);
2611 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
2612 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
2614 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
2615 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
2616 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
2620 /**************************
2621 * CALCULATE INTERACTIONS *
2622 **************************/
2624 if (gmx_mm_any_lt(rsq13,rcutoff2))
2627 r13 = _mm_mul_ps(rsq13,rinv13);
2628 r13 = _mm_andnot_ps(dummy_mask,r13);
2630 /* EWALD ELECTROSTATICS */
2632 /* Analytical PME correction */
2633 zeta2 = _mm_mul_ps(beta2,rsq13);
2634 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
2635 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2636 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2637 felec = _mm_mul_ps(qq13,felec);
2638 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2639 velec = _mm_nmacc_ps(pmecorrV,beta,rinv13);
2640 velec = _mm_mul_ps(qq13,velec);
2642 d = _mm_sub_ps(r13,rswitch);
2643 d = _mm_max_ps(d,_mm_setzero_ps());
2644 d2 = _mm_mul_ps(d,d);
2645 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2647 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2649 /* Evaluate switch function */
2650 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2651 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2652 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2656 fscal = _mm_and_ps(fscal,cutoff_mask);
2658 fscal = _mm_andnot_ps(dummy_mask,fscal);
2660 /* Update vectorial force */
2661 fix1 = _mm_macc_ps(dx13,fscal,fix1);
2662 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
2663 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
2665 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
2666 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
2667 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
2671 /**************************
2672 * CALCULATE INTERACTIONS *
2673 **************************/
2675 if (gmx_mm_any_lt(rsq21,rcutoff2))
2678 r21 = _mm_mul_ps(rsq21,rinv21);
2679 r21 = _mm_andnot_ps(dummy_mask,r21);
2681 /* EWALD ELECTROSTATICS */
2683 /* Analytical PME correction */
2684 zeta2 = _mm_mul_ps(beta2,rsq21);
2685 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
2686 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2687 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2688 felec = _mm_mul_ps(qq21,felec);
2689 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2690 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
2691 velec = _mm_mul_ps(qq21,velec);
2693 d = _mm_sub_ps(r21,rswitch);
2694 d = _mm_max_ps(d,_mm_setzero_ps());
2695 d2 = _mm_mul_ps(d,d);
2696 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2698 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2700 /* Evaluate switch function */
2701 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2702 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2703 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2707 fscal = _mm_and_ps(fscal,cutoff_mask);
2709 fscal = _mm_andnot_ps(dummy_mask,fscal);
2711 /* Update vectorial force */
2712 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2713 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2714 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2716 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2717 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2718 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2722 /**************************
2723 * CALCULATE INTERACTIONS *
2724 **************************/
2726 if (gmx_mm_any_lt(rsq22,rcutoff2))
2729 r22 = _mm_mul_ps(rsq22,rinv22);
2730 r22 = _mm_andnot_ps(dummy_mask,r22);
2732 /* EWALD ELECTROSTATICS */
2734 /* Analytical PME correction */
2735 zeta2 = _mm_mul_ps(beta2,rsq22);
2736 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
2737 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2738 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2739 felec = _mm_mul_ps(qq22,felec);
2740 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2741 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
2742 velec = _mm_mul_ps(qq22,velec);
2744 d = _mm_sub_ps(r22,rswitch);
2745 d = _mm_max_ps(d,_mm_setzero_ps());
2746 d2 = _mm_mul_ps(d,d);
2747 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2749 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2751 /* Evaluate switch function */
2752 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2753 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2754 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2758 fscal = _mm_and_ps(fscal,cutoff_mask);
2760 fscal = _mm_andnot_ps(dummy_mask,fscal);
2762 /* Update vectorial force */
2763 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2764 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2765 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2767 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2768 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2769 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2773 /**************************
2774 * CALCULATE INTERACTIONS *
2775 **************************/
2777 if (gmx_mm_any_lt(rsq23,rcutoff2))
2780 r23 = _mm_mul_ps(rsq23,rinv23);
2781 r23 = _mm_andnot_ps(dummy_mask,r23);
2783 /* EWALD ELECTROSTATICS */
2785 /* Analytical PME correction */
2786 zeta2 = _mm_mul_ps(beta2,rsq23);
2787 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
2788 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2789 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2790 felec = _mm_mul_ps(qq23,felec);
2791 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2792 velec = _mm_nmacc_ps(pmecorrV,beta,rinv23);
2793 velec = _mm_mul_ps(qq23,velec);
2795 d = _mm_sub_ps(r23,rswitch);
2796 d = _mm_max_ps(d,_mm_setzero_ps());
2797 d2 = _mm_mul_ps(d,d);
2798 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2800 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2802 /* Evaluate switch function */
2803 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2804 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2805 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2809 fscal = _mm_and_ps(fscal,cutoff_mask);
2811 fscal = _mm_andnot_ps(dummy_mask,fscal);
2813 /* Update vectorial force */
2814 fix2 = _mm_macc_ps(dx23,fscal,fix2);
2815 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
2816 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
2818 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
2819 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
2820 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
2824 /**************************
2825 * CALCULATE INTERACTIONS *
2826 **************************/
2828 if (gmx_mm_any_lt(rsq31,rcutoff2))
2831 r31 = _mm_mul_ps(rsq31,rinv31);
2832 r31 = _mm_andnot_ps(dummy_mask,r31);
2834 /* EWALD ELECTROSTATICS */
2836 /* Analytical PME correction */
2837 zeta2 = _mm_mul_ps(beta2,rsq31);
2838 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
2839 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2840 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2841 felec = _mm_mul_ps(qq31,felec);
2842 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2843 velec = _mm_nmacc_ps(pmecorrV,beta,rinv31);
2844 velec = _mm_mul_ps(qq31,velec);
2846 d = _mm_sub_ps(r31,rswitch);
2847 d = _mm_max_ps(d,_mm_setzero_ps());
2848 d2 = _mm_mul_ps(d,d);
2849 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2851 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2853 /* Evaluate switch function */
2854 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2855 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2856 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2860 fscal = _mm_and_ps(fscal,cutoff_mask);
2862 fscal = _mm_andnot_ps(dummy_mask,fscal);
2864 /* Update vectorial force */
2865 fix3 = _mm_macc_ps(dx31,fscal,fix3);
2866 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
2867 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
2869 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
2870 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
2871 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
2875 /**************************
2876 * CALCULATE INTERACTIONS *
2877 **************************/
2879 if (gmx_mm_any_lt(rsq32,rcutoff2))
2882 r32 = _mm_mul_ps(rsq32,rinv32);
2883 r32 = _mm_andnot_ps(dummy_mask,r32);
2885 /* EWALD ELECTROSTATICS */
2887 /* Analytical PME correction */
2888 zeta2 = _mm_mul_ps(beta2,rsq32);
2889 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
2890 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2891 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2892 felec = _mm_mul_ps(qq32,felec);
2893 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2894 velec = _mm_nmacc_ps(pmecorrV,beta,rinv32);
2895 velec = _mm_mul_ps(qq32,velec);
2897 d = _mm_sub_ps(r32,rswitch);
2898 d = _mm_max_ps(d,_mm_setzero_ps());
2899 d2 = _mm_mul_ps(d,d);
2900 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2902 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2904 /* Evaluate switch function */
2905 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2906 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2907 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2911 fscal = _mm_and_ps(fscal,cutoff_mask);
2913 fscal = _mm_andnot_ps(dummy_mask,fscal);
2915 /* Update vectorial force */
2916 fix3 = _mm_macc_ps(dx32,fscal,fix3);
2917 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
2918 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
2920 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
2921 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
2922 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
2926 /**************************
2927 * CALCULATE INTERACTIONS *
2928 **************************/
2930 if (gmx_mm_any_lt(rsq33,rcutoff2))
2933 r33 = _mm_mul_ps(rsq33,rinv33);
2934 r33 = _mm_andnot_ps(dummy_mask,r33);
2936 /* EWALD ELECTROSTATICS */
2938 /* Analytical PME correction */
2939 zeta2 = _mm_mul_ps(beta2,rsq33);
2940 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
2941 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2942 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2943 felec = _mm_mul_ps(qq33,felec);
2944 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2945 velec = _mm_nmacc_ps(pmecorrV,beta,rinv33);
2946 velec = _mm_mul_ps(qq33,velec);
2948 d = _mm_sub_ps(r33,rswitch);
2949 d = _mm_max_ps(d,_mm_setzero_ps());
2950 d2 = _mm_mul_ps(d,d);
2951 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2953 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2955 /* Evaluate switch function */
2956 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2957 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2958 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2962 fscal = _mm_and_ps(fscal,cutoff_mask);
2964 fscal = _mm_andnot_ps(dummy_mask,fscal);
2966 /* Update vectorial force */
2967 fix3 = _mm_macc_ps(dx33,fscal,fix3);
2968 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
2969 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
2971 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
2972 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
2973 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
2977 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2978 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2979 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2980 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2982 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2983 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2984 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2986 /* Inner loop uses 522 flops */
2989 /* End of innermost loop */
2991 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2992 f+i_coord_offset,fshift+i_shift_offset);
2994 /* Increment number of inner iterations */
2995 inneriter += j_index_end - j_index_start;
2997 /* Outer loop uses 24 flops */
3000 /* Increment number of outer iterations */
3003 /* Update outer/inner flops */
3005 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*522);