2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
48 #include "kernelutil_x86_avx_128_fma_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_128_fma_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LennardJones
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_avx_128_fma_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
112 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
116 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
118 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
119 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
121 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
122 real rswitch_scalar,d_scalar;
123 __m128 dummy_mask,cutoff_mask;
124 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
125 __m128 one = _mm_set1_ps(1.0);
126 __m128 two = _mm_set1_ps(2.0);
132 jindex = nlist->jindex;
134 shiftidx = nlist->shift;
136 shiftvec = fr->shift_vec[0];
137 fshift = fr->fshift[0];
138 facel = _mm_set1_ps(fr->epsfac);
139 charge = mdatoms->chargeA;
140 nvdwtype = fr->ntype;
142 vdwtype = mdatoms->typeA;
144 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
145 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
146 beta2 = _mm_mul_ps(beta,beta);
147 beta3 = _mm_mul_ps(beta,beta2);
148 ewtab = fr->ic->tabq_coul_FDV0;
149 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
150 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
152 /* Setup water-specific parameters */
153 inr = nlist->iinr[0];
154 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
155 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
156 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
157 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
159 jq1 = _mm_set1_ps(charge[inr+1]);
160 jq2 = _mm_set1_ps(charge[inr+2]);
161 jq3 = _mm_set1_ps(charge[inr+3]);
162 vdwjidx0A = 2*vdwtype[inr+0];
163 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
164 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
165 qq11 = _mm_mul_ps(iq1,jq1);
166 qq12 = _mm_mul_ps(iq1,jq2);
167 qq13 = _mm_mul_ps(iq1,jq3);
168 qq21 = _mm_mul_ps(iq2,jq1);
169 qq22 = _mm_mul_ps(iq2,jq2);
170 qq23 = _mm_mul_ps(iq2,jq3);
171 qq31 = _mm_mul_ps(iq3,jq1);
172 qq32 = _mm_mul_ps(iq3,jq2);
173 qq33 = _mm_mul_ps(iq3,jq3);
175 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
176 rcutoff_scalar = fr->rcoulomb;
177 rcutoff = _mm_set1_ps(rcutoff_scalar);
178 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
180 rswitch_scalar = fr->rcoulomb_switch;
181 rswitch = _mm_set1_ps(rswitch_scalar);
182 /* Setup switch parameters */
183 d_scalar = rcutoff_scalar-rswitch_scalar;
184 d = _mm_set1_ps(d_scalar);
185 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
186 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
187 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
188 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
189 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
190 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
192 /* Avoid stupid compiler warnings */
193 jnrA = jnrB = jnrC = jnrD = 0;
202 for(iidx=0;iidx<4*DIM;iidx++)
207 /* Start outer loop over neighborlists */
208 for(iidx=0; iidx<nri; iidx++)
210 /* Load shift vector for this list */
211 i_shift_offset = DIM*shiftidx[iidx];
213 /* Load limits for loop over neighbors */
214 j_index_start = jindex[iidx];
215 j_index_end = jindex[iidx+1];
217 /* Get outer coordinate index */
219 i_coord_offset = DIM*inr;
221 /* Load i particle coords and add shift vector */
222 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
223 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
225 fix0 = _mm_setzero_ps();
226 fiy0 = _mm_setzero_ps();
227 fiz0 = _mm_setzero_ps();
228 fix1 = _mm_setzero_ps();
229 fiy1 = _mm_setzero_ps();
230 fiz1 = _mm_setzero_ps();
231 fix2 = _mm_setzero_ps();
232 fiy2 = _mm_setzero_ps();
233 fiz2 = _mm_setzero_ps();
234 fix3 = _mm_setzero_ps();
235 fiy3 = _mm_setzero_ps();
236 fiz3 = _mm_setzero_ps();
238 /* Reset potential sums */
239 velecsum = _mm_setzero_ps();
240 vvdwsum = _mm_setzero_ps();
242 /* Start inner kernel loop */
243 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
246 /* Get j neighbor index, and coordinate index */
251 j_coord_offsetA = DIM*jnrA;
252 j_coord_offsetB = DIM*jnrB;
253 j_coord_offsetC = DIM*jnrC;
254 j_coord_offsetD = DIM*jnrD;
256 /* load j atom coordinates */
257 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
258 x+j_coord_offsetC,x+j_coord_offsetD,
259 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
260 &jy2,&jz2,&jx3,&jy3,&jz3);
262 /* Calculate displacement vector */
263 dx00 = _mm_sub_ps(ix0,jx0);
264 dy00 = _mm_sub_ps(iy0,jy0);
265 dz00 = _mm_sub_ps(iz0,jz0);
266 dx11 = _mm_sub_ps(ix1,jx1);
267 dy11 = _mm_sub_ps(iy1,jy1);
268 dz11 = _mm_sub_ps(iz1,jz1);
269 dx12 = _mm_sub_ps(ix1,jx2);
270 dy12 = _mm_sub_ps(iy1,jy2);
271 dz12 = _mm_sub_ps(iz1,jz2);
272 dx13 = _mm_sub_ps(ix1,jx3);
273 dy13 = _mm_sub_ps(iy1,jy3);
274 dz13 = _mm_sub_ps(iz1,jz3);
275 dx21 = _mm_sub_ps(ix2,jx1);
276 dy21 = _mm_sub_ps(iy2,jy1);
277 dz21 = _mm_sub_ps(iz2,jz1);
278 dx22 = _mm_sub_ps(ix2,jx2);
279 dy22 = _mm_sub_ps(iy2,jy2);
280 dz22 = _mm_sub_ps(iz2,jz2);
281 dx23 = _mm_sub_ps(ix2,jx3);
282 dy23 = _mm_sub_ps(iy2,jy3);
283 dz23 = _mm_sub_ps(iz2,jz3);
284 dx31 = _mm_sub_ps(ix3,jx1);
285 dy31 = _mm_sub_ps(iy3,jy1);
286 dz31 = _mm_sub_ps(iz3,jz1);
287 dx32 = _mm_sub_ps(ix3,jx2);
288 dy32 = _mm_sub_ps(iy3,jy2);
289 dz32 = _mm_sub_ps(iz3,jz2);
290 dx33 = _mm_sub_ps(ix3,jx3);
291 dy33 = _mm_sub_ps(iy3,jy3);
292 dz33 = _mm_sub_ps(iz3,jz3);
294 /* Calculate squared distance and things based on it */
295 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
296 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
297 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
298 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
299 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
300 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
301 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
302 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
303 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
304 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
306 rinv00 = gmx_mm_invsqrt_ps(rsq00);
307 rinv11 = gmx_mm_invsqrt_ps(rsq11);
308 rinv12 = gmx_mm_invsqrt_ps(rsq12);
309 rinv13 = gmx_mm_invsqrt_ps(rsq13);
310 rinv21 = gmx_mm_invsqrt_ps(rsq21);
311 rinv22 = gmx_mm_invsqrt_ps(rsq22);
312 rinv23 = gmx_mm_invsqrt_ps(rsq23);
313 rinv31 = gmx_mm_invsqrt_ps(rsq31);
314 rinv32 = gmx_mm_invsqrt_ps(rsq32);
315 rinv33 = gmx_mm_invsqrt_ps(rsq33);
317 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
318 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
319 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
320 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
321 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
322 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
323 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
324 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
325 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
326 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
328 fjx0 = _mm_setzero_ps();
329 fjy0 = _mm_setzero_ps();
330 fjz0 = _mm_setzero_ps();
331 fjx1 = _mm_setzero_ps();
332 fjy1 = _mm_setzero_ps();
333 fjz1 = _mm_setzero_ps();
334 fjx2 = _mm_setzero_ps();
335 fjy2 = _mm_setzero_ps();
336 fjz2 = _mm_setzero_ps();
337 fjx3 = _mm_setzero_ps();
338 fjy3 = _mm_setzero_ps();
339 fjz3 = _mm_setzero_ps();
341 /**************************
342 * CALCULATE INTERACTIONS *
343 **************************/
345 if (gmx_mm_any_lt(rsq00,rcutoff2))
348 r00 = _mm_mul_ps(rsq00,rinv00);
350 /* LENNARD-JONES DISPERSION/REPULSION */
352 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
353 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
354 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
355 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
356 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
358 d = _mm_sub_ps(r00,rswitch);
359 d = _mm_max_ps(d,_mm_setzero_ps());
360 d2 = _mm_mul_ps(d,d);
361 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
363 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
365 /* Evaluate switch function */
366 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
367 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
368 vvdw = _mm_mul_ps(vvdw,sw);
369 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
371 /* Update potential sum for this i atom from the interaction with this j atom. */
372 vvdw = _mm_and_ps(vvdw,cutoff_mask);
373 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
377 fscal = _mm_and_ps(fscal,cutoff_mask);
379 /* Update vectorial force */
380 fix0 = _mm_macc_ps(dx00,fscal,fix0);
381 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
382 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
384 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
385 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
386 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
390 /**************************
391 * CALCULATE INTERACTIONS *
392 **************************/
394 if (gmx_mm_any_lt(rsq11,rcutoff2))
397 r11 = _mm_mul_ps(rsq11,rinv11);
399 /* EWALD ELECTROSTATICS */
401 /* Analytical PME correction */
402 zeta2 = _mm_mul_ps(beta2,rsq11);
403 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
404 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
405 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
406 felec = _mm_mul_ps(qq11,felec);
407 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
408 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
409 velec = _mm_mul_ps(qq11,velec);
411 d = _mm_sub_ps(r11,rswitch);
412 d = _mm_max_ps(d,_mm_setzero_ps());
413 d2 = _mm_mul_ps(d,d);
414 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
416 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
418 /* Evaluate switch function */
419 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
420 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
421 velec = _mm_mul_ps(velec,sw);
422 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
424 /* Update potential sum for this i atom from the interaction with this j atom. */
425 velec = _mm_and_ps(velec,cutoff_mask);
426 velecsum = _mm_add_ps(velecsum,velec);
430 fscal = _mm_and_ps(fscal,cutoff_mask);
432 /* Update vectorial force */
433 fix1 = _mm_macc_ps(dx11,fscal,fix1);
434 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
435 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
437 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
438 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
439 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
443 /**************************
444 * CALCULATE INTERACTIONS *
445 **************************/
447 if (gmx_mm_any_lt(rsq12,rcutoff2))
450 r12 = _mm_mul_ps(rsq12,rinv12);
452 /* EWALD ELECTROSTATICS */
454 /* Analytical PME correction */
455 zeta2 = _mm_mul_ps(beta2,rsq12);
456 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
457 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
458 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
459 felec = _mm_mul_ps(qq12,felec);
460 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
461 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
462 velec = _mm_mul_ps(qq12,velec);
464 d = _mm_sub_ps(r12,rswitch);
465 d = _mm_max_ps(d,_mm_setzero_ps());
466 d2 = _mm_mul_ps(d,d);
467 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
469 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
471 /* Evaluate switch function */
472 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
473 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
474 velec = _mm_mul_ps(velec,sw);
475 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
477 /* Update potential sum for this i atom from the interaction with this j atom. */
478 velec = _mm_and_ps(velec,cutoff_mask);
479 velecsum = _mm_add_ps(velecsum,velec);
483 fscal = _mm_and_ps(fscal,cutoff_mask);
485 /* Update vectorial force */
486 fix1 = _mm_macc_ps(dx12,fscal,fix1);
487 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
488 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
490 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
491 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
492 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
496 /**************************
497 * CALCULATE INTERACTIONS *
498 **************************/
500 if (gmx_mm_any_lt(rsq13,rcutoff2))
503 r13 = _mm_mul_ps(rsq13,rinv13);
505 /* EWALD ELECTROSTATICS */
507 /* Analytical PME correction */
508 zeta2 = _mm_mul_ps(beta2,rsq13);
509 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
510 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
511 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
512 felec = _mm_mul_ps(qq13,felec);
513 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
514 velec = _mm_nmacc_ps(pmecorrV,beta,rinv13);
515 velec = _mm_mul_ps(qq13,velec);
517 d = _mm_sub_ps(r13,rswitch);
518 d = _mm_max_ps(d,_mm_setzero_ps());
519 d2 = _mm_mul_ps(d,d);
520 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
522 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
524 /* Evaluate switch function */
525 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
526 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
527 velec = _mm_mul_ps(velec,sw);
528 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
530 /* Update potential sum for this i atom from the interaction with this j atom. */
531 velec = _mm_and_ps(velec,cutoff_mask);
532 velecsum = _mm_add_ps(velecsum,velec);
536 fscal = _mm_and_ps(fscal,cutoff_mask);
538 /* Update vectorial force */
539 fix1 = _mm_macc_ps(dx13,fscal,fix1);
540 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
541 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
543 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
544 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
545 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
549 /**************************
550 * CALCULATE INTERACTIONS *
551 **************************/
553 if (gmx_mm_any_lt(rsq21,rcutoff2))
556 r21 = _mm_mul_ps(rsq21,rinv21);
558 /* EWALD ELECTROSTATICS */
560 /* Analytical PME correction */
561 zeta2 = _mm_mul_ps(beta2,rsq21);
562 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
563 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
564 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
565 felec = _mm_mul_ps(qq21,felec);
566 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
567 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
568 velec = _mm_mul_ps(qq21,velec);
570 d = _mm_sub_ps(r21,rswitch);
571 d = _mm_max_ps(d,_mm_setzero_ps());
572 d2 = _mm_mul_ps(d,d);
573 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
575 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
577 /* Evaluate switch function */
578 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
579 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
580 velec = _mm_mul_ps(velec,sw);
581 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
583 /* Update potential sum for this i atom from the interaction with this j atom. */
584 velec = _mm_and_ps(velec,cutoff_mask);
585 velecsum = _mm_add_ps(velecsum,velec);
589 fscal = _mm_and_ps(fscal,cutoff_mask);
591 /* Update vectorial force */
592 fix2 = _mm_macc_ps(dx21,fscal,fix2);
593 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
594 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
596 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
597 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
598 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
602 /**************************
603 * CALCULATE INTERACTIONS *
604 **************************/
606 if (gmx_mm_any_lt(rsq22,rcutoff2))
609 r22 = _mm_mul_ps(rsq22,rinv22);
611 /* EWALD ELECTROSTATICS */
613 /* Analytical PME correction */
614 zeta2 = _mm_mul_ps(beta2,rsq22);
615 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
616 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
617 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
618 felec = _mm_mul_ps(qq22,felec);
619 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
620 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
621 velec = _mm_mul_ps(qq22,velec);
623 d = _mm_sub_ps(r22,rswitch);
624 d = _mm_max_ps(d,_mm_setzero_ps());
625 d2 = _mm_mul_ps(d,d);
626 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
628 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
630 /* Evaluate switch function */
631 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
632 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
633 velec = _mm_mul_ps(velec,sw);
634 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
636 /* Update potential sum for this i atom from the interaction with this j atom. */
637 velec = _mm_and_ps(velec,cutoff_mask);
638 velecsum = _mm_add_ps(velecsum,velec);
642 fscal = _mm_and_ps(fscal,cutoff_mask);
644 /* Update vectorial force */
645 fix2 = _mm_macc_ps(dx22,fscal,fix2);
646 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
647 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
649 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
650 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
651 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
655 /**************************
656 * CALCULATE INTERACTIONS *
657 **************************/
659 if (gmx_mm_any_lt(rsq23,rcutoff2))
662 r23 = _mm_mul_ps(rsq23,rinv23);
664 /* EWALD ELECTROSTATICS */
666 /* Analytical PME correction */
667 zeta2 = _mm_mul_ps(beta2,rsq23);
668 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
669 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
670 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
671 felec = _mm_mul_ps(qq23,felec);
672 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
673 velec = _mm_nmacc_ps(pmecorrV,beta,rinv23);
674 velec = _mm_mul_ps(qq23,velec);
676 d = _mm_sub_ps(r23,rswitch);
677 d = _mm_max_ps(d,_mm_setzero_ps());
678 d2 = _mm_mul_ps(d,d);
679 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
681 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
683 /* Evaluate switch function */
684 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
685 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
686 velec = _mm_mul_ps(velec,sw);
687 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
689 /* Update potential sum for this i atom from the interaction with this j atom. */
690 velec = _mm_and_ps(velec,cutoff_mask);
691 velecsum = _mm_add_ps(velecsum,velec);
695 fscal = _mm_and_ps(fscal,cutoff_mask);
697 /* Update vectorial force */
698 fix2 = _mm_macc_ps(dx23,fscal,fix2);
699 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
700 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
702 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
703 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
704 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
708 /**************************
709 * CALCULATE INTERACTIONS *
710 **************************/
712 if (gmx_mm_any_lt(rsq31,rcutoff2))
715 r31 = _mm_mul_ps(rsq31,rinv31);
717 /* EWALD ELECTROSTATICS */
719 /* Analytical PME correction */
720 zeta2 = _mm_mul_ps(beta2,rsq31);
721 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
722 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
723 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
724 felec = _mm_mul_ps(qq31,felec);
725 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
726 velec = _mm_nmacc_ps(pmecorrV,beta,rinv31);
727 velec = _mm_mul_ps(qq31,velec);
729 d = _mm_sub_ps(r31,rswitch);
730 d = _mm_max_ps(d,_mm_setzero_ps());
731 d2 = _mm_mul_ps(d,d);
732 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
734 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
736 /* Evaluate switch function */
737 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
738 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
739 velec = _mm_mul_ps(velec,sw);
740 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
742 /* Update potential sum for this i atom from the interaction with this j atom. */
743 velec = _mm_and_ps(velec,cutoff_mask);
744 velecsum = _mm_add_ps(velecsum,velec);
748 fscal = _mm_and_ps(fscal,cutoff_mask);
750 /* Update vectorial force */
751 fix3 = _mm_macc_ps(dx31,fscal,fix3);
752 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
753 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
755 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
756 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
757 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
761 /**************************
762 * CALCULATE INTERACTIONS *
763 **************************/
765 if (gmx_mm_any_lt(rsq32,rcutoff2))
768 r32 = _mm_mul_ps(rsq32,rinv32);
770 /* EWALD ELECTROSTATICS */
772 /* Analytical PME correction */
773 zeta2 = _mm_mul_ps(beta2,rsq32);
774 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
775 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
776 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
777 felec = _mm_mul_ps(qq32,felec);
778 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
779 velec = _mm_nmacc_ps(pmecorrV,beta,rinv32);
780 velec = _mm_mul_ps(qq32,velec);
782 d = _mm_sub_ps(r32,rswitch);
783 d = _mm_max_ps(d,_mm_setzero_ps());
784 d2 = _mm_mul_ps(d,d);
785 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
787 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
789 /* Evaluate switch function */
790 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
791 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
792 velec = _mm_mul_ps(velec,sw);
793 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
795 /* Update potential sum for this i atom from the interaction with this j atom. */
796 velec = _mm_and_ps(velec,cutoff_mask);
797 velecsum = _mm_add_ps(velecsum,velec);
801 fscal = _mm_and_ps(fscal,cutoff_mask);
803 /* Update vectorial force */
804 fix3 = _mm_macc_ps(dx32,fscal,fix3);
805 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
806 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
808 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
809 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
810 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
814 /**************************
815 * CALCULATE INTERACTIONS *
816 **************************/
818 if (gmx_mm_any_lt(rsq33,rcutoff2))
821 r33 = _mm_mul_ps(rsq33,rinv33);
823 /* EWALD ELECTROSTATICS */
825 /* Analytical PME correction */
826 zeta2 = _mm_mul_ps(beta2,rsq33);
827 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
828 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
829 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
830 felec = _mm_mul_ps(qq33,felec);
831 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
832 velec = _mm_nmacc_ps(pmecorrV,beta,rinv33);
833 velec = _mm_mul_ps(qq33,velec);
835 d = _mm_sub_ps(r33,rswitch);
836 d = _mm_max_ps(d,_mm_setzero_ps());
837 d2 = _mm_mul_ps(d,d);
838 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
840 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
842 /* Evaluate switch function */
843 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
844 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
845 velec = _mm_mul_ps(velec,sw);
846 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
848 /* Update potential sum for this i atom from the interaction with this j atom. */
849 velec = _mm_and_ps(velec,cutoff_mask);
850 velecsum = _mm_add_ps(velecsum,velec);
854 fscal = _mm_and_ps(fscal,cutoff_mask);
856 /* Update vectorial force */
857 fix3 = _mm_macc_ps(dx33,fscal,fix3);
858 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
859 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
861 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
862 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
863 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
867 fjptrA = f+j_coord_offsetA;
868 fjptrB = f+j_coord_offsetB;
869 fjptrC = f+j_coord_offsetC;
870 fjptrD = f+j_coord_offsetD;
872 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
873 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
874 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
876 /* Inner loop uses 542 flops */
882 /* Get j neighbor index, and coordinate index */
883 jnrlistA = jjnr[jidx];
884 jnrlistB = jjnr[jidx+1];
885 jnrlistC = jjnr[jidx+2];
886 jnrlistD = jjnr[jidx+3];
887 /* Sign of each element will be negative for non-real atoms.
888 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
889 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
891 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
892 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
893 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
894 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
895 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
896 j_coord_offsetA = DIM*jnrA;
897 j_coord_offsetB = DIM*jnrB;
898 j_coord_offsetC = DIM*jnrC;
899 j_coord_offsetD = DIM*jnrD;
901 /* load j atom coordinates */
902 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
903 x+j_coord_offsetC,x+j_coord_offsetD,
904 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
905 &jy2,&jz2,&jx3,&jy3,&jz3);
907 /* Calculate displacement vector */
908 dx00 = _mm_sub_ps(ix0,jx0);
909 dy00 = _mm_sub_ps(iy0,jy0);
910 dz00 = _mm_sub_ps(iz0,jz0);
911 dx11 = _mm_sub_ps(ix1,jx1);
912 dy11 = _mm_sub_ps(iy1,jy1);
913 dz11 = _mm_sub_ps(iz1,jz1);
914 dx12 = _mm_sub_ps(ix1,jx2);
915 dy12 = _mm_sub_ps(iy1,jy2);
916 dz12 = _mm_sub_ps(iz1,jz2);
917 dx13 = _mm_sub_ps(ix1,jx3);
918 dy13 = _mm_sub_ps(iy1,jy3);
919 dz13 = _mm_sub_ps(iz1,jz3);
920 dx21 = _mm_sub_ps(ix2,jx1);
921 dy21 = _mm_sub_ps(iy2,jy1);
922 dz21 = _mm_sub_ps(iz2,jz1);
923 dx22 = _mm_sub_ps(ix2,jx2);
924 dy22 = _mm_sub_ps(iy2,jy2);
925 dz22 = _mm_sub_ps(iz2,jz2);
926 dx23 = _mm_sub_ps(ix2,jx3);
927 dy23 = _mm_sub_ps(iy2,jy3);
928 dz23 = _mm_sub_ps(iz2,jz3);
929 dx31 = _mm_sub_ps(ix3,jx1);
930 dy31 = _mm_sub_ps(iy3,jy1);
931 dz31 = _mm_sub_ps(iz3,jz1);
932 dx32 = _mm_sub_ps(ix3,jx2);
933 dy32 = _mm_sub_ps(iy3,jy2);
934 dz32 = _mm_sub_ps(iz3,jz2);
935 dx33 = _mm_sub_ps(ix3,jx3);
936 dy33 = _mm_sub_ps(iy3,jy3);
937 dz33 = _mm_sub_ps(iz3,jz3);
939 /* Calculate squared distance and things based on it */
940 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
941 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
942 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
943 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
944 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
945 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
946 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
947 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
948 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
949 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
951 rinv00 = gmx_mm_invsqrt_ps(rsq00);
952 rinv11 = gmx_mm_invsqrt_ps(rsq11);
953 rinv12 = gmx_mm_invsqrt_ps(rsq12);
954 rinv13 = gmx_mm_invsqrt_ps(rsq13);
955 rinv21 = gmx_mm_invsqrt_ps(rsq21);
956 rinv22 = gmx_mm_invsqrt_ps(rsq22);
957 rinv23 = gmx_mm_invsqrt_ps(rsq23);
958 rinv31 = gmx_mm_invsqrt_ps(rsq31);
959 rinv32 = gmx_mm_invsqrt_ps(rsq32);
960 rinv33 = gmx_mm_invsqrt_ps(rsq33);
962 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
963 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
964 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
965 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
966 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
967 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
968 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
969 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
970 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
971 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
973 fjx0 = _mm_setzero_ps();
974 fjy0 = _mm_setzero_ps();
975 fjz0 = _mm_setzero_ps();
976 fjx1 = _mm_setzero_ps();
977 fjy1 = _mm_setzero_ps();
978 fjz1 = _mm_setzero_ps();
979 fjx2 = _mm_setzero_ps();
980 fjy2 = _mm_setzero_ps();
981 fjz2 = _mm_setzero_ps();
982 fjx3 = _mm_setzero_ps();
983 fjy3 = _mm_setzero_ps();
984 fjz3 = _mm_setzero_ps();
986 /**************************
987 * CALCULATE INTERACTIONS *
988 **************************/
990 if (gmx_mm_any_lt(rsq00,rcutoff2))
993 r00 = _mm_mul_ps(rsq00,rinv00);
994 r00 = _mm_andnot_ps(dummy_mask,r00);
996 /* LENNARD-JONES DISPERSION/REPULSION */
998 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
999 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1000 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1001 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
1002 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1004 d = _mm_sub_ps(r00,rswitch);
1005 d = _mm_max_ps(d,_mm_setzero_ps());
1006 d2 = _mm_mul_ps(d,d);
1007 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1009 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1011 /* Evaluate switch function */
1012 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1013 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1014 vvdw = _mm_mul_ps(vvdw,sw);
1015 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1017 /* Update potential sum for this i atom from the interaction with this j atom. */
1018 vvdw = _mm_and_ps(vvdw,cutoff_mask);
1019 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
1020 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
1024 fscal = _mm_and_ps(fscal,cutoff_mask);
1026 fscal = _mm_andnot_ps(dummy_mask,fscal);
1028 /* Update vectorial force */
1029 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1030 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1031 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1033 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1034 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1035 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1039 /**************************
1040 * CALCULATE INTERACTIONS *
1041 **************************/
1043 if (gmx_mm_any_lt(rsq11,rcutoff2))
1046 r11 = _mm_mul_ps(rsq11,rinv11);
1047 r11 = _mm_andnot_ps(dummy_mask,r11);
1049 /* EWALD ELECTROSTATICS */
1051 /* Analytical PME correction */
1052 zeta2 = _mm_mul_ps(beta2,rsq11);
1053 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1054 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1055 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1056 felec = _mm_mul_ps(qq11,felec);
1057 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1058 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
1059 velec = _mm_mul_ps(qq11,velec);
1061 d = _mm_sub_ps(r11,rswitch);
1062 d = _mm_max_ps(d,_mm_setzero_ps());
1063 d2 = _mm_mul_ps(d,d);
1064 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1066 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1068 /* Evaluate switch function */
1069 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1070 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1071 velec = _mm_mul_ps(velec,sw);
1072 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1074 /* Update potential sum for this i atom from the interaction with this j atom. */
1075 velec = _mm_and_ps(velec,cutoff_mask);
1076 velec = _mm_andnot_ps(dummy_mask,velec);
1077 velecsum = _mm_add_ps(velecsum,velec);
1081 fscal = _mm_and_ps(fscal,cutoff_mask);
1083 fscal = _mm_andnot_ps(dummy_mask,fscal);
1085 /* Update vectorial force */
1086 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1087 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1088 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1090 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1091 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1092 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1096 /**************************
1097 * CALCULATE INTERACTIONS *
1098 **************************/
1100 if (gmx_mm_any_lt(rsq12,rcutoff2))
1103 r12 = _mm_mul_ps(rsq12,rinv12);
1104 r12 = _mm_andnot_ps(dummy_mask,r12);
1106 /* EWALD ELECTROSTATICS */
1108 /* Analytical PME correction */
1109 zeta2 = _mm_mul_ps(beta2,rsq12);
1110 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1111 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1112 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1113 felec = _mm_mul_ps(qq12,felec);
1114 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1115 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
1116 velec = _mm_mul_ps(qq12,velec);
1118 d = _mm_sub_ps(r12,rswitch);
1119 d = _mm_max_ps(d,_mm_setzero_ps());
1120 d2 = _mm_mul_ps(d,d);
1121 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1123 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1125 /* Evaluate switch function */
1126 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1127 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1128 velec = _mm_mul_ps(velec,sw);
1129 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1131 /* Update potential sum for this i atom from the interaction with this j atom. */
1132 velec = _mm_and_ps(velec,cutoff_mask);
1133 velec = _mm_andnot_ps(dummy_mask,velec);
1134 velecsum = _mm_add_ps(velecsum,velec);
1138 fscal = _mm_and_ps(fscal,cutoff_mask);
1140 fscal = _mm_andnot_ps(dummy_mask,fscal);
1142 /* Update vectorial force */
1143 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1144 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1145 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1147 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1148 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1149 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1153 /**************************
1154 * CALCULATE INTERACTIONS *
1155 **************************/
1157 if (gmx_mm_any_lt(rsq13,rcutoff2))
1160 r13 = _mm_mul_ps(rsq13,rinv13);
1161 r13 = _mm_andnot_ps(dummy_mask,r13);
1163 /* EWALD ELECTROSTATICS */
1165 /* Analytical PME correction */
1166 zeta2 = _mm_mul_ps(beta2,rsq13);
1167 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
1168 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1169 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1170 felec = _mm_mul_ps(qq13,felec);
1171 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1172 velec = _mm_nmacc_ps(pmecorrV,beta,rinv13);
1173 velec = _mm_mul_ps(qq13,velec);
1175 d = _mm_sub_ps(r13,rswitch);
1176 d = _mm_max_ps(d,_mm_setzero_ps());
1177 d2 = _mm_mul_ps(d,d);
1178 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1180 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1182 /* Evaluate switch function */
1183 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1184 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
1185 velec = _mm_mul_ps(velec,sw);
1186 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1188 /* Update potential sum for this i atom from the interaction with this j atom. */
1189 velec = _mm_and_ps(velec,cutoff_mask);
1190 velec = _mm_andnot_ps(dummy_mask,velec);
1191 velecsum = _mm_add_ps(velecsum,velec);
1195 fscal = _mm_and_ps(fscal,cutoff_mask);
1197 fscal = _mm_andnot_ps(dummy_mask,fscal);
1199 /* Update vectorial force */
1200 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1201 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1202 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1204 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1205 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1206 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1210 /**************************
1211 * CALCULATE INTERACTIONS *
1212 **************************/
1214 if (gmx_mm_any_lt(rsq21,rcutoff2))
1217 r21 = _mm_mul_ps(rsq21,rinv21);
1218 r21 = _mm_andnot_ps(dummy_mask,r21);
1220 /* EWALD ELECTROSTATICS */
1222 /* Analytical PME correction */
1223 zeta2 = _mm_mul_ps(beta2,rsq21);
1224 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1225 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1226 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1227 felec = _mm_mul_ps(qq21,felec);
1228 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1229 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
1230 velec = _mm_mul_ps(qq21,velec);
1232 d = _mm_sub_ps(r21,rswitch);
1233 d = _mm_max_ps(d,_mm_setzero_ps());
1234 d2 = _mm_mul_ps(d,d);
1235 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1237 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1239 /* Evaluate switch function */
1240 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1241 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1242 velec = _mm_mul_ps(velec,sw);
1243 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1245 /* Update potential sum for this i atom from the interaction with this j atom. */
1246 velec = _mm_and_ps(velec,cutoff_mask);
1247 velec = _mm_andnot_ps(dummy_mask,velec);
1248 velecsum = _mm_add_ps(velecsum,velec);
1252 fscal = _mm_and_ps(fscal,cutoff_mask);
1254 fscal = _mm_andnot_ps(dummy_mask,fscal);
1256 /* Update vectorial force */
1257 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1258 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1259 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1261 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1262 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1263 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1267 /**************************
1268 * CALCULATE INTERACTIONS *
1269 **************************/
1271 if (gmx_mm_any_lt(rsq22,rcutoff2))
1274 r22 = _mm_mul_ps(rsq22,rinv22);
1275 r22 = _mm_andnot_ps(dummy_mask,r22);
1277 /* EWALD ELECTROSTATICS */
1279 /* Analytical PME correction */
1280 zeta2 = _mm_mul_ps(beta2,rsq22);
1281 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1282 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1283 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1284 felec = _mm_mul_ps(qq22,felec);
1285 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1286 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
1287 velec = _mm_mul_ps(qq22,velec);
1289 d = _mm_sub_ps(r22,rswitch);
1290 d = _mm_max_ps(d,_mm_setzero_ps());
1291 d2 = _mm_mul_ps(d,d);
1292 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1294 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1296 /* Evaluate switch function */
1297 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1298 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1299 velec = _mm_mul_ps(velec,sw);
1300 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1302 /* Update potential sum for this i atom from the interaction with this j atom. */
1303 velec = _mm_and_ps(velec,cutoff_mask);
1304 velec = _mm_andnot_ps(dummy_mask,velec);
1305 velecsum = _mm_add_ps(velecsum,velec);
1309 fscal = _mm_and_ps(fscal,cutoff_mask);
1311 fscal = _mm_andnot_ps(dummy_mask,fscal);
1313 /* Update vectorial force */
1314 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1315 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1316 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1318 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1319 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1320 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1324 /**************************
1325 * CALCULATE INTERACTIONS *
1326 **************************/
1328 if (gmx_mm_any_lt(rsq23,rcutoff2))
1331 r23 = _mm_mul_ps(rsq23,rinv23);
1332 r23 = _mm_andnot_ps(dummy_mask,r23);
1334 /* EWALD ELECTROSTATICS */
1336 /* Analytical PME correction */
1337 zeta2 = _mm_mul_ps(beta2,rsq23);
1338 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
1339 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1340 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1341 felec = _mm_mul_ps(qq23,felec);
1342 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1343 velec = _mm_nmacc_ps(pmecorrV,beta,rinv23);
1344 velec = _mm_mul_ps(qq23,velec);
1346 d = _mm_sub_ps(r23,rswitch);
1347 d = _mm_max_ps(d,_mm_setzero_ps());
1348 d2 = _mm_mul_ps(d,d);
1349 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1351 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1353 /* Evaluate switch function */
1354 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1355 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
1356 velec = _mm_mul_ps(velec,sw);
1357 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1359 /* Update potential sum for this i atom from the interaction with this j atom. */
1360 velec = _mm_and_ps(velec,cutoff_mask);
1361 velec = _mm_andnot_ps(dummy_mask,velec);
1362 velecsum = _mm_add_ps(velecsum,velec);
1366 fscal = _mm_and_ps(fscal,cutoff_mask);
1368 fscal = _mm_andnot_ps(dummy_mask,fscal);
1370 /* Update vectorial force */
1371 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1372 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1373 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1375 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1376 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1377 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1381 /**************************
1382 * CALCULATE INTERACTIONS *
1383 **************************/
1385 if (gmx_mm_any_lt(rsq31,rcutoff2))
1388 r31 = _mm_mul_ps(rsq31,rinv31);
1389 r31 = _mm_andnot_ps(dummy_mask,r31);
1391 /* EWALD ELECTROSTATICS */
1393 /* Analytical PME correction */
1394 zeta2 = _mm_mul_ps(beta2,rsq31);
1395 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
1396 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1397 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1398 felec = _mm_mul_ps(qq31,felec);
1399 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1400 velec = _mm_nmacc_ps(pmecorrV,beta,rinv31);
1401 velec = _mm_mul_ps(qq31,velec);
1403 d = _mm_sub_ps(r31,rswitch);
1404 d = _mm_max_ps(d,_mm_setzero_ps());
1405 d2 = _mm_mul_ps(d,d);
1406 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1408 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1410 /* Evaluate switch function */
1411 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1412 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
1413 velec = _mm_mul_ps(velec,sw);
1414 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1416 /* Update potential sum for this i atom from the interaction with this j atom. */
1417 velec = _mm_and_ps(velec,cutoff_mask);
1418 velec = _mm_andnot_ps(dummy_mask,velec);
1419 velecsum = _mm_add_ps(velecsum,velec);
1423 fscal = _mm_and_ps(fscal,cutoff_mask);
1425 fscal = _mm_andnot_ps(dummy_mask,fscal);
1427 /* Update vectorial force */
1428 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1429 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1430 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1432 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1433 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1434 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1438 /**************************
1439 * CALCULATE INTERACTIONS *
1440 **************************/
1442 if (gmx_mm_any_lt(rsq32,rcutoff2))
1445 r32 = _mm_mul_ps(rsq32,rinv32);
1446 r32 = _mm_andnot_ps(dummy_mask,r32);
1448 /* EWALD ELECTROSTATICS */
1450 /* Analytical PME correction */
1451 zeta2 = _mm_mul_ps(beta2,rsq32);
1452 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
1453 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1454 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1455 felec = _mm_mul_ps(qq32,felec);
1456 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1457 velec = _mm_nmacc_ps(pmecorrV,beta,rinv32);
1458 velec = _mm_mul_ps(qq32,velec);
1460 d = _mm_sub_ps(r32,rswitch);
1461 d = _mm_max_ps(d,_mm_setzero_ps());
1462 d2 = _mm_mul_ps(d,d);
1463 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1465 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1467 /* Evaluate switch function */
1468 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1469 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
1470 velec = _mm_mul_ps(velec,sw);
1471 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1473 /* Update potential sum for this i atom from the interaction with this j atom. */
1474 velec = _mm_and_ps(velec,cutoff_mask);
1475 velec = _mm_andnot_ps(dummy_mask,velec);
1476 velecsum = _mm_add_ps(velecsum,velec);
1480 fscal = _mm_and_ps(fscal,cutoff_mask);
1482 fscal = _mm_andnot_ps(dummy_mask,fscal);
1484 /* Update vectorial force */
1485 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1486 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1487 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1489 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1490 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1491 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1495 /**************************
1496 * CALCULATE INTERACTIONS *
1497 **************************/
1499 if (gmx_mm_any_lt(rsq33,rcutoff2))
1502 r33 = _mm_mul_ps(rsq33,rinv33);
1503 r33 = _mm_andnot_ps(dummy_mask,r33);
1505 /* EWALD ELECTROSTATICS */
1507 /* Analytical PME correction */
1508 zeta2 = _mm_mul_ps(beta2,rsq33);
1509 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
1510 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1511 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1512 felec = _mm_mul_ps(qq33,felec);
1513 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1514 velec = _mm_nmacc_ps(pmecorrV,beta,rinv33);
1515 velec = _mm_mul_ps(qq33,velec);
1517 d = _mm_sub_ps(r33,rswitch);
1518 d = _mm_max_ps(d,_mm_setzero_ps());
1519 d2 = _mm_mul_ps(d,d);
1520 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1522 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1524 /* Evaluate switch function */
1525 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1526 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
1527 velec = _mm_mul_ps(velec,sw);
1528 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1530 /* Update potential sum for this i atom from the interaction with this j atom. */
1531 velec = _mm_and_ps(velec,cutoff_mask);
1532 velec = _mm_andnot_ps(dummy_mask,velec);
1533 velecsum = _mm_add_ps(velecsum,velec);
1537 fscal = _mm_and_ps(fscal,cutoff_mask);
1539 fscal = _mm_andnot_ps(dummy_mask,fscal);
1541 /* Update vectorial force */
1542 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1543 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1544 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1546 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1547 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1548 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1552 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1553 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1554 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1555 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1557 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1558 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1559 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1561 /* Inner loop uses 552 flops */
1564 /* End of innermost loop */
1566 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1567 f+i_coord_offset,fshift+i_shift_offset);
1570 /* Update potential energies */
1571 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1572 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1574 /* Increment number of inner iterations */
1575 inneriter += j_index_end - j_index_start;
1577 /* Outer loop uses 26 flops */
1580 /* Increment number of outer iterations */
1583 /* Update outer/inner flops */
1585 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*552);
1588 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_128_fma_single
1589 * Electrostatics interaction: Ewald
1590 * VdW interaction: LennardJones
1591 * Geometry: Water4-Water4
1592 * Calculate force/pot: Force
1595 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_avx_128_fma_single
1596 (t_nblist * gmx_restrict nlist,
1597 rvec * gmx_restrict xx,
1598 rvec * gmx_restrict ff,
1599 t_forcerec * gmx_restrict fr,
1600 t_mdatoms * gmx_restrict mdatoms,
1601 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1602 t_nrnb * gmx_restrict nrnb)
1604 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1605 * just 0 for non-waters.
1606 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1607 * jnr indices corresponding to data put in the four positions in the SIMD register.
1609 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1610 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1611 int jnrA,jnrB,jnrC,jnrD;
1612 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1613 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1614 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1615 real rcutoff_scalar;
1616 real *shiftvec,*fshift,*x,*f;
1617 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1618 real scratch[4*DIM];
1619 __m128 fscal,rcutoff,rcutoff2,jidxall;
1621 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1623 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1625 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1627 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1628 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1629 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1630 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1631 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1632 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1633 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1634 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1635 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1636 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1637 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1638 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1639 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1640 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1641 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1642 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1643 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1644 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1645 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1646 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1649 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1652 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1653 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1655 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1656 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1658 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1659 real rswitch_scalar,d_scalar;
1660 __m128 dummy_mask,cutoff_mask;
1661 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1662 __m128 one = _mm_set1_ps(1.0);
1663 __m128 two = _mm_set1_ps(2.0);
1669 jindex = nlist->jindex;
1671 shiftidx = nlist->shift;
1673 shiftvec = fr->shift_vec[0];
1674 fshift = fr->fshift[0];
1675 facel = _mm_set1_ps(fr->epsfac);
1676 charge = mdatoms->chargeA;
1677 nvdwtype = fr->ntype;
1678 vdwparam = fr->nbfp;
1679 vdwtype = mdatoms->typeA;
1681 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1682 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1683 beta2 = _mm_mul_ps(beta,beta);
1684 beta3 = _mm_mul_ps(beta,beta2);
1685 ewtab = fr->ic->tabq_coul_FDV0;
1686 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1687 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1689 /* Setup water-specific parameters */
1690 inr = nlist->iinr[0];
1691 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1692 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1693 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1694 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1696 jq1 = _mm_set1_ps(charge[inr+1]);
1697 jq2 = _mm_set1_ps(charge[inr+2]);
1698 jq3 = _mm_set1_ps(charge[inr+3]);
1699 vdwjidx0A = 2*vdwtype[inr+0];
1700 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1701 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1702 qq11 = _mm_mul_ps(iq1,jq1);
1703 qq12 = _mm_mul_ps(iq1,jq2);
1704 qq13 = _mm_mul_ps(iq1,jq3);
1705 qq21 = _mm_mul_ps(iq2,jq1);
1706 qq22 = _mm_mul_ps(iq2,jq2);
1707 qq23 = _mm_mul_ps(iq2,jq3);
1708 qq31 = _mm_mul_ps(iq3,jq1);
1709 qq32 = _mm_mul_ps(iq3,jq2);
1710 qq33 = _mm_mul_ps(iq3,jq3);
1712 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1713 rcutoff_scalar = fr->rcoulomb;
1714 rcutoff = _mm_set1_ps(rcutoff_scalar);
1715 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1717 rswitch_scalar = fr->rcoulomb_switch;
1718 rswitch = _mm_set1_ps(rswitch_scalar);
1719 /* Setup switch parameters */
1720 d_scalar = rcutoff_scalar-rswitch_scalar;
1721 d = _mm_set1_ps(d_scalar);
1722 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1723 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1724 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1725 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1726 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1727 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1729 /* Avoid stupid compiler warnings */
1730 jnrA = jnrB = jnrC = jnrD = 0;
1731 j_coord_offsetA = 0;
1732 j_coord_offsetB = 0;
1733 j_coord_offsetC = 0;
1734 j_coord_offsetD = 0;
1739 for(iidx=0;iidx<4*DIM;iidx++)
1741 scratch[iidx] = 0.0;
1744 /* Start outer loop over neighborlists */
1745 for(iidx=0; iidx<nri; iidx++)
1747 /* Load shift vector for this list */
1748 i_shift_offset = DIM*shiftidx[iidx];
1750 /* Load limits for loop over neighbors */
1751 j_index_start = jindex[iidx];
1752 j_index_end = jindex[iidx+1];
1754 /* Get outer coordinate index */
1756 i_coord_offset = DIM*inr;
1758 /* Load i particle coords and add shift vector */
1759 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1760 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1762 fix0 = _mm_setzero_ps();
1763 fiy0 = _mm_setzero_ps();
1764 fiz0 = _mm_setzero_ps();
1765 fix1 = _mm_setzero_ps();
1766 fiy1 = _mm_setzero_ps();
1767 fiz1 = _mm_setzero_ps();
1768 fix2 = _mm_setzero_ps();
1769 fiy2 = _mm_setzero_ps();
1770 fiz2 = _mm_setzero_ps();
1771 fix3 = _mm_setzero_ps();
1772 fiy3 = _mm_setzero_ps();
1773 fiz3 = _mm_setzero_ps();
1775 /* Start inner kernel loop */
1776 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1779 /* Get j neighbor index, and coordinate index */
1781 jnrB = jjnr[jidx+1];
1782 jnrC = jjnr[jidx+2];
1783 jnrD = jjnr[jidx+3];
1784 j_coord_offsetA = DIM*jnrA;
1785 j_coord_offsetB = DIM*jnrB;
1786 j_coord_offsetC = DIM*jnrC;
1787 j_coord_offsetD = DIM*jnrD;
1789 /* load j atom coordinates */
1790 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1791 x+j_coord_offsetC,x+j_coord_offsetD,
1792 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1793 &jy2,&jz2,&jx3,&jy3,&jz3);
1795 /* Calculate displacement vector */
1796 dx00 = _mm_sub_ps(ix0,jx0);
1797 dy00 = _mm_sub_ps(iy0,jy0);
1798 dz00 = _mm_sub_ps(iz0,jz0);
1799 dx11 = _mm_sub_ps(ix1,jx1);
1800 dy11 = _mm_sub_ps(iy1,jy1);
1801 dz11 = _mm_sub_ps(iz1,jz1);
1802 dx12 = _mm_sub_ps(ix1,jx2);
1803 dy12 = _mm_sub_ps(iy1,jy2);
1804 dz12 = _mm_sub_ps(iz1,jz2);
1805 dx13 = _mm_sub_ps(ix1,jx3);
1806 dy13 = _mm_sub_ps(iy1,jy3);
1807 dz13 = _mm_sub_ps(iz1,jz3);
1808 dx21 = _mm_sub_ps(ix2,jx1);
1809 dy21 = _mm_sub_ps(iy2,jy1);
1810 dz21 = _mm_sub_ps(iz2,jz1);
1811 dx22 = _mm_sub_ps(ix2,jx2);
1812 dy22 = _mm_sub_ps(iy2,jy2);
1813 dz22 = _mm_sub_ps(iz2,jz2);
1814 dx23 = _mm_sub_ps(ix2,jx3);
1815 dy23 = _mm_sub_ps(iy2,jy3);
1816 dz23 = _mm_sub_ps(iz2,jz3);
1817 dx31 = _mm_sub_ps(ix3,jx1);
1818 dy31 = _mm_sub_ps(iy3,jy1);
1819 dz31 = _mm_sub_ps(iz3,jz1);
1820 dx32 = _mm_sub_ps(ix3,jx2);
1821 dy32 = _mm_sub_ps(iy3,jy2);
1822 dz32 = _mm_sub_ps(iz3,jz2);
1823 dx33 = _mm_sub_ps(ix3,jx3);
1824 dy33 = _mm_sub_ps(iy3,jy3);
1825 dz33 = _mm_sub_ps(iz3,jz3);
1827 /* Calculate squared distance and things based on it */
1828 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1829 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1830 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1831 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1832 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1833 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1834 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1835 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1836 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1837 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1839 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1840 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1841 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1842 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1843 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1844 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1845 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1846 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1847 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1848 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1850 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1851 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1852 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1853 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1854 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1855 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1856 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1857 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1858 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1859 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1861 fjx0 = _mm_setzero_ps();
1862 fjy0 = _mm_setzero_ps();
1863 fjz0 = _mm_setzero_ps();
1864 fjx1 = _mm_setzero_ps();
1865 fjy1 = _mm_setzero_ps();
1866 fjz1 = _mm_setzero_ps();
1867 fjx2 = _mm_setzero_ps();
1868 fjy2 = _mm_setzero_ps();
1869 fjz2 = _mm_setzero_ps();
1870 fjx3 = _mm_setzero_ps();
1871 fjy3 = _mm_setzero_ps();
1872 fjz3 = _mm_setzero_ps();
1874 /**************************
1875 * CALCULATE INTERACTIONS *
1876 **************************/
1878 if (gmx_mm_any_lt(rsq00,rcutoff2))
1881 r00 = _mm_mul_ps(rsq00,rinv00);
1883 /* LENNARD-JONES DISPERSION/REPULSION */
1885 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1886 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1887 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1888 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
1889 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1891 d = _mm_sub_ps(r00,rswitch);
1892 d = _mm_max_ps(d,_mm_setzero_ps());
1893 d2 = _mm_mul_ps(d,d);
1894 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1896 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1898 /* Evaluate switch function */
1899 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1900 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1901 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1905 fscal = _mm_and_ps(fscal,cutoff_mask);
1907 /* Update vectorial force */
1908 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1909 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1910 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1912 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1913 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1914 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1918 /**************************
1919 * CALCULATE INTERACTIONS *
1920 **************************/
1922 if (gmx_mm_any_lt(rsq11,rcutoff2))
1925 r11 = _mm_mul_ps(rsq11,rinv11);
1927 /* EWALD ELECTROSTATICS */
1929 /* Analytical PME correction */
1930 zeta2 = _mm_mul_ps(beta2,rsq11);
1931 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1932 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1933 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1934 felec = _mm_mul_ps(qq11,felec);
1935 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1936 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
1937 velec = _mm_mul_ps(qq11,velec);
1939 d = _mm_sub_ps(r11,rswitch);
1940 d = _mm_max_ps(d,_mm_setzero_ps());
1941 d2 = _mm_mul_ps(d,d);
1942 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1944 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1946 /* Evaluate switch function */
1947 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1948 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1949 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1953 fscal = _mm_and_ps(fscal,cutoff_mask);
1955 /* Update vectorial force */
1956 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1957 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1958 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1960 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1961 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1962 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1966 /**************************
1967 * CALCULATE INTERACTIONS *
1968 **************************/
1970 if (gmx_mm_any_lt(rsq12,rcutoff2))
1973 r12 = _mm_mul_ps(rsq12,rinv12);
1975 /* EWALD ELECTROSTATICS */
1977 /* Analytical PME correction */
1978 zeta2 = _mm_mul_ps(beta2,rsq12);
1979 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1980 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1981 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1982 felec = _mm_mul_ps(qq12,felec);
1983 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1984 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
1985 velec = _mm_mul_ps(qq12,velec);
1987 d = _mm_sub_ps(r12,rswitch);
1988 d = _mm_max_ps(d,_mm_setzero_ps());
1989 d2 = _mm_mul_ps(d,d);
1990 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1992 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1994 /* Evaluate switch function */
1995 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1996 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1997 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2001 fscal = _mm_and_ps(fscal,cutoff_mask);
2003 /* Update vectorial force */
2004 fix1 = _mm_macc_ps(dx12,fscal,fix1);
2005 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
2006 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
2008 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
2009 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
2010 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
2014 /**************************
2015 * CALCULATE INTERACTIONS *
2016 **************************/
2018 if (gmx_mm_any_lt(rsq13,rcutoff2))
2021 r13 = _mm_mul_ps(rsq13,rinv13);
2023 /* EWALD ELECTROSTATICS */
2025 /* Analytical PME correction */
2026 zeta2 = _mm_mul_ps(beta2,rsq13);
2027 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
2028 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2029 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2030 felec = _mm_mul_ps(qq13,felec);
2031 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2032 velec = _mm_nmacc_ps(pmecorrV,beta,rinv13);
2033 velec = _mm_mul_ps(qq13,velec);
2035 d = _mm_sub_ps(r13,rswitch);
2036 d = _mm_max_ps(d,_mm_setzero_ps());
2037 d2 = _mm_mul_ps(d,d);
2038 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2040 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2042 /* Evaluate switch function */
2043 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2044 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2045 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2049 fscal = _mm_and_ps(fscal,cutoff_mask);
2051 /* Update vectorial force */
2052 fix1 = _mm_macc_ps(dx13,fscal,fix1);
2053 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
2054 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
2056 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
2057 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
2058 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
2062 /**************************
2063 * CALCULATE INTERACTIONS *
2064 **************************/
2066 if (gmx_mm_any_lt(rsq21,rcutoff2))
2069 r21 = _mm_mul_ps(rsq21,rinv21);
2071 /* EWALD ELECTROSTATICS */
2073 /* Analytical PME correction */
2074 zeta2 = _mm_mul_ps(beta2,rsq21);
2075 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
2076 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2077 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2078 felec = _mm_mul_ps(qq21,felec);
2079 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2080 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
2081 velec = _mm_mul_ps(qq21,velec);
2083 d = _mm_sub_ps(r21,rswitch);
2084 d = _mm_max_ps(d,_mm_setzero_ps());
2085 d2 = _mm_mul_ps(d,d);
2086 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2088 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2090 /* Evaluate switch function */
2091 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2092 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2093 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2097 fscal = _mm_and_ps(fscal,cutoff_mask);
2099 /* Update vectorial force */
2100 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2101 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2102 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2104 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2105 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2106 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2110 /**************************
2111 * CALCULATE INTERACTIONS *
2112 **************************/
2114 if (gmx_mm_any_lt(rsq22,rcutoff2))
2117 r22 = _mm_mul_ps(rsq22,rinv22);
2119 /* EWALD ELECTROSTATICS */
2121 /* Analytical PME correction */
2122 zeta2 = _mm_mul_ps(beta2,rsq22);
2123 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
2124 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2125 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2126 felec = _mm_mul_ps(qq22,felec);
2127 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2128 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
2129 velec = _mm_mul_ps(qq22,velec);
2131 d = _mm_sub_ps(r22,rswitch);
2132 d = _mm_max_ps(d,_mm_setzero_ps());
2133 d2 = _mm_mul_ps(d,d);
2134 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2136 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2138 /* Evaluate switch function */
2139 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2140 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2141 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2145 fscal = _mm_and_ps(fscal,cutoff_mask);
2147 /* Update vectorial force */
2148 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2149 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2150 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2152 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2153 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2154 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2158 /**************************
2159 * CALCULATE INTERACTIONS *
2160 **************************/
2162 if (gmx_mm_any_lt(rsq23,rcutoff2))
2165 r23 = _mm_mul_ps(rsq23,rinv23);
2167 /* EWALD ELECTROSTATICS */
2169 /* Analytical PME correction */
2170 zeta2 = _mm_mul_ps(beta2,rsq23);
2171 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
2172 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2173 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2174 felec = _mm_mul_ps(qq23,felec);
2175 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2176 velec = _mm_nmacc_ps(pmecorrV,beta,rinv23);
2177 velec = _mm_mul_ps(qq23,velec);
2179 d = _mm_sub_ps(r23,rswitch);
2180 d = _mm_max_ps(d,_mm_setzero_ps());
2181 d2 = _mm_mul_ps(d,d);
2182 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2184 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2186 /* Evaluate switch function */
2187 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2188 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2189 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2193 fscal = _mm_and_ps(fscal,cutoff_mask);
2195 /* Update vectorial force */
2196 fix2 = _mm_macc_ps(dx23,fscal,fix2);
2197 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
2198 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
2200 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
2201 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
2202 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
2206 /**************************
2207 * CALCULATE INTERACTIONS *
2208 **************************/
2210 if (gmx_mm_any_lt(rsq31,rcutoff2))
2213 r31 = _mm_mul_ps(rsq31,rinv31);
2215 /* EWALD ELECTROSTATICS */
2217 /* Analytical PME correction */
2218 zeta2 = _mm_mul_ps(beta2,rsq31);
2219 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
2220 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2221 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2222 felec = _mm_mul_ps(qq31,felec);
2223 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2224 velec = _mm_nmacc_ps(pmecorrV,beta,rinv31);
2225 velec = _mm_mul_ps(qq31,velec);
2227 d = _mm_sub_ps(r31,rswitch);
2228 d = _mm_max_ps(d,_mm_setzero_ps());
2229 d2 = _mm_mul_ps(d,d);
2230 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2232 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2234 /* Evaluate switch function */
2235 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2236 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2237 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2241 fscal = _mm_and_ps(fscal,cutoff_mask);
2243 /* Update vectorial force */
2244 fix3 = _mm_macc_ps(dx31,fscal,fix3);
2245 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
2246 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
2248 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
2249 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
2250 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
2254 /**************************
2255 * CALCULATE INTERACTIONS *
2256 **************************/
2258 if (gmx_mm_any_lt(rsq32,rcutoff2))
2261 r32 = _mm_mul_ps(rsq32,rinv32);
2263 /* EWALD ELECTROSTATICS */
2265 /* Analytical PME correction */
2266 zeta2 = _mm_mul_ps(beta2,rsq32);
2267 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
2268 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2269 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2270 felec = _mm_mul_ps(qq32,felec);
2271 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2272 velec = _mm_nmacc_ps(pmecorrV,beta,rinv32);
2273 velec = _mm_mul_ps(qq32,velec);
2275 d = _mm_sub_ps(r32,rswitch);
2276 d = _mm_max_ps(d,_mm_setzero_ps());
2277 d2 = _mm_mul_ps(d,d);
2278 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2280 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2282 /* Evaluate switch function */
2283 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2284 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2285 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2289 fscal = _mm_and_ps(fscal,cutoff_mask);
2291 /* Update vectorial force */
2292 fix3 = _mm_macc_ps(dx32,fscal,fix3);
2293 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
2294 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
2296 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
2297 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
2298 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
2302 /**************************
2303 * CALCULATE INTERACTIONS *
2304 **************************/
2306 if (gmx_mm_any_lt(rsq33,rcutoff2))
2309 r33 = _mm_mul_ps(rsq33,rinv33);
2311 /* EWALD ELECTROSTATICS */
2313 /* Analytical PME correction */
2314 zeta2 = _mm_mul_ps(beta2,rsq33);
2315 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
2316 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2317 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2318 felec = _mm_mul_ps(qq33,felec);
2319 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2320 velec = _mm_nmacc_ps(pmecorrV,beta,rinv33);
2321 velec = _mm_mul_ps(qq33,velec);
2323 d = _mm_sub_ps(r33,rswitch);
2324 d = _mm_max_ps(d,_mm_setzero_ps());
2325 d2 = _mm_mul_ps(d,d);
2326 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2328 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2330 /* Evaluate switch function */
2331 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2332 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2333 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2337 fscal = _mm_and_ps(fscal,cutoff_mask);
2339 /* Update vectorial force */
2340 fix3 = _mm_macc_ps(dx33,fscal,fix3);
2341 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
2342 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
2344 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
2345 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
2346 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
2350 fjptrA = f+j_coord_offsetA;
2351 fjptrB = f+j_coord_offsetB;
2352 fjptrC = f+j_coord_offsetC;
2353 fjptrD = f+j_coord_offsetD;
2355 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2356 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2357 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2359 /* Inner loop uses 512 flops */
2362 if(jidx<j_index_end)
2365 /* Get j neighbor index, and coordinate index */
2366 jnrlistA = jjnr[jidx];
2367 jnrlistB = jjnr[jidx+1];
2368 jnrlistC = jjnr[jidx+2];
2369 jnrlistD = jjnr[jidx+3];
2370 /* Sign of each element will be negative for non-real atoms.
2371 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2372 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2374 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2375 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2376 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2377 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2378 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2379 j_coord_offsetA = DIM*jnrA;
2380 j_coord_offsetB = DIM*jnrB;
2381 j_coord_offsetC = DIM*jnrC;
2382 j_coord_offsetD = DIM*jnrD;
2384 /* load j atom coordinates */
2385 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2386 x+j_coord_offsetC,x+j_coord_offsetD,
2387 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2388 &jy2,&jz2,&jx3,&jy3,&jz3);
2390 /* Calculate displacement vector */
2391 dx00 = _mm_sub_ps(ix0,jx0);
2392 dy00 = _mm_sub_ps(iy0,jy0);
2393 dz00 = _mm_sub_ps(iz0,jz0);
2394 dx11 = _mm_sub_ps(ix1,jx1);
2395 dy11 = _mm_sub_ps(iy1,jy1);
2396 dz11 = _mm_sub_ps(iz1,jz1);
2397 dx12 = _mm_sub_ps(ix1,jx2);
2398 dy12 = _mm_sub_ps(iy1,jy2);
2399 dz12 = _mm_sub_ps(iz1,jz2);
2400 dx13 = _mm_sub_ps(ix1,jx3);
2401 dy13 = _mm_sub_ps(iy1,jy3);
2402 dz13 = _mm_sub_ps(iz1,jz3);
2403 dx21 = _mm_sub_ps(ix2,jx1);
2404 dy21 = _mm_sub_ps(iy2,jy1);
2405 dz21 = _mm_sub_ps(iz2,jz1);
2406 dx22 = _mm_sub_ps(ix2,jx2);
2407 dy22 = _mm_sub_ps(iy2,jy2);
2408 dz22 = _mm_sub_ps(iz2,jz2);
2409 dx23 = _mm_sub_ps(ix2,jx3);
2410 dy23 = _mm_sub_ps(iy2,jy3);
2411 dz23 = _mm_sub_ps(iz2,jz3);
2412 dx31 = _mm_sub_ps(ix3,jx1);
2413 dy31 = _mm_sub_ps(iy3,jy1);
2414 dz31 = _mm_sub_ps(iz3,jz1);
2415 dx32 = _mm_sub_ps(ix3,jx2);
2416 dy32 = _mm_sub_ps(iy3,jy2);
2417 dz32 = _mm_sub_ps(iz3,jz2);
2418 dx33 = _mm_sub_ps(ix3,jx3);
2419 dy33 = _mm_sub_ps(iy3,jy3);
2420 dz33 = _mm_sub_ps(iz3,jz3);
2422 /* Calculate squared distance and things based on it */
2423 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2424 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2425 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2426 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2427 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2428 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2429 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2430 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2431 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2432 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2434 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2435 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2436 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2437 rinv13 = gmx_mm_invsqrt_ps(rsq13);
2438 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2439 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2440 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2441 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2442 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2443 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2445 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2446 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2447 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2448 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2449 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2450 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2451 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2452 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2453 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2454 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2456 fjx0 = _mm_setzero_ps();
2457 fjy0 = _mm_setzero_ps();
2458 fjz0 = _mm_setzero_ps();
2459 fjx1 = _mm_setzero_ps();
2460 fjy1 = _mm_setzero_ps();
2461 fjz1 = _mm_setzero_ps();
2462 fjx2 = _mm_setzero_ps();
2463 fjy2 = _mm_setzero_ps();
2464 fjz2 = _mm_setzero_ps();
2465 fjx3 = _mm_setzero_ps();
2466 fjy3 = _mm_setzero_ps();
2467 fjz3 = _mm_setzero_ps();
2469 /**************************
2470 * CALCULATE INTERACTIONS *
2471 **************************/
2473 if (gmx_mm_any_lt(rsq00,rcutoff2))
2476 r00 = _mm_mul_ps(rsq00,rinv00);
2477 r00 = _mm_andnot_ps(dummy_mask,r00);
2479 /* LENNARD-JONES DISPERSION/REPULSION */
2481 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2482 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
2483 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2484 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
2485 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2487 d = _mm_sub_ps(r00,rswitch);
2488 d = _mm_max_ps(d,_mm_setzero_ps());
2489 d2 = _mm_mul_ps(d,d);
2490 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2492 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2494 /* Evaluate switch function */
2495 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2496 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2497 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2501 fscal = _mm_and_ps(fscal,cutoff_mask);
2503 fscal = _mm_andnot_ps(dummy_mask,fscal);
2505 /* Update vectorial force */
2506 fix0 = _mm_macc_ps(dx00,fscal,fix0);
2507 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
2508 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
2510 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
2511 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
2512 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
2516 /**************************
2517 * CALCULATE INTERACTIONS *
2518 **************************/
2520 if (gmx_mm_any_lt(rsq11,rcutoff2))
2523 r11 = _mm_mul_ps(rsq11,rinv11);
2524 r11 = _mm_andnot_ps(dummy_mask,r11);
2526 /* EWALD ELECTROSTATICS */
2528 /* Analytical PME correction */
2529 zeta2 = _mm_mul_ps(beta2,rsq11);
2530 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
2531 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2532 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2533 felec = _mm_mul_ps(qq11,felec);
2534 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2535 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
2536 velec = _mm_mul_ps(qq11,velec);
2538 d = _mm_sub_ps(r11,rswitch);
2539 d = _mm_max_ps(d,_mm_setzero_ps());
2540 d2 = _mm_mul_ps(d,d);
2541 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2543 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2545 /* Evaluate switch function */
2546 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2547 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2548 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2552 fscal = _mm_and_ps(fscal,cutoff_mask);
2554 fscal = _mm_andnot_ps(dummy_mask,fscal);
2556 /* Update vectorial force */
2557 fix1 = _mm_macc_ps(dx11,fscal,fix1);
2558 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
2559 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
2561 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
2562 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
2563 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
2567 /**************************
2568 * CALCULATE INTERACTIONS *
2569 **************************/
2571 if (gmx_mm_any_lt(rsq12,rcutoff2))
2574 r12 = _mm_mul_ps(rsq12,rinv12);
2575 r12 = _mm_andnot_ps(dummy_mask,r12);
2577 /* EWALD ELECTROSTATICS */
2579 /* Analytical PME correction */
2580 zeta2 = _mm_mul_ps(beta2,rsq12);
2581 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
2582 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2583 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2584 felec = _mm_mul_ps(qq12,felec);
2585 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2586 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
2587 velec = _mm_mul_ps(qq12,velec);
2589 d = _mm_sub_ps(r12,rswitch);
2590 d = _mm_max_ps(d,_mm_setzero_ps());
2591 d2 = _mm_mul_ps(d,d);
2592 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2594 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2596 /* Evaluate switch function */
2597 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2598 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2599 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2603 fscal = _mm_and_ps(fscal,cutoff_mask);
2605 fscal = _mm_andnot_ps(dummy_mask,fscal);
2607 /* Update vectorial force */
2608 fix1 = _mm_macc_ps(dx12,fscal,fix1);
2609 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
2610 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
2612 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
2613 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
2614 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
2618 /**************************
2619 * CALCULATE INTERACTIONS *
2620 **************************/
2622 if (gmx_mm_any_lt(rsq13,rcutoff2))
2625 r13 = _mm_mul_ps(rsq13,rinv13);
2626 r13 = _mm_andnot_ps(dummy_mask,r13);
2628 /* EWALD ELECTROSTATICS */
2630 /* Analytical PME correction */
2631 zeta2 = _mm_mul_ps(beta2,rsq13);
2632 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
2633 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2634 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2635 felec = _mm_mul_ps(qq13,felec);
2636 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2637 velec = _mm_nmacc_ps(pmecorrV,beta,rinv13);
2638 velec = _mm_mul_ps(qq13,velec);
2640 d = _mm_sub_ps(r13,rswitch);
2641 d = _mm_max_ps(d,_mm_setzero_ps());
2642 d2 = _mm_mul_ps(d,d);
2643 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2645 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2647 /* Evaluate switch function */
2648 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2649 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2650 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2654 fscal = _mm_and_ps(fscal,cutoff_mask);
2656 fscal = _mm_andnot_ps(dummy_mask,fscal);
2658 /* Update vectorial force */
2659 fix1 = _mm_macc_ps(dx13,fscal,fix1);
2660 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
2661 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
2663 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
2664 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
2665 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
2669 /**************************
2670 * CALCULATE INTERACTIONS *
2671 **************************/
2673 if (gmx_mm_any_lt(rsq21,rcutoff2))
2676 r21 = _mm_mul_ps(rsq21,rinv21);
2677 r21 = _mm_andnot_ps(dummy_mask,r21);
2679 /* EWALD ELECTROSTATICS */
2681 /* Analytical PME correction */
2682 zeta2 = _mm_mul_ps(beta2,rsq21);
2683 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
2684 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2685 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2686 felec = _mm_mul_ps(qq21,felec);
2687 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2688 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
2689 velec = _mm_mul_ps(qq21,velec);
2691 d = _mm_sub_ps(r21,rswitch);
2692 d = _mm_max_ps(d,_mm_setzero_ps());
2693 d2 = _mm_mul_ps(d,d);
2694 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2696 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2698 /* Evaluate switch function */
2699 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2700 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2701 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2705 fscal = _mm_and_ps(fscal,cutoff_mask);
2707 fscal = _mm_andnot_ps(dummy_mask,fscal);
2709 /* Update vectorial force */
2710 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2711 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2712 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2714 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2715 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2716 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2720 /**************************
2721 * CALCULATE INTERACTIONS *
2722 **************************/
2724 if (gmx_mm_any_lt(rsq22,rcutoff2))
2727 r22 = _mm_mul_ps(rsq22,rinv22);
2728 r22 = _mm_andnot_ps(dummy_mask,r22);
2730 /* EWALD ELECTROSTATICS */
2732 /* Analytical PME correction */
2733 zeta2 = _mm_mul_ps(beta2,rsq22);
2734 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
2735 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2736 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2737 felec = _mm_mul_ps(qq22,felec);
2738 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2739 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
2740 velec = _mm_mul_ps(qq22,velec);
2742 d = _mm_sub_ps(r22,rswitch);
2743 d = _mm_max_ps(d,_mm_setzero_ps());
2744 d2 = _mm_mul_ps(d,d);
2745 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2747 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2749 /* Evaluate switch function */
2750 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2751 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2752 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2756 fscal = _mm_and_ps(fscal,cutoff_mask);
2758 fscal = _mm_andnot_ps(dummy_mask,fscal);
2760 /* Update vectorial force */
2761 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2762 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2763 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2765 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2766 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2767 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2771 /**************************
2772 * CALCULATE INTERACTIONS *
2773 **************************/
2775 if (gmx_mm_any_lt(rsq23,rcutoff2))
2778 r23 = _mm_mul_ps(rsq23,rinv23);
2779 r23 = _mm_andnot_ps(dummy_mask,r23);
2781 /* EWALD ELECTROSTATICS */
2783 /* Analytical PME correction */
2784 zeta2 = _mm_mul_ps(beta2,rsq23);
2785 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
2786 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2787 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2788 felec = _mm_mul_ps(qq23,felec);
2789 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2790 velec = _mm_nmacc_ps(pmecorrV,beta,rinv23);
2791 velec = _mm_mul_ps(qq23,velec);
2793 d = _mm_sub_ps(r23,rswitch);
2794 d = _mm_max_ps(d,_mm_setzero_ps());
2795 d2 = _mm_mul_ps(d,d);
2796 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2798 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2800 /* Evaluate switch function */
2801 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2802 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2803 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2807 fscal = _mm_and_ps(fscal,cutoff_mask);
2809 fscal = _mm_andnot_ps(dummy_mask,fscal);
2811 /* Update vectorial force */
2812 fix2 = _mm_macc_ps(dx23,fscal,fix2);
2813 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
2814 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
2816 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
2817 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
2818 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
2822 /**************************
2823 * CALCULATE INTERACTIONS *
2824 **************************/
2826 if (gmx_mm_any_lt(rsq31,rcutoff2))
2829 r31 = _mm_mul_ps(rsq31,rinv31);
2830 r31 = _mm_andnot_ps(dummy_mask,r31);
2832 /* EWALD ELECTROSTATICS */
2834 /* Analytical PME correction */
2835 zeta2 = _mm_mul_ps(beta2,rsq31);
2836 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
2837 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2838 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2839 felec = _mm_mul_ps(qq31,felec);
2840 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2841 velec = _mm_nmacc_ps(pmecorrV,beta,rinv31);
2842 velec = _mm_mul_ps(qq31,velec);
2844 d = _mm_sub_ps(r31,rswitch);
2845 d = _mm_max_ps(d,_mm_setzero_ps());
2846 d2 = _mm_mul_ps(d,d);
2847 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2849 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2851 /* Evaluate switch function */
2852 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2853 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2854 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2858 fscal = _mm_and_ps(fscal,cutoff_mask);
2860 fscal = _mm_andnot_ps(dummy_mask,fscal);
2862 /* Update vectorial force */
2863 fix3 = _mm_macc_ps(dx31,fscal,fix3);
2864 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
2865 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
2867 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
2868 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
2869 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
2873 /**************************
2874 * CALCULATE INTERACTIONS *
2875 **************************/
2877 if (gmx_mm_any_lt(rsq32,rcutoff2))
2880 r32 = _mm_mul_ps(rsq32,rinv32);
2881 r32 = _mm_andnot_ps(dummy_mask,r32);
2883 /* EWALD ELECTROSTATICS */
2885 /* Analytical PME correction */
2886 zeta2 = _mm_mul_ps(beta2,rsq32);
2887 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
2888 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2889 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2890 felec = _mm_mul_ps(qq32,felec);
2891 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2892 velec = _mm_nmacc_ps(pmecorrV,beta,rinv32);
2893 velec = _mm_mul_ps(qq32,velec);
2895 d = _mm_sub_ps(r32,rswitch);
2896 d = _mm_max_ps(d,_mm_setzero_ps());
2897 d2 = _mm_mul_ps(d,d);
2898 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2900 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2902 /* Evaluate switch function */
2903 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2904 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2905 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2909 fscal = _mm_and_ps(fscal,cutoff_mask);
2911 fscal = _mm_andnot_ps(dummy_mask,fscal);
2913 /* Update vectorial force */
2914 fix3 = _mm_macc_ps(dx32,fscal,fix3);
2915 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
2916 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
2918 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
2919 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
2920 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
2924 /**************************
2925 * CALCULATE INTERACTIONS *
2926 **************************/
2928 if (gmx_mm_any_lt(rsq33,rcutoff2))
2931 r33 = _mm_mul_ps(rsq33,rinv33);
2932 r33 = _mm_andnot_ps(dummy_mask,r33);
2934 /* EWALD ELECTROSTATICS */
2936 /* Analytical PME correction */
2937 zeta2 = _mm_mul_ps(beta2,rsq33);
2938 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
2939 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2940 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2941 felec = _mm_mul_ps(qq33,felec);
2942 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2943 velec = _mm_nmacc_ps(pmecorrV,beta,rinv33);
2944 velec = _mm_mul_ps(qq33,velec);
2946 d = _mm_sub_ps(r33,rswitch);
2947 d = _mm_max_ps(d,_mm_setzero_ps());
2948 d2 = _mm_mul_ps(d,d);
2949 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2951 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2953 /* Evaluate switch function */
2954 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2955 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2956 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2960 fscal = _mm_and_ps(fscal,cutoff_mask);
2962 fscal = _mm_andnot_ps(dummy_mask,fscal);
2964 /* Update vectorial force */
2965 fix3 = _mm_macc_ps(dx33,fscal,fix3);
2966 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
2967 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
2969 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
2970 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
2971 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
2975 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2976 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2977 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2978 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2980 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2981 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2982 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2984 /* Inner loop uses 522 flops */
2987 /* End of innermost loop */
2989 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2990 f+i_coord_offset,fshift+i_shift_offset);
2992 /* Increment number of inner iterations */
2993 inneriter += j_index_end - j_index_start;
2995 /* Outer loop uses 24 flops */
2998 /* Increment number of outer iterations */
3001 /* Update outer/inner flops */
3003 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*522);