2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
48 #include "kernelutil_x86_avx_128_fma_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_128_fma_single
52 * Electrostatics interaction: ReactionField
53 * VdW interaction: LennardJones
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_128_fma_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
112 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
116 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
117 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
118 real rswitch_scalar,d_scalar;
119 __m128 dummy_mask,cutoff_mask;
120 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
121 __m128 one = _mm_set1_ps(1.0);
122 __m128 two = _mm_set1_ps(2.0);
128 jindex = nlist->jindex;
130 shiftidx = nlist->shift;
132 shiftvec = fr->shift_vec[0];
133 fshift = fr->fshift[0];
134 facel = _mm_set1_ps(fr->epsfac);
135 charge = mdatoms->chargeA;
136 krf = _mm_set1_ps(fr->ic->k_rf);
137 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
138 crf = _mm_set1_ps(fr->ic->c_rf);
139 nvdwtype = fr->ntype;
141 vdwtype = mdatoms->typeA;
143 /* Setup water-specific parameters */
144 inr = nlist->iinr[0];
145 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
146 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
147 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
148 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
150 jq1 = _mm_set1_ps(charge[inr+1]);
151 jq2 = _mm_set1_ps(charge[inr+2]);
152 jq3 = _mm_set1_ps(charge[inr+3]);
153 vdwjidx0A = 2*vdwtype[inr+0];
154 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
155 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
156 qq11 = _mm_mul_ps(iq1,jq1);
157 qq12 = _mm_mul_ps(iq1,jq2);
158 qq13 = _mm_mul_ps(iq1,jq3);
159 qq21 = _mm_mul_ps(iq2,jq1);
160 qq22 = _mm_mul_ps(iq2,jq2);
161 qq23 = _mm_mul_ps(iq2,jq3);
162 qq31 = _mm_mul_ps(iq3,jq1);
163 qq32 = _mm_mul_ps(iq3,jq2);
164 qq33 = _mm_mul_ps(iq3,jq3);
166 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
167 rcutoff_scalar = fr->rcoulomb;
168 rcutoff = _mm_set1_ps(rcutoff_scalar);
169 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
171 rswitch_scalar = fr->rvdw_switch;
172 rswitch = _mm_set1_ps(rswitch_scalar);
173 /* Setup switch parameters */
174 d_scalar = rcutoff_scalar-rswitch_scalar;
175 d = _mm_set1_ps(d_scalar);
176 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
177 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
178 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
179 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
180 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
181 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
183 /* Avoid stupid compiler warnings */
184 jnrA = jnrB = jnrC = jnrD = 0;
193 for(iidx=0;iidx<4*DIM;iidx++)
198 /* Start outer loop over neighborlists */
199 for(iidx=0; iidx<nri; iidx++)
201 /* Load shift vector for this list */
202 i_shift_offset = DIM*shiftidx[iidx];
204 /* Load limits for loop over neighbors */
205 j_index_start = jindex[iidx];
206 j_index_end = jindex[iidx+1];
208 /* Get outer coordinate index */
210 i_coord_offset = DIM*inr;
212 /* Load i particle coords and add shift vector */
213 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
214 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
216 fix0 = _mm_setzero_ps();
217 fiy0 = _mm_setzero_ps();
218 fiz0 = _mm_setzero_ps();
219 fix1 = _mm_setzero_ps();
220 fiy1 = _mm_setzero_ps();
221 fiz1 = _mm_setzero_ps();
222 fix2 = _mm_setzero_ps();
223 fiy2 = _mm_setzero_ps();
224 fiz2 = _mm_setzero_ps();
225 fix3 = _mm_setzero_ps();
226 fiy3 = _mm_setzero_ps();
227 fiz3 = _mm_setzero_ps();
229 /* Reset potential sums */
230 velecsum = _mm_setzero_ps();
231 vvdwsum = _mm_setzero_ps();
233 /* Start inner kernel loop */
234 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
237 /* Get j neighbor index, and coordinate index */
242 j_coord_offsetA = DIM*jnrA;
243 j_coord_offsetB = DIM*jnrB;
244 j_coord_offsetC = DIM*jnrC;
245 j_coord_offsetD = DIM*jnrD;
247 /* load j atom coordinates */
248 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
249 x+j_coord_offsetC,x+j_coord_offsetD,
250 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
251 &jy2,&jz2,&jx3,&jy3,&jz3);
253 /* Calculate displacement vector */
254 dx00 = _mm_sub_ps(ix0,jx0);
255 dy00 = _mm_sub_ps(iy0,jy0);
256 dz00 = _mm_sub_ps(iz0,jz0);
257 dx11 = _mm_sub_ps(ix1,jx1);
258 dy11 = _mm_sub_ps(iy1,jy1);
259 dz11 = _mm_sub_ps(iz1,jz1);
260 dx12 = _mm_sub_ps(ix1,jx2);
261 dy12 = _mm_sub_ps(iy1,jy2);
262 dz12 = _mm_sub_ps(iz1,jz2);
263 dx13 = _mm_sub_ps(ix1,jx3);
264 dy13 = _mm_sub_ps(iy1,jy3);
265 dz13 = _mm_sub_ps(iz1,jz3);
266 dx21 = _mm_sub_ps(ix2,jx1);
267 dy21 = _mm_sub_ps(iy2,jy1);
268 dz21 = _mm_sub_ps(iz2,jz1);
269 dx22 = _mm_sub_ps(ix2,jx2);
270 dy22 = _mm_sub_ps(iy2,jy2);
271 dz22 = _mm_sub_ps(iz2,jz2);
272 dx23 = _mm_sub_ps(ix2,jx3);
273 dy23 = _mm_sub_ps(iy2,jy3);
274 dz23 = _mm_sub_ps(iz2,jz3);
275 dx31 = _mm_sub_ps(ix3,jx1);
276 dy31 = _mm_sub_ps(iy3,jy1);
277 dz31 = _mm_sub_ps(iz3,jz1);
278 dx32 = _mm_sub_ps(ix3,jx2);
279 dy32 = _mm_sub_ps(iy3,jy2);
280 dz32 = _mm_sub_ps(iz3,jz2);
281 dx33 = _mm_sub_ps(ix3,jx3);
282 dy33 = _mm_sub_ps(iy3,jy3);
283 dz33 = _mm_sub_ps(iz3,jz3);
285 /* Calculate squared distance and things based on it */
286 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
287 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
288 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
289 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
290 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
291 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
292 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
293 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
294 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
295 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
297 rinv00 = gmx_mm_invsqrt_ps(rsq00);
298 rinv11 = gmx_mm_invsqrt_ps(rsq11);
299 rinv12 = gmx_mm_invsqrt_ps(rsq12);
300 rinv13 = gmx_mm_invsqrt_ps(rsq13);
301 rinv21 = gmx_mm_invsqrt_ps(rsq21);
302 rinv22 = gmx_mm_invsqrt_ps(rsq22);
303 rinv23 = gmx_mm_invsqrt_ps(rsq23);
304 rinv31 = gmx_mm_invsqrt_ps(rsq31);
305 rinv32 = gmx_mm_invsqrt_ps(rsq32);
306 rinv33 = gmx_mm_invsqrt_ps(rsq33);
308 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
309 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
310 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
311 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
312 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
313 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
314 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
315 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
316 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
317 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
319 fjx0 = _mm_setzero_ps();
320 fjy0 = _mm_setzero_ps();
321 fjz0 = _mm_setzero_ps();
322 fjx1 = _mm_setzero_ps();
323 fjy1 = _mm_setzero_ps();
324 fjz1 = _mm_setzero_ps();
325 fjx2 = _mm_setzero_ps();
326 fjy2 = _mm_setzero_ps();
327 fjz2 = _mm_setzero_ps();
328 fjx3 = _mm_setzero_ps();
329 fjy3 = _mm_setzero_ps();
330 fjz3 = _mm_setzero_ps();
332 /**************************
333 * CALCULATE INTERACTIONS *
334 **************************/
336 if (gmx_mm_any_lt(rsq00,rcutoff2))
339 r00 = _mm_mul_ps(rsq00,rinv00);
341 /* LENNARD-JONES DISPERSION/REPULSION */
343 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
344 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
345 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
346 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
347 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
349 d = _mm_sub_ps(r00,rswitch);
350 d = _mm_max_ps(d,_mm_setzero_ps());
351 d2 = _mm_mul_ps(d,d);
352 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
354 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
356 /* Evaluate switch function */
357 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
358 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
359 vvdw = _mm_mul_ps(vvdw,sw);
360 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
362 /* Update potential sum for this i atom from the interaction with this j atom. */
363 vvdw = _mm_and_ps(vvdw,cutoff_mask);
364 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
368 fscal = _mm_and_ps(fscal,cutoff_mask);
370 /* Update vectorial force */
371 fix0 = _mm_macc_ps(dx00,fscal,fix0);
372 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
373 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
375 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
376 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
377 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
381 /**************************
382 * CALCULATE INTERACTIONS *
383 **************************/
385 if (gmx_mm_any_lt(rsq11,rcutoff2))
388 /* REACTION-FIELD ELECTROSTATICS */
389 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_macc_ps(krf,rsq11,rinv11),crf));
390 felec = _mm_mul_ps(qq11,_mm_msub_ps(rinv11,rinvsq11,krf2));
392 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
394 /* Update potential sum for this i atom from the interaction with this j atom. */
395 velec = _mm_and_ps(velec,cutoff_mask);
396 velecsum = _mm_add_ps(velecsum,velec);
400 fscal = _mm_and_ps(fscal,cutoff_mask);
402 /* Update vectorial force */
403 fix1 = _mm_macc_ps(dx11,fscal,fix1);
404 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
405 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
407 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
408 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
409 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
413 /**************************
414 * CALCULATE INTERACTIONS *
415 **************************/
417 if (gmx_mm_any_lt(rsq12,rcutoff2))
420 /* REACTION-FIELD ELECTROSTATICS */
421 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_macc_ps(krf,rsq12,rinv12),crf));
422 felec = _mm_mul_ps(qq12,_mm_msub_ps(rinv12,rinvsq12,krf2));
424 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
426 /* Update potential sum for this i atom from the interaction with this j atom. */
427 velec = _mm_and_ps(velec,cutoff_mask);
428 velecsum = _mm_add_ps(velecsum,velec);
432 fscal = _mm_and_ps(fscal,cutoff_mask);
434 /* Update vectorial force */
435 fix1 = _mm_macc_ps(dx12,fscal,fix1);
436 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
437 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
439 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
440 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
441 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
445 /**************************
446 * CALCULATE INTERACTIONS *
447 **************************/
449 if (gmx_mm_any_lt(rsq13,rcutoff2))
452 /* REACTION-FIELD ELECTROSTATICS */
453 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_macc_ps(krf,rsq13,rinv13),crf));
454 felec = _mm_mul_ps(qq13,_mm_msub_ps(rinv13,rinvsq13,krf2));
456 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
458 /* Update potential sum for this i atom from the interaction with this j atom. */
459 velec = _mm_and_ps(velec,cutoff_mask);
460 velecsum = _mm_add_ps(velecsum,velec);
464 fscal = _mm_and_ps(fscal,cutoff_mask);
466 /* Update vectorial force */
467 fix1 = _mm_macc_ps(dx13,fscal,fix1);
468 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
469 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
471 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
472 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
473 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
477 /**************************
478 * CALCULATE INTERACTIONS *
479 **************************/
481 if (gmx_mm_any_lt(rsq21,rcutoff2))
484 /* REACTION-FIELD ELECTROSTATICS */
485 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_macc_ps(krf,rsq21,rinv21),crf));
486 felec = _mm_mul_ps(qq21,_mm_msub_ps(rinv21,rinvsq21,krf2));
488 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
490 /* Update potential sum for this i atom from the interaction with this j atom. */
491 velec = _mm_and_ps(velec,cutoff_mask);
492 velecsum = _mm_add_ps(velecsum,velec);
496 fscal = _mm_and_ps(fscal,cutoff_mask);
498 /* Update vectorial force */
499 fix2 = _mm_macc_ps(dx21,fscal,fix2);
500 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
501 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
503 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
504 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
505 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
509 /**************************
510 * CALCULATE INTERACTIONS *
511 **************************/
513 if (gmx_mm_any_lt(rsq22,rcutoff2))
516 /* REACTION-FIELD ELECTROSTATICS */
517 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_macc_ps(krf,rsq22,rinv22),crf));
518 felec = _mm_mul_ps(qq22,_mm_msub_ps(rinv22,rinvsq22,krf2));
520 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
522 /* Update potential sum for this i atom from the interaction with this j atom. */
523 velec = _mm_and_ps(velec,cutoff_mask);
524 velecsum = _mm_add_ps(velecsum,velec);
528 fscal = _mm_and_ps(fscal,cutoff_mask);
530 /* Update vectorial force */
531 fix2 = _mm_macc_ps(dx22,fscal,fix2);
532 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
533 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
535 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
536 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
537 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
541 /**************************
542 * CALCULATE INTERACTIONS *
543 **************************/
545 if (gmx_mm_any_lt(rsq23,rcutoff2))
548 /* REACTION-FIELD ELECTROSTATICS */
549 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_macc_ps(krf,rsq23,rinv23),crf));
550 felec = _mm_mul_ps(qq23,_mm_msub_ps(rinv23,rinvsq23,krf2));
552 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
554 /* Update potential sum for this i atom from the interaction with this j atom. */
555 velec = _mm_and_ps(velec,cutoff_mask);
556 velecsum = _mm_add_ps(velecsum,velec);
560 fscal = _mm_and_ps(fscal,cutoff_mask);
562 /* Update vectorial force */
563 fix2 = _mm_macc_ps(dx23,fscal,fix2);
564 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
565 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
567 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
568 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
569 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
573 /**************************
574 * CALCULATE INTERACTIONS *
575 **************************/
577 if (gmx_mm_any_lt(rsq31,rcutoff2))
580 /* REACTION-FIELD ELECTROSTATICS */
581 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_macc_ps(krf,rsq31,rinv31),crf));
582 felec = _mm_mul_ps(qq31,_mm_msub_ps(rinv31,rinvsq31,krf2));
584 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
586 /* Update potential sum for this i atom from the interaction with this j atom. */
587 velec = _mm_and_ps(velec,cutoff_mask);
588 velecsum = _mm_add_ps(velecsum,velec);
592 fscal = _mm_and_ps(fscal,cutoff_mask);
594 /* Update vectorial force */
595 fix3 = _mm_macc_ps(dx31,fscal,fix3);
596 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
597 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
599 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
600 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
601 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
605 /**************************
606 * CALCULATE INTERACTIONS *
607 **************************/
609 if (gmx_mm_any_lt(rsq32,rcutoff2))
612 /* REACTION-FIELD ELECTROSTATICS */
613 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_macc_ps(krf,rsq32,rinv32),crf));
614 felec = _mm_mul_ps(qq32,_mm_msub_ps(rinv32,rinvsq32,krf2));
616 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
618 /* Update potential sum for this i atom from the interaction with this j atom. */
619 velec = _mm_and_ps(velec,cutoff_mask);
620 velecsum = _mm_add_ps(velecsum,velec);
624 fscal = _mm_and_ps(fscal,cutoff_mask);
626 /* Update vectorial force */
627 fix3 = _mm_macc_ps(dx32,fscal,fix3);
628 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
629 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
631 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
632 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
633 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
637 /**************************
638 * CALCULATE INTERACTIONS *
639 **************************/
641 if (gmx_mm_any_lt(rsq33,rcutoff2))
644 /* REACTION-FIELD ELECTROSTATICS */
645 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_macc_ps(krf,rsq33,rinv33),crf));
646 felec = _mm_mul_ps(qq33,_mm_msub_ps(rinv33,rinvsq33,krf2));
648 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
650 /* Update potential sum for this i atom from the interaction with this j atom. */
651 velec = _mm_and_ps(velec,cutoff_mask);
652 velecsum = _mm_add_ps(velecsum,velec);
656 fscal = _mm_and_ps(fscal,cutoff_mask);
658 /* Update vectorial force */
659 fix3 = _mm_macc_ps(dx33,fscal,fix3);
660 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
661 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
663 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
664 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
665 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
669 fjptrA = f+j_coord_offsetA;
670 fjptrB = f+j_coord_offsetB;
671 fjptrC = f+j_coord_offsetC;
672 fjptrD = f+j_coord_offsetD;
674 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
675 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
676 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
678 /* Inner loop uses 416 flops */
684 /* Get j neighbor index, and coordinate index */
685 jnrlistA = jjnr[jidx];
686 jnrlistB = jjnr[jidx+1];
687 jnrlistC = jjnr[jidx+2];
688 jnrlistD = jjnr[jidx+3];
689 /* Sign of each element will be negative for non-real atoms.
690 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
691 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
693 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
694 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
695 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
696 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
697 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
698 j_coord_offsetA = DIM*jnrA;
699 j_coord_offsetB = DIM*jnrB;
700 j_coord_offsetC = DIM*jnrC;
701 j_coord_offsetD = DIM*jnrD;
703 /* load j atom coordinates */
704 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
705 x+j_coord_offsetC,x+j_coord_offsetD,
706 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
707 &jy2,&jz2,&jx3,&jy3,&jz3);
709 /* Calculate displacement vector */
710 dx00 = _mm_sub_ps(ix0,jx0);
711 dy00 = _mm_sub_ps(iy0,jy0);
712 dz00 = _mm_sub_ps(iz0,jz0);
713 dx11 = _mm_sub_ps(ix1,jx1);
714 dy11 = _mm_sub_ps(iy1,jy1);
715 dz11 = _mm_sub_ps(iz1,jz1);
716 dx12 = _mm_sub_ps(ix1,jx2);
717 dy12 = _mm_sub_ps(iy1,jy2);
718 dz12 = _mm_sub_ps(iz1,jz2);
719 dx13 = _mm_sub_ps(ix1,jx3);
720 dy13 = _mm_sub_ps(iy1,jy3);
721 dz13 = _mm_sub_ps(iz1,jz3);
722 dx21 = _mm_sub_ps(ix2,jx1);
723 dy21 = _mm_sub_ps(iy2,jy1);
724 dz21 = _mm_sub_ps(iz2,jz1);
725 dx22 = _mm_sub_ps(ix2,jx2);
726 dy22 = _mm_sub_ps(iy2,jy2);
727 dz22 = _mm_sub_ps(iz2,jz2);
728 dx23 = _mm_sub_ps(ix2,jx3);
729 dy23 = _mm_sub_ps(iy2,jy3);
730 dz23 = _mm_sub_ps(iz2,jz3);
731 dx31 = _mm_sub_ps(ix3,jx1);
732 dy31 = _mm_sub_ps(iy3,jy1);
733 dz31 = _mm_sub_ps(iz3,jz1);
734 dx32 = _mm_sub_ps(ix3,jx2);
735 dy32 = _mm_sub_ps(iy3,jy2);
736 dz32 = _mm_sub_ps(iz3,jz2);
737 dx33 = _mm_sub_ps(ix3,jx3);
738 dy33 = _mm_sub_ps(iy3,jy3);
739 dz33 = _mm_sub_ps(iz3,jz3);
741 /* Calculate squared distance and things based on it */
742 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
743 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
744 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
745 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
746 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
747 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
748 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
749 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
750 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
751 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
753 rinv00 = gmx_mm_invsqrt_ps(rsq00);
754 rinv11 = gmx_mm_invsqrt_ps(rsq11);
755 rinv12 = gmx_mm_invsqrt_ps(rsq12);
756 rinv13 = gmx_mm_invsqrt_ps(rsq13);
757 rinv21 = gmx_mm_invsqrt_ps(rsq21);
758 rinv22 = gmx_mm_invsqrt_ps(rsq22);
759 rinv23 = gmx_mm_invsqrt_ps(rsq23);
760 rinv31 = gmx_mm_invsqrt_ps(rsq31);
761 rinv32 = gmx_mm_invsqrt_ps(rsq32);
762 rinv33 = gmx_mm_invsqrt_ps(rsq33);
764 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
765 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
766 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
767 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
768 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
769 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
770 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
771 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
772 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
773 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
775 fjx0 = _mm_setzero_ps();
776 fjy0 = _mm_setzero_ps();
777 fjz0 = _mm_setzero_ps();
778 fjx1 = _mm_setzero_ps();
779 fjy1 = _mm_setzero_ps();
780 fjz1 = _mm_setzero_ps();
781 fjx2 = _mm_setzero_ps();
782 fjy2 = _mm_setzero_ps();
783 fjz2 = _mm_setzero_ps();
784 fjx3 = _mm_setzero_ps();
785 fjy3 = _mm_setzero_ps();
786 fjz3 = _mm_setzero_ps();
788 /**************************
789 * CALCULATE INTERACTIONS *
790 **************************/
792 if (gmx_mm_any_lt(rsq00,rcutoff2))
795 r00 = _mm_mul_ps(rsq00,rinv00);
796 r00 = _mm_andnot_ps(dummy_mask,r00);
798 /* LENNARD-JONES DISPERSION/REPULSION */
800 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
801 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
802 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
803 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
804 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
806 d = _mm_sub_ps(r00,rswitch);
807 d = _mm_max_ps(d,_mm_setzero_ps());
808 d2 = _mm_mul_ps(d,d);
809 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
811 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
813 /* Evaluate switch function */
814 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
815 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
816 vvdw = _mm_mul_ps(vvdw,sw);
817 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
819 /* Update potential sum for this i atom from the interaction with this j atom. */
820 vvdw = _mm_and_ps(vvdw,cutoff_mask);
821 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
822 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
826 fscal = _mm_and_ps(fscal,cutoff_mask);
828 fscal = _mm_andnot_ps(dummy_mask,fscal);
830 /* Update vectorial force */
831 fix0 = _mm_macc_ps(dx00,fscal,fix0);
832 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
833 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
835 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
836 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
837 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
841 /**************************
842 * CALCULATE INTERACTIONS *
843 **************************/
845 if (gmx_mm_any_lt(rsq11,rcutoff2))
848 /* REACTION-FIELD ELECTROSTATICS */
849 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_macc_ps(krf,rsq11,rinv11),crf));
850 felec = _mm_mul_ps(qq11,_mm_msub_ps(rinv11,rinvsq11,krf2));
852 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
854 /* Update potential sum for this i atom from the interaction with this j atom. */
855 velec = _mm_and_ps(velec,cutoff_mask);
856 velec = _mm_andnot_ps(dummy_mask,velec);
857 velecsum = _mm_add_ps(velecsum,velec);
861 fscal = _mm_and_ps(fscal,cutoff_mask);
863 fscal = _mm_andnot_ps(dummy_mask,fscal);
865 /* Update vectorial force */
866 fix1 = _mm_macc_ps(dx11,fscal,fix1);
867 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
868 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
870 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
871 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
872 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
876 /**************************
877 * CALCULATE INTERACTIONS *
878 **************************/
880 if (gmx_mm_any_lt(rsq12,rcutoff2))
883 /* REACTION-FIELD ELECTROSTATICS */
884 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_macc_ps(krf,rsq12,rinv12),crf));
885 felec = _mm_mul_ps(qq12,_mm_msub_ps(rinv12,rinvsq12,krf2));
887 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
889 /* Update potential sum for this i atom from the interaction with this j atom. */
890 velec = _mm_and_ps(velec,cutoff_mask);
891 velec = _mm_andnot_ps(dummy_mask,velec);
892 velecsum = _mm_add_ps(velecsum,velec);
896 fscal = _mm_and_ps(fscal,cutoff_mask);
898 fscal = _mm_andnot_ps(dummy_mask,fscal);
900 /* Update vectorial force */
901 fix1 = _mm_macc_ps(dx12,fscal,fix1);
902 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
903 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
905 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
906 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
907 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
911 /**************************
912 * CALCULATE INTERACTIONS *
913 **************************/
915 if (gmx_mm_any_lt(rsq13,rcutoff2))
918 /* REACTION-FIELD ELECTROSTATICS */
919 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_macc_ps(krf,rsq13,rinv13),crf));
920 felec = _mm_mul_ps(qq13,_mm_msub_ps(rinv13,rinvsq13,krf2));
922 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
924 /* Update potential sum for this i atom from the interaction with this j atom. */
925 velec = _mm_and_ps(velec,cutoff_mask);
926 velec = _mm_andnot_ps(dummy_mask,velec);
927 velecsum = _mm_add_ps(velecsum,velec);
931 fscal = _mm_and_ps(fscal,cutoff_mask);
933 fscal = _mm_andnot_ps(dummy_mask,fscal);
935 /* Update vectorial force */
936 fix1 = _mm_macc_ps(dx13,fscal,fix1);
937 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
938 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
940 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
941 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
942 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
946 /**************************
947 * CALCULATE INTERACTIONS *
948 **************************/
950 if (gmx_mm_any_lt(rsq21,rcutoff2))
953 /* REACTION-FIELD ELECTROSTATICS */
954 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_macc_ps(krf,rsq21,rinv21),crf));
955 felec = _mm_mul_ps(qq21,_mm_msub_ps(rinv21,rinvsq21,krf2));
957 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
959 /* Update potential sum for this i atom from the interaction with this j atom. */
960 velec = _mm_and_ps(velec,cutoff_mask);
961 velec = _mm_andnot_ps(dummy_mask,velec);
962 velecsum = _mm_add_ps(velecsum,velec);
966 fscal = _mm_and_ps(fscal,cutoff_mask);
968 fscal = _mm_andnot_ps(dummy_mask,fscal);
970 /* Update vectorial force */
971 fix2 = _mm_macc_ps(dx21,fscal,fix2);
972 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
973 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
975 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
976 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
977 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
981 /**************************
982 * CALCULATE INTERACTIONS *
983 **************************/
985 if (gmx_mm_any_lt(rsq22,rcutoff2))
988 /* REACTION-FIELD ELECTROSTATICS */
989 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_macc_ps(krf,rsq22,rinv22),crf));
990 felec = _mm_mul_ps(qq22,_mm_msub_ps(rinv22,rinvsq22,krf2));
992 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
994 /* Update potential sum for this i atom from the interaction with this j atom. */
995 velec = _mm_and_ps(velec,cutoff_mask);
996 velec = _mm_andnot_ps(dummy_mask,velec);
997 velecsum = _mm_add_ps(velecsum,velec);
1001 fscal = _mm_and_ps(fscal,cutoff_mask);
1003 fscal = _mm_andnot_ps(dummy_mask,fscal);
1005 /* Update vectorial force */
1006 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1007 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1008 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1010 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1011 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1012 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1016 /**************************
1017 * CALCULATE INTERACTIONS *
1018 **************************/
1020 if (gmx_mm_any_lt(rsq23,rcutoff2))
1023 /* REACTION-FIELD ELECTROSTATICS */
1024 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_macc_ps(krf,rsq23,rinv23),crf));
1025 felec = _mm_mul_ps(qq23,_mm_msub_ps(rinv23,rinvsq23,krf2));
1027 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1029 /* Update potential sum for this i atom from the interaction with this j atom. */
1030 velec = _mm_and_ps(velec,cutoff_mask);
1031 velec = _mm_andnot_ps(dummy_mask,velec);
1032 velecsum = _mm_add_ps(velecsum,velec);
1036 fscal = _mm_and_ps(fscal,cutoff_mask);
1038 fscal = _mm_andnot_ps(dummy_mask,fscal);
1040 /* Update vectorial force */
1041 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1042 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1043 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1045 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1046 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1047 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1051 /**************************
1052 * CALCULATE INTERACTIONS *
1053 **************************/
1055 if (gmx_mm_any_lt(rsq31,rcutoff2))
1058 /* REACTION-FIELD ELECTROSTATICS */
1059 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_macc_ps(krf,rsq31,rinv31),crf));
1060 felec = _mm_mul_ps(qq31,_mm_msub_ps(rinv31,rinvsq31,krf2));
1062 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1064 /* Update potential sum for this i atom from the interaction with this j atom. */
1065 velec = _mm_and_ps(velec,cutoff_mask);
1066 velec = _mm_andnot_ps(dummy_mask,velec);
1067 velecsum = _mm_add_ps(velecsum,velec);
1071 fscal = _mm_and_ps(fscal,cutoff_mask);
1073 fscal = _mm_andnot_ps(dummy_mask,fscal);
1075 /* Update vectorial force */
1076 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1077 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1078 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1080 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1081 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1082 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1086 /**************************
1087 * CALCULATE INTERACTIONS *
1088 **************************/
1090 if (gmx_mm_any_lt(rsq32,rcutoff2))
1093 /* REACTION-FIELD ELECTROSTATICS */
1094 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_macc_ps(krf,rsq32,rinv32),crf));
1095 felec = _mm_mul_ps(qq32,_mm_msub_ps(rinv32,rinvsq32,krf2));
1097 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1099 /* Update potential sum for this i atom from the interaction with this j atom. */
1100 velec = _mm_and_ps(velec,cutoff_mask);
1101 velec = _mm_andnot_ps(dummy_mask,velec);
1102 velecsum = _mm_add_ps(velecsum,velec);
1106 fscal = _mm_and_ps(fscal,cutoff_mask);
1108 fscal = _mm_andnot_ps(dummy_mask,fscal);
1110 /* Update vectorial force */
1111 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1112 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1113 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1115 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1116 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1117 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1121 /**************************
1122 * CALCULATE INTERACTIONS *
1123 **************************/
1125 if (gmx_mm_any_lt(rsq33,rcutoff2))
1128 /* REACTION-FIELD ELECTROSTATICS */
1129 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_macc_ps(krf,rsq33,rinv33),crf));
1130 felec = _mm_mul_ps(qq33,_mm_msub_ps(rinv33,rinvsq33,krf2));
1132 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1134 /* Update potential sum for this i atom from the interaction with this j atom. */
1135 velec = _mm_and_ps(velec,cutoff_mask);
1136 velec = _mm_andnot_ps(dummy_mask,velec);
1137 velecsum = _mm_add_ps(velecsum,velec);
1141 fscal = _mm_and_ps(fscal,cutoff_mask);
1143 fscal = _mm_andnot_ps(dummy_mask,fscal);
1145 /* Update vectorial force */
1146 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1147 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1148 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1150 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1151 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1152 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1156 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1157 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1158 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1159 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1161 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1162 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1163 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1165 /* Inner loop uses 417 flops */
1168 /* End of innermost loop */
1170 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1171 f+i_coord_offset,fshift+i_shift_offset);
1174 /* Update potential energies */
1175 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1176 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1178 /* Increment number of inner iterations */
1179 inneriter += j_index_end - j_index_start;
1181 /* Outer loop uses 26 flops */
1184 /* Increment number of outer iterations */
1187 /* Update outer/inner flops */
1189 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*417);
1192 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_128_fma_single
1193 * Electrostatics interaction: ReactionField
1194 * VdW interaction: LennardJones
1195 * Geometry: Water4-Water4
1196 * Calculate force/pot: Force
1199 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_128_fma_single
1200 (t_nblist * gmx_restrict nlist,
1201 rvec * gmx_restrict xx,
1202 rvec * gmx_restrict ff,
1203 t_forcerec * gmx_restrict fr,
1204 t_mdatoms * gmx_restrict mdatoms,
1205 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1206 t_nrnb * gmx_restrict nrnb)
1208 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1209 * just 0 for non-waters.
1210 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1211 * jnr indices corresponding to data put in the four positions in the SIMD register.
1213 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1214 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1215 int jnrA,jnrB,jnrC,jnrD;
1216 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1217 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1218 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1219 real rcutoff_scalar;
1220 real *shiftvec,*fshift,*x,*f;
1221 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1222 real scratch[4*DIM];
1223 __m128 fscal,rcutoff,rcutoff2,jidxall;
1225 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1227 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1229 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1231 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1232 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1233 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1234 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1235 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1236 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1237 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1238 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1239 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1240 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1241 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1242 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1243 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1244 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1245 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1246 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1247 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1248 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1249 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1250 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1253 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1256 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1257 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1258 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1259 real rswitch_scalar,d_scalar;
1260 __m128 dummy_mask,cutoff_mask;
1261 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1262 __m128 one = _mm_set1_ps(1.0);
1263 __m128 two = _mm_set1_ps(2.0);
1269 jindex = nlist->jindex;
1271 shiftidx = nlist->shift;
1273 shiftvec = fr->shift_vec[0];
1274 fshift = fr->fshift[0];
1275 facel = _mm_set1_ps(fr->epsfac);
1276 charge = mdatoms->chargeA;
1277 krf = _mm_set1_ps(fr->ic->k_rf);
1278 krf2 = _mm_set1_ps(fr->ic->k_rf*2.0);
1279 crf = _mm_set1_ps(fr->ic->c_rf);
1280 nvdwtype = fr->ntype;
1281 vdwparam = fr->nbfp;
1282 vdwtype = mdatoms->typeA;
1284 /* Setup water-specific parameters */
1285 inr = nlist->iinr[0];
1286 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1287 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1288 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1289 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1291 jq1 = _mm_set1_ps(charge[inr+1]);
1292 jq2 = _mm_set1_ps(charge[inr+2]);
1293 jq3 = _mm_set1_ps(charge[inr+3]);
1294 vdwjidx0A = 2*vdwtype[inr+0];
1295 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1296 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1297 qq11 = _mm_mul_ps(iq1,jq1);
1298 qq12 = _mm_mul_ps(iq1,jq2);
1299 qq13 = _mm_mul_ps(iq1,jq3);
1300 qq21 = _mm_mul_ps(iq2,jq1);
1301 qq22 = _mm_mul_ps(iq2,jq2);
1302 qq23 = _mm_mul_ps(iq2,jq3);
1303 qq31 = _mm_mul_ps(iq3,jq1);
1304 qq32 = _mm_mul_ps(iq3,jq2);
1305 qq33 = _mm_mul_ps(iq3,jq3);
1307 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1308 rcutoff_scalar = fr->rcoulomb;
1309 rcutoff = _mm_set1_ps(rcutoff_scalar);
1310 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1312 rswitch_scalar = fr->rvdw_switch;
1313 rswitch = _mm_set1_ps(rswitch_scalar);
1314 /* Setup switch parameters */
1315 d_scalar = rcutoff_scalar-rswitch_scalar;
1316 d = _mm_set1_ps(d_scalar);
1317 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1318 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1319 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1320 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1321 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1322 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1324 /* Avoid stupid compiler warnings */
1325 jnrA = jnrB = jnrC = jnrD = 0;
1326 j_coord_offsetA = 0;
1327 j_coord_offsetB = 0;
1328 j_coord_offsetC = 0;
1329 j_coord_offsetD = 0;
1334 for(iidx=0;iidx<4*DIM;iidx++)
1336 scratch[iidx] = 0.0;
1339 /* Start outer loop over neighborlists */
1340 for(iidx=0; iidx<nri; iidx++)
1342 /* Load shift vector for this list */
1343 i_shift_offset = DIM*shiftidx[iidx];
1345 /* Load limits for loop over neighbors */
1346 j_index_start = jindex[iidx];
1347 j_index_end = jindex[iidx+1];
1349 /* Get outer coordinate index */
1351 i_coord_offset = DIM*inr;
1353 /* Load i particle coords and add shift vector */
1354 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1355 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1357 fix0 = _mm_setzero_ps();
1358 fiy0 = _mm_setzero_ps();
1359 fiz0 = _mm_setzero_ps();
1360 fix1 = _mm_setzero_ps();
1361 fiy1 = _mm_setzero_ps();
1362 fiz1 = _mm_setzero_ps();
1363 fix2 = _mm_setzero_ps();
1364 fiy2 = _mm_setzero_ps();
1365 fiz2 = _mm_setzero_ps();
1366 fix3 = _mm_setzero_ps();
1367 fiy3 = _mm_setzero_ps();
1368 fiz3 = _mm_setzero_ps();
1370 /* Start inner kernel loop */
1371 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1374 /* Get j neighbor index, and coordinate index */
1376 jnrB = jjnr[jidx+1];
1377 jnrC = jjnr[jidx+2];
1378 jnrD = jjnr[jidx+3];
1379 j_coord_offsetA = DIM*jnrA;
1380 j_coord_offsetB = DIM*jnrB;
1381 j_coord_offsetC = DIM*jnrC;
1382 j_coord_offsetD = DIM*jnrD;
1384 /* load j atom coordinates */
1385 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1386 x+j_coord_offsetC,x+j_coord_offsetD,
1387 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1388 &jy2,&jz2,&jx3,&jy3,&jz3);
1390 /* Calculate displacement vector */
1391 dx00 = _mm_sub_ps(ix0,jx0);
1392 dy00 = _mm_sub_ps(iy0,jy0);
1393 dz00 = _mm_sub_ps(iz0,jz0);
1394 dx11 = _mm_sub_ps(ix1,jx1);
1395 dy11 = _mm_sub_ps(iy1,jy1);
1396 dz11 = _mm_sub_ps(iz1,jz1);
1397 dx12 = _mm_sub_ps(ix1,jx2);
1398 dy12 = _mm_sub_ps(iy1,jy2);
1399 dz12 = _mm_sub_ps(iz1,jz2);
1400 dx13 = _mm_sub_ps(ix1,jx3);
1401 dy13 = _mm_sub_ps(iy1,jy3);
1402 dz13 = _mm_sub_ps(iz1,jz3);
1403 dx21 = _mm_sub_ps(ix2,jx1);
1404 dy21 = _mm_sub_ps(iy2,jy1);
1405 dz21 = _mm_sub_ps(iz2,jz1);
1406 dx22 = _mm_sub_ps(ix2,jx2);
1407 dy22 = _mm_sub_ps(iy2,jy2);
1408 dz22 = _mm_sub_ps(iz2,jz2);
1409 dx23 = _mm_sub_ps(ix2,jx3);
1410 dy23 = _mm_sub_ps(iy2,jy3);
1411 dz23 = _mm_sub_ps(iz2,jz3);
1412 dx31 = _mm_sub_ps(ix3,jx1);
1413 dy31 = _mm_sub_ps(iy3,jy1);
1414 dz31 = _mm_sub_ps(iz3,jz1);
1415 dx32 = _mm_sub_ps(ix3,jx2);
1416 dy32 = _mm_sub_ps(iy3,jy2);
1417 dz32 = _mm_sub_ps(iz3,jz2);
1418 dx33 = _mm_sub_ps(ix3,jx3);
1419 dy33 = _mm_sub_ps(iy3,jy3);
1420 dz33 = _mm_sub_ps(iz3,jz3);
1422 /* Calculate squared distance and things based on it */
1423 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1424 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1425 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1426 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1427 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1428 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1429 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1430 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1431 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1432 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1434 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1435 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1436 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1437 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1438 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1439 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1440 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1441 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1442 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1443 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1445 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1446 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1447 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1448 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1449 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1450 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1451 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1452 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1453 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1454 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1456 fjx0 = _mm_setzero_ps();
1457 fjy0 = _mm_setzero_ps();
1458 fjz0 = _mm_setzero_ps();
1459 fjx1 = _mm_setzero_ps();
1460 fjy1 = _mm_setzero_ps();
1461 fjz1 = _mm_setzero_ps();
1462 fjx2 = _mm_setzero_ps();
1463 fjy2 = _mm_setzero_ps();
1464 fjz2 = _mm_setzero_ps();
1465 fjx3 = _mm_setzero_ps();
1466 fjy3 = _mm_setzero_ps();
1467 fjz3 = _mm_setzero_ps();
1469 /**************************
1470 * CALCULATE INTERACTIONS *
1471 **************************/
1473 if (gmx_mm_any_lt(rsq00,rcutoff2))
1476 r00 = _mm_mul_ps(rsq00,rinv00);
1478 /* LENNARD-JONES DISPERSION/REPULSION */
1480 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1481 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1482 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1483 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
1484 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1486 d = _mm_sub_ps(r00,rswitch);
1487 d = _mm_max_ps(d,_mm_setzero_ps());
1488 d2 = _mm_mul_ps(d,d);
1489 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1491 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1493 /* Evaluate switch function */
1494 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1495 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1496 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1500 fscal = _mm_and_ps(fscal,cutoff_mask);
1502 /* Update vectorial force */
1503 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1504 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1505 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1507 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1508 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1509 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1513 /**************************
1514 * CALCULATE INTERACTIONS *
1515 **************************/
1517 if (gmx_mm_any_lt(rsq11,rcutoff2))
1520 /* REACTION-FIELD ELECTROSTATICS */
1521 felec = _mm_mul_ps(qq11,_mm_msub_ps(rinv11,rinvsq11,krf2));
1523 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1527 fscal = _mm_and_ps(fscal,cutoff_mask);
1529 /* Update vectorial force */
1530 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1531 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1532 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1534 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1535 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1536 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1540 /**************************
1541 * CALCULATE INTERACTIONS *
1542 **************************/
1544 if (gmx_mm_any_lt(rsq12,rcutoff2))
1547 /* REACTION-FIELD ELECTROSTATICS */
1548 felec = _mm_mul_ps(qq12,_mm_msub_ps(rinv12,rinvsq12,krf2));
1550 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1554 fscal = _mm_and_ps(fscal,cutoff_mask);
1556 /* Update vectorial force */
1557 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1558 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1559 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1561 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1562 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1563 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1567 /**************************
1568 * CALCULATE INTERACTIONS *
1569 **************************/
1571 if (gmx_mm_any_lt(rsq13,rcutoff2))
1574 /* REACTION-FIELD ELECTROSTATICS */
1575 felec = _mm_mul_ps(qq13,_mm_msub_ps(rinv13,rinvsq13,krf2));
1577 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1581 fscal = _mm_and_ps(fscal,cutoff_mask);
1583 /* Update vectorial force */
1584 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1585 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1586 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1588 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1589 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1590 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1594 /**************************
1595 * CALCULATE INTERACTIONS *
1596 **************************/
1598 if (gmx_mm_any_lt(rsq21,rcutoff2))
1601 /* REACTION-FIELD ELECTROSTATICS */
1602 felec = _mm_mul_ps(qq21,_mm_msub_ps(rinv21,rinvsq21,krf2));
1604 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1608 fscal = _mm_and_ps(fscal,cutoff_mask);
1610 /* Update vectorial force */
1611 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1612 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1613 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1615 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1616 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1617 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1621 /**************************
1622 * CALCULATE INTERACTIONS *
1623 **************************/
1625 if (gmx_mm_any_lt(rsq22,rcutoff2))
1628 /* REACTION-FIELD ELECTROSTATICS */
1629 felec = _mm_mul_ps(qq22,_mm_msub_ps(rinv22,rinvsq22,krf2));
1631 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1635 fscal = _mm_and_ps(fscal,cutoff_mask);
1637 /* Update vectorial force */
1638 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1639 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1640 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1642 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1643 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1644 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1648 /**************************
1649 * CALCULATE INTERACTIONS *
1650 **************************/
1652 if (gmx_mm_any_lt(rsq23,rcutoff2))
1655 /* REACTION-FIELD ELECTROSTATICS */
1656 felec = _mm_mul_ps(qq23,_mm_msub_ps(rinv23,rinvsq23,krf2));
1658 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1662 fscal = _mm_and_ps(fscal,cutoff_mask);
1664 /* Update vectorial force */
1665 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1666 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1667 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1669 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1670 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1671 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1675 /**************************
1676 * CALCULATE INTERACTIONS *
1677 **************************/
1679 if (gmx_mm_any_lt(rsq31,rcutoff2))
1682 /* REACTION-FIELD ELECTROSTATICS */
1683 felec = _mm_mul_ps(qq31,_mm_msub_ps(rinv31,rinvsq31,krf2));
1685 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1689 fscal = _mm_and_ps(fscal,cutoff_mask);
1691 /* Update vectorial force */
1692 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1693 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1694 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1696 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1697 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1698 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1702 /**************************
1703 * CALCULATE INTERACTIONS *
1704 **************************/
1706 if (gmx_mm_any_lt(rsq32,rcutoff2))
1709 /* REACTION-FIELD ELECTROSTATICS */
1710 felec = _mm_mul_ps(qq32,_mm_msub_ps(rinv32,rinvsq32,krf2));
1712 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1716 fscal = _mm_and_ps(fscal,cutoff_mask);
1718 /* Update vectorial force */
1719 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1720 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1721 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1723 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1724 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1725 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1729 /**************************
1730 * CALCULATE INTERACTIONS *
1731 **************************/
1733 if (gmx_mm_any_lt(rsq33,rcutoff2))
1736 /* REACTION-FIELD ELECTROSTATICS */
1737 felec = _mm_mul_ps(qq33,_mm_msub_ps(rinv33,rinvsq33,krf2));
1739 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1743 fscal = _mm_and_ps(fscal,cutoff_mask);
1745 /* Update vectorial force */
1746 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1747 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1748 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1750 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1751 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1752 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1756 fjptrA = f+j_coord_offsetA;
1757 fjptrB = f+j_coord_offsetB;
1758 fjptrC = f+j_coord_offsetC;
1759 fjptrD = f+j_coord_offsetD;
1761 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1762 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1763 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1765 /* Inner loop uses 359 flops */
1768 if(jidx<j_index_end)
1771 /* Get j neighbor index, and coordinate index */
1772 jnrlistA = jjnr[jidx];
1773 jnrlistB = jjnr[jidx+1];
1774 jnrlistC = jjnr[jidx+2];
1775 jnrlistD = jjnr[jidx+3];
1776 /* Sign of each element will be negative for non-real atoms.
1777 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1778 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1780 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1781 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1782 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1783 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1784 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1785 j_coord_offsetA = DIM*jnrA;
1786 j_coord_offsetB = DIM*jnrB;
1787 j_coord_offsetC = DIM*jnrC;
1788 j_coord_offsetD = DIM*jnrD;
1790 /* load j atom coordinates */
1791 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1792 x+j_coord_offsetC,x+j_coord_offsetD,
1793 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1794 &jy2,&jz2,&jx3,&jy3,&jz3);
1796 /* Calculate displacement vector */
1797 dx00 = _mm_sub_ps(ix0,jx0);
1798 dy00 = _mm_sub_ps(iy0,jy0);
1799 dz00 = _mm_sub_ps(iz0,jz0);
1800 dx11 = _mm_sub_ps(ix1,jx1);
1801 dy11 = _mm_sub_ps(iy1,jy1);
1802 dz11 = _mm_sub_ps(iz1,jz1);
1803 dx12 = _mm_sub_ps(ix1,jx2);
1804 dy12 = _mm_sub_ps(iy1,jy2);
1805 dz12 = _mm_sub_ps(iz1,jz2);
1806 dx13 = _mm_sub_ps(ix1,jx3);
1807 dy13 = _mm_sub_ps(iy1,jy3);
1808 dz13 = _mm_sub_ps(iz1,jz3);
1809 dx21 = _mm_sub_ps(ix2,jx1);
1810 dy21 = _mm_sub_ps(iy2,jy1);
1811 dz21 = _mm_sub_ps(iz2,jz1);
1812 dx22 = _mm_sub_ps(ix2,jx2);
1813 dy22 = _mm_sub_ps(iy2,jy2);
1814 dz22 = _mm_sub_ps(iz2,jz2);
1815 dx23 = _mm_sub_ps(ix2,jx3);
1816 dy23 = _mm_sub_ps(iy2,jy3);
1817 dz23 = _mm_sub_ps(iz2,jz3);
1818 dx31 = _mm_sub_ps(ix3,jx1);
1819 dy31 = _mm_sub_ps(iy3,jy1);
1820 dz31 = _mm_sub_ps(iz3,jz1);
1821 dx32 = _mm_sub_ps(ix3,jx2);
1822 dy32 = _mm_sub_ps(iy3,jy2);
1823 dz32 = _mm_sub_ps(iz3,jz2);
1824 dx33 = _mm_sub_ps(ix3,jx3);
1825 dy33 = _mm_sub_ps(iy3,jy3);
1826 dz33 = _mm_sub_ps(iz3,jz3);
1828 /* Calculate squared distance and things based on it */
1829 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1830 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1831 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1832 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1833 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1834 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1835 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1836 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1837 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1838 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1840 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1841 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1842 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1843 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1844 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1845 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1846 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1847 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1848 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1849 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1851 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1852 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1853 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1854 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1855 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1856 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1857 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1858 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1859 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1860 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1862 fjx0 = _mm_setzero_ps();
1863 fjy0 = _mm_setzero_ps();
1864 fjz0 = _mm_setzero_ps();
1865 fjx1 = _mm_setzero_ps();
1866 fjy1 = _mm_setzero_ps();
1867 fjz1 = _mm_setzero_ps();
1868 fjx2 = _mm_setzero_ps();
1869 fjy2 = _mm_setzero_ps();
1870 fjz2 = _mm_setzero_ps();
1871 fjx3 = _mm_setzero_ps();
1872 fjy3 = _mm_setzero_ps();
1873 fjz3 = _mm_setzero_ps();
1875 /**************************
1876 * CALCULATE INTERACTIONS *
1877 **************************/
1879 if (gmx_mm_any_lt(rsq00,rcutoff2))
1882 r00 = _mm_mul_ps(rsq00,rinv00);
1883 r00 = _mm_andnot_ps(dummy_mask,r00);
1885 /* LENNARD-JONES DISPERSION/REPULSION */
1887 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1888 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1889 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1890 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
1891 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1893 d = _mm_sub_ps(r00,rswitch);
1894 d = _mm_max_ps(d,_mm_setzero_ps());
1895 d2 = _mm_mul_ps(d,d);
1896 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1898 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1900 /* Evaluate switch function */
1901 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1902 fvdw = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1903 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1907 fscal = _mm_and_ps(fscal,cutoff_mask);
1909 fscal = _mm_andnot_ps(dummy_mask,fscal);
1911 /* Update vectorial force */
1912 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1913 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1914 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1916 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1917 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1918 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1922 /**************************
1923 * CALCULATE INTERACTIONS *
1924 **************************/
1926 if (gmx_mm_any_lt(rsq11,rcutoff2))
1929 /* REACTION-FIELD ELECTROSTATICS */
1930 felec = _mm_mul_ps(qq11,_mm_msub_ps(rinv11,rinvsq11,krf2));
1932 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1936 fscal = _mm_and_ps(fscal,cutoff_mask);
1938 fscal = _mm_andnot_ps(dummy_mask,fscal);
1940 /* Update vectorial force */
1941 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1942 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1943 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1945 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1946 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1947 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1951 /**************************
1952 * CALCULATE INTERACTIONS *
1953 **************************/
1955 if (gmx_mm_any_lt(rsq12,rcutoff2))
1958 /* REACTION-FIELD ELECTROSTATICS */
1959 felec = _mm_mul_ps(qq12,_mm_msub_ps(rinv12,rinvsq12,krf2));
1961 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1965 fscal = _mm_and_ps(fscal,cutoff_mask);
1967 fscal = _mm_andnot_ps(dummy_mask,fscal);
1969 /* Update vectorial force */
1970 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1971 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1972 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1974 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1975 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1976 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1980 /**************************
1981 * CALCULATE INTERACTIONS *
1982 **************************/
1984 if (gmx_mm_any_lt(rsq13,rcutoff2))
1987 /* REACTION-FIELD ELECTROSTATICS */
1988 felec = _mm_mul_ps(qq13,_mm_msub_ps(rinv13,rinvsq13,krf2));
1990 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1994 fscal = _mm_and_ps(fscal,cutoff_mask);
1996 fscal = _mm_andnot_ps(dummy_mask,fscal);
1998 /* Update vectorial force */
1999 fix1 = _mm_macc_ps(dx13,fscal,fix1);
2000 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
2001 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
2003 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
2004 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
2005 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
2009 /**************************
2010 * CALCULATE INTERACTIONS *
2011 **************************/
2013 if (gmx_mm_any_lt(rsq21,rcutoff2))
2016 /* REACTION-FIELD ELECTROSTATICS */
2017 felec = _mm_mul_ps(qq21,_mm_msub_ps(rinv21,rinvsq21,krf2));
2019 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2023 fscal = _mm_and_ps(fscal,cutoff_mask);
2025 fscal = _mm_andnot_ps(dummy_mask,fscal);
2027 /* Update vectorial force */
2028 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2029 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2030 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2032 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2033 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2034 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2038 /**************************
2039 * CALCULATE INTERACTIONS *
2040 **************************/
2042 if (gmx_mm_any_lt(rsq22,rcutoff2))
2045 /* REACTION-FIELD ELECTROSTATICS */
2046 felec = _mm_mul_ps(qq22,_mm_msub_ps(rinv22,rinvsq22,krf2));
2048 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2052 fscal = _mm_and_ps(fscal,cutoff_mask);
2054 fscal = _mm_andnot_ps(dummy_mask,fscal);
2056 /* Update vectorial force */
2057 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2058 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2059 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2061 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2062 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2063 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2067 /**************************
2068 * CALCULATE INTERACTIONS *
2069 **************************/
2071 if (gmx_mm_any_lt(rsq23,rcutoff2))
2074 /* REACTION-FIELD ELECTROSTATICS */
2075 felec = _mm_mul_ps(qq23,_mm_msub_ps(rinv23,rinvsq23,krf2));
2077 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2081 fscal = _mm_and_ps(fscal,cutoff_mask);
2083 fscal = _mm_andnot_ps(dummy_mask,fscal);
2085 /* Update vectorial force */
2086 fix2 = _mm_macc_ps(dx23,fscal,fix2);
2087 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
2088 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
2090 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
2091 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
2092 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
2096 /**************************
2097 * CALCULATE INTERACTIONS *
2098 **************************/
2100 if (gmx_mm_any_lt(rsq31,rcutoff2))
2103 /* REACTION-FIELD ELECTROSTATICS */
2104 felec = _mm_mul_ps(qq31,_mm_msub_ps(rinv31,rinvsq31,krf2));
2106 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2110 fscal = _mm_and_ps(fscal,cutoff_mask);
2112 fscal = _mm_andnot_ps(dummy_mask,fscal);
2114 /* Update vectorial force */
2115 fix3 = _mm_macc_ps(dx31,fscal,fix3);
2116 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
2117 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
2119 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
2120 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
2121 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
2125 /**************************
2126 * CALCULATE INTERACTIONS *
2127 **************************/
2129 if (gmx_mm_any_lt(rsq32,rcutoff2))
2132 /* REACTION-FIELD ELECTROSTATICS */
2133 felec = _mm_mul_ps(qq32,_mm_msub_ps(rinv32,rinvsq32,krf2));
2135 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2139 fscal = _mm_and_ps(fscal,cutoff_mask);
2141 fscal = _mm_andnot_ps(dummy_mask,fscal);
2143 /* Update vectorial force */
2144 fix3 = _mm_macc_ps(dx32,fscal,fix3);
2145 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
2146 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
2148 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
2149 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
2150 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
2154 /**************************
2155 * CALCULATE INTERACTIONS *
2156 **************************/
2158 if (gmx_mm_any_lt(rsq33,rcutoff2))
2161 /* REACTION-FIELD ELECTROSTATICS */
2162 felec = _mm_mul_ps(qq33,_mm_msub_ps(rinv33,rinvsq33,krf2));
2164 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2168 fscal = _mm_and_ps(fscal,cutoff_mask);
2170 fscal = _mm_andnot_ps(dummy_mask,fscal);
2172 /* Update vectorial force */
2173 fix3 = _mm_macc_ps(dx33,fscal,fix3);
2174 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
2175 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
2177 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
2178 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
2179 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
2183 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2184 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2185 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2186 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2188 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2189 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2190 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2192 /* Inner loop uses 360 flops */
2195 /* End of innermost loop */
2197 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2198 f+i_coord_offset,fshift+i_shift_offset);
2200 /* Increment number of inner iterations */
2201 inneriter += j_index_end - j_index_start;
2203 /* Outer loop uses 24 flops */
2206 /* Increment number of outer iterations */
2209 /* Update outer/inner flops */
2211 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*360);