2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
48 #include "kernelutil_x86_avx_128_fma_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_128_fma_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: None
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_128_fma_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
89 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
90 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
91 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
92 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
93 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
94 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
95 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
96 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
97 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
98 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
99 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
100 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
101 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
102 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
103 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
104 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
107 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
108 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
110 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
111 real rswitch_scalar,d_scalar;
112 __m128 dummy_mask,cutoff_mask;
113 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
114 __m128 one = _mm_set1_ps(1.0);
115 __m128 two = _mm_set1_ps(2.0);
121 jindex = nlist->jindex;
123 shiftidx = nlist->shift;
125 shiftvec = fr->shift_vec[0];
126 fshift = fr->fshift[0];
127 facel = _mm_set1_ps(fr->epsfac);
128 charge = mdatoms->chargeA;
130 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
131 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
132 beta2 = _mm_mul_ps(beta,beta);
133 beta3 = _mm_mul_ps(beta,beta2);
134 ewtab = fr->ic->tabq_coul_FDV0;
135 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
136 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
138 /* Setup water-specific parameters */
139 inr = nlist->iinr[0];
140 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
141 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
142 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
144 jq1 = _mm_set1_ps(charge[inr+1]);
145 jq2 = _mm_set1_ps(charge[inr+2]);
146 jq3 = _mm_set1_ps(charge[inr+3]);
147 qq11 = _mm_mul_ps(iq1,jq1);
148 qq12 = _mm_mul_ps(iq1,jq2);
149 qq13 = _mm_mul_ps(iq1,jq3);
150 qq21 = _mm_mul_ps(iq2,jq1);
151 qq22 = _mm_mul_ps(iq2,jq2);
152 qq23 = _mm_mul_ps(iq2,jq3);
153 qq31 = _mm_mul_ps(iq3,jq1);
154 qq32 = _mm_mul_ps(iq3,jq2);
155 qq33 = _mm_mul_ps(iq3,jq3);
157 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
158 rcutoff_scalar = fr->rcoulomb;
159 rcutoff = _mm_set1_ps(rcutoff_scalar);
160 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
162 rswitch_scalar = fr->rcoulomb_switch;
163 rswitch = _mm_set1_ps(rswitch_scalar);
164 /* Setup switch parameters */
165 d_scalar = rcutoff_scalar-rswitch_scalar;
166 d = _mm_set1_ps(d_scalar);
167 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
168 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
169 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
170 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
171 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
172 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
174 /* Avoid stupid compiler warnings */
175 jnrA = jnrB = jnrC = jnrD = 0;
184 for(iidx=0;iidx<4*DIM;iidx++)
189 /* Start outer loop over neighborlists */
190 for(iidx=0; iidx<nri; iidx++)
192 /* Load shift vector for this list */
193 i_shift_offset = DIM*shiftidx[iidx];
195 /* Load limits for loop over neighbors */
196 j_index_start = jindex[iidx];
197 j_index_end = jindex[iidx+1];
199 /* Get outer coordinate index */
201 i_coord_offset = DIM*inr;
203 /* Load i particle coords and add shift vector */
204 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
205 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
207 fix1 = _mm_setzero_ps();
208 fiy1 = _mm_setzero_ps();
209 fiz1 = _mm_setzero_ps();
210 fix2 = _mm_setzero_ps();
211 fiy2 = _mm_setzero_ps();
212 fiz2 = _mm_setzero_ps();
213 fix3 = _mm_setzero_ps();
214 fiy3 = _mm_setzero_ps();
215 fiz3 = _mm_setzero_ps();
217 /* Reset potential sums */
218 velecsum = _mm_setzero_ps();
220 /* Start inner kernel loop */
221 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
224 /* Get j neighbor index, and coordinate index */
229 j_coord_offsetA = DIM*jnrA;
230 j_coord_offsetB = DIM*jnrB;
231 j_coord_offsetC = DIM*jnrC;
232 j_coord_offsetD = DIM*jnrD;
234 /* load j atom coordinates */
235 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
236 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
237 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
239 /* Calculate displacement vector */
240 dx11 = _mm_sub_ps(ix1,jx1);
241 dy11 = _mm_sub_ps(iy1,jy1);
242 dz11 = _mm_sub_ps(iz1,jz1);
243 dx12 = _mm_sub_ps(ix1,jx2);
244 dy12 = _mm_sub_ps(iy1,jy2);
245 dz12 = _mm_sub_ps(iz1,jz2);
246 dx13 = _mm_sub_ps(ix1,jx3);
247 dy13 = _mm_sub_ps(iy1,jy3);
248 dz13 = _mm_sub_ps(iz1,jz3);
249 dx21 = _mm_sub_ps(ix2,jx1);
250 dy21 = _mm_sub_ps(iy2,jy1);
251 dz21 = _mm_sub_ps(iz2,jz1);
252 dx22 = _mm_sub_ps(ix2,jx2);
253 dy22 = _mm_sub_ps(iy2,jy2);
254 dz22 = _mm_sub_ps(iz2,jz2);
255 dx23 = _mm_sub_ps(ix2,jx3);
256 dy23 = _mm_sub_ps(iy2,jy3);
257 dz23 = _mm_sub_ps(iz2,jz3);
258 dx31 = _mm_sub_ps(ix3,jx1);
259 dy31 = _mm_sub_ps(iy3,jy1);
260 dz31 = _mm_sub_ps(iz3,jz1);
261 dx32 = _mm_sub_ps(ix3,jx2);
262 dy32 = _mm_sub_ps(iy3,jy2);
263 dz32 = _mm_sub_ps(iz3,jz2);
264 dx33 = _mm_sub_ps(ix3,jx3);
265 dy33 = _mm_sub_ps(iy3,jy3);
266 dz33 = _mm_sub_ps(iz3,jz3);
268 /* Calculate squared distance and things based on it */
269 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
270 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
271 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
272 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
273 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
274 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
275 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
276 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
277 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
279 rinv11 = gmx_mm_invsqrt_ps(rsq11);
280 rinv12 = gmx_mm_invsqrt_ps(rsq12);
281 rinv13 = gmx_mm_invsqrt_ps(rsq13);
282 rinv21 = gmx_mm_invsqrt_ps(rsq21);
283 rinv22 = gmx_mm_invsqrt_ps(rsq22);
284 rinv23 = gmx_mm_invsqrt_ps(rsq23);
285 rinv31 = gmx_mm_invsqrt_ps(rsq31);
286 rinv32 = gmx_mm_invsqrt_ps(rsq32);
287 rinv33 = gmx_mm_invsqrt_ps(rsq33);
289 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
290 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
291 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
292 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
293 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
294 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
295 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
296 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
297 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
299 fjx1 = _mm_setzero_ps();
300 fjy1 = _mm_setzero_ps();
301 fjz1 = _mm_setzero_ps();
302 fjx2 = _mm_setzero_ps();
303 fjy2 = _mm_setzero_ps();
304 fjz2 = _mm_setzero_ps();
305 fjx3 = _mm_setzero_ps();
306 fjy3 = _mm_setzero_ps();
307 fjz3 = _mm_setzero_ps();
309 /**************************
310 * CALCULATE INTERACTIONS *
311 **************************/
313 if (gmx_mm_any_lt(rsq11,rcutoff2))
316 r11 = _mm_mul_ps(rsq11,rinv11);
318 /* EWALD ELECTROSTATICS */
320 /* Analytical PME correction */
321 zeta2 = _mm_mul_ps(beta2,rsq11);
322 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
323 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
324 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
325 felec = _mm_mul_ps(qq11,felec);
326 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
327 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
328 velec = _mm_mul_ps(qq11,velec);
330 d = _mm_sub_ps(r11,rswitch);
331 d = _mm_max_ps(d,_mm_setzero_ps());
332 d2 = _mm_mul_ps(d,d);
333 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
335 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
337 /* Evaluate switch function */
338 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
339 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
340 velec = _mm_mul_ps(velec,sw);
341 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
343 /* Update potential sum for this i atom from the interaction with this j atom. */
344 velec = _mm_and_ps(velec,cutoff_mask);
345 velecsum = _mm_add_ps(velecsum,velec);
349 fscal = _mm_and_ps(fscal,cutoff_mask);
351 /* Update vectorial force */
352 fix1 = _mm_macc_ps(dx11,fscal,fix1);
353 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
354 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
356 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
357 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
358 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
362 /**************************
363 * CALCULATE INTERACTIONS *
364 **************************/
366 if (gmx_mm_any_lt(rsq12,rcutoff2))
369 r12 = _mm_mul_ps(rsq12,rinv12);
371 /* EWALD ELECTROSTATICS */
373 /* Analytical PME correction */
374 zeta2 = _mm_mul_ps(beta2,rsq12);
375 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
376 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
377 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
378 felec = _mm_mul_ps(qq12,felec);
379 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
380 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
381 velec = _mm_mul_ps(qq12,velec);
383 d = _mm_sub_ps(r12,rswitch);
384 d = _mm_max_ps(d,_mm_setzero_ps());
385 d2 = _mm_mul_ps(d,d);
386 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
388 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
390 /* Evaluate switch function */
391 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
392 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
393 velec = _mm_mul_ps(velec,sw);
394 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
396 /* Update potential sum for this i atom from the interaction with this j atom. */
397 velec = _mm_and_ps(velec,cutoff_mask);
398 velecsum = _mm_add_ps(velecsum,velec);
402 fscal = _mm_and_ps(fscal,cutoff_mask);
404 /* Update vectorial force */
405 fix1 = _mm_macc_ps(dx12,fscal,fix1);
406 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
407 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
409 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
410 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
411 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
415 /**************************
416 * CALCULATE INTERACTIONS *
417 **************************/
419 if (gmx_mm_any_lt(rsq13,rcutoff2))
422 r13 = _mm_mul_ps(rsq13,rinv13);
424 /* EWALD ELECTROSTATICS */
426 /* Analytical PME correction */
427 zeta2 = _mm_mul_ps(beta2,rsq13);
428 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
429 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
430 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
431 felec = _mm_mul_ps(qq13,felec);
432 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
433 velec = _mm_nmacc_ps(pmecorrV,beta,rinv13);
434 velec = _mm_mul_ps(qq13,velec);
436 d = _mm_sub_ps(r13,rswitch);
437 d = _mm_max_ps(d,_mm_setzero_ps());
438 d2 = _mm_mul_ps(d,d);
439 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
441 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
443 /* Evaluate switch function */
444 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
445 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
446 velec = _mm_mul_ps(velec,sw);
447 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
449 /* Update potential sum for this i atom from the interaction with this j atom. */
450 velec = _mm_and_ps(velec,cutoff_mask);
451 velecsum = _mm_add_ps(velecsum,velec);
455 fscal = _mm_and_ps(fscal,cutoff_mask);
457 /* Update vectorial force */
458 fix1 = _mm_macc_ps(dx13,fscal,fix1);
459 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
460 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
462 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
463 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
464 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
468 /**************************
469 * CALCULATE INTERACTIONS *
470 **************************/
472 if (gmx_mm_any_lt(rsq21,rcutoff2))
475 r21 = _mm_mul_ps(rsq21,rinv21);
477 /* EWALD ELECTROSTATICS */
479 /* Analytical PME correction */
480 zeta2 = _mm_mul_ps(beta2,rsq21);
481 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
482 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
483 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
484 felec = _mm_mul_ps(qq21,felec);
485 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
486 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
487 velec = _mm_mul_ps(qq21,velec);
489 d = _mm_sub_ps(r21,rswitch);
490 d = _mm_max_ps(d,_mm_setzero_ps());
491 d2 = _mm_mul_ps(d,d);
492 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
494 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
496 /* Evaluate switch function */
497 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
498 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
499 velec = _mm_mul_ps(velec,sw);
500 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
502 /* Update potential sum for this i atom from the interaction with this j atom. */
503 velec = _mm_and_ps(velec,cutoff_mask);
504 velecsum = _mm_add_ps(velecsum,velec);
508 fscal = _mm_and_ps(fscal,cutoff_mask);
510 /* Update vectorial force */
511 fix2 = _mm_macc_ps(dx21,fscal,fix2);
512 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
513 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
515 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
516 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
517 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
521 /**************************
522 * CALCULATE INTERACTIONS *
523 **************************/
525 if (gmx_mm_any_lt(rsq22,rcutoff2))
528 r22 = _mm_mul_ps(rsq22,rinv22);
530 /* EWALD ELECTROSTATICS */
532 /* Analytical PME correction */
533 zeta2 = _mm_mul_ps(beta2,rsq22);
534 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
535 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
536 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
537 felec = _mm_mul_ps(qq22,felec);
538 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
539 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
540 velec = _mm_mul_ps(qq22,velec);
542 d = _mm_sub_ps(r22,rswitch);
543 d = _mm_max_ps(d,_mm_setzero_ps());
544 d2 = _mm_mul_ps(d,d);
545 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
547 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
549 /* Evaluate switch function */
550 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
551 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
552 velec = _mm_mul_ps(velec,sw);
553 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
555 /* Update potential sum for this i atom from the interaction with this j atom. */
556 velec = _mm_and_ps(velec,cutoff_mask);
557 velecsum = _mm_add_ps(velecsum,velec);
561 fscal = _mm_and_ps(fscal,cutoff_mask);
563 /* Update vectorial force */
564 fix2 = _mm_macc_ps(dx22,fscal,fix2);
565 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
566 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
568 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
569 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
570 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
574 /**************************
575 * CALCULATE INTERACTIONS *
576 **************************/
578 if (gmx_mm_any_lt(rsq23,rcutoff2))
581 r23 = _mm_mul_ps(rsq23,rinv23);
583 /* EWALD ELECTROSTATICS */
585 /* Analytical PME correction */
586 zeta2 = _mm_mul_ps(beta2,rsq23);
587 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
588 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
589 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
590 felec = _mm_mul_ps(qq23,felec);
591 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
592 velec = _mm_nmacc_ps(pmecorrV,beta,rinv23);
593 velec = _mm_mul_ps(qq23,velec);
595 d = _mm_sub_ps(r23,rswitch);
596 d = _mm_max_ps(d,_mm_setzero_ps());
597 d2 = _mm_mul_ps(d,d);
598 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
600 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
602 /* Evaluate switch function */
603 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
604 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
605 velec = _mm_mul_ps(velec,sw);
606 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
608 /* Update potential sum for this i atom from the interaction with this j atom. */
609 velec = _mm_and_ps(velec,cutoff_mask);
610 velecsum = _mm_add_ps(velecsum,velec);
614 fscal = _mm_and_ps(fscal,cutoff_mask);
616 /* Update vectorial force */
617 fix2 = _mm_macc_ps(dx23,fscal,fix2);
618 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
619 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
621 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
622 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
623 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
627 /**************************
628 * CALCULATE INTERACTIONS *
629 **************************/
631 if (gmx_mm_any_lt(rsq31,rcutoff2))
634 r31 = _mm_mul_ps(rsq31,rinv31);
636 /* EWALD ELECTROSTATICS */
638 /* Analytical PME correction */
639 zeta2 = _mm_mul_ps(beta2,rsq31);
640 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
641 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
642 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
643 felec = _mm_mul_ps(qq31,felec);
644 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
645 velec = _mm_nmacc_ps(pmecorrV,beta,rinv31);
646 velec = _mm_mul_ps(qq31,velec);
648 d = _mm_sub_ps(r31,rswitch);
649 d = _mm_max_ps(d,_mm_setzero_ps());
650 d2 = _mm_mul_ps(d,d);
651 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
653 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
655 /* Evaluate switch function */
656 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
657 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
658 velec = _mm_mul_ps(velec,sw);
659 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
661 /* Update potential sum for this i atom from the interaction with this j atom. */
662 velec = _mm_and_ps(velec,cutoff_mask);
663 velecsum = _mm_add_ps(velecsum,velec);
667 fscal = _mm_and_ps(fscal,cutoff_mask);
669 /* Update vectorial force */
670 fix3 = _mm_macc_ps(dx31,fscal,fix3);
671 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
672 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
674 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
675 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
676 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
680 /**************************
681 * CALCULATE INTERACTIONS *
682 **************************/
684 if (gmx_mm_any_lt(rsq32,rcutoff2))
687 r32 = _mm_mul_ps(rsq32,rinv32);
689 /* EWALD ELECTROSTATICS */
691 /* Analytical PME correction */
692 zeta2 = _mm_mul_ps(beta2,rsq32);
693 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
694 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
695 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
696 felec = _mm_mul_ps(qq32,felec);
697 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
698 velec = _mm_nmacc_ps(pmecorrV,beta,rinv32);
699 velec = _mm_mul_ps(qq32,velec);
701 d = _mm_sub_ps(r32,rswitch);
702 d = _mm_max_ps(d,_mm_setzero_ps());
703 d2 = _mm_mul_ps(d,d);
704 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
706 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
708 /* Evaluate switch function */
709 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
710 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
711 velec = _mm_mul_ps(velec,sw);
712 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
714 /* Update potential sum for this i atom from the interaction with this j atom. */
715 velec = _mm_and_ps(velec,cutoff_mask);
716 velecsum = _mm_add_ps(velecsum,velec);
720 fscal = _mm_and_ps(fscal,cutoff_mask);
722 /* Update vectorial force */
723 fix3 = _mm_macc_ps(dx32,fscal,fix3);
724 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
725 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
727 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
728 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
729 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
733 /**************************
734 * CALCULATE INTERACTIONS *
735 **************************/
737 if (gmx_mm_any_lt(rsq33,rcutoff2))
740 r33 = _mm_mul_ps(rsq33,rinv33);
742 /* EWALD ELECTROSTATICS */
744 /* Analytical PME correction */
745 zeta2 = _mm_mul_ps(beta2,rsq33);
746 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
747 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
748 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
749 felec = _mm_mul_ps(qq33,felec);
750 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
751 velec = _mm_nmacc_ps(pmecorrV,beta,rinv33);
752 velec = _mm_mul_ps(qq33,velec);
754 d = _mm_sub_ps(r33,rswitch);
755 d = _mm_max_ps(d,_mm_setzero_ps());
756 d2 = _mm_mul_ps(d,d);
757 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
759 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
761 /* Evaluate switch function */
762 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
763 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
764 velec = _mm_mul_ps(velec,sw);
765 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
767 /* Update potential sum for this i atom from the interaction with this j atom. */
768 velec = _mm_and_ps(velec,cutoff_mask);
769 velecsum = _mm_add_ps(velecsum,velec);
773 fscal = _mm_and_ps(fscal,cutoff_mask);
775 /* Update vectorial force */
776 fix3 = _mm_macc_ps(dx33,fscal,fix3);
777 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
778 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
780 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
781 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
782 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
786 fjptrA = f+j_coord_offsetA;
787 fjptrB = f+j_coord_offsetB;
788 fjptrC = f+j_coord_offsetC;
789 fjptrD = f+j_coord_offsetD;
791 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
792 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
794 /* Inner loop uses 477 flops */
800 /* Get j neighbor index, and coordinate index */
801 jnrlistA = jjnr[jidx];
802 jnrlistB = jjnr[jidx+1];
803 jnrlistC = jjnr[jidx+2];
804 jnrlistD = jjnr[jidx+3];
805 /* Sign of each element will be negative for non-real atoms.
806 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
807 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
809 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
810 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
811 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
812 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
813 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
814 j_coord_offsetA = DIM*jnrA;
815 j_coord_offsetB = DIM*jnrB;
816 j_coord_offsetC = DIM*jnrC;
817 j_coord_offsetD = DIM*jnrD;
819 /* load j atom coordinates */
820 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
821 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
822 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
824 /* Calculate displacement vector */
825 dx11 = _mm_sub_ps(ix1,jx1);
826 dy11 = _mm_sub_ps(iy1,jy1);
827 dz11 = _mm_sub_ps(iz1,jz1);
828 dx12 = _mm_sub_ps(ix1,jx2);
829 dy12 = _mm_sub_ps(iy1,jy2);
830 dz12 = _mm_sub_ps(iz1,jz2);
831 dx13 = _mm_sub_ps(ix1,jx3);
832 dy13 = _mm_sub_ps(iy1,jy3);
833 dz13 = _mm_sub_ps(iz1,jz3);
834 dx21 = _mm_sub_ps(ix2,jx1);
835 dy21 = _mm_sub_ps(iy2,jy1);
836 dz21 = _mm_sub_ps(iz2,jz1);
837 dx22 = _mm_sub_ps(ix2,jx2);
838 dy22 = _mm_sub_ps(iy2,jy2);
839 dz22 = _mm_sub_ps(iz2,jz2);
840 dx23 = _mm_sub_ps(ix2,jx3);
841 dy23 = _mm_sub_ps(iy2,jy3);
842 dz23 = _mm_sub_ps(iz2,jz3);
843 dx31 = _mm_sub_ps(ix3,jx1);
844 dy31 = _mm_sub_ps(iy3,jy1);
845 dz31 = _mm_sub_ps(iz3,jz1);
846 dx32 = _mm_sub_ps(ix3,jx2);
847 dy32 = _mm_sub_ps(iy3,jy2);
848 dz32 = _mm_sub_ps(iz3,jz2);
849 dx33 = _mm_sub_ps(ix3,jx3);
850 dy33 = _mm_sub_ps(iy3,jy3);
851 dz33 = _mm_sub_ps(iz3,jz3);
853 /* Calculate squared distance and things based on it */
854 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
855 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
856 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
857 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
858 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
859 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
860 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
861 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
862 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
864 rinv11 = gmx_mm_invsqrt_ps(rsq11);
865 rinv12 = gmx_mm_invsqrt_ps(rsq12);
866 rinv13 = gmx_mm_invsqrt_ps(rsq13);
867 rinv21 = gmx_mm_invsqrt_ps(rsq21);
868 rinv22 = gmx_mm_invsqrt_ps(rsq22);
869 rinv23 = gmx_mm_invsqrt_ps(rsq23);
870 rinv31 = gmx_mm_invsqrt_ps(rsq31);
871 rinv32 = gmx_mm_invsqrt_ps(rsq32);
872 rinv33 = gmx_mm_invsqrt_ps(rsq33);
874 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
875 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
876 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
877 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
878 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
879 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
880 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
881 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
882 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
884 fjx1 = _mm_setzero_ps();
885 fjy1 = _mm_setzero_ps();
886 fjz1 = _mm_setzero_ps();
887 fjx2 = _mm_setzero_ps();
888 fjy2 = _mm_setzero_ps();
889 fjz2 = _mm_setzero_ps();
890 fjx3 = _mm_setzero_ps();
891 fjy3 = _mm_setzero_ps();
892 fjz3 = _mm_setzero_ps();
894 /**************************
895 * CALCULATE INTERACTIONS *
896 **************************/
898 if (gmx_mm_any_lt(rsq11,rcutoff2))
901 r11 = _mm_mul_ps(rsq11,rinv11);
902 r11 = _mm_andnot_ps(dummy_mask,r11);
904 /* EWALD ELECTROSTATICS */
906 /* Analytical PME correction */
907 zeta2 = _mm_mul_ps(beta2,rsq11);
908 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
909 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
910 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
911 felec = _mm_mul_ps(qq11,felec);
912 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
913 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
914 velec = _mm_mul_ps(qq11,velec);
916 d = _mm_sub_ps(r11,rswitch);
917 d = _mm_max_ps(d,_mm_setzero_ps());
918 d2 = _mm_mul_ps(d,d);
919 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
921 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
923 /* Evaluate switch function */
924 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
925 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
926 velec = _mm_mul_ps(velec,sw);
927 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
929 /* Update potential sum for this i atom from the interaction with this j atom. */
930 velec = _mm_and_ps(velec,cutoff_mask);
931 velec = _mm_andnot_ps(dummy_mask,velec);
932 velecsum = _mm_add_ps(velecsum,velec);
936 fscal = _mm_and_ps(fscal,cutoff_mask);
938 fscal = _mm_andnot_ps(dummy_mask,fscal);
940 /* Update vectorial force */
941 fix1 = _mm_macc_ps(dx11,fscal,fix1);
942 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
943 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
945 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
946 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
947 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
951 /**************************
952 * CALCULATE INTERACTIONS *
953 **************************/
955 if (gmx_mm_any_lt(rsq12,rcutoff2))
958 r12 = _mm_mul_ps(rsq12,rinv12);
959 r12 = _mm_andnot_ps(dummy_mask,r12);
961 /* EWALD ELECTROSTATICS */
963 /* Analytical PME correction */
964 zeta2 = _mm_mul_ps(beta2,rsq12);
965 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
966 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
967 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
968 felec = _mm_mul_ps(qq12,felec);
969 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
970 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
971 velec = _mm_mul_ps(qq12,velec);
973 d = _mm_sub_ps(r12,rswitch);
974 d = _mm_max_ps(d,_mm_setzero_ps());
975 d2 = _mm_mul_ps(d,d);
976 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
978 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
980 /* Evaluate switch function */
981 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
982 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
983 velec = _mm_mul_ps(velec,sw);
984 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
986 /* Update potential sum for this i atom from the interaction with this j atom. */
987 velec = _mm_and_ps(velec,cutoff_mask);
988 velec = _mm_andnot_ps(dummy_mask,velec);
989 velecsum = _mm_add_ps(velecsum,velec);
993 fscal = _mm_and_ps(fscal,cutoff_mask);
995 fscal = _mm_andnot_ps(dummy_mask,fscal);
997 /* Update vectorial force */
998 fix1 = _mm_macc_ps(dx12,fscal,fix1);
999 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1000 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1002 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1003 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1004 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1008 /**************************
1009 * CALCULATE INTERACTIONS *
1010 **************************/
1012 if (gmx_mm_any_lt(rsq13,rcutoff2))
1015 r13 = _mm_mul_ps(rsq13,rinv13);
1016 r13 = _mm_andnot_ps(dummy_mask,r13);
1018 /* EWALD ELECTROSTATICS */
1020 /* Analytical PME correction */
1021 zeta2 = _mm_mul_ps(beta2,rsq13);
1022 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
1023 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1024 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1025 felec = _mm_mul_ps(qq13,felec);
1026 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1027 velec = _mm_nmacc_ps(pmecorrV,beta,rinv13);
1028 velec = _mm_mul_ps(qq13,velec);
1030 d = _mm_sub_ps(r13,rswitch);
1031 d = _mm_max_ps(d,_mm_setzero_ps());
1032 d2 = _mm_mul_ps(d,d);
1033 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1035 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1037 /* Evaluate switch function */
1038 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1039 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
1040 velec = _mm_mul_ps(velec,sw);
1041 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1043 /* Update potential sum for this i atom from the interaction with this j atom. */
1044 velec = _mm_and_ps(velec,cutoff_mask);
1045 velec = _mm_andnot_ps(dummy_mask,velec);
1046 velecsum = _mm_add_ps(velecsum,velec);
1050 fscal = _mm_and_ps(fscal,cutoff_mask);
1052 fscal = _mm_andnot_ps(dummy_mask,fscal);
1054 /* Update vectorial force */
1055 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1056 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1057 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1059 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1060 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1061 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1065 /**************************
1066 * CALCULATE INTERACTIONS *
1067 **************************/
1069 if (gmx_mm_any_lt(rsq21,rcutoff2))
1072 r21 = _mm_mul_ps(rsq21,rinv21);
1073 r21 = _mm_andnot_ps(dummy_mask,r21);
1075 /* EWALD ELECTROSTATICS */
1077 /* Analytical PME correction */
1078 zeta2 = _mm_mul_ps(beta2,rsq21);
1079 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1080 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1081 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1082 felec = _mm_mul_ps(qq21,felec);
1083 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1084 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
1085 velec = _mm_mul_ps(qq21,velec);
1087 d = _mm_sub_ps(r21,rswitch);
1088 d = _mm_max_ps(d,_mm_setzero_ps());
1089 d2 = _mm_mul_ps(d,d);
1090 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1092 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1094 /* Evaluate switch function */
1095 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1096 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1097 velec = _mm_mul_ps(velec,sw);
1098 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1100 /* Update potential sum for this i atom from the interaction with this j atom. */
1101 velec = _mm_and_ps(velec,cutoff_mask);
1102 velec = _mm_andnot_ps(dummy_mask,velec);
1103 velecsum = _mm_add_ps(velecsum,velec);
1107 fscal = _mm_and_ps(fscal,cutoff_mask);
1109 fscal = _mm_andnot_ps(dummy_mask,fscal);
1111 /* Update vectorial force */
1112 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1113 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1114 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1116 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1117 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1118 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1122 /**************************
1123 * CALCULATE INTERACTIONS *
1124 **************************/
1126 if (gmx_mm_any_lt(rsq22,rcutoff2))
1129 r22 = _mm_mul_ps(rsq22,rinv22);
1130 r22 = _mm_andnot_ps(dummy_mask,r22);
1132 /* EWALD ELECTROSTATICS */
1134 /* Analytical PME correction */
1135 zeta2 = _mm_mul_ps(beta2,rsq22);
1136 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1137 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1138 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1139 felec = _mm_mul_ps(qq22,felec);
1140 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1141 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
1142 velec = _mm_mul_ps(qq22,velec);
1144 d = _mm_sub_ps(r22,rswitch);
1145 d = _mm_max_ps(d,_mm_setzero_ps());
1146 d2 = _mm_mul_ps(d,d);
1147 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1149 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1151 /* Evaluate switch function */
1152 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1153 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1154 velec = _mm_mul_ps(velec,sw);
1155 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1157 /* Update potential sum for this i atom from the interaction with this j atom. */
1158 velec = _mm_and_ps(velec,cutoff_mask);
1159 velec = _mm_andnot_ps(dummy_mask,velec);
1160 velecsum = _mm_add_ps(velecsum,velec);
1164 fscal = _mm_and_ps(fscal,cutoff_mask);
1166 fscal = _mm_andnot_ps(dummy_mask,fscal);
1168 /* Update vectorial force */
1169 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1170 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1171 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1173 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1174 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1175 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1179 /**************************
1180 * CALCULATE INTERACTIONS *
1181 **************************/
1183 if (gmx_mm_any_lt(rsq23,rcutoff2))
1186 r23 = _mm_mul_ps(rsq23,rinv23);
1187 r23 = _mm_andnot_ps(dummy_mask,r23);
1189 /* EWALD ELECTROSTATICS */
1191 /* Analytical PME correction */
1192 zeta2 = _mm_mul_ps(beta2,rsq23);
1193 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
1194 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1195 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1196 felec = _mm_mul_ps(qq23,felec);
1197 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1198 velec = _mm_nmacc_ps(pmecorrV,beta,rinv23);
1199 velec = _mm_mul_ps(qq23,velec);
1201 d = _mm_sub_ps(r23,rswitch);
1202 d = _mm_max_ps(d,_mm_setzero_ps());
1203 d2 = _mm_mul_ps(d,d);
1204 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1206 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1208 /* Evaluate switch function */
1209 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1210 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
1211 velec = _mm_mul_ps(velec,sw);
1212 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1214 /* Update potential sum for this i atom from the interaction with this j atom. */
1215 velec = _mm_and_ps(velec,cutoff_mask);
1216 velec = _mm_andnot_ps(dummy_mask,velec);
1217 velecsum = _mm_add_ps(velecsum,velec);
1221 fscal = _mm_and_ps(fscal,cutoff_mask);
1223 fscal = _mm_andnot_ps(dummy_mask,fscal);
1225 /* Update vectorial force */
1226 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1227 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1228 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1230 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1231 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1232 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1236 /**************************
1237 * CALCULATE INTERACTIONS *
1238 **************************/
1240 if (gmx_mm_any_lt(rsq31,rcutoff2))
1243 r31 = _mm_mul_ps(rsq31,rinv31);
1244 r31 = _mm_andnot_ps(dummy_mask,r31);
1246 /* EWALD ELECTROSTATICS */
1248 /* Analytical PME correction */
1249 zeta2 = _mm_mul_ps(beta2,rsq31);
1250 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
1251 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1252 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1253 felec = _mm_mul_ps(qq31,felec);
1254 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1255 velec = _mm_nmacc_ps(pmecorrV,beta,rinv31);
1256 velec = _mm_mul_ps(qq31,velec);
1258 d = _mm_sub_ps(r31,rswitch);
1259 d = _mm_max_ps(d,_mm_setzero_ps());
1260 d2 = _mm_mul_ps(d,d);
1261 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1263 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1265 /* Evaluate switch function */
1266 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1267 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
1268 velec = _mm_mul_ps(velec,sw);
1269 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1271 /* Update potential sum for this i atom from the interaction with this j atom. */
1272 velec = _mm_and_ps(velec,cutoff_mask);
1273 velec = _mm_andnot_ps(dummy_mask,velec);
1274 velecsum = _mm_add_ps(velecsum,velec);
1278 fscal = _mm_and_ps(fscal,cutoff_mask);
1280 fscal = _mm_andnot_ps(dummy_mask,fscal);
1282 /* Update vectorial force */
1283 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1284 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1285 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1287 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1288 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1289 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1293 /**************************
1294 * CALCULATE INTERACTIONS *
1295 **************************/
1297 if (gmx_mm_any_lt(rsq32,rcutoff2))
1300 r32 = _mm_mul_ps(rsq32,rinv32);
1301 r32 = _mm_andnot_ps(dummy_mask,r32);
1303 /* EWALD ELECTROSTATICS */
1305 /* Analytical PME correction */
1306 zeta2 = _mm_mul_ps(beta2,rsq32);
1307 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
1308 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1309 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1310 felec = _mm_mul_ps(qq32,felec);
1311 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1312 velec = _mm_nmacc_ps(pmecorrV,beta,rinv32);
1313 velec = _mm_mul_ps(qq32,velec);
1315 d = _mm_sub_ps(r32,rswitch);
1316 d = _mm_max_ps(d,_mm_setzero_ps());
1317 d2 = _mm_mul_ps(d,d);
1318 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1320 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1322 /* Evaluate switch function */
1323 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1324 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
1325 velec = _mm_mul_ps(velec,sw);
1326 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1328 /* Update potential sum for this i atom from the interaction with this j atom. */
1329 velec = _mm_and_ps(velec,cutoff_mask);
1330 velec = _mm_andnot_ps(dummy_mask,velec);
1331 velecsum = _mm_add_ps(velecsum,velec);
1335 fscal = _mm_and_ps(fscal,cutoff_mask);
1337 fscal = _mm_andnot_ps(dummy_mask,fscal);
1339 /* Update vectorial force */
1340 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1341 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1342 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1344 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1345 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1346 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1350 /**************************
1351 * CALCULATE INTERACTIONS *
1352 **************************/
1354 if (gmx_mm_any_lt(rsq33,rcutoff2))
1357 r33 = _mm_mul_ps(rsq33,rinv33);
1358 r33 = _mm_andnot_ps(dummy_mask,r33);
1360 /* EWALD ELECTROSTATICS */
1362 /* Analytical PME correction */
1363 zeta2 = _mm_mul_ps(beta2,rsq33);
1364 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
1365 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1366 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1367 felec = _mm_mul_ps(qq33,felec);
1368 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1369 velec = _mm_nmacc_ps(pmecorrV,beta,rinv33);
1370 velec = _mm_mul_ps(qq33,velec);
1372 d = _mm_sub_ps(r33,rswitch);
1373 d = _mm_max_ps(d,_mm_setzero_ps());
1374 d2 = _mm_mul_ps(d,d);
1375 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1377 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1379 /* Evaluate switch function */
1380 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1381 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
1382 velec = _mm_mul_ps(velec,sw);
1383 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1385 /* Update potential sum for this i atom from the interaction with this j atom. */
1386 velec = _mm_and_ps(velec,cutoff_mask);
1387 velec = _mm_andnot_ps(dummy_mask,velec);
1388 velecsum = _mm_add_ps(velecsum,velec);
1392 fscal = _mm_and_ps(fscal,cutoff_mask);
1394 fscal = _mm_andnot_ps(dummy_mask,fscal);
1396 /* Update vectorial force */
1397 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1398 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1399 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1401 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1402 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1403 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1407 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1408 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1409 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1410 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1412 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1413 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1415 /* Inner loop uses 486 flops */
1418 /* End of innermost loop */
1420 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1421 f+i_coord_offset+DIM,fshift+i_shift_offset);
1424 /* Update potential energies */
1425 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1427 /* Increment number of inner iterations */
1428 inneriter += j_index_end - j_index_start;
1430 /* Outer loop uses 19 flops */
1433 /* Increment number of outer iterations */
1436 /* Update outer/inner flops */
1438 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*486);
1441 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_128_fma_single
1442 * Electrostatics interaction: Ewald
1443 * VdW interaction: None
1444 * Geometry: Water4-Water4
1445 * Calculate force/pot: Force
1448 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_128_fma_single
1449 (t_nblist * gmx_restrict nlist,
1450 rvec * gmx_restrict xx,
1451 rvec * gmx_restrict ff,
1452 t_forcerec * gmx_restrict fr,
1453 t_mdatoms * gmx_restrict mdatoms,
1454 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1455 t_nrnb * gmx_restrict nrnb)
1457 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1458 * just 0 for non-waters.
1459 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1460 * jnr indices corresponding to data put in the four positions in the SIMD register.
1462 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1463 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1464 int jnrA,jnrB,jnrC,jnrD;
1465 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1466 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1467 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1468 real rcutoff_scalar;
1469 real *shiftvec,*fshift,*x,*f;
1470 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1471 real scratch[4*DIM];
1472 __m128 fscal,rcutoff,rcutoff2,jidxall;
1474 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1476 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1478 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1479 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1480 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1481 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1482 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1483 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1484 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1485 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1486 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1487 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1488 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1489 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1490 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1491 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1492 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1493 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1494 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1497 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1498 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1500 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1501 real rswitch_scalar,d_scalar;
1502 __m128 dummy_mask,cutoff_mask;
1503 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1504 __m128 one = _mm_set1_ps(1.0);
1505 __m128 two = _mm_set1_ps(2.0);
1511 jindex = nlist->jindex;
1513 shiftidx = nlist->shift;
1515 shiftvec = fr->shift_vec[0];
1516 fshift = fr->fshift[0];
1517 facel = _mm_set1_ps(fr->epsfac);
1518 charge = mdatoms->chargeA;
1520 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1521 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1522 beta2 = _mm_mul_ps(beta,beta);
1523 beta3 = _mm_mul_ps(beta,beta2);
1524 ewtab = fr->ic->tabq_coul_FDV0;
1525 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1526 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1528 /* Setup water-specific parameters */
1529 inr = nlist->iinr[0];
1530 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1531 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1532 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1534 jq1 = _mm_set1_ps(charge[inr+1]);
1535 jq2 = _mm_set1_ps(charge[inr+2]);
1536 jq3 = _mm_set1_ps(charge[inr+3]);
1537 qq11 = _mm_mul_ps(iq1,jq1);
1538 qq12 = _mm_mul_ps(iq1,jq2);
1539 qq13 = _mm_mul_ps(iq1,jq3);
1540 qq21 = _mm_mul_ps(iq2,jq1);
1541 qq22 = _mm_mul_ps(iq2,jq2);
1542 qq23 = _mm_mul_ps(iq2,jq3);
1543 qq31 = _mm_mul_ps(iq3,jq1);
1544 qq32 = _mm_mul_ps(iq3,jq2);
1545 qq33 = _mm_mul_ps(iq3,jq3);
1547 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1548 rcutoff_scalar = fr->rcoulomb;
1549 rcutoff = _mm_set1_ps(rcutoff_scalar);
1550 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1552 rswitch_scalar = fr->rcoulomb_switch;
1553 rswitch = _mm_set1_ps(rswitch_scalar);
1554 /* Setup switch parameters */
1555 d_scalar = rcutoff_scalar-rswitch_scalar;
1556 d = _mm_set1_ps(d_scalar);
1557 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1558 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1559 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1560 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1561 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1562 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1564 /* Avoid stupid compiler warnings */
1565 jnrA = jnrB = jnrC = jnrD = 0;
1566 j_coord_offsetA = 0;
1567 j_coord_offsetB = 0;
1568 j_coord_offsetC = 0;
1569 j_coord_offsetD = 0;
1574 for(iidx=0;iidx<4*DIM;iidx++)
1576 scratch[iidx] = 0.0;
1579 /* Start outer loop over neighborlists */
1580 for(iidx=0; iidx<nri; iidx++)
1582 /* Load shift vector for this list */
1583 i_shift_offset = DIM*shiftidx[iidx];
1585 /* Load limits for loop over neighbors */
1586 j_index_start = jindex[iidx];
1587 j_index_end = jindex[iidx+1];
1589 /* Get outer coordinate index */
1591 i_coord_offset = DIM*inr;
1593 /* Load i particle coords and add shift vector */
1594 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1595 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1597 fix1 = _mm_setzero_ps();
1598 fiy1 = _mm_setzero_ps();
1599 fiz1 = _mm_setzero_ps();
1600 fix2 = _mm_setzero_ps();
1601 fiy2 = _mm_setzero_ps();
1602 fiz2 = _mm_setzero_ps();
1603 fix3 = _mm_setzero_ps();
1604 fiy3 = _mm_setzero_ps();
1605 fiz3 = _mm_setzero_ps();
1607 /* Start inner kernel loop */
1608 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1611 /* Get j neighbor index, and coordinate index */
1613 jnrB = jjnr[jidx+1];
1614 jnrC = jjnr[jidx+2];
1615 jnrD = jjnr[jidx+3];
1616 j_coord_offsetA = DIM*jnrA;
1617 j_coord_offsetB = DIM*jnrB;
1618 j_coord_offsetC = DIM*jnrC;
1619 j_coord_offsetD = DIM*jnrD;
1621 /* load j atom coordinates */
1622 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1623 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1624 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1626 /* Calculate displacement vector */
1627 dx11 = _mm_sub_ps(ix1,jx1);
1628 dy11 = _mm_sub_ps(iy1,jy1);
1629 dz11 = _mm_sub_ps(iz1,jz1);
1630 dx12 = _mm_sub_ps(ix1,jx2);
1631 dy12 = _mm_sub_ps(iy1,jy2);
1632 dz12 = _mm_sub_ps(iz1,jz2);
1633 dx13 = _mm_sub_ps(ix1,jx3);
1634 dy13 = _mm_sub_ps(iy1,jy3);
1635 dz13 = _mm_sub_ps(iz1,jz3);
1636 dx21 = _mm_sub_ps(ix2,jx1);
1637 dy21 = _mm_sub_ps(iy2,jy1);
1638 dz21 = _mm_sub_ps(iz2,jz1);
1639 dx22 = _mm_sub_ps(ix2,jx2);
1640 dy22 = _mm_sub_ps(iy2,jy2);
1641 dz22 = _mm_sub_ps(iz2,jz2);
1642 dx23 = _mm_sub_ps(ix2,jx3);
1643 dy23 = _mm_sub_ps(iy2,jy3);
1644 dz23 = _mm_sub_ps(iz2,jz3);
1645 dx31 = _mm_sub_ps(ix3,jx1);
1646 dy31 = _mm_sub_ps(iy3,jy1);
1647 dz31 = _mm_sub_ps(iz3,jz1);
1648 dx32 = _mm_sub_ps(ix3,jx2);
1649 dy32 = _mm_sub_ps(iy3,jy2);
1650 dz32 = _mm_sub_ps(iz3,jz2);
1651 dx33 = _mm_sub_ps(ix3,jx3);
1652 dy33 = _mm_sub_ps(iy3,jy3);
1653 dz33 = _mm_sub_ps(iz3,jz3);
1655 /* Calculate squared distance and things based on it */
1656 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1657 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1658 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1659 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1660 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1661 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1662 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1663 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1664 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1666 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1667 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1668 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1669 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1670 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1671 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1672 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1673 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1674 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1676 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1677 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1678 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1679 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1680 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1681 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1682 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1683 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1684 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1686 fjx1 = _mm_setzero_ps();
1687 fjy1 = _mm_setzero_ps();
1688 fjz1 = _mm_setzero_ps();
1689 fjx2 = _mm_setzero_ps();
1690 fjy2 = _mm_setzero_ps();
1691 fjz2 = _mm_setzero_ps();
1692 fjx3 = _mm_setzero_ps();
1693 fjy3 = _mm_setzero_ps();
1694 fjz3 = _mm_setzero_ps();
1696 /**************************
1697 * CALCULATE INTERACTIONS *
1698 **************************/
1700 if (gmx_mm_any_lt(rsq11,rcutoff2))
1703 r11 = _mm_mul_ps(rsq11,rinv11);
1705 /* EWALD ELECTROSTATICS */
1707 /* Analytical PME correction */
1708 zeta2 = _mm_mul_ps(beta2,rsq11);
1709 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1710 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1711 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1712 felec = _mm_mul_ps(qq11,felec);
1713 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1714 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
1715 velec = _mm_mul_ps(qq11,velec);
1717 d = _mm_sub_ps(r11,rswitch);
1718 d = _mm_max_ps(d,_mm_setzero_ps());
1719 d2 = _mm_mul_ps(d,d);
1720 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1722 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1724 /* Evaluate switch function */
1725 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1726 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1727 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1731 fscal = _mm_and_ps(fscal,cutoff_mask);
1733 /* Update vectorial force */
1734 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1735 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1736 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1738 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1739 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1740 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1744 /**************************
1745 * CALCULATE INTERACTIONS *
1746 **************************/
1748 if (gmx_mm_any_lt(rsq12,rcutoff2))
1751 r12 = _mm_mul_ps(rsq12,rinv12);
1753 /* EWALD ELECTROSTATICS */
1755 /* Analytical PME correction */
1756 zeta2 = _mm_mul_ps(beta2,rsq12);
1757 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1758 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1759 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1760 felec = _mm_mul_ps(qq12,felec);
1761 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1762 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
1763 velec = _mm_mul_ps(qq12,velec);
1765 d = _mm_sub_ps(r12,rswitch);
1766 d = _mm_max_ps(d,_mm_setzero_ps());
1767 d2 = _mm_mul_ps(d,d);
1768 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1770 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1772 /* Evaluate switch function */
1773 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1774 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1775 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1779 fscal = _mm_and_ps(fscal,cutoff_mask);
1781 /* Update vectorial force */
1782 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1783 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1784 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1786 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1787 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1788 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1792 /**************************
1793 * CALCULATE INTERACTIONS *
1794 **************************/
1796 if (gmx_mm_any_lt(rsq13,rcutoff2))
1799 r13 = _mm_mul_ps(rsq13,rinv13);
1801 /* EWALD ELECTROSTATICS */
1803 /* Analytical PME correction */
1804 zeta2 = _mm_mul_ps(beta2,rsq13);
1805 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
1806 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1807 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1808 felec = _mm_mul_ps(qq13,felec);
1809 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1810 velec = _mm_nmacc_ps(pmecorrV,beta,rinv13);
1811 velec = _mm_mul_ps(qq13,velec);
1813 d = _mm_sub_ps(r13,rswitch);
1814 d = _mm_max_ps(d,_mm_setzero_ps());
1815 d2 = _mm_mul_ps(d,d);
1816 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1818 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1820 /* Evaluate switch function */
1821 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1822 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
1823 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1827 fscal = _mm_and_ps(fscal,cutoff_mask);
1829 /* Update vectorial force */
1830 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1831 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1832 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1834 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1835 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1836 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1840 /**************************
1841 * CALCULATE INTERACTIONS *
1842 **************************/
1844 if (gmx_mm_any_lt(rsq21,rcutoff2))
1847 r21 = _mm_mul_ps(rsq21,rinv21);
1849 /* EWALD ELECTROSTATICS */
1851 /* Analytical PME correction */
1852 zeta2 = _mm_mul_ps(beta2,rsq21);
1853 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1854 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1855 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1856 felec = _mm_mul_ps(qq21,felec);
1857 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1858 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
1859 velec = _mm_mul_ps(qq21,velec);
1861 d = _mm_sub_ps(r21,rswitch);
1862 d = _mm_max_ps(d,_mm_setzero_ps());
1863 d2 = _mm_mul_ps(d,d);
1864 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1866 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1868 /* Evaluate switch function */
1869 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1870 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1871 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1875 fscal = _mm_and_ps(fscal,cutoff_mask);
1877 /* Update vectorial force */
1878 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1879 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1880 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1882 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1883 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1884 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1888 /**************************
1889 * CALCULATE INTERACTIONS *
1890 **************************/
1892 if (gmx_mm_any_lt(rsq22,rcutoff2))
1895 r22 = _mm_mul_ps(rsq22,rinv22);
1897 /* EWALD ELECTROSTATICS */
1899 /* Analytical PME correction */
1900 zeta2 = _mm_mul_ps(beta2,rsq22);
1901 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1902 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1903 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1904 felec = _mm_mul_ps(qq22,felec);
1905 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1906 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
1907 velec = _mm_mul_ps(qq22,velec);
1909 d = _mm_sub_ps(r22,rswitch);
1910 d = _mm_max_ps(d,_mm_setzero_ps());
1911 d2 = _mm_mul_ps(d,d);
1912 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1914 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1916 /* Evaluate switch function */
1917 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1918 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1919 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1923 fscal = _mm_and_ps(fscal,cutoff_mask);
1925 /* Update vectorial force */
1926 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1927 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1928 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1930 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1931 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1932 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1936 /**************************
1937 * CALCULATE INTERACTIONS *
1938 **************************/
1940 if (gmx_mm_any_lt(rsq23,rcutoff2))
1943 r23 = _mm_mul_ps(rsq23,rinv23);
1945 /* EWALD ELECTROSTATICS */
1947 /* Analytical PME correction */
1948 zeta2 = _mm_mul_ps(beta2,rsq23);
1949 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
1950 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1951 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1952 felec = _mm_mul_ps(qq23,felec);
1953 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1954 velec = _mm_nmacc_ps(pmecorrV,beta,rinv23);
1955 velec = _mm_mul_ps(qq23,velec);
1957 d = _mm_sub_ps(r23,rswitch);
1958 d = _mm_max_ps(d,_mm_setzero_ps());
1959 d2 = _mm_mul_ps(d,d);
1960 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1962 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1964 /* Evaluate switch function */
1965 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1966 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
1967 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1971 fscal = _mm_and_ps(fscal,cutoff_mask);
1973 /* Update vectorial force */
1974 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1975 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1976 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1978 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1979 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1980 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1984 /**************************
1985 * CALCULATE INTERACTIONS *
1986 **************************/
1988 if (gmx_mm_any_lt(rsq31,rcutoff2))
1991 r31 = _mm_mul_ps(rsq31,rinv31);
1993 /* EWALD ELECTROSTATICS */
1995 /* Analytical PME correction */
1996 zeta2 = _mm_mul_ps(beta2,rsq31);
1997 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
1998 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1999 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2000 felec = _mm_mul_ps(qq31,felec);
2001 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2002 velec = _mm_nmacc_ps(pmecorrV,beta,rinv31);
2003 velec = _mm_mul_ps(qq31,velec);
2005 d = _mm_sub_ps(r31,rswitch);
2006 d = _mm_max_ps(d,_mm_setzero_ps());
2007 d2 = _mm_mul_ps(d,d);
2008 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2010 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2012 /* Evaluate switch function */
2013 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2014 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2015 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2019 fscal = _mm_and_ps(fscal,cutoff_mask);
2021 /* Update vectorial force */
2022 fix3 = _mm_macc_ps(dx31,fscal,fix3);
2023 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
2024 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
2026 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
2027 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
2028 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
2032 /**************************
2033 * CALCULATE INTERACTIONS *
2034 **************************/
2036 if (gmx_mm_any_lt(rsq32,rcutoff2))
2039 r32 = _mm_mul_ps(rsq32,rinv32);
2041 /* EWALD ELECTROSTATICS */
2043 /* Analytical PME correction */
2044 zeta2 = _mm_mul_ps(beta2,rsq32);
2045 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
2046 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2047 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2048 felec = _mm_mul_ps(qq32,felec);
2049 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2050 velec = _mm_nmacc_ps(pmecorrV,beta,rinv32);
2051 velec = _mm_mul_ps(qq32,velec);
2053 d = _mm_sub_ps(r32,rswitch);
2054 d = _mm_max_ps(d,_mm_setzero_ps());
2055 d2 = _mm_mul_ps(d,d);
2056 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2058 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2060 /* Evaluate switch function */
2061 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2062 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2063 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2067 fscal = _mm_and_ps(fscal,cutoff_mask);
2069 /* Update vectorial force */
2070 fix3 = _mm_macc_ps(dx32,fscal,fix3);
2071 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
2072 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
2074 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
2075 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
2076 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
2080 /**************************
2081 * CALCULATE INTERACTIONS *
2082 **************************/
2084 if (gmx_mm_any_lt(rsq33,rcutoff2))
2087 r33 = _mm_mul_ps(rsq33,rinv33);
2089 /* EWALD ELECTROSTATICS */
2091 /* Analytical PME correction */
2092 zeta2 = _mm_mul_ps(beta2,rsq33);
2093 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
2094 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2095 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2096 felec = _mm_mul_ps(qq33,felec);
2097 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2098 velec = _mm_nmacc_ps(pmecorrV,beta,rinv33);
2099 velec = _mm_mul_ps(qq33,velec);
2101 d = _mm_sub_ps(r33,rswitch);
2102 d = _mm_max_ps(d,_mm_setzero_ps());
2103 d2 = _mm_mul_ps(d,d);
2104 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2106 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2108 /* Evaluate switch function */
2109 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2110 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2111 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2115 fscal = _mm_and_ps(fscal,cutoff_mask);
2117 /* Update vectorial force */
2118 fix3 = _mm_macc_ps(dx33,fscal,fix3);
2119 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
2120 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
2122 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
2123 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
2124 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
2128 fjptrA = f+j_coord_offsetA;
2129 fjptrB = f+j_coord_offsetB;
2130 fjptrC = f+j_coord_offsetC;
2131 fjptrD = f+j_coord_offsetD;
2133 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2134 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2136 /* Inner loop uses 450 flops */
2139 if(jidx<j_index_end)
2142 /* Get j neighbor index, and coordinate index */
2143 jnrlistA = jjnr[jidx];
2144 jnrlistB = jjnr[jidx+1];
2145 jnrlistC = jjnr[jidx+2];
2146 jnrlistD = jjnr[jidx+3];
2147 /* Sign of each element will be negative for non-real atoms.
2148 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2149 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2151 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2152 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2153 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2154 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2155 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2156 j_coord_offsetA = DIM*jnrA;
2157 j_coord_offsetB = DIM*jnrB;
2158 j_coord_offsetC = DIM*jnrC;
2159 j_coord_offsetD = DIM*jnrD;
2161 /* load j atom coordinates */
2162 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
2163 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
2164 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2166 /* Calculate displacement vector */
2167 dx11 = _mm_sub_ps(ix1,jx1);
2168 dy11 = _mm_sub_ps(iy1,jy1);
2169 dz11 = _mm_sub_ps(iz1,jz1);
2170 dx12 = _mm_sub_ps(ix1,jx2);
2171 dy12 = _mm_sub_ps(iy1,jy2);
2172 dz12 = _mm_sub_ps(iz1,jz2);
2173 dx13 = _mm_sub_ps(ix1,jx3);
2174 dy13 = _mm_sub_ps(iy1,jy3);
2175 dz13 = _mm_sub_ps(iz1,jz3);
2176 dx21 = _mm_sub_ps(ix2,jx1);
2177 dy21 = _mm_sub_ps(iy2,jy1);
2178 dz21 = _mm_sub_ps(iz2,jz1);
2179 dx22 = _mm_sub_ps(ix2,jx2);
2180 dy22 = _mm_sub_ps(iy2,jy2);
2181 dz22 = _mm_sub_ps(iz2,jz2);
2182 dx23 = _mm_sub_ps(ix2,jx3);
2183 dy23 = _mm_sub_ps(iy2,jy3);
2184 dz23 = _mm_sub_ps(iz2,jz3);
2185 dx31 = _mm_sub_ps(ix3,jx1);
2186 dy31 = _mm_sub_ps(iy3,jy1);
2187 dz31 = _mm_sub_ps(iz3,jz1);
2188 dx32 = _mm_sub_ps(ix3,jx2);
2189 dy32 = _mm_sub_ps(iy3,jy2);
2190 dz32 = _mm_sub_ps(iz3,jz2);
2191 dx33 = _mm_sub_ps(ix3,jx3);
2192 dy33 = _mm_sub_ps(iy3,jy3);
2193 dz33 = _mm_sub_ps(iz3,jz3);
2195 /* Calculate squared distance and things based on it */
2196 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2197 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2198 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2199 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2200 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2201 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2202 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2203 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2204 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2206 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2207 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2208 rinv13 = gmx_mm_invsqrt_ps(rsq13);
2209 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2210 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2211 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2212 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2213 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2214 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2216 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2217 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2218 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2219 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2220 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2221 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2222 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2223 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2224 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2226 fjx1 = _mm_setzero_ps();
2227 fjy1 = _mm_setzero_ps();
2228 fjz1 = _mm_setzero_ps();
2229 fjx2 = _mm_setzero_ps();
2230 fjy2 = _mm_setzero_ps();
2231 fjz2 = _mm_setzero_ps();
2232 fjx3 = _mm_setzero_ps();
2233 fjy3 = _mm_setzero_ps();
2234 fjz3 = _mm_setzero_ps();
2236 /**************************
2237 * CALCULATE INTERACTIONS *
2238 **************************/
2240 if (gmx_mm_any_lt(rsq11,rcutoff2))
2243 r11 = _mm_mul_ps(rsq11,rinv11);
2244 r11 = _mm_andnot_ps(dummy_mask,r11);
2246 /* EWALD ELECTROSTATICS */
2248 /* Analytical PME correction */
2249 zeta2 = _mm_mul_ps(beta2,rsq11);
2250 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
2251 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2252 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2253 felec = _mm_mul_ps(qq11,felec);
2254 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2255 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
2256 velec = _mm_mul_ps(qq11,velec);
2258 d = _mm_sub_ps(r11,rswitch);
2259 d = _mm_max_ps(d,_mm_setzero_ps());
2260 d2 = _mm_mul_ps(d,d);
2261 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2263 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2265 /* Evaluate switch function */
2266 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2267 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2268 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2272 fscal = _mm_and_ps(fscal,cutoff_mask);
2274 fscal = _mm_andnot_ps(dummy_mask,fscal);
2276 /* Update vectorial force */
2277 fix1 = _mm_macc_ps(dx11,fscal,fix1);
2278 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
2279 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
2281 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
2282 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
2283 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
2287 /**************************
2288 * CALCULATE INTERACTIONS *
2289 **************************/
2291 if (gmx_mm_any_lt(rsq12,rcutoff2))
2294 r12 = _mm_mul_ps(rsq12,rinv12);
2295 r12 = _mm_andnot_ps(dummy_mask,r12);
2297 /* EWALD ELECTROSTATICS */
2299 /* Analytical PME correction */
2300 zeta2 = _mm_mul_ps(beta2,rsq12);
2301 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
2302 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2303 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2304 felec = _mm_mul_ps(qq12,felec);
2305 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2306 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
2307 velec = _mm_mul_ps(qq12,velec);
2309 d = _mm_sub_ps(r12,rswitch);
2310 d = _mm_max_ps(d,_mm_setzero_ps());
2311 d2 = _mm_mul_ps(d,d);
2312 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2314 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2316 /* Evaluate switch function */
2317 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2318 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2319 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2323 fscal = _mm_and_ps(fscal,cutoff_mask);
2325 fscal = _mm_andnot_ps(dummy_mask,fscal);
2327 /* Update vectorial force */
2328 fix1 = _mm_macc_ps(dx12,fscal,fix1);
2329 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
2330 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
2332 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
2333 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
2334 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
2338 /**************************
2339 * CALCULATE INTERACTIONS *
2340 **************************/
2342 if (gmx_mm_any_lt(rsq13,rcutoff2))
2345 r13 = _mm_mul_ps(rsq13,rinv13);
2346 r13 = _mm_andnot_ps(dummy_mask,r13);
2348 /* EWALD ELECTROSTATICS */
2350 /* Analytical PME correction */
2351 zeta2 = _mm_mul_ps(beta2,rsq13);
2352 rinv3 = _mm_mul_ps(rinvsq13,rinv13);
2353 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2354 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2355 felec = _mm_mul_ps(qq13,felec);
2356 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2357 velec = _mm_nmacc_ps(pmecorrV,beta,rinv13);
2358 velec = _mm_mul_ps(qq13,velec);
2360 d = _mm_sub_ps(r13,rswitch);
2361 d = _mm_max_ps(d,_mm_setzero_ps());
2362 d2 = _mm_mul_ps(d,d);
2363 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2365 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2367 /* Evaluate switch function */
2368 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2369 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2370 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2374 fscal = _mm_and_ps(fscal,cutoff_mask);
2376 fscal = _mm_andnot_ps(dummy_mask,fscal);
2378 /* Update vectorial force */
2379 fix1 = _mm_macc_ps(dx13,fscal,fix1);
2380 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
2381 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
2383 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
2384 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
2385 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
2389 /**************************
2390 * CALCULATE INTERACTIONS *
2391 **************************/
2393 if (gmx_mm_any_lt(rsq21,rcutoff2))
2396 r21 = _mm_mul_ps(rsq21,rinv21);
2397 r21 = _mm_andnot_ps(dummy_mask,r21);
2399 /* EWALD ELECTROSTATICS */
2401 /* Analytical PME correction */
2402 zeta2 = _mm_mul_ps(beta2,rsq21);
2403 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
2404 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2405 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2406 felec = _mm_mul_ps(qq21,felec);
2407 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2408 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
2409 velec = _mm_mul_ps(qq21,velec);
2411 d = _mm_sub_ps(r21,rswitch);
2412 d = _mm_max_ps(d,_mm_setzero_ps());
2413 d2 = _mm_mul_ps(d,d);
2414 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2416 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2418 /* Evaluate switch function */
2419 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2420 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2421 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2425 fscal = _mm_and_ps(fscal,cutoff_mask);
2427 fscal = _mm_andnot_ps(dummy_mask,fscal);
2429 /* Update vectorial force */
2430 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2431 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2432 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2434 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2435 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2436 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2440 /**************************
2441 * CALCULATE INTERACTIONS *
2442 **************************/
2444 if (gmx_mm_any_lt(rsq22,rcutoff2))
2447 r22 = _mm_mul_ps(rsq22,rinv22);
2448 r22 = _mm_andnot_ps(dummy_mask,r22);
2450 /* EWALD ELECTROSTATICS */
2452 /* Analytical PME correction */
2453 zeta2 = _mm_mul_ps(beta2,rsq22);
2454 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
2455 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2456 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2457 felec = _mm_mul_ps(qq22,felec);
2458 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2459 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
2460 velec = _mm_mul_ps(qq22,velec);
2462 d = _mm_sub_ps(r22,rswitch);
2463 d = _mm_max_ps(d,_mm_setzero_ps());
2464 d2 = _mm_mul_ps(d,d);
2465 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2467 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2469 /* Evaluate switch function */
2470 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2471 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2472 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2476 fscal = _mm_and_ps(fscal,cutoff_mask);
2478 fscal = _mm_andnot_ps(dummy_mask,fscal);
2480 /* Update vectorial force */
2481 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2482 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2483 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2485 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2486 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2487 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2491 /**************************
2492 * CALCULATE INTERACTIONS *
2493 **************************/
2495 if (gmx_mm_any_lt(rsq23,rcutoff2))
2498 r23 = _mm_mul_ps(rsq23,rinv23);
2499 r23 = _mm_andnot_ps(dummy_mask,r23);
2501 /* EWALD ELECTROSTATICS */
2503 /* Analytical PME correction */
2504 zeta2 = _mm_mul_ps(beta2,rsq23);
2505 rinv3 = _mm_mul_ps(rinvsq23,rinv23);
2506 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2507 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2508 felec = _mm_mul_ps(qq23,felec);
2509 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2510 velec = _mm_nmacc_ps(pmecorrV,beta,rinv23);
2511 velec = _mm_mul_ps(qq23,velec);
2513 d = _mm_sub_ps(r23,rswitch);
2514 d = _mm_max_ps(d,_mm_setzero_ps());
2515 d2 = _mm_mul_ps(d,d);
2516 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2518 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2520 /* Evaluate switch function */
2521 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2522 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2523 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2527 fscal = _mm_and_ps(fscal,cutoff_mask);
2529 fscal = _mm_andnot_ps(dummy_mask,fscal);
2531 /* Update vectorial force */
2532 fix2 = _mm_macc_ps(dx23,fscal,fix2);
2533 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
2534 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
2536 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
2537 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
2538 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
2542 /**************************
2543 * CALCULATE INTERACTIONS *
2544 **************************/
2546 if (gmx_mm_any_lt(rsq31,rcutoff2))
2549 r31 = _mm_mul_ps(rsq31,rinv31);
2550 r31 = _mm_andnot_ps(dummy_mask,r31);
2552 /* EWALD ELECTROSTATICS */
2554 /* Analytical PME correction */
2555 zeta2 = _mm_mul_ps(beta2,rsq31);
2556 rinv3 = _mm_mul_ps(rinvsq31,rinv31);
2557 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2558 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2559 felec = _mm_mul_ps(qq31,felec);
2560 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2561 velec = _mm_nmacc_ps(pmecorrV,beta,rinv31);
2562 velec = _mm_mul_ps(qq31,velec);
2564 d = _mm_sub_ps(r31,rswitch);
2565 d = _mm_max_ps(d,_mm_setzero_ps());
2566 d2 = _mm_mul_ps(d,d);
2567 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2569 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2571 /* Evaluate switch function */
2572 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2573 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2574 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2578 fscal = _mm_and_ps(fscal,cutoff_mask);
2580 fscal = _mm_andnot_ps(dummy_mask,fscal);
2582 /* Update vectorial force */
2583 fix3 = _mm_macc_ps(dx31,fscal,fix3);
2584 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
2585 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
2587 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
2588 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
2589 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
2593 /**************************
2594 * CALCULATE INTERACTIONS *
2595 **************************/
2597 if (gmx_mm_any_lt(rsq32,rcutoff2))
2600 r32 = _mm_mul_ps(rsq32,rinv32);
2601 r32 = _mm_andnot_ps(dummy_mask,r32);
2603 /* EWALD ELECTROSTATICS */
2605 /* Analytical PME correction */
2606 zeta2 = _mm_mul_ps(beta2,rsq32);
2607 rinv3 = _mm_mul_ps(rinvsq32,rinv32);
2608 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2609 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2610 felec = _mm_mul_ps(qq32,felec);
2611 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2612 velec = _mm_nmacc_ps(pmecorrV,beta,rinv32);
2613 velec = _mm_mul_ps(qq32,velec);
2615 d = _mm_sub_ps(r32,rswitch);
2616 d = _mm_max_ps(d,_mm_setzero_ps());
2617 d2 = _mm_mul_ps(d,d);
2618 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2620 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2622 /* Evaluate switch function */
2623 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2624 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2625 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2629 fscal = _mm_and_ps(fscal,cutoff_mask);
2631 fscal = _mm_andnot_ps(dummy_mask,fscal);
2633 /* Update vectorial force */
2634 fix3 = _mm_macc_ps(dx32,fscal,fix3);
2635 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
2636 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
2638 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
2639 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
2640 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
2644 /**************************
2645 * CALCULATE INTERACTIONS *
2646 **************************/
2648 if (gmx_mm_any_lt(rsq33,rcutoff2))
2651 r33 = _mm_mul_ps(rsq33,rinv33);
2652 r33 = _mm_andnot_ps(dummy_mask,r33);
2654 /* EWALD ELECTROSTATICS */
2656 /* Analytical PME correction */
2657 zeta2 = _mm_mul_ps(beta2,rsq33);
2658 rinv3 = _mm_mul_ps(rinvsq33,rinv33);
2659 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2660 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2661 felec = _mm_mul_ps(qq33,felec);
2662 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2663 velec = _mm_nmacc_ps(pmecorrV,beta,rinv33);
2664 velec = _mm_mul_ps(qq33,velec);
2666 d = _mm_sub_ps(r33,rswitch);
2667 d = _mm_max_ps(d,_mm_setzero_ps());
2668 d2 = _mm_mul_ps(d,d);
2669 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2671 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2673 /* Evaluate switch function */
2674 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2675 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2676 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2680 fscal = _mm_and_ps(fscal,cutoff_mask);
2682 fscal = _mm_andnot_ps(dummy_mask,fscal);
2684 /* Update vectorial force */
2685 fix3 = _mm_macc_ps(dx33,fscal,fix3);
2686 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
2687 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
2689 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
2690 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
2691 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
2695 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2696 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2697 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2698 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2700 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2701 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2703 /* Inner loop uses 459 flops */
2706 /* End of innermost loop */
2708 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2709 f+i_coord_offset+DIM,fshift+i_shift_offset);
2711 /* Increment number of inner iterations */
2712 inneriter += j_index_end - j_index_start;
2714 /* Outer loop uses 18 flops */
2717 /* Increment number of outer iterations */
2720 /* Update outer/inner flops */
2722 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*459);