2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
48 #include "kernelutil_x86_avx_128_fma_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_128_fma_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: None
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_128_fma_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
107 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
108 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
110 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
111 real rswitch_scalar,d_scalar;
112 __m128 dummy_mask,cutoff_mask;
113 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
114 __m128 one = _mm_set1_ps(1.0);
115 __m128 two = _mm_set1_ps(2.0);
121 jindex = nlist->jindex;
123 shiftidx = nlist->shift;
125 shiftvec = fr->shift_vec[0];
126 fshift = fr->fshift[0];
127 facel = _mm_set1_ps(fr->epsfac);
128 charge = mdatoms->chargeA;
130 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
131 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
132 beta2 = _mm_mul_ps(beta,beta);
133 beta3 = _mm_mul_ps(beta,beta2);
134 ewtab = fr->ic->tabq_coul_FDV0;
135 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
136 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
138 /* Setup water-specific parameters */
139 inr = nlist->iinr[0];
140 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
141 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
142 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
144 jq0 = _mm_set1_ps(charge[inr+0]);
145 jq1 = _mm_set1_ps(charge[inr+1]);
146 jq2 = _mm_set1_ps(charge[inr+2]);
147 qq00 = _mm_mul_ps(iq0,jq0);
148 qq01 = _mm_mul_ps(iq0,jq1);
149 qq02 = _mm_mul_ps(iq0,jq2);
150 qq10 = _mm_mul_ps(iq1,jq0);
151 qq11 = _mm_mul_ps(iq1,jq1);
152 qq12 = _mm_mul_ps(iq1,jq2);
153 qq20 = _mm_mul_ps(iq2,jq0);
154 qq21 = _mm_mul_ps(iq2,jq1);
155 qq22 = _mm_mul_ps(iq2,jq2);
157 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
158 rcutoff_scalar = fr->rcoulomb;
159 rcutoff = _mm_set1_ps(rcutoff_scalar);
160 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
162 rswitch_scalar = fr->rcoulomb_switch;
163 rswitch = _mm_set1_ps(rswitch_scalar);
164 /* Setup switch parameters */
165 d_scalar = rcutoff_scalar-rswitch_scalar;
166 d = _mm_set1_ps(d_scalar);
167 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
168 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
169 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
170 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
171 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
172 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
174 /* Avoid stupid compiler warnings */
175 jnrA = jnrB = jnrC = jnrD = 0;
184 for(iidx=0;iidx<4*DIM;iidx++)
189 /* Start outer loop over neighborlists */
190 for(iidx=0; iidx<nri; iidx++)
192 /* Load shift vector for this list */
193 i_shift_offset = DIM*shiftidx[iidx];
195 /* Load limits for loop over neighbors */
196 j_index_start = jindex[iidx];
197 j_index_end = jindex[iidx+1];
199 /* Get outer coordinate index */
201 i_coord_offset = DIM*inr;
203 /* Load i particle coords and add shift vector */
204 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
205 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
207 fix0 = _mm_setzero_ps();
208 fiy0 = _mm_setzero_ps();
209 fiz0 = _mm_setzero_ps();
210 fix1 = _mm_setzero_ps();
211 fiy1 = _mm_setzero_ps();
212 fiz1 = _mm_setzero_ps();
213 fix2 = _mm_setzero_ps();
214 fiy2 = _mm_setzero_ps();
215 fiz2 = _mm_setzero_ps();
217 /* Reset potential sums */
218 velecsum = _mm_setzero_ps();
220 /* Start inner kernel loop */
221 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
224 /* Get j neighbor index, and coordinate index */
229 j_coord_offsetA = DIM*jnrA;
230 j_coord_offsetB = DIM*jnrB;
231 j_coord_offsetC = DIM*jnrC;
232 j_coord_offsetD = DIM*jnrD;
234 /* load j atom coordinates */
235 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
236 x+j_coord_offsetC,x+j_coord_offsetD,
237 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
239 /* Calculate displacement vector */
240 dx00 = _mm_sub_ps(ix0,jx0);
241 dy00 = _mm_sub_ps(iy0,jy0);
242 dz00 = _mm_sub_ps(iz0,jz0);
243 dx01 = _mm_sub_ps(ix0,jx1);
244 dy01 = _mm_sub_ps(iy0,jy1);
245 dz01 = _mm_sub_ps(iz0,jz1);
246 dx02 = _mm_sub_ps(ix0,jx2);
247 dy02 = _mm_sub_ps(iy0,jy2);
248 dz02 = _mm_sub_ps(iz0,jz2);
249 dx10 = _mm_sub_ps(ix1,jx0);
250 dy10 = _mm_sub_ps(iy1,jy0);
251 dz10 = _mm_sub_ps(iz1,jz0);
252 dx11 = _mm_sub_ps(ix1,jx1);
253 dy11 = _mm_sub_ps(iy1,jy1);
254 dz11 = _mm_sub_ps(iz1,jz1);
255 dx12 = _mm_sub_ps(ix1,jx2);
256 dy12 = _mm_sub_ps(iy1,jy2);
257 dz12 = _mm_sub_ps(iz1,jz2);
258 dx20 = _mm_sub_ps(ix2,jx0);
259 dy20 = _mm_sub_ps(iy2,jy0);
260 dz20 = _mm_sub_ps(iz2,jz0);
261 dx21 = _mm_sub_ps(ix2,jx1);
262 dy21 = _mm_sub_ps(iy2,jy1);
263 dz21 = _mm_sub_ps(iz2,jz1);
264 dx22 = _mm_sub_ps(ix2,jx2);
265 dy22 = _mm_sub_ps(iy2,jy2);
266 dz22 = _mm_sub_ps(iz2,jz2);
268 /* Calculate squared distance and things based on it */
269 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
270 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
271 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
272 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
273 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
274 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
275 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
276 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
277 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
279 rinv00 = gmx_mm_invsqrt_ps(rsq00);
280 rinv01 = gmx_mm_invsqrt_ps(rsq01);
281 rinv02 = gmx_mm_invsqrt_ps(rsq02);
282 rinv10 = gmx_mm_invsqrt_ps(rsq10);
283 rinv11 = gmx_mm_invsqrt_ps(rsq11);
284 rinv12 = gmx_mm_invsqrt_ps(rsq12);
285 rinv20 = gmx_mm_invsqrt_ps(rsq20);
286 rinv21 = gmx_mm_invsqrt_ps(rsq21);
287 rinv22 = gmx_mm_invsqrt_ps(rsq22);
289 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
290 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
291 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
292 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
293 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
294 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
295 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
296 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
297 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
299 fjx0 = _mm_setzero_ps();
300 fjy0 = _mm_setzero_ps();
301 fjz0 = _mm_setzero_ps();
302 fjx1 = _mm_setzero_ps();
303 fjy1 = _mm_setzero_ps();
304 fjz1 = _mm_setzero_ps();
305 fjx2 = _mm_setzero_ps();
306 fjy2 = _mm_setzero_ps();
307 fjz2 = _mm_setzero_ps();
309 /**************************
310 * CALCULATE INTERACTIONS *
311 **************************/
313 if (gmx_mm_any_lt(rsq00,rcutoff2))
316 r00 = _mm_mul_ps(rsq00,rinv00);
318 /* EWALD ELECTROSTATICS */
320 /* Analytical PME correction */
321 zeta2 = _mm_mul_ps(beta2,rsq00);
322 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
323 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
324 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
325 felec = _mm_mul_ps(qq00,felec);
326 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
327 velec = _mm_nmacc_ps(pmecorrV,beta,rinv00);
328 velec = _mm_mul_ps(qq00,velec);
330 d = _mm_sub_ps(r00,rswitch);
331 d = _mm_max_ps(d,_mm_setzero_ps());
332 d2 = _mm_mul_ps(d,d);
333 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
335 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
337 /* Evaluate switch function */
338 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
339 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
340 velec = _mm_mul_ps(velec,sw);
341 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
343 /* Update potential sum for this i atom from the interaction with this j atom. */
344 velec = _mm_and_ps(velec,cutoff_mask);
345 velecsum = _mm_add_ps(velecsum,velec);
349 fscal = _mm_and_ps(fscal,cutoff_mask);
351 /* Update vectorial force */
352 fix0 = _mm_macc_ps(dx00,fscal,fix0);
353 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
354 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
356 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
357 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
358 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
362 /**************************
363 * CALCULATE INTERACTIONS *
364 **************************/
366 if (gmx_mm_any_lt(rsq01,rcutoff2))
369 r01 = _mm_mul_ps(rsq01,rinv01);
371 /* EWALD ELECTROSTATICS */
373 /* Analytical PME correction */
374 zeta2 = _mm_mul_ps(beta2,rsq01);
375 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
376 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
377 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
378 felec = _mm_mul_ps(qq01,felec);
379 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
380 velec = _mm_nmacc_ps(pmecorrV,beta,rinv01);
381 velec = _mm_mul_ps(qq01,velec);
383 d = _mm_sub_ps(r01,rswitch);
384 d = _mm_max_ps(d,_mm_setzero_ps());
385 d2 = _mm_mul_ps(d,d);
386 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
388 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
390 /* Evaluate switch function */
391 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
392 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
393 velec = _mm_mul_ps(velec,sw);
394 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
396 /* Update potential sum for this i atom from the interaction with this j atom. */
397 velec = _mm_and_ps(velec,cutoff_mask);
398 velecsum = _mm_add_ps(velecsum,velec);
402 fscal = _mm_and_ps(fscal,cutoff_mask);
404 /* Update vectorial force */
405 fix0 = _mm_macc_ps(dx01,fscal,fix0);
406 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
407 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
409 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
410 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
411 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
415 /**************************
416 * CALCULATE INTERACTIONS *
417 **************************/
419 if (gmx_mm_any_lt(rsq02,rcutoff2))
422 r02 = _mm_mul_ps(rsq02,rinv02);
424 /* EWALD ELECTROSTATICS */
426 /* Analytical PME correction */
427 zeta2 = _mm_mul_ps(beta2,rsq02);
428 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
429 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
430 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
431 felec = _mm_mul_ps(qq02,felec);
432 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
433 velec = _mm_nmacc_ps(pmecorrV,beta,rinv02);
434 velec = _mm_mul_ps(qq02,velec);
436 d = _mm_sub_ps(r02,rswitch);
437 d = _mm_max_ps(d,_mm_setzero_ps());
438 d2 = _mm_mul_ps(d,d);
439 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
441 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
443 /* Evaluate switch function */
444 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
445 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
446 velec = _mm_mul_ps(velec,sw);
447 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
449 /* Update potential sum for this i atom from the interaction with this j atom. */
450 velec = _mm_and_ps(velec,cutoff_mask);
451 velecsum = _mm_add_ps(velecsum,velec);
455 fscal = _mm_and_ps(fscal,cutoff_mask);
457 /* Update vectorial force */
458 fix0 = _mm_macc_ps(dx02,fscal,fix0);
459 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
460 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
462 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
463 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
464 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
468 /**************************
469 * CALCULATE INTERACTIONS *
470 **************************/
472 if (gmx_mm_any_lt(rsq10,rcutoff2))
475 r10 = _mm_mul_ps(rsq10,rinv10);
477 /* EWALD ELECTROSTATICS */
479 /* Analytical PME correction */
480 zeta2 = _mm_mul_ps(beta2,rsq10);
481 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
482 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
483 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
484 felec = _mm_mul_ps(qq10,felec);
485 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
486 velec = _mm_nmacc_ps(pmecorrV,beta,rinv10);
487 velec = _mm_mul_ps(qq10,velec);
489 d = _mm_sub_ps(r10,rswitch);
490 d = _mm_max_ps(d,_mm_setzero_ps());
491 d2 = _mm_mul_ps(d,d);
492 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
494 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
496 /* Evaluate switch function */
497 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
498 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
499 velec = _mm_mul_ps(velec,sw);
500 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
502 /* Update potential sum for this i atom from the interaction with this j atom. */
503 velec = _mm_and_ps(velec,cutoff_mask);
504 velecsum = _mm_add_ps(velecsum,velec);
508 fscal = _mm_and_ps(fscal,cutoff_mask);
510 /* Update vectorial force */
511 fix1 = _mm_macc_ps(dx10,fscal,fix1);
512 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
513 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
515 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
516 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
517 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
521 /**************************
522 * CALCULATE INTERACTIONS *
523 **************************/
525 if (gmx_mm_any_lt(rsq11,rcutoff2))
528 r11 = _mm_mul_ps(rsq11,rinv11);
530 /* EWALD ELECTROSTATICS */
532 /* Analytical PME correction */
533 zeta2 = _mm_mul_ps(beta2,rsq11);
534 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
535 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
536 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
537 felec = _mm_mul_ps(qq11,felec);
538 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
539 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
540 velec = _mm_mul_ps(qq11,velec);
542 d = _mm_sub_ps(r11,rswitch);
543 d = _mm_max_ps(d,_mm_setzero_ps());
544 d2 = _mm_mul_ps(d,d);
545 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
547 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
549 /* Evaluate switch function */
550 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
551 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
552 velec = _mm_mul_ps(velec,sw);
553 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
555 /* Update potential sum for this i atom from the interaction with this j atom. */
556 velec = _mm_and_ps(velec,cutoff_mask);
557 velecsum = _mm_add_ps(velecsum,velec);
561 fscal = _mm_and_ps(fscal,cutoff_mask);
563 /* Update vectorial force */
564 fix1 = _mm_macc_ps(dx11,fscal,fix1);
565 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
566 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
568 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
569 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
570 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
574 /**************************
575 * CALCULATE INTERACTIONS *
576 **************************/
578 if (gmx_mm_any_lt(rsq12,rcutoff2))
581 r12 = _mm_mul_ps(rsq12,rinv12);
583 /* EWALD ELECTROSTATICS */
585 /* Analytical PME correction */
586 zeta2 = _mm_mul_ps(beta2,rsq12);
587 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
588 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
589 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
590 felec = _mm_mul_ps(qq12,felec);
591 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
592 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
593 velec = _mm_mul_ps(qq12,velec);
595 d = _mm_sub_ps(r12,rswitch);
596 d = _mm_max_ps(d,_mm_setzero_ps());
597 d2 = _mm_mul_ps(d,d);
598 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
600 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
602 /* Evaluate switch function */
603 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
604 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
605 velec = _mm_mul_ps(velec,sw);
606 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
608 /* Update potential sum for this i atom from the interaction with this j atom. */
609 velec = _mm_and_ps(velec,cutoff_mask);
610 velecsum = _mm_add_ps(velecsum,velec);
614 fscal = _mm_and_ps(fscal,cutoff_mask);
616 /* Update vectorial force */
617 fix1 = _mm_macc_ps(dx12,fscal,fix1);
618 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
619 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
621 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
622 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
623 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
627 /**************************
628 * CALCULATE INTERACTIONS *
629 **************************/
631 if (gmx_mm_any_lt(rsq20,rcutoff2))
634 r20 = _mm_mul_ps(rsq20,rinv20);
636 /* EWALD ELECTROSTATICS */
638 /* Analytical PME correction */
639 zeta2 = _mm_mul_ps(beta2,rsq20);
640 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
641 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
642 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
643 felec = _mm_mul_ps(qq20,felec);
644 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
645 velec = _mm_nmacc_ps(pmecorrV,beta,rinv20);
646 velec = _mm_mul_ps(qq20,velec);
648 d = _mm_sub_ps(r20,rswitch);
649 d = _mm_max_ps(d,_mm_setzero_ps());
650 d2 = _mm_mul_ps(d,d);
651 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
653 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
655 /* Evaluate switch function */
656 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
657 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
658 velec = _mm_mul_ps(velec,sw);
659 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
661 /* Update potential sum for this i atom from the interaction with this j atom. */
662 velec = _mm_and_ps(velec,cutoff_mask);
663 velecsum = _mm_add_ps(velecsum,velec);
667 fscal = _mm_and_ps(fscal,cutoff_mask);
669 /* Update vectorial force */
670 fix2 = _mm_macc_ps(dx20,fscal,fix2);
671 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
672 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
674 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
675 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
676 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
680 /**************************
681 * CALCULATE INTERACTIONS *
682 **************************/
684 if (gmx_mm_any_lt(rsq21,rcutoff2))
687 r21 = _mm_mul_ps(rsq21,rinv21);
689 /* EWALD ELECTROSTATICS */
691 /* Analytical PME correction */
692 zeta2 = _mm_mul_ps(beta2,rsq21);
693 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
694 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
695 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
696 felec = _mm_mul_ps(qq21,felec);
697 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
698 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
699 velec = _mm_mul_ps(qq21,velec);
701 d = _mm_sub_ps(r21,rswitch);
702 d = _mm_max_ps(d,_mm_setzero_ps());
703 d2 = _mm_mul_ps(d,d);
704 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
706 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
708 /* Evaluate switch function */
709 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
710 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
711 velec = _mm_mul_ps(velec,sw);
712 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
714 /* Update potential sum for this i atom from the interaction with this j atom. */
715 velec = _mm_and_ps(velec,cutoff_mask);
716 velecsum = _mm_add_ps(velecsum,velec);
720 fscal = _mm_and_ps(fscal,cutoff_mask);
722 /* Update vectorial force */
723 fix2 = _mm_macc_ps(dx21,fscal,fix2);
724 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
725 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
727 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
728 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
729 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
733 /**************************
734 * CALCULATE INTERACTIONS *
735 **************************/
737 if (gmx_mm_any_lt(rsq22,rcutoff2))
740 r22 = _mm_mul_ps(rsq22,rinv22);
742 /* EWALD ELECTROSTATICS */
744 /* Analytical PME correction */
745 zeta2 = _mm_mul_ps(beta2,rsq22);
746 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
747 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
748 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
749 felec = _mm_mul_ps(qq22,felec);
750 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
751 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
752 velec = _mm_mul_ps(qq22,velec);
754 d = _mm_sub_ps(r22,rswitch);
755 d = _mm_max_ps(d,_mm_setzero_ps());
756 d2 = _mm_mul_ps(d,d);
757 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
759 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
761 /* Evaluate switch function */
762 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
763 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
764 velec = _mm_mul_ps(velec,sw);
765 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
767 /* Update potential sum for this i atom from the interaction with this j atom. */
768 velec = _mm_and_ps(velec,cutoff_mask);
769 velecsum = _mm_add_ps(velecsum,velec);
773 fscal = _mm_and_ps(fscal,cutoff_mask);
775 /* Update vectorial force */
776 fix2 = _mm_macc_ps(dx22,fscal,fix2);
777 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
778 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
780 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
781 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
782 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
786 fjptrA = f+j_coord_offsetA;
787 fjptrB = f+j_coord_offsetB;
788 fjptrC = f+j_coord_offsetC;
789 fjptrD = f+j_coord_offsetD;
791 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
792 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
794 /* Inner loop uses 477 flops */
800 /* Get j neighbor index, and coordinate index */
801 jnrlistA = jjnr[jidx];
802 jnrlistB = jjnr[jidx+1];
803 jnrlistC = jjnr[jidx+2];
804 jnrlistD = jjnr[jidx+3];
805 /* Sign of each element will be negative for non-real atoms.
806 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
807 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
809 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
810 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
811 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
812 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
813 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
814 j_coord_offsetA = DIM*jnrA;
815 j_coord_offsetB = DIM*jnrB;
816 j_coord_offsetC = DIM*jnrC;
817 j_coord_offsetD = DIM*jnrD;
819 /* load j atom coordinates */
820 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
821 x+j_coord_offsetC,x+j_coord_offsetD,
822 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
824 /* Calculate displacement vector */
825 dx00 = _mm_sub_ps(ix0,jx0);
826 dy00 = _mm_sub_ps(iy0,jy0);
827 dz00 = _mm_sub_ps(iz0,jz0);
828 dx01 = _mm_sub_ps(ix0,jx1);
829 dy01 = _mm_sub_ps(iy0,jy1);
830 dz01 = _mm_sub_ps(iz0,jz1);
831 dx02 = _mm_sub_ps(ix0,jx2);
832 dy02 = _mm_sub_ps(iy0,jy2);
833 dz02 = _mm_sub_ps(iz0,jz2);
834 dx10 = _mm_sub_ps(ix1,jx0);
835 dy10 = _mm_sub_ps(iy1,jy0);
836 dz10 = _mm_sub_ps(iz1,jz0);
837 dx11 = _mm_sub_ps(ix1,jx1);
838 dy11 = _mm_sub_ps(iy1,jy1);
839 dz11 = _mm_sub_ps(iz1,jz1);
840 dx12 = _mm_sub_ps(ix1,jx2);
841 dy12 = _mm_sub_ps(iy1,jy2);
842 dz12 = _mm_sub_ps(iz1,jz2);
843 dx20 = _mm_sub_ps(ix2,jx0);
844 dy20 = _mm_sub_ps(iy2,jy0);
845 dz20 = _mm_sub_ps(iz2,jz0);
846 dx21 = _mm_sub_ps(ix2,jx1);
847 dy21 = _mm_sub_ps(iy2,jy1);
848 dz21 = _mm_sub_ps(iz2,jz1);
849 dx22 = _mm_sub_ps(ix2,jx2);
850 dy22 = _mm_sub_ps(iy2,jy2);
851 dz22 = _mm_sub_ps(iz2,jz2);
853 /* Calculate squared distance and things based on it */
854 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
855 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
856 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
857 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
858 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
859 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
860 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
861 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
862 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
864 rinv00 = gmx_mm_invsqrt_ps(rsq00);
865 rinv01 = gmx_mm_invsqrt_ps(rsq01);
866 rinv02 = gmx_mm_invsqrt_ps(rsq02);
867 rinv10 = gmx_mm_invsqrt_ps(rsq10);
868 rinv11 = gmx_mm_invsqrt_ps(rsq11);
869 rinv12 = gmx_mm_invsqrt_ps(rsq12);
870 rinv20 = gmx_mm_invsqrt_ps(rsq20);
871 rinv21 = gmx_mm_invsqrt_ps(rsq21);
872 rinv22 = gmx_mm_invsqrt_ps(rsq22);
874 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
875 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
876 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
877 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
878 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
879 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
880 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
881 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
882 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
884 fjx0 = _mm_setzero_ps();
885 fjy0 = _mm_setzero_ps();
886 fjz0 = _mm_setzero_ps();
887 fjx1 = _mm_setzero_ps();
888 fjy1 = _mm_setzero_ps();
889 fjz1 = _mm_setzero_ps();
890 fjx2 = _mm_setzero_ps();
891 fjy2 = _mm_setzero_ps();
892 fjz2 = _mm_setzero_ps();
894 /**************************
895 * CALCULATE INTERACTIONS *
896 **************************/
898 if (gmx_mm_any_lt(rsq00,rcutoff2))
901 r00 = _mm_mul_ps(rsq00,rinv00);
902 r00 = _mm_andnot_ps(dummy_mask,r00);
904 /* EWALD ELECTROSTATICS */
906 /* Analytical PME correction */
907 zeta2 = _mm_mul_ps(beta2,rsq00);
908 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
909 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
910 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
911 felec = _mm_mul_ps(qq00,felec);
912 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
913 velec = _mm_nmacc_ps(pmecorrV,beta,rinv00);
914 velec = _mm_mul_ps(qq00,velec);
916 d = _mm_sub_ps(r00,rswitch);
917 d = _mm_max_ps(d,_mm_setzero_ps());
918 d2 = _mm_mul_ps(d,d);
919 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
921 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
923 /* Evaluate switch function */
924 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
925 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
926 velec = _mm_mul_ps(velec,sw);
927 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
929 /* Update potential sum for this i atom from the interaction with this j atom. */
930 velec = _mm_and_ps(velec,cutoff_mask);
931 velec = _mm_andnot_ps(dummy_mask,velec);
932 velecsum = _mm_add_ps(velecsum,velec);
936 fscal = _mm_and_ps(fscal,cutoff_mask);
938 fscal = _mm_andnot_ps(dummy_mask,fscal);
940 /* Update vectorial force */
941 fix0 = _mm_macc_ps(dx00,fscal,fix0);
942 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
943 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
945 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
946 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
947 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
951 /**************************
952 * CALCULATE INTERACTIONS *
953 **************************/
955 if (gmx_mm_any_lt(rsq01,rcutoff2))
958 r01 = _mm_mul_ps(rsq01,rinv01);
959 r01 = _mm_andnot_ps(dummy_mask,r01);
961 /* EWALD ELECTROSTATICS */
963 /* Analytical PME correction */
964 zeta2 = _mm_mul_ps(beta2,rsq01);
965 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
966 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
967 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
968 felec = _mm_mul_ps(qq01,felec);
969 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
970 velec = _mm_nmacc_ps(pmecorrV,beta,rinv01);
971 velec = _mm_mul_ps(qq01,velec);
973 d = _mm_sub_ps(r01,rswitch);
974 d = _mm_max_ps(d,_mm_setzero_ps());
975 d2 = _mm_mul_ps(d,d);
976 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
978 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
980 /* Evaluate switch function */
981 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
982 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
983 velec = _mm_mul_ps(velec,sw);
984 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
986 /* Update potential sum for this i atom from the interaction with this j atom. */
987 velec = _mm_and_ps(velec,cutoff_mask);
988 velec = _mm_andnot_ps(dummy_mask,velec);
989 velecsum = _mm_add_ps(velecsum,velec);
993 fscal = _mm_and_ps(fscal,cutoff_mask);
995 fscal = _mm_andnot_ps(dummy_mask,fscal);
997 /* Update vectorial force */
998 fix0 = _mm_macc_ps(dx01,fscal,fix0);
999 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1000 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1002 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1003 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1004 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1008 /**************************
1009 * CALCULATE INTERACTIONS *
1010 **************************/
1012 if (gmx_mm_any_lt(rsq02,rcutoff2))
1015 r02 = _mm_mul_ps(rsq02,rinv02);
1016 r02 = _mm_andnot_ps(dummy_mask,r02);
1018 /* EWALD ELECTROSTATICS */
1020 /* Analytical PME correction */
1021 zeta2 = _mm_mul_ps(beta2,rsq02);
1022 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
1023 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1024 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1025 felec = _mm_mul_ps(qq02,felec);
1026 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1027 velec = _mm_nmacc_ps(pmecorrV,beta,rinv02);
1028 velec = _mm_mul_ps(qq02,velec);
1030 d = _mm_sub_ps(r02,rswitch);
1031 d = _mm_max_ps(d,_mm_setzero_ps());
1032 d2 = _mm_mul_ps(d,d);
1033 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1035 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1037 /* Evaluate switch function */
1038 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1039 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1040 velec = _mm_mul_ps(velec,sw);
1041 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1043 /* Update potential sum for this i atom from the interaction with this j atom. */
1044 velec = _mm_and_ps(velec,cutoff_mask);
1045 velec = _mm_andnot_ps(dummy_mask,velec);
1046 velecsum = _mm_add_ps(velecsum,velec);
1050 fscal = _mm_and_ps(fscal,cutoff_mask);
1052 fscal = _mm_andnot_ps(dummy_mask,fscal);
1054 /* Update vectorial force */
1055 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1056 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1057 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
1059 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
1060 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
1061 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
1065 /**************************
1066 * CALCULATE INTERACTIONS *
1067 **************************/
1069 if (gmx_mm_any_lt(rsq10,rcutoff2))
1072 r10 = _mm_mul_ps(rsq10,rinv10);
1073 r10 = _mm_andnot_ps(dummy_mask,r10);
1075 /* EWALD ELECTROSTATICS */
1077 /* Analytical PME correction */
1078 zeta2 = _mm_mul_ps(beta2,rsq10);
1079 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
1080 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1081 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1082 felec = _mm_mul_ps(qq10,felec);
1083 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1084 velec = _mm_nmacc_ps(pmecorrV,beta,rinv10);
1085 velec = _mm_mul_ps(qq10,velec);
1087 d = _mm_sub_ps(r10,rswitch);
1088 d = _mm_max_ps(d,_mm_setzero_ps());
1089 d2 = _mm_mul_ps(d,d);
1090 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1092 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1094 /* Evaluate switch function */
1095 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1096 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1097 velec = _mm_mul_ps(velec,sw);
1098 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1100 /* Update potential sum for this i atom from the interaction with this j atom. */
1101 velec = _mm_and_ps(velec,cutoff_mask);
1102 velec = _mm_andnot_ps(dummy_mask,velec);
1103 velecsum = _mm_add_ps(velecsum,velec);
1107 fscal = _mm_and_ps(fscal,cutoff_mask);
1109 fscal = _mm_andnot_ps(dummy_mask,fscal);
1111 /* Update vectorial force */
1112 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1113 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1114 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1116 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1117 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1118 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1122 /**************************
1123 * CALCULATE INTERACTIONS *
1124 **************************/
1126 if (gmx_mm_any_lt(rsq11,rcutoff2))
1129 r11 = _mm_mul_ps(rsq11,rinv11);
1130 r11 = _mm_andnot_ps(dummy_mask,r11);
1132 /* EWALD ELECTROSTATICS */
1134 /* Analytical PME correction */
1135 zeta2 = _mm_mul_ps(beta2,rsq11);
1136 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1137 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1138 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1139 felec = _mm_mul_ps(qq11,felec);
1140 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1141 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
1142 velec = _mm_mul_ps(qq11,velec);
1144 d = _mm_sub_ps(r11,rswitch);
1145 d = _mm_max_ps(d,_mm_setzero_ps());
1146 d2 = _mm_mul_ps(d,d);
1147 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1149 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1151 /* Evaluate switch function */
1152 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1153 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1154 velec = _mm_mul_ps(velec,sw);
1155 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1157 /* Update potential sum for this i atom from the interaction with this j atom. */
1158 velec = _mm_and_ps(velec,cutoff_mask);
1159 velec = _mm_andnot_ps(dummy_mask,velec);
1160 velecsum = _mm_add_ps(velecsum,velec);
1164 fscal = _mm_and_ps(fscal,cutoff_mask);
1166 fscal = _mm_andnot_ps(dummy_mask,fscal);
1168 /* Update vectorial force */
1169 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1170 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1171 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1173 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1174 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1175 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1179 /**************************
1180 * CALCULATE INTERACTIONS *
1181 **************************/
1183 if (gmx_mm_any_lt(rsq12,rcutoff2))
1186 r12 = _mm_mul_ps(rsq12,rinv12);
1187 r12 = _mm_andnot_ps(dummy_mask,r12);
1189 /* EWALD ELECTROSTATICS */
1191 /* Analytical PME correction */
1192 zeta2 = _mm_mul_ps(beta2,rsq12);
1193 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1194 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1195 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1196 felec = _mm_mul_ps(qq12,felec);
1197 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1198 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
1199 velec = _mm_mul_ps(qq12,velec);
1201 d = _mm_sub_ps(r12,rswitch);
1202 d = _mm_max_ps(d,_mm_setzero_ps());
1203 d2 = _mm_mul_ps(d,d);
1204 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1206 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1208 /* Evaluate switch function */
1209 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1210 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1211 velec = _mm_mul_ps(velec,sw);
1212 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1214 /* Update potential sum for this i atom from the interaction with this j atom. */
1215 velec = _mm_and_ps(velec,cutoff_mask);
1216 velec = _mm_andnot_ps(dummy_mask,velec);
1217 velecsum = _mm_add_ps(velecsum,velec);
1221 fscal = _mm_and_ps(fscal,cutoff_mask);
1223 fscal = _mm_andnot_ps(dummy_mask,fscal);
1225 /* Update vectorial force */
1226 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1227 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1228 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1230 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1231 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1232 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1236 /**************************
1237 * CALCULATE INTERACTIONS *
1238 **************************/
1240 if (gmx_mm_any_lt(rsq20,rcutoff2))
1243 r20 = _mm_mul_ps(rsq20,rinv20);
1244 r20 = _mm_andnot_ps(dummy_mask,r20);
1246 /* EWALD ELECTROSTATICS */
1248 /* Analytical PME correction */
1249 zeta2 = _mm_mul_ps(beta2,rsq20);
1250 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
1251 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1252 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1253 felec = _mm_mul_ps(qq20,felec);
1254 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1255 velec = _mm_nmacc_ps(pmecorrV,beta,rinv20);
1256 velec = _mm_mul_ps(qq20,velec);
1258 d = _mm_sub_ps(r20,rswitch);
1259 d = _mm_max_ps(d,_mm_setzero_ps());
1260 d2 = _mm_mul_ps(d,d);
1261 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1263 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1265 /* Evaluate switch function */
1266 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1267 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1268 velec = _mm_mul_ps(velec,sw);
1269 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1271 /* Update potential sum for this i atom from the interaction with this j atom. */
1272 velec = _mm_and_ps(velec,cutoff_mask);
1273 velec = _mm_andnot_ps(dummy_mask,velec);
1274 velecsum = _mm_add_ps(velecsum,velec);
1278 fscal = _mm_and_ps(fscal,cutoff_mask);
1280 fscal = _mm_andnot_ps(dummy_mask,fscal);
1282 /* Update vectorial force */
1283 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1284 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1285 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1287 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1288 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1289 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1293 /**************************
1294 * CALCULATE INTERACTIONS *
1295 **************************/
1297 if (gmx_mm_any_lt(rsq21,rcutoff2))
1300 r21 = _mm_mul_ps(rsq21,rinv21);
1301 r21 = _mm_andnot_ps(dummy_mask,r21);
1303 /* EWALD ELECTROSTATICS */
1305 /* Analytical PME correction */
1306 zeta2 = _mm_mul_ps(beta2,rsq21);
1307 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1308 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1309 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1310 felec = _mm_mul_ps(qq21,felec);
1311 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1312 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
1313 velec = _mm_mul_ps(qq21,velec);
1315 d = _mm_sub_ps(r21,rswitch);
1316 d = _mm_max_ps(d,_mm_setzero_ps());
1317 d2 = _mm_mul_ps(d,d);
1318 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1320 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1322 /* Evaluate switch function */
1323 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1324 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1325 velec = _mm_mul_ps(velec,sw);
1326 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1328 /* Update potential sum for this i atom from the interaction with this j atom. */
1329 velec = _mm_and_ps(velec,cutoff_mask);
1330 velec = _mm_andnot_ps(dummy_mask,velec);
1331 velecsum = _mm_add_ps(velecsum,velec);
1335 fscal = _mm_and_ps(fscal,cutoff_mask);
1337 fscal = _mm_andnot_ps(dummy_mask,fscal);
1339 /* Update vectorial force */
1340 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1341 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1342 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1344 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1345 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1346 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1350 /**************************
1351 * CALCULATE INTERACTIONS *
1352 **************************/
1354 if (gmx_mm_any_lt(rsq22,rcutoff2))
1357 r22 = _mm_mul_ps(rsq22,rinv22);
1358 r22 = _mm_andnot_ps(dummy_mask,r22);
1360 /* EWALD ELECTROSTATICS */
1362 /* Analytical PME correction */
1363 zeta2 = _mm_mul_ps(beta2,rsq22);
1364 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1365 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1366 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1367 felec = _mm_mul_ps(qq22,felec);
1368 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1369 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
1370 velec = _mm_mul_ps(qq22,velec);
1372 d = _mm_sub_ps(r22,rswitch);
1373 d = _mm_max_ps(d,_mm_setzero_ps());
1374 d2 = _mm_mul_ps(d,d);
1375 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1377 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1379 /* Evaluate switch function */
1380 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1381 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1382 velec = _mm_mul_ps(velec,sw);
1383 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1385 /* Update potential sum for this i atom from the interaction with this j atom. */
1386 velec = _mm_and_ps(velec,cutoff_mask);
1387 velec = _mm_andnot_ps(dummy_mask,velec);
1388 velecsum = _mm_add_ps(velecsum,velec);
1392 fscal = _mm_and_ps(fscal,cutoff_mask);
1394 fscal = _mm_andnot_ps(dummy_mask,fscal);
1396 /* Update vectorial force */
1397 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1398 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1399 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1401 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1402 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1403 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1407 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1408 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1409 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1410 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1412 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1413 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1415 /* Inner loop uses 486 flops */
1418 /* End of innermost loop */
1420 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1421 f+i_coord_offset,fshift+i_shift_offset);
1424 /* Update potential energies */
1425 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1427 /* Increment number of inner iterations */
1428 inneriter += j_index_end - j_index_start;
1430 /* Outer loop uses 19 flops */
1433 /* Increment number of outer iterations */
1436 /* Update outer/inner flops */
1438 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*486);
1441 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_128_fma_single
1442 * Electrostatics interaction: Ewald
1443 * VdW interaction: None
1444 * Geometry: Water3-Water3
1445 * Calculate force/pot: Force
1448 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_128_fma_single
1449 (t_nblist * gmx_restrict nlist,
1450 rvec * gmx_restrict xx,
1451 rvec * gmx_restrict ff,
1452 t_forcerec * gmx_restrict fr,
1453 t_mdatoms * gmx_restrict mdatoms,
1454 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1455 t_nrnb * gmx_restrict nrnb)
1457 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1458 * just 0 for non-waters.
1459 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1460 * jnr indices corresponding to data put in the four positions in the SIMD register.
1462 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1463 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1464 int jnrA,jnrB,jnrC,jnrD;
1465 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1466 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1467 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1468 real rcutoff_scalar;
1469 real *shiftvec,*fshift,*x,*f;
1470 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1471 real scratch[4*DIM];
1472 __m128 fscal,rcutoff,rcutoff2,jidxall;
1474 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1476 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1478 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1479 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1480 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1481 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1482 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1483 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1484 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1485 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1486 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1487 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1488 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1489 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1490 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1491 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1492 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1493 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1494 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1497 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1498 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1500 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1501 real rswitch_scalar,d_scalar;
1502 __m128 dummy_mask,cutoff_mask;
1503 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1504 __m128 one = _mm_set1_ps(1.0);
1505 __m128 two = _mm_set1_ps(2.0);
1511 jindex = nlist->jindex;
1513 shiftidx = nlist->shift;
1515 shiftvec = fr->shift_vec[0];
1516 fshift = fr->fshift[0];
1517 facel = _mm_set1_ps(fr->epsfac);
1518 charge = mdatoms->chargeA;
1520 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1521 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1522 beta2 = _mm_mul_ps(beta,beta);
1523 beta3 = _mm_mul_ps(beta,beta2);
1524 ewtab = fr->ic->tabq_coul_FDV0;
1525 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1526 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1528 /* Setup water-specific parameters */
1529 inr = nlist->iinr[0];
1530 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1531 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1532 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1534 jq0 = _mm_set1_ps(charge[inr+0]);
1535 jq1 = _mm_set1_ps(charge[inr+1]);
1536 jq2 = _mm_set1_ps(charge[inr+2]);
1537 qq00 = _mm_mul_ps(iq0,jq0);
1538 qq01 = _mm_mul_ps(iq0,jq1);
1539 qq02 = _mm_mul_ps(iq0,jq2);
1540 qq10 = _mm_mul_ps(iq1,jq0);
1541 qq11 = _mm_mul_ps(iq1,jq1);
1542 qq12 = _mm_mul_ps(iq1,jq2);
1543 qq20 = _mm_mul_ps(iq2,jq0);
1544 qq21 = _mm_mul_ps(iq2,jq1);
1545 qq22 = _mm_mul_ps(iq2,jq2);
1547 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1548 rcutoff_scalar = fr->rcoulomb;
1549 rcutoff = _mm_set1_ps(rcutoff_scalar);
1550 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1552 rswitch_scalar = fr->rcoulomb_switch;
1553 rswitch = _mm_set1_ps(rswitch_scalar);
1554 /* Setup switch parameters */
1555 d_scalar = rcutoff_scalar-rswitch_scalar;
1556 d = _mm_set1_ps(d_scalar);
1557 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1558 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1559 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1560 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1561 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1562 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1564 /* Avoid stupid compiler warnings */
1565 jnrA = jnrB = jnrC = jnrD = 0;
1566 j_coord_offsetA = 0;
1567 j_coord_offsetB = 0;
1568 j_coord_offsetC = 0;
1569 j_coord_offsetD = 0;
1574 for(iidx=0;iidx<4*DIM;iidx++)
1576 scratch[iidx] = 0.0;
1579 /* Start outer loop over neighborlists */
1580 for(iidx=0; iidx<nri; iidx++)
1582 /* Load shift vector for this list */
1583 i_shift_offset = DIM*shiftidx[iidx];
1585 /* Load limits for loop over neighbors */
1586 j_index_start = jindex[iidx];
1587 j_index_end = jindex[iidx+1];
1589 /* Get outer coordinate index */
1591 i_coord_offset = DIM*inr;
1593 /* Load i particle coords and add shift vector */
1594 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1595 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1597 fix0 = _mm_setzero_ps();
1598 fiy0 = _mm_setzero_ps();
1599 fiz0 = _mm_setzero_ps();
1600 fix1 = _mm_setzero_ps();
1601 fiy1 = _mm_setzero_ps();
1602 fiz1 = _mm_setzero_ps();
1603 fix2 = _mm_setzero_ps();
1604 fiy2 = _mm_setzero_ps();
1605 fiz2 = _mm_setzero_ps();
1607 /* Start inner kernel loop */
1608 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1611 /* Get j neighbor index, and coordinate index */
1613 jnrB = jjnr[jidx+1];
1614 jnrC = jjnr[jidx+2];
1615 jnrD = jjnr[jidx+3];
1616 j_coord_offsetA = DIM*jnrA;
1617 j_coord_offsetB = DIM*jnrB;
1618 j_coord_offsetC = DIM*jnrC;
1619 j_coord_offsetD = DIM*jnrD;
1621 /* load j atom coordinates */
1622 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1623 x+j_coord_offsetC,x+j_coord_offsetD,
1624 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1626 /* Calculate displacement vector */
1627 dx00 = _mm_sub_ps(ix0,jx0);
1628 dy00 = _mm_sub_ps(iy0,jy0);
1629 dz00 = _mm_sub_ps(iz0,jz0);
1630 dx01 = _mm_sub_ps(ix0,jx1);
1631 dy01 = _mm_sub_ps(iy0,jy1);
1632 dz01 = _mm_sub_ps(iz0,jz1);
1633 dx02 = _mm_sub_ps(ix0,jx2);
1634 dy02 = _mm_sub_ps(iy0,jy2);
1635 dz02 = _mm_sub_ps(iz0,jz2);
1636 dx10 = _mm_sub_ps(ix1,jx0);
1637 dy10 = _mm_sub_ps(iy1,jy0);
1638 dz10 = _mm_sub_ps(iz1,jz0);
1639 dx11 = _mm_sub_ps(ix1,jx1);
1640 dy11 = _mm_sub_ps(iy1,jy1);
1641 dz11 = _mm_sub_ps(iz1,jz1);
1642 dx12 = _mm_sub_ps(ix1,jx2);
1643 dy12 = _mm_sub_ps(iy1,jy2);
1644 dz12 = _mm_sub_ps(iz1,jz2);
1645 dx20 = _mm_sub_ps(ix2,jx0);
1646 dy20 = _mm_sub_ps(iy2,jy0);
1647 dz20 = _mm_sub_ps(iz2,jz0);
1648 dx21 = _mm_sub_ps(ix2,jx1);
1649 dy21 = _mm_sub_ps(iy2,jy1);
1650 dz21 = _mm_sub_ps(iz2,jz1);
1651 dx22 = _mm_sub_ps(ix2,jx2);
1652 dy22 = _mm_sub_ps(iy2,jy2);
1653 dz22 = _mm_sub_ps(iz2,jz2);
1655 /* Calculate squared distance and things based on it */
1656 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1657 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1658 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1659 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1660 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1661 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1662 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1663 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1664 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1666 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1667 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1668 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1669 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1670 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1671 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1672 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1673 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1674 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1676 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1677 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1678 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1679 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1680 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1681 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1682 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1683 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1684 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1686 fjx0 = _mm_setzero_ps();
1687 fjy0 = _mm_setzero_ps();
1688 fjz0 = _mm_setzero_ps();
1689 fjx1 = _mm_setzero_ps();
1690 fjy1 = _mm_setzero_ps();
1691 fjz1 = _mm_setzero_ps();
1692 fjx2 = _mm_setzero_ps();
1693 fjy2 = _mm_setzero_ps();
1694 fjz2 = _mm_setzero_ps();
1696 /**************************
1697 * CALCULATE INTERACTIONS *
1698 **************************/
1700 if (gmx_mm_any_lt(rsq00,rcutoff2))
1703 r00 = _mm_mul_ps(rsq00,rinv00);
1705 /* EWALD ELECTROSTATICS */
1707 /* Analytical PME correction */
1708 zeta2 = _mm_mul_ps(beta2,rsq00);
1709 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
1710 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1711 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1712 felec = _mm_mul_ps(qq00,felec);
1713 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1714 velec = _mm_nmacc_ps(pmecorrV,beta,rinv00);
1715 velec = _mm_mul_ps(qq00,velec);
1717 d = _mm_sub_ps(r00,rswitch);
1718 d = _mm_max_ps(d,_mm_setzero_ps());
1719 d2 = _mm_mul_ps(d,d);
1720 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1722 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1724 /* Evaluate switch function */
1725 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1726 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1727 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1731 fscal = _mm_and_ps(fscal,cutoff_mask);
1733 /* Update vectorial force */
1734 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1735 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1736 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1738 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1739 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1740 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1744 /**************************
1745 * CALCULATE INTERACTIONS *
1746 **************************/
1748 if (gmx_mm_any_lt(rsq01,rcutoff2))
1751 r01 = _mm_mul_ps(rsq01,rinv01);
1753 /* EWALD ELECTROSTATICS */
1755 /* Analytical PME correction */
1756 zeta2 = _mm_mul_ps(beta2,rsq01);
1757 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
1758 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1759 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1760 felec = _mm_mul_ps(qq01,felec);
1761 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1762 velec = _mm_nmacc_ps(pmecorrV,beta,rinv01);
1763 velec = _mm_mul_ps(qq01,velec);
1765 d = _mm_sub_ps(r01,rswitch);
1766 d = _mm_max_ps(d,_mm_setzero_ps());
1767 d2 = _mm_mul_ps(d,d);
1768 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1770 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1772 /* Evaluate switch function */
1773 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1774 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1775 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1779 fscal = _mm_and_ps(fscal,cutoff_mask);
1781 /* Update vectorial force */
1782 fix0 = _mm_macc_ps(dx01,fscal,fix0);
1783 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1784 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1786 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1787 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1788 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1792 /**************************
1793 * CALCULATE INTERACTIONS *
1794 **************************/
1796 if (gmx_mm_any_lt(rsq02,rcutoff2))
1799 r02 = _mm_mul_ps(rsq02,rinv02);
1801 /* EWALD ELECTROSTATICS */
1803 /* Analytical PME correction */
1804 zeta2 = _mm_mul_ps(beta2,rsq02);
1805 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
1806 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1807 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1808 felec = _mm_mul_ps(qq02,felec);
1809 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1810 velec = _mm_nmacc_ps(pmecorrV,beta,rinv02);
1811 velec = _mm_mul_ps(qq02,velec);
1813 d = _mm_sub_ps(r02,rswitch);
1814 d = _mm_max_ps(d,_mm_setzero_ps());
1815 d2 = _mm_mul_ps(d,d);
1816 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1818 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1820 /* Evaluate switch function */
1821 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1822 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1823 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1827 fscal = _mm_and_ps(fscal,cutoff_mask);
1829 /* Update vectorial force */
1830 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1831 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1832 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
1834 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
1835 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
1836 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
1840 /**************************
1841 * CALCULATE INTERACTIONS *
1842 **************************/
1844 if (gmx_mm_any_lt(rsq10,rcutoff2))
1847 r10 = _mm_mul_ps(rsq10,rinv10);
1849 /* EWALD ELECTROSTATICS */
1851 /* Analytical PME correction */
1852 zeta2 = _mm_mul_ps(beta2,rsq10);
1853 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
1854 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1855 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1856 felec = _mm_mul_ps(qq10,felec);
1857 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1858 velec = _mm_nmacc_ps(pmecorrV,beta,rinv10);
1859 velec = _mm_mul_ps(qq10,velec);
1861 d = _mm_sub_ps(r10,rswitch);
1862 d = _mm_max_ps(d,_mm_setzero_ps());
1863 d2 = _mm_mul_ps(d,d);
1864 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1866 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1868 /* Evaluate switch function */
1869 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1870 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1871 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1875 fscal = _mm_and_ps(fscal,cutoff_mask);
1877 /* Update vectorial force */
1878 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1879 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1880 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1882 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1883 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1884 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1888 /**************************
1889 * CALCULATE INTERACTIONS *
1890 **************************/
1892 if (gmx_mm_any_lt(rsq11,rcutoff2))
1895 r11 = _mm_mul_ps(rsq11,rinv11);
1897 /* EWALD ELECTROSTATICS */
1899 /* Analytical PME correction */
1900 zeta2 = _mm_mul_ps(beta2,rsq11);
1901 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1902 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1903 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1904 felec = _mm_mul_ps(qq11,felec);
1905 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1906 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
1907 velec = _mm_mul_ps(qq11,velec);
1909 d = _mm_sub_ps(r11,rswitch);
1910 d = _mm_max_ps(d,_mm_setzero_ps());
1911 d2 = _mm_mul_ps(d,d);
1912 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1914 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1916 /* Evaluate switch function */
1917 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1918 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1919 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1923 fscal = _mm_and_ps(fscal,cutoff_mask);
1925 /* Update vectorial force */
1926 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1927 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1928 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1930 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1931 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1932 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1936 /**************************
1937 * CALCULATE INTERACTIONS *
1938 **************************/
1940 if (gmx_mm_any_lt(rsq12,rcutoff2))
1943 r12 = _mm_mul_ps(rsq12,rinv12);
1945 /* EWALD ELECTROSTATICS */
1947 /* Analytical PME correction */
1948 zeta2 = _mm_mul_ps(beta2,rsq12);
1949 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1950 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1951 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1952 felec = _mm_mul_ps(qq12,felec);
1953 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1954 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
1955 velec = _mm_mul_ps(qq12,velec);
1957 d = _mm_sub_ps(r12,rswitch);
1958 d = _mm_max_ps(d,_mm_setzero_ps());
1959 d2 = _mm_mul_ps(d,d);
1960 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1962 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1964 /* Evaluate switch function */
1965 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1966 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1967 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1971 fscal = _mm_and_ps(fscal,cutoff_mask);
1973 /* Update vectorial force */
1974 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1975 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1976 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1978 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1979 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1980 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1984 /**************************
1985 * CALCULATE INTERACTIONS *
1986 **************************/
1988 if (gmx_mm_any_lt(rsq20,rcutoff2))
1991 r20 = _mm_mul_ps(rsq20,rinv20);
1993 /* EWALD ELECTROSTATICS */
1995 /* Analytical PME correction */
1996 zeta2 = _mm_mul_ps(beta2,rsq20);
1997 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
1998 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1999 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2000 felec = _mm_mul_ps(qq20,felec);
2001 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2002 velec = _mm_nmacc_ps(pmecorrV,beta,rinv20);
2003 velec = _mm_mul_ps(qq20,velec);
2005 d = _mm_sub_ps(r20,rswitch);
2006 d = _mm_max_ps(d,_mm_setzero_ps());
2007 d2 = _mm_mul_ps(d,d);
2008 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2010 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2012 /* Evaluate switch function */
2013 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2014 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2015 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2019 fscal = _mm_and_ps(fscal,cutoff_mask);
2021 /* Update vectorial force */
2022 fix2 = _mm_macc_ps(dx20,fscal,fix2);
2023 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
2024 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
2026 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
2027 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
2028 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
2032 /**************************
2033 * CALCULATE INTERACTIONS *
2034 **************************/
2036 if (gmx_mm_any_lt(rsq21,rcutoff2))
2039 r21 = _mm_mul_ps(rsq21,rinv21);
2041 /* EWALD ELECTROSTATICS */
2043 /* Analytical PME correction */
2044 zeta2 = _mm_mul_ps(beta2,rsq21);
2045 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
2046 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2047 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2048 felec = _mm_mul_ps(qq21,felec);
2049 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2050 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
2051 velec = _mm_mul_ps(qq21,velec);
2053 d = _mm_sub_ps(r21,rswitch);
2054 d = _mm_max_ps(d,_mm_setzero_ps());
2055 d2 = _mm_mul_ps(d,d);
2056 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2058 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2060 /* Evaluate switch function */
2061 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2062 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2063 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2067 fscal = _mm_and_ps(fscal,cutoff_mask);
2069 /* Update vectorial force */
2070 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2071 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2072 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2074 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2075 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2076 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2080 /**************************
2081 * CALCULATE INTERACTIONS *
2082 **************************/
2084 if (gmx_mm_any_lt(rsq22,rcutoff2))
2087 r22 = _mm_mul_ps(rsq22,rinv22);
2089 /* EWALD ELECTROSTATICS */
2091 /* Analytical PME correction */
2092 zeta2 = _mm_mul_ps(beta2,rsq22);
2093 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
2094 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2095 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2096 felec = _mm_mul_ps(qq22,felec);
2097 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2098 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
2099 velec = _mm_mul_ps(qq22,velec);
2101 d = _mm_sub_ps(r22,rswitch);
2102 d = _mm_max_ps(d,_mm_setzero_ps());
2103 d2 = _mm_mul_ps(d,d);
2104 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2106 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2108 /* Evaluate switch function */
2109 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2110 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2111 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2115 fscal = _mm_and_ps(fscal,cutoff_mask);
2117 /* Update vectorial force */
2118 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2119 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2120 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2122 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2123 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2124 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2128 fjptrA = f+j_coord_offsetA;
2129 fjptrB = f+j_coord_offsetB;
2130 fjptrC = f+j_coord_offsetC;
2131 fjptrD = f+j_coord_offsetD;
2133 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2134 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2136 /* Inner loop uses 450 flops */
2139 if(jidx<j_index_end)
2142 /* Get j neighbor index, and coordinate index */
2143 jnrlistA = jjnr[jidx];
2144 jnrlistB = jjnr[jidx+1];
2145 jnrlistC = jjnr[jidx+2];
2146 jnrlistD = jjnr[jidx+3];
2147 /* Sign of each element will be negative for non-real atoms.
2148 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2149 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2151 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2152 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2153 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2154 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2155 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2156 j_coord_offsetA = DIM*jnrA;
2157 j_coord_offsetB = DIM*jnrB;
2158 j_coord_offsetC = DIM*jnrC;
2159 j_coord_offsetD = DIM*jnrD;
2161 /* load j atom coordinates */
2162 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2163 x+j_coord_offsetC,x+j_coord_offsetD,
2164 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2166 /* Calculate displacement vector */
2167 dx00 = _mm_sub_ps(ix0,jx0);
2168 dy00 = _mm_sub_ps(iy0,jy0);
2169 dz00 = _mm_sub_ps(iz0,jz0);
2170 dx01 = _mm_sub_ps(ix0,jx1);
2171 dy01 = _mm_sub_ps(iy0,jy1);
2172 dz01 = _mm_sub_ps(iz0,jz1);
2173 dx02 = _mm_sub_ps(ix0,jx2);
2174 dy02 = _mm_sub_ps(iy0,jy2);
2175 dz02 = _mm_sub_ps(iz0,jz2);
2176 dx10 = _mm_sub_ps(ix1,jx0);
2177 dy10 = _mm_sub_ps(iy1,jy0);
2178 dz10 = _mm_sub_ps(iz1,jz0);
2179 dx11 = _mm_sub_ps(ix1,jx1);
2180 dy11 = _mm_sub_ps(iy1,jy1);
2181 dz11 = _mm_sub_ps(iz1,jz1);
2182 dx12 = _mm_sub_ps(ix1,jx2);
2183 dy12 = _mm_sub_ps(iy1,jy2);
2184 dz12 = _mm_sub_ps(iz1,jz2);
2185 dx20 = _mm_sub_ps(ix2,jx0);
2186 dy20 = _mm_sub_ps(iy2,jy0);
2187 dz20 = _mm_sub_ps(iz2,jz0);
2188 dx21 = _mm_sub_ps(ix2,jx1);
2189 dy21 = _mm_sub_ps(iy2,jy1);
2190 dz21 = _mm_sub_ps(iz2,jz1);
2191 dx22 = _mm_sub_ps(ix2,jx2);
2192 dy22 = _mm_sub_ps(iy2,jy2);
2193 dz22 = _mm_sub_ps(iz2,jz2);
2195 /* Calculate squared distance and things based on it */
2196 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2197 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2198 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2199 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2200 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2201 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2202 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2203 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2204 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2206 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2207 rinv01 = gmx_mm_invsqrt_ps(rsq01);
2208 rinv02 = gmx_mm_invsqrt_ps(rsq02);
2209 rinv10 = gmx_mm_invsqrt_ps(rsq10);
2210 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2211 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2212 rinv20 = gmx_mm_invsqrt_ps(rsq20);
2213 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2214 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2216 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2217 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
2218 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
2219 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
2220 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2221 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2222 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
2223 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2224 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2226 fjx0 = _mm_setzero_ps();
2227 fjy0 = _mm_setzero_ps();
2228 fjz0 = _mm_setzero_ps();
2229 fjx1 = _mm_setzero_ps();
2230 fjy1 = _mm_setzero_ps();
2231 fjz1 = _mm_setzero_ps();
2232 fjx2 = _mm_setzero_ps();
2233 fjy2 = _mm_setzero_ps();
2234 fjz2 = _mm_setzero_ps();
2236 /**************************
2237 * CALCULATE INTERACTIONS *
2238 **************************/
2240 if (gmx_mm_any_lt(rsq00,rcutoff2))
2243 r00 = _mm_mul_ps(rsq00,rinv00);
2244 r00 = _mm_andnot_ps(dummy_mask,r00);
2246 /* EWALD ELECTROSTATICS */
2248 /* Analytical PME correction */
2249 zeta2 = _mm_mul_ps(beta2,rsq00);
2250 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
2251 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2252 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2253 felec = _mm_mul_ps(qq00,felec);
2254 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2255 velec = _mm_nmacc_ps(pmecorrV,beta,rinv00);
2256 velec = _mm_mul_ps(qq00,velec);
2258 d = _mm_sub_ps(r00,rswitch);
2259 d = _mm_max_ps(d,_mm_setzero_ps());
2260 d2 = _mm_mul_ps(d,d);
2261 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2263 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2265 /* Evaluate switch function */
2266 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2267 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
2268 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2272 fscal = _mm_and_ps(fscal,cutoff_mask);
2274 fscal = _mm_andnot_ps(dummy_mask,fscal);
2276 /* Update vectorial force */
2277 fix0 = _mm_macc_ps(dx00,fscal,fix0);
2278 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
2279 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
2281 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
2282 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
2283 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
2287 /**************************
2288 * CALCULATE INTERACTIONS *
2289 **************************/
2291 if (gmx_mm_any_lt(rsq01,rcutoff2))
2294 r01 = _mm_mul_ps(rsq01,rinv01);
2295 r01 = _mm_andnot_ps(dummy_mask,r01);
2297 /* EWALD ELECTROSTATICS */
2299 /* Analytical PME correction */
2300 zeta2 = _mm_mul_ps(beta2,rsq01);
2301 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
2302 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2303 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2304 felec = _mm_mul_ps(qq01,felec);
2305 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2306 velec = _mm_nmacc_ps(pmecorrV,beta,rinv01);
2307 velec = _mm_mul_ps(qq01,velec);
2309 d = _mm_sub_ps(r01,rswitch);
2310 d = _mm_max_ps(d,_mm_setzero_ps());
2311 d2 = _mm_mul_ps(d,d);
2312 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2314 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2316 /* Evaluate switch function */
2317 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2318 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2319 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2323 fscal = _mm_and_ps(fscal,cutoff_mask);
2325 fscal = _mm_andnot_ps(dummy_mask,fscal);
2327 /* Update vectorial force */
2328 fix0 = _mm_macc_ps(dx01,fscal,fix0);
2329 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
2330 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
2332 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
2333 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
2334 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
2338 /**************************
2339 * CALCULATE INTERACTIONS *
2340 **************************/
2342 if (gmx_mm_any_lt(rsq02,rcutoff2))
2345 r02 = _mm_mul_ps(rsq02,rinv02);
2346 r02 = _mm_andnot_ps(dummy_mask,r02);
2348 /* EWALD ELECTROSTATICS */
2350 /* Analytical PME correction */
2351 zeta2 = _mm_mul_ps(beta2,rsq02);
2352 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
2353 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2354 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2355 felec = _mm_mul_ps(qq02,felec);
2356 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2357 velec = _mm_nmacc_ps(pmecorrV,beta,rinv02);
2358 velec = _mm_mul_ps(qq02,velec);
2360 d = _mm_sub_ps(r02,rswitch);
2361 d = _mm_max_ps(d,_mm_setzero_ps());
2362 d2 = _mm_mul_ps(d,d);
2363 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2365 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2367 /* Evaluate switch function */
2368 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2369 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2370 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2374 fscal = _mm_and_ps(fscal,cutoff_mask);
2376 fscal = _mm_andnot_ps(dummy_mask,fscal);
2378 /* Update vectorial force */
2379 fix0 = _mm_macc_ps(dx02,fscal,fix0);
2380 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
2381 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
2383 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
2384 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
2385 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
2389 /**************************
2390 * CALCULATE INTERACTIONS *
2391 **************************/
2393 if (gmx_mm_any_lt(rsq10,rcutoff2))
2396 r10 = _mm_mul_ps(rsq10,rinv10);
2397 r10 = _mm_andnot_ps(dummy_mask,r10);
2399 /* EWALD ELECTROSTATICS */
2401 /* Analytical PME correction */
2402 zeta2 = _mm_mul_ps(beta2,rsq10);
2403 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
2404 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2405 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2406 felec = _mm_mul_ps(qq10,felec);
2407 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2408 velec = _mm_nmacc_ps(pmecorrV,beta,rinv10);
2409 velec = _mm_mul_ps(qq10,velec);
2411 d = _mm_sub_ps(r10,rswitch);
2412 d = _mm_max_ps(d,_mm_setzero_ps());
2413 d2 = _mm_mul_ps(d,d);
2414 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2416 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2418 /* Evaluate switch function */
2419 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2420 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2421 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2425 fscal = _mm_and_ps(fscal,cutoff_mask);
2427 fscal = _mm_andnot_ps(dummy_mask,fscal);
2429 /* Update vectorial force */
2430 fix1 = _mm_macc_ps(dx10,fscal,fix1);
2431 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
2432 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
2434 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
2435 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
2436 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
2440 /**************************
2441 * CALCULATE INTERACTIONS *
2442 **************************/
2444 if (gmx_mm_any_lt(rsq11,rcutoff2))
2447 r11 = _mm_mul_ps(rsq11,rinv11);
2448 r11 = _mm_andnot_ps(dummy_mask,r11);
2450 /* EWALD ELECTROSTATICS */
2452 /* Analytical PME correction */
2453 zeta2 = _mm_mul_ps(beta2,rsq11);
2454 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
2455 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2456 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2457 felec = _mm_mul_ps(qq11,felec);
2458 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2459 velec = _mm_nmacc_ps(pmecorrV,beta,rinv11);
2460 velec = _mm_mul_ps(qq11,velec);
2462 d = _mm_sub_ps(r11,rswitch);
2463 d = _mm_max_ps(d,_mm_setzero_ps());
2464 d2 = _mm_mul_ps(d,d);
2465 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2467 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2469 /* Evaluate switch function */
2470 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2471 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2472 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2476 fscal = _mm_and_ps(fscal,cutoff_mask);
2478 fscal = _mm_andnot_ps(dummy_mask,fscal);
2480 /* Update vectorial force */
2481 fix1 = _mm_macc_ps(dx11,fscal,fix1);
2482 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
2483 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
2485 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
2486 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
2487 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
2491 /**************************
2492 * CALCULATE INTERACTIONS *
2493 **************************/
2495 if (gmx_mm_any_lt(rsq12,rcutoff2))
2498 r12 = _mm_mul_ps(rsq12,rinv12);
2499 r12 = _mm_andnot_ps(dummy_mask,r12);
2501 /* EWALD ELECTROSTATICS */
2503 /* Analytical PME correction */
2504 zeta2 = _mm_mul_ps(beta2,rsq12);
2505 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
2506 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2507 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2508 felec = _mm_mul_ps(qq12,felec);
2509 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2510 velec = _mm_nmacc_ps(pmecorrV,beta,rinv12);
2511 velec = _mm_mul_ps(qq12,velec);
2513 d = _mm_sub_ps(r12,rswitch);
2514 d = _mm_max_ps(d,_mm_setzero_ps());
2515 d2 = _mm_mul_ps(d,d);
2516 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2518 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2520 /* Evaluate switch function */
2521 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2522 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2523 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2527 fscal = _mm_and_ps(fscal,cutoff_mask);
2529 fscal = _mm_andnot_ps(dummy_mask,fscal);
2531 /* Update vectorial force */
2532 fix1 = _mm_macc_ps(dx12,fscal,fix1);
2533 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
2534 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
2536 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
2537 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
2538 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
2542 /**************************
2543 * CALCULATE INTERACTIONS *
2544 **************************/
2546 if (gmx_mm_any_lt(rsq20,rcutoff2))
2549 r20 = _mm_mul_ps(rsq20,rinv20);
2550 r20 = _mm_andnot_ps(dummy_mask,r20);
2552 /* EWALD ELECTROSTATICS */
2554 /* Analytical PME correction */
2555 zeta2 = _mm_mul_ps(beta2,rsq20);
2556 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
2557 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2558 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2559 felec = _mm_mul_ps(qq20,felec);
2560 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2561 velec = _mm_nmacc_ps(pmecorrV,beta,rinv20);
2562 velec = _mm_mul_ps(qq20,velec);
2564 d = _mm_sub_ps(r20,rswitch);
2565 d = _mm_max_ps(d,_mm_setzero_ps());
2566 d2 = _mm_mul_ps(d,d);
2567 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2569 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2571 /* Evaluate switch function */
2572 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2573 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2574 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2578 fscal = _mm_and_ps(fscal,cutoff_mask);
2580 fscal = _mm_andnot_ps(dummy_mask,fscal);
2582 /* Update vectorial force */
2583 fix2 = _mm_macc_ps(dx20,fscal,fix2);
2584 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
2585 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
2587 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
2588 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
2589 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
2593 /**************************
2594 * CALCULATE INTERACTIONS *
2595 **************************/
2597 if (gmx_mm_any_lt(rsq21,rcutoff2))
2600 r21 = _mm_mul_ps(rsq21,rinv21);
2601 r21 = _mm_andnot_ps(dummy_mask,r21);
2603 /* EWALD ELECTROSTATICS */
2605 /* Analytical PME correction */
2606 zeta2 = _mm_mul_ps(beta2,rsq21);
2607 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
2608 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2609 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2610 felec = _mm_mul_ps(qq21,felec);
2611 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2612 velec = _mm_nmacc_ps(pmecorrV,beta,rinv21);
2613 velec = _mm_mul_ps(qq21,velec);
2615 d = _mm_sub_ps(r21,rswitch);
2616 d = _mm_max_ps(d,_mm_setzero_ps());
2617 d2 = _mm_mul_ps(d,d);
2618 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2620 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2622 /* Evaluate switch function */
2623 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2624 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2625 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2629 fscal = _mm_and_ps(fscal,cutoff_mask);
2631 fscal = _mm_andnot_ps(dummy_mask,fscal);
2633 /* Update vectorial force */
2634 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2635 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2636 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2638 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2639 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2640 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2644 /**************************
2645 * CALCULATE INTERACTIONS *
2646 **************************/
2648 if (gmx_mm_any_lt(rsq22,rcutoff2))
2651 r22 = _mm_mul_ps(rsq22,rinv22);
2652 r22 = _mm_andnot_ps(dummy_mask,r22);
2654 /* EWALD ELECTROSTATICS */
2656 /* Analytical PME correction */
2657 zeta2 = _mm_mul_ps(beta2,rsq22);
2658 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
2659 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2660 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2661 felec = _mm_mul_ps(qq22,felec);
2662 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
2663 velec = _mm_nmacc_ps(pmecorrV,beta,rinv22);
2664 velec = _mm_mul_ps(qq22,velec);
2666 d = _mm_sub_ps(r22,rswitch);
2667 d = _mm_max_ps(d,_mm_setzero_ps());
2668 d2 = _mm_mul_ps(d,d);
2669 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2671 dsw = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2673 /* Evaluate switch function */
2674 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2675 felec = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2676 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2680 fscal = _mm_and_ps(fscal,cutoff_mask);
2682 fscal = _mm_andnot_ps(dummy_mask,fscal);
2684 /* Update vectorial force */
2685 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2686 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2687 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2689 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2690 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2691 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2695 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2696 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2697 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2698 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2700 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2701 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2703 /* Inner loop uses 459 flops */
2706 /* End of innermost loop */
2708 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2709 f+i_coord_offset,fshift+i_shift_offset);
2711 /* Increment number of inner iterations */
2712 inneriter += j_index_end - j_index_start;
2714 /* Outer loop uses 18 flops */
2717 /* Increment number of outer iterations */
2720 /* Update outer/inner flops */
2722 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*459);