2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
48 #include "kernelutil_x86_avx_128_fma_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_128_fma_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LennardJones
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_128_fma_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
107 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
111 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
113 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
114 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
116 __m128 dummy_mask,cutoff_mask;
117 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
118 __m128 one = _mm_set1_ps(1.0);
119 __m128 two = _mm_set1_ps(2.0);
125 jindex = nlist->jindex;
127 shiftidx = nlist->shift;
129 shiftvec = fr->shift_vec[0];
130 fshift = fr->fshift[0];
131 facel = _mm_set1_ps(fr->epsfac);
132 charge = mdatoms->chargeA;
133 nvdwtype = fr->ntype;
135 vdwtype = mdatoms->typeA;
137 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
138 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
139 beta2 = _mm_mul_ps(beta,beta);
140 beta3 = _mm_mul_ps(beta,beta2);
141 ewtab = fr->ic->tabq_coul_FDV0;
142 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
143 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
145 /* Setup water-specific parameters */
146 inr = nlist->iinr[0];
147 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
148 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
149 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
150 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
152 jq0 = _mm_set1_ps(charge[inr+0]);
153 jq1 = _mm_set1_ps(charge[inr+1]);
154 jq2 = _mm_set1_ps(charge[inr+2]);
155 vdwjidx0A = 2*vdwtype[inr+0];
156 qq00 = _mm_mul_ps(iq0,jq0);
157 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
158 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
159 qq01 = _mm_mul_ps(iq0,jq1);
160 qq02 = _mm_mul_ps(iq0,jq2);
161 qq10 = _mm_mul_ps(iq1,jq0);
162 qq11 = _mm_mul_ps(iq1,jq1);
163 qq12 = _mm_mul_ps(iq1,jq2);
164 qq20 = _mm_mul_ps(iq2,jq0);
165 qq21 = _mm_mul_ps(iq2,jq1);
166 qq22 = _mm_mul_ps(iq2,jq2);
168 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
169 rcutoff_scalar = fr->rcoulomb;
170 rcutoff = _mm_set1_ps(rcutoff_scalar);
171 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
173 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
174 rvdw = _mm_set1_ps(fr->rvdw);
176 /* Avoid stupid compiler warnings */
177 jnrA = jnrB = jnrC = jnrD = 0;
186 for(iidx=0;iidx<4*DIM;iidx++)
191 /* Start outer loop over neighborlists */
192 for(iidx=0; iidx<nri; iidx++)
194 /* Load shift vector for this list */
195 i_shift_offset = DIM*shiftidx[iidx];
197 /* Load limits for loop over neighbors */
198 j_index_start = jindex[iidx];
199 j_index_end = jindex[iidx+1];
201 /* Get outer coordinate index */
203 i_coord_offset = DIM*inr;
205 /* Load i particle coords and add shift vector */
206 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
207 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
209 fix0 = _mm_setzero_ps();
210 fiy0 = _mm_setzero_ps();
211 fiz0 = _mm_setzero_ps();
212 fix1 = _mm_setzero_ps();
213 fiy1 = _mm_setzero_ps();
214 fiz1 = _mm_setzero_ps();
215 fix2 = _mm_setzero_ps();
216 fiy2 = _mm_setzero_ps();
217 fiz2 = _mm_setzero_ps();
219 /* Reset potential sums */
220 velecsum = _mm_setzero_ps();
221 vvdwsum = _mm_setzero_ps();
223 /* Start inner kernel loop */
224 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
227 /* Get j neighbor index, and coordinate index */
232 j_coord_offsetA = DIM*jnrA;
233 j_coord_offsetB = DIM*jnrB;
234 j_coord_offsetC = DIM*jnrC;
235 j_coord_offsetD = DIM*jnrD;
237 /* load j atom coordinates */
238 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
239 x+j_coord_offsetC,x+j_coord_offsetD,
240 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
242 /* Calculate displacement vector */
243 dx00 = _mm_sub_ps(ix0,jx0);
244 dy00 = _mm_sub_ps(iy0,jy0);
245 dz00 = _mm_sub_ps(iz0,jz0);
246 dx01 = _mm_sub_ps(ix0,jx1);
247 dy01 = _mm_sub_ps(iy0,jy1);
248 dz01 = _mm_sub_ps(iz0,jz1);
249 dx02 = _mm_sub_ps(ix0,jx2);
250 dy02 = _mm_sub_ps(iy0,jy2);
251 dz02 = _mm_sub_ps(iz0,jz2);
252 dx10 = _mm_sub_ps(ix1,jx0);
253 dy10 = _mm_sub_ps(iy1,jy0);
254 dz10 = _mm_sub_ps(iz1,jz0);
255 dx11 = _mm_sub_ps(ix1,jx1);
256 dy11 = _mm_sub_ps(iy1,jy1);
257 dz11 = _mm_sub_ps(iz1,jz1);
258 dx12 = _mm_sub_ps(ix1,jx2);
259 dy12 = _mm_sub_ps(iy1,jy2);
260 dz12 = _mm_sub_ps(iz1,jz2);
261 dx20 = _mm_sub_ps(ix2,jx0);
262 dy20 = _mm_sub_ps(iy2,jy0);
263 dz20 = _mm_sub_ps(iz2,jz0);
264 dx21 = _mm_sub_ps(ix2,jx1);
265 dy21 = _mm_sub_ps(iy2,jy1);
266 dz21 = _mm_sub_ps(iz2,jz1);
267 dx22 = _mm_sub_ps(ix2,jx2);
268 dy22 = _mm_sub_ps(iy2,jy2);
269 dz22 = _mm_sub_ps(iz2,jz2);
271 /* Calculate squared distance and things based on it */
272 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
273 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
274 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
275 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
276 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
277 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
278 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
279 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
280 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
282 rinv00 = gmx_mm_invsqrt_ps(rsq00);
283 rinv01 = gmx_mm_invsqrt_ps(rsq01);
284 rinv02 = gmx_mm_invsqrt_ps(rsq02);
285 rinv10 = gmx_mm_invsqrt_ps(rsq10);
286 rinv11 = gmx_mm_invsqrt_ps(rsq11);
287 rinv12 = gmx_mm_invsqrt_ps(rsq12);
288 rinv20 = gmx_mm_invsqrt_ps(rsq20);
289 rinv21 = gmx_mm_invsqrt_ps(rsq21);
290 rinv22 = gmx_mm_invsqrt_ps(rsq22);
292 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
293 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
294 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
295 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
296 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
297 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
298 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
299 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
300 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
302 fjx0 = _mm_setzero_ps();
303 fjy0 = _mm_setzero_ps();
304 fjz0 = _mm_setzero_ps();
305 fjx1 = _mm_setzero_ps();
306 fjy1 = _mm_setzero_ps();
307 fjz1 = _mm_setzero_ps();
308 fjx2 = _mm_setzero_ps();
309 fjy2 = _mm_setzero_ps();
310 fjz2 = _mm_setzero_ps();
312 /**************************
313 * CALCULATE INTERACTIONS *
314 **************************/
316 if (gmx_mm_any_lt(rsq00,rcutoff2))
319 r00 = _mm_mul_ps(rsq00,rinv00);
321 /* EWALD ELECTROSTATICS */
323 /* Analytical PME correction */
324 zeta2 = _mm_mul_ps(beta2,rsq00);
325 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
326 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
327 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
328 felec = _mm_mul_ps(qq00,felec);
329 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
330 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv00,sh_ewald));
331 velec = _mm_mul_ps(qq00,velec);
333 /* LENNARD-JONES DISPERSION/REPULSION */
335 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
336 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
337 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
338 vvdw = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
339 _mm_mul_ps( _mm_nmacc_ps(c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
340 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
342 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
344 /* Update potential sum for this i atom from the interaction with this j atom. */
345 velec = _mm_and_ps(velec,cutoff_mask);
346 velecsum = _mm_add_ps(velecsum,velec);
347 vvdw = _mm_and_ps(vvdw,cutoff_mask);
348 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
350 fscal = _mm_add_ps(felec,fvdw);
352 fscal = _mm_and_ps(fscal,cutoff_mask);
354 /* Update vectorial force */
355 fix0 = _mm_macc_ps(dx00,fscal,fix0);
356 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
357 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
359 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
360 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
361 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
365 /**************************
366 * CALCULATE INTERACTIONS *
367 **************************/
369 if (gmx_mm_any_lt(rsq01,rcutoff2))
372 r01 = _mm_mul_ps(rsq01,rinv01);
374 /* EWALD ELECTROSTATICS */
376 /* Analytical PME correction */
377 zeta2 = _mm_mul_ps(beta2,rsq01);
378 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
379 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
380 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
381 felec = _mm_mul_ps(qq01,felec);
382 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
383 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv01,sh_ewald));
384 velec = _mm_mul_ps(qq01,velec);
386 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
388 /* Update potential sum for this i atom from the interaction with this j atom. */
389 velec = _mm_and_ps(velec,cutoff_mask);
390 velecsum = _mm_add_ps(velecsum,velec);
394 fscal = _mm_and_ps(fscal,cutoff_mask);
396 /* Update vectorial force */
397 fix0 = _mm_macc_ps(dx01,fscal,fix0);
398 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
399 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
401 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
402 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
403 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
407 /**************************
408 * CALCULATE INTERACTIONS *
409 **************************/
411 if (gmx_mm_any_lt(rsq02,rcutoff2))
414 r02 = _mm_mul_ps(rsq02,rinv02);
416 /* EWALD ELECTROSTATICS */
418 /* Analytical PME correction */
419 zeta2 = _mm_mul_ps(beta2,rsq02);
420 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
421 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
422 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
423 felec = _mm_mul_ps(qq02,felec);
424 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
425 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv02,sh_ewald));
426 velec = _mm_mul_ps(qq02,velec);
428 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
430 /* Update potential sum for this i atom from the interaction with this j atom. */
431 velec = _mm_and_ps(velec,cutoff_mask);
432 velecsum = _mm_add_ps(velecsum,velec);
436 fscal = _mm_and_ps(fscal,cutoff_mask);
438 /* Update vectorial force */
439 fix0 = _mm_macc_ps(dx02,fscal,fix0);
440 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
441 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
443 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
444 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
445 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
449 /**************************
450 * CALCULATE INTERACTIONS *
451 **************************/
453 if (gmx_mm_any_lt(rsq10,rcutoff2))
456 r10 = _mm_mul_ps(rsq10,rinv10);
458 /* EWALD ELECTROSTATICS */
460 /* Analytical PME correction */
461 zeta2 = _mm_mul_ps(beta2,rsq10);
462 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
463 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
464 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
465 felec = _mm_mul_ps(qq10,felec);
466 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
467 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv10,sh_ewald));
468 velec = _mm_mul_ps(qq10,velec);
470 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
472 /* Update potential sum for this i atom from the interaction with this j atom. */
473 velec = _mm_and_ps(velec,cutoff_mask);
474 velecsum = _mm_add_ps(velecsum,velec);
478 fscal = _mm_and_ps(fscal,cutoff_mask);
480 /* Update vectorial force */
481 fix1 = _mm_macc_ps(dx10,fscal,fix1);
482 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
483 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
485 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
486 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
487 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
491 /**************************
492 * CALCULATE INTERACTIONS *
493 **************************/
495 if (gmx_mm_any_lt(rsq11,rcutoff2))
498 r11 = _mm_mul_ps(rsq11,rinv11);
500 /* EWALD ELECTROSTATICS */
502 /* Analytical PME correction */
503 zeta2 = _mm_mul_ps(beta2,rsq11);
504 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
505 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
506 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
507 felec = _mm_mul_ps(qq11,felec);
508 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
509 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv11,sh_ewald));
510 velec = _mm_mul_ps(qq11,velec);
512 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
514 /* Update potential sum for this i atom from the interaction with this j atom. */
515 velec = _mm_and_ps(velec,cutoff_mask);
516 velecsum = _mm_add_ps(velecsum,velec);
520 fscal = _mm_and_ps(fscal,cutoff_mask);
522 /* Update vectorial force */
523 fix1 = _mm_macc_ps(dx11,fscal,fix1);
524 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
525 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
527 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
528 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
529 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
533 /**************************
534 * CALCULATE INTERACTIONS *
535 **************************/
537 if (gmx_mm_any_lt(rsq12,rcutoff2))
540 r12 = _mm_mul_ps(rsq12,rinv12);
542 /* EWALD ELECTROSTATICS */
544 /* Analytical PME correction */
545 zeta2 = _mm_mul_ps(beta2,rsq12);
546 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
547 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
548 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
549 felec = _mm_mul_ps(qq12,felec);
550 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
551 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv12,sh_ewald));
552 velec = _mm_mul_ps(qq12,velec);
554 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
556 /* Update potential sum for this i atom from the interaction with this j atom. */
557 velec = _mm_and_ps(velec,cutoff_mask);
558 velecsum = _mm_add_ps(velecsum,velec);
562 fscal = _mm_and_ps(fscal,cutoff_mask);
564 /* Update vectorial force */
565 fix1 = _mm_macc_ps(dx12,fscal,fix1);
566 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
567 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
569 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
570 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
571 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
575 /**************************
576 * CALCULATE INTERACTIONS *
577 **************************/
579 if (gmx_mm_any_lt(rsq20,rcutoff2))
582 r20 = _mm_mul_ps(rsq20,rinv20);
584 /* EWALD ELECTROSTATICS */
586 /* Analytical PME correction */
587 zeta2 = _mm_mul_ps(beta2,rsq20);
588 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
589 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
590 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
591 felec = _mm_mul_ps(qq20,felec);
592 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
593 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv20,sh_ewald));
594 velec = _mm_mul_ps(qq20,velec);
596 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
598 /* Update potential sum for this i atom from the interaction with this j atom. */
599 velec = _mm_and_ps(velec,cutoff_mask);
600 velecsum = _mm_add_ps(velecsum,velec);
604 fscal = _mm_and_ps(fscal,cutoff_mask);
606 /* Update vectorial force */
607 fix2 = _mm_macc_ps(dx20,fscal,fix2);
608 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
609 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
611 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
612 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
613 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
617 /**************************
618 * CALCULATE INTERACTIONS *
619 **************************/
621 if (gmx_mm_any_lt(rsq21,rcutoff2))
624 r21 = _mm_mul_ps(rsq21,rinv21);
626 /* EWALD ELECTROSTATICS */
628 /* Analytical PME correction */
629 zeta2 = _mm_mul_ps(beta2,rsq21);
630 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
631 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
632 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
633 felec = _mm_mul_ps(qq21,felec);
634 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
635 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv21,sh_ewald));
636 velec = _mm_mul_ps(qq21,velec);
638 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
640 /* Update potential sum for this i atom from the interaction with this j atom. */
641 velec = _mm_and_ps(velec,cutoff_mask);
642 velecsum = _mm_add_ps(velecsum,velec);
646 fscal = _mm_and_ps(fscal,cutoff_mask);
648 /* Update vectorial force */
649 fix2 = _mm_macc_ps(dx21,fscal,fix2);
650 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
651 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
653 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
654 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
655 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
659 /**************************
660 * CALCULATE INTERACTIONS *
661 **************************/
663 if (gmx_mm_any_lt(rsq22,rcutoff2))
666 r22 = _mm_mul_ps(rsq22,rinv22);
668 /* EWALD ELECTROSTATICS */
670 /* Analytical PME correction */
671 zeta2 = _mm_mul_ps(beta2,rsq22);
672 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
673 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
674 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
675 felec = _mm_mul_ps(qq22,felec);
676 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
677 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv22,sh_ewald));
678 velec = _mm_mul_ps(qq22,velec);
680 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
682 /* Update potential sum for this i atom from the interaction with this j atom. */
683 velec = _mm_and_ps(velec,cutoff_mask);
684 velecsum = _mm_add_ps(velecsum,velec);
688 fscal = _mm_and_ps(fscal,cutoff_mask);
690 /* Update vectorial force */
691 fix2 = _mm_macc_ps(dx22,fscal,fix2);
692 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
693 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
695 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
696 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
697 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
701 fjptrA = f+j_coord_offsetA;
702 fjptrB = f+j_coord_offsetB;
703 fjptrC = f+j_coord_offsetC;
704 fjptrD = f+j_coord_offsetD;
706 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
707 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
709 /* Inner loop uses 315 flops */
715 /* Get j neighbor index, and coordinate index */
716 jnrlistA = jjnr[jidx];
717 jnrlistB = jjnr[jidx+1];
718 jnrlistC = jjnr[jidx+2];
719 jnrlistD = jjnr[jidx+3];
720 /* Sign of each element will be negative for non-real atoms.
721 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
722 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
724 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
725 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
726 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
727 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
728 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
729 j_coord_offsetA = DIM*jnrA;
730 j_coord_offsetB = DIM*jnrB;
731 j_coord_offsetC = DIM*jnrC;
732 j_coord_offsetD = DIM*jnrD;
734 /* load j atom coordinates */
735 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
736 x+j_coord_offsetC,x+j_coord_offsetD,
737 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
739 /* Calculate displacement vector */
740 dx00 = _mm_sub_ps(ix0,jx0);
741 dy00 = _mm_sub_ps(iy0,jy0);
742 dz00 = _mm_sub_ps(iz0,jz0);
743 dx01 = _mm_sub_ps(ix0,jx1);
744 dy01 = _mm_sub_ps(iy0,jy1);
745 dz01 = _mm_sub_ps(iz0,jz1);
746 dx02 = _mm_sub_ps(ix0,jx2);
747 dy02 = _mm_sub_ps(iy0,jy2);
748 dz02 = _mm_sub_ps(iz0,jz2);
749 dx10 = _mm_sub_ps(ix1,jx0);
750 dy10 = _mm_sub_ps(iy1,jy0);
751 dz10 = _mm_sub_ps(iz1,jz0);
752 dx11 = _mm_sub_ps(ix1,jx1);
753 dy11 = _mm_sub_ps(iy1,jy1);
754 dz11 = _mm_sub_ps(iz1,jz1);
755 dx12 = _mm_sub_ps(ix1,jx2);
756 dy12 = _mm_sub_ps(iy1,jy2);
757 dz12 = _mm_sub_ps(iz1,jz2);
758 dx20 = _mm_sub_ps(ix2,jx0);
759 dy20 = _mm_sub_ps(iy2,jy0);
760 dz20 = _mm_sub_ps(iz2,jz0);
761 dx21 = _mm_sub_ps(ix2,jx1);
762 dy21 = _mm_sub_ps(iy2,jy1);
763 dz21 = _mm_sub_ps(iz2,jz1);
764 dx22 = _mm_sub_ps(ix2,jx2);
765 dy22 = _mm_sub_ps(iy2,jy2);
766 dz22 = _mm_sub_ps(iz2,jz2);
768 /* Calculate squared distance and things based on it */
769 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
770 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
771 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
772 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
773 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
774 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
775 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
776 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
777 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
779 rinv00 = gmx_mm_invsqrt_ps(rsq00);
780 rinv01 = gmx_mm_invsqrt_ps(rsq01);
781 rinv02 = gmx_mm_invsqrt_ps(rsq02);
782 rinv10 = gmx_mm_invsqrt_ps(rsq10);
783 rinv11 = gmx_mm_invsqrt_ps(rsq11);
784 rinv12 = gmx_mm_invsqrt_ps(rsq12);
785 rinv20 = gmx_mm_invsqrt_ps(rsq20);
786 rinv21 = gmx_mm_invsqrt_ps(rsq21);
787 rinv22 = gmx_mm_invsqrt_ps(rsq22);
789 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
790 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
791 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
792 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
793 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
794 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
795 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
796 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
797 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
799 fjx0 = _mm_setzero_ps();
800 fjy0 = _mm_setzero_ps();
801 fjz0 = _mm_setzero_ps();
802 fjx1 = _mm_setzero_ps();
803 fjy1 = _mm_setzero_ps();
804 fjz1 = _mm_setzero_ps();
805 fjx2 = _mm_setzero_ps();
806 fjy2 = _mm_setzero_ps();
807 fjz2 = _mm_setzero_ps();
809 /**************************
810 * CALCULATE INTERACTIONS *
811 **************************/
813 if (gmx_mm_any_lt(rsq00,rcutoff2))
816 r00 = _mm_mul_ps(rsq00,rinv00);
817 r00 = _mm_andnot_ps(dummy_mask,r00);
819 /* EWALD ELECTROSTATICS */
821 /* Analytical PME correction */
822 zeta2 = _mm_mul_ps(beta2,rsq00);
823 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
824 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
825 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
826 felec = _mm_mul_ps(qq00,felec);
827 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
828 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv00,sh_ewald));
829 velec = _mm_mul_ps(qq00,velec);
831 /* LENNARD-JONES DISPERSION/REPULSION */
833 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
834 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
835 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
836 vvdw = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
837 _mm_mul_ps( _mm_nmacc_ps(c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
838 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
840 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
842 /* Update potential sum for this i atom from the interaction with this j atom. */
843 velec = _mm_and_ps(velec,cutoff_mask);
844 velec = _mm_andnot_ps(dummy_mask,velec);
845 velecsum = _mm_add_ps(velecsum,velec);
846 vvdw = _mm_and_ps(vvdw,cutoff_mask);
847 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
848 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
850 fscal = _mm_add_ps(felec,fvdw);
852 fscal = _mm_and_ps(fscal,cutoff_mask);
854 fscal = _mm_andnot_ps(dummy_mask,fscal);
856 /* Update vectorial force */
857 fix0 = _mm_macc_ps(dx00,fscal,fix0);
858 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
859 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
861 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
862 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
863 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
867 /**************************
868 * CALCULATE INTERACTIONS *
869 **************************/
871 if (gmx_mm_any_lt(rsq01,rcutoff2))
874 r01 = _mm_mul_ps(rsq01,rinv01);
875 r01 = _mm_andnot_ps(dummy_mask,r01);
877 /* EWALD ELECTROSTATICS */
879 /* Analytical PME correction */
880 zeta2 = _mm_mul_ps(beta2,rsq01);
881 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
882 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
883 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
884 felec = _mm_mul_ps(qq01,felec);
885 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
886 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv01,sh_ewald));
887 velec = _mm_mul_ps(qq01,velec);
889 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
891 /* Update potential sum for this i atom from the interaction with this j atom. */
892 velec = _mm_and_ps(velec,cutoff_mask);
893 velec = _mm_andnot_ps(dummy_mask,velec);
894 velecsum = _mm_add_ps(velecsum,velec);
898 fscal = _mm_and_ps(fscal,cutoff_mask);
900 fscal = _mm_andnot_ps(dummy_mask,fscal);
902 /* Update vectorial force */
903 fix0 = _mm_macc_ps(dx01,fscal,fix0);
904 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
905 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
907 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
908 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
909 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
913 /**************************
914 * CALCULATE INTERACTIONS *
915 **************************/
917 if (gmx_mm_any_lt(rsq02,rcutoff2))
920 r02 = _mm_mul_ps(rsq02,rinv02);
921 r02 = _mm_andnot_ps(dummy_mask,r02);
923 /* EWALD ELECTROSTATICS */
925 /* Analytical PME correction */
926 zeta2 = _mm_mul_ps(beta2,rsq02);
927 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
928 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
929 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
930 felec = _mm_mul_ps(qq02,felec);
931 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
932 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv02,sh_ewald));
933 velec = _mm_mul_ps(qq02,velec);
935 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
937 /* Update potential sum for this i atom from the interaction with this j atom. */
938 velec = _mm_and_ps(velec,cutoff_mask);
939 velec = _mm_andnot_ps(dummy_mask,velec);
940 velecsum = _mm_add_ps(velecsum,velec);
944 fscal = _mm_and_ps(fscal,cutoff_mask);
946 fscal = _mm_andnot_ps(dummy_mask,fscal);
948 /* Update vectorial force */
949 fix0 = _mm_macc_ps(dx02,fscal,fix0);
950 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
951 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
953 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
954 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
955 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
959 /**************************
960 * CALCULATE INTERACTIONS *
961 **************************/
963 if (gmx_mm_any_lt(rsq10,rcutoff2))
966 r10 = _mm_mul_ps(rsq10,rinv10);
967 r10 = _mm_andnot_ps(dummy_mask,r10);
969 /* EWALD ELECTROSTATICS */
971 /* Analytical PME correction */
972 zeta2 = _mm_mul_ps(beta2,rsq10);
973 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
974 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
975 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
976 felec = _mm_mul_ps(qq10,felec);
977 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
978 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv10,sh_ewald));
979 velec = _mm_mul_ps(qq10,velec);
981 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
983 /* Update potential sum for this i atom from the interaction with this j atom. */
984 velec = _mm_and_ps(velec,cutoff_mask);
985 velec = _mm_andnot_ps(dummy_mask,velec);
986 velecsum = _mm_add_ps(velecsum,velec);
990 fscal = _mm_and_ps(fscal,cutoff_mask);
992 fscal = _mm_andnot_ps(dummy_mask,fscal);
994 /* Update vectorial force */
995 fix1 = _mm_macc_ps(dx10,fscal,fix1);
996 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
997 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
999 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1000 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1001 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1005 /**************************
1006 * CALCULATE INTERACTIONS *
1007 **************************/
1009 if (gmx_mm_any_lt(rsq11,rcutoff2))
1012 r11 = _mm_mul_ps(rsq11,rinv11);
1013 r11 = _mm_andnot_ps(dummy_mask,r11);
1015 /* EWALD ELECTROSTATICS */
1017 /* Analytical PME correction */
1018 zeta2 = _mm_mul_ps(beta2,rsq11);
1019 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1020 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1021 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1022 felec = _mm_mul_ps(qq11,felec);
1023 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1024 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv11,sh_ewald));
1025 velec = _mm_mul_ps(qq11,velec);
1027 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1029 /* Update potential sum for this i atom from the interaction with this j atom. */
1030 velec = _mm_and_ps(velec,cutoff_mask);
1031 velec = _mm_andnot_ps(dummy_mask,velec);
1032 velecsum = _mm_add_ps(velecsum,velec);
1036 fscal = _mm_and_ps(fscal,cutoff_mask);
1038 fscal = _mm_andnot_ps(dummy_mask,fscal);
1040 /* Update vectorial force */
1041 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1042 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1043 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1045 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1046 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1047 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1051 /**************************
1052 * CALCULATE INTERACTIONS *
1053 **************************/
1055 if (gmx_mm_any_lt(rsq12,rcutoff2))
1058 r12 = _mm_mul_ps(rsq12,rinv12);
1059 r12 = _mm_andnot_ps(dummy_mask,r12);
1061 /* EWALD ELECTROSTATICS */
1063 /* Analytical PME correction */
1064 zeta2 = _mm_mul_ps(beta2,rsq12);
1065 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1066 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1067 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1068 felec = _mm_mul_ps(qq12,felec);
1069 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1070 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv12,sh_ewald));
1071 velec = _mm_mul_ps(qq12,velec);
1073 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1075 /* Update potential sum for this i atom from the interaction with this j atom. */
1076 velec = _mm_and_ps(velec,cutoff_mask);
1077 velec = _mm_andnot_ps(dummy_mask,velec);
1078 velecsum = _mm_add_ps(velecsum,velec);
1082 fscal = _mm_and_ps(fscal,cutoff_mask);
1084 fscal = _mm_andnot_ps(dummy_mask,fscal);
1086 /* Update vectorial force */
1087 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1088 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1089 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1091 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1092 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1093 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1097 /**************************
1098 * CALCULATE INTERACTIONS *
1099 **************************/
1101 if (gmx_mm_any_lt(rsq20,rcutoff2))
1104 r20 = _mm_mul_ps(rsq20,rinv20);
1105 r20 = _mm_andnot_ps(dummy_mask,r20);
1107 /* EWALD ELECTROSTATICS */
1109 /* Analytical PME correction */
1110 zeta2 = _mm_mul_ps(beta2,rsq20);
1111 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
1112 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1113 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1114 felec = _mm_mul_ps(qq20,felec);
1115 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1116 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv20,sh_ewald));
1117 velec = _mm_mul_ps(qq20,velec);
1119 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1121 /* Update potential sum for this i atom from the interaction with this j atom. */
1122 velec = _mm_and_ps(velec,cutoff_mask);
1123 velec = _mm_andnot_ps(dummy_mask,velec);
1124 velecsum = _mm_add_ps(velecsum,velec);
1128 fscal = _mm_and_ps(fscal,cutoff_mask);
1130 fscal = _mm_andnot_ps(dummy_mask,fscal);
1132 /* Update vectorial force */
1133 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1134 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1135 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1137 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1138 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1139 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1143 /**************************
1144 * CALCULATE INTERACTIONS *
1145 **************************/
1147 if (gmx_mm_any_lt(rsq21,rcutoff2))
1150 r21 = _mm_mul_ps(rsq21,rinv21);
1151 r21 = _mm_andnot_ps(dummy_mask,r21);
1153 /* EWALD ELECTROSTATICS */
1155 /* Analytical PME correction */
1156 zeta2 = _mm_mul_ps(beta2,rsq21);
1157 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1158 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1159 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1160 felec = _mm_mul_ps(qq21,felec);
1161 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1162 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv21,sh_ewald));
1163 velec = _mm_mul_ps(qq21,velec);
1165 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1167 /* Update potential sum for this i atom from the interaction with this j atom. */
1168 velec = _mm_and_ps(velec,cutoff_mask);
1169 velec = _mm_andnot_ps(dummy_mask,velec);
1170 velecsum = _mm_add_ps(velecsum,velec);
1174 fscal = _mm_and_ps(fscal,cutoff_mask);
1176 fscal = _mm_andnot_ps(dummy_mask,fscal);
1178 /* Update vectorial force */
1179 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1180 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1181 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1183 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1184 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1185 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1189 /**************************
1190 * CALCULATE INTERACTIONS *
1191 **************************/
1193 if (gmx_mm_any_lt(rsq22,rcutoff2))
1196 r22 = _mm_mul_ps(rsq22,rinv22);
1197 r22 = _mm_andnot_ps(dummy_mask,r22);
1199 /* EWALD ELECTROSTATICS */
1201 /* Analytical PME correction */
1202 zeta2 = _mm_mul_ps(beta2,rsq22);
1203 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1204 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1205 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1206 felec = _mm_mul_ps(qq22,felec);
1207 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1208 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv22,sh_ewald));
1209 velec = _mm_mul_ps(qq22,velec);
1211 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1213 /* Update potential sum for this i atom from the interaction with this j atom. */
1214 velec = _mm_and_ps(velec,cutoff_mask);
1215 velec = _mm_andnot_ps(dummy_mask,velec);
1216 velecsum = _mm_add_ps(velecsum,velec);
1220 fscal = _mm_and_ps(fscal,cutoff_mask);
1222 fscal = _mm_andnot_ps(dummy_mask,fscal);
1224 /* Update vectorial force */
1225 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1226 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1227 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1229 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1230 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1231 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1235 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1236 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1237 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1238 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1240 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1241 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1243 /* Inner loop uses 324 flops */
1246 /* End of innermost loop */
1248 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1249 f+i_coord_offset,fshift+i_shift_offset);
1252 /* Update potential energies */
1253 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1254 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1256 /* Increment number of inner iterations */
1257 inneriter += j_index_end - j_index_start;
1259 /* Outer loop uses 20 flops */
1262 /* Increment number of outer iterations */
1265 /* Update outer/inner flops */
1267 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*324);
1270 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_128_fma_single
1271 * Electrostatics interaction: Ewald
1272 * VdW interaction: LennardJones
1273 * Geometry: Water3-Water3
1274 * Calculate force/pot: Force
1277 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_128_fma_single
1278 (t_nblist * gmx_restrict nlist,
1279 rvec * gmx_restrict xx,
1280 rvec * gmx_restrict ff,
1281 t_forcerec * gmx_restrict fr,
1282 t_mdatoms * gmx_restrict mdatoms,
1283 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1284 t_nrnb * gmx_restrict nrnb)
1286 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1287 * just 0 for non-waters.
1288 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1289 * jnr indices corresponding to data put in the four positions in the SIMD register.
1291 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1292 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1293 int jnrA,jnrB,jnrC,jnrD;
1294 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1295 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1296 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1297 real rcutoff_scalar;
1298 real *shiftvec,*fshift,*x,*f;
1299 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1300 real scratch[4*DIM];
1301 __m128 fscal,rcutoff,rcutoff2,jidxall;
1303 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1305 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1307 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1308 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1309 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1310 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1311 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1312 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1313 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1314 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1315 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1316 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1317 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1318 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1319 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1320 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1321 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1322 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1323 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1326 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1329 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1330 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1332 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1333 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1335 __m128 dummy_mask,cutoff_mask;
1336 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1337 __m128 one = _mm_set1_ps(1.0);
1338 __m128 two = _mm_set1_ps(2.0);
1344 jindex = nlist->jindex;
1346 shiftidx = nlist->shift;
1348 shiftvec = fr->shift_vec[0];
1349 fshift = fr->fshift[0];
1350 facel = _mm_set1_ps(fr->epsfac);
1351 charge = mdatoms->chargeA;
1352 nvdwtype = fr->ntype;
1353 vdwparam = fr->nbfp;
1354 vdwtype = mdatoms->typeA;
1356 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1357 beta = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1358 beta2 = _mm_mul_ps(beta,beta);
1359 beta3 = _mm_mul_ps(beta,beta2);
1360 ewtab = fr->ic->tabq_coul_F;
1361 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1362 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1364 /* Setup water-specific parameters */
1365 inr = nlist->iinr[0];
1366 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1367 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1368 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1369 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1371 jq0 = _mm_set1_ps(charge[inr+0]);
1372 jq1 = _mm_set1_ps(charge[inr+1]);
1373 jq2 = _mm_set1_ps(charge[inr+2]);
1374 vdwjidx0A = 2*vdwtype[inr+0];
1375 qq00 = _mm_mul_ps(iq0,jq0);
1376 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1377 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1378 qq01 = _mm_mul_ps(iq0,jq1);
1379 qq02 = _mm_mul_ps(iq0,jq2);
1380 qq10 = _mm_mul_ps(iq1,jq0);
1381 qq11 = _mm_mul_ps(iq1,jq1);
1382 qq12 = _mm_mul_ps(iq1,jq2);
1383 qq20 = _mm_mul_ps(iq2,jq0);
1384 qq21 = _mm_mul_ps(iq2,jq1);
1385 qq22 = _mm_mul_ps(iq2,jq2);
1387 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1388 rcutoff_scalar = fr->rcoulomb;
1389 rcutoff = _mm_set1_ps(rcutoff_scalar);
1390 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1392 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
1393 rvdw = _mm_set1_ps(fr->rvdw);
1395 /* Avoid stupid compiler warnings */
1396 jnrA = jnrB = jnrC = jnrD = 0;
1397 j_coord_offsetA = 0;
1398 j_coord_offsetB = 0;
1399 j_coord_offsetC = 0;
1400 j_coord_offsetD = 0;
1405 for(iidx=0;iidx<4*DIM;iidx++)
1407 scratch[iidx] = 0.0;
1410 /* Start outer loop over neighborlists */
1411 for(iidx=0; iidx<nri; iidx++)
1413 /* Load shift vector for this list */
1414 i_shift_offset = DIM*shiftidx[iidx];
1416 /* Load limits for loop over neighbors */
1417 j_index_start = jindex[iidx];
1418 j_index_end = jindex[iidx+1];
1420 /* Get outer coordinate index */
1422 i_coord_offset = DIM*inr;
1424 /* Load i particle coords and add shift vector */
1425 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1426 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1428 fix0 = _mm_setzero_ps();
1429 fiy0 = _mm_setzero_ps();
1430 fiz0 = _mm_setzero_ps();
1431 fix1 = _mm_setzero_ps();
1432 fiy1 = _mm_setzero_ps();
1433 fiz1 = _mm_setzero_ps();
1434 fix2 = _mm_setzero_ps();
1435 fiy2 = _mm_setzero_ps();
1436 fiz2 = _mm_setzero_ps();
1438 /* Start inner kernel loop */
1439 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1442 /* Get j neighbor index, and coordinate index */
1444 jnrB = jjnr[jidx+1];
1445 jnrC = jjnr[jidx+2];
1446 jnrD = jjnr[jidx+3];
1447 j_coord_offsetA = DIM*jnrA;
1448 j_coord_offsetB = DIM*jnrB;
1449 j_coord_offsetC = DIM*jnrC;
1450 j_coord_offsetD = DIM*jnrD;
1452 /* load j atom coordinates */
1453 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1454 x+j_coord_offsetC,x+j_coord_offsetD,
1455 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1457 /* Calculate displacement vector */
1458 dx00 = _mm_sub_ps(ix0,jx0);
1459 dy00 = _mm_sub_ps(iy0,jy0);
1460 dz00 = _mm_sub_ps(iz0,jz0);
1461 dx01 = _mm_sub_ps(ix0,jx1);
1462 dy01 = _mm_sub_ps(iy0,jy1);
1463 dz01 = _mm_sub_ps(iz0,jz1);
1464 dx02 = _mm_sub_ps(ix0,jx2);
1465 dy02 = _mm_sub_ps(iy0,jy2);
1466 dz02 = _mm_sub_ps(iz0,jz2);
1467 dx10 = _mm_sub_ps(ix1,jx0);
1468 dy10 = _mm_sub_ps(iy1,jy0);
1469 dz10 = _mm_sub_ps(iz1,jz0);
1470 dx11 = _mm_sub_ps(ix1,jx1);
1471 dy11 = _mm_sub_ps(iy1,jy1);
1472 dz11 = _mm_sub_ps(iz1,jz1);
1473 dx12 = _mm_sub_ps(ix1,jx2);
1474 dy12 = _mm_sub_ps(iy1,jy2);
1475 dz12 = _mm_sub_ps(iz1,jz2);
1476 dx20 = _mm_sub_ps(ix2,jx0);
1477 dy20 = _mm_sub_ps(iy2,jy0);
1478 dz20 = _mm_sub_ps(iz2,jz0);
1479 dx21 = _mm_sub_ps(ix2,jx1);
1480 dy21 = _mm_sub_ps(iy2,jy1);
1481 dz21 = _mm_sub_ps(iz2,jz1);
1482 dx22 = _mm_sub_ps(ix2,jx2);
1483 dy22 = _mm_sub_ps(iy2,jy2);
1484 dz22 = _mm_sub_ps(iz2,jz2);
1486 /* Calculate squared distance and things based on it */
1487 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1488 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1489 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1490 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1491 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1492 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1493 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1494 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1495 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1497 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1498 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1499 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1500 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1501 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1502 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1503 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1504 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1505 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1507 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1508 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1509 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1510 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1511 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1512 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1513 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1514 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1515 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1517 fjx0 = _mm_setzero_ps();
1518 fjy0 = _mm_setzero_ps();
1519 fjz0 = _mm_setzero_ps();
1520 fjx1 = _mm_setzero_ps();
1521 fjy1 = _mm_setzero_ps();
1522 fjz1 = _mm_setzero_ps();
1523 fjx2 = _mm_setzero_ps();
1524 fjy2 = _mm_setzero_ps();
1525 fjz2 = _mm_setzero_ps();
1527 /**************************
1528 * CALCULATE INTERACTIONS *
1529 **************************/
1531 if (gmx_mm_any_lt(rsq00,rcutoff2))
1534 r00 = _mm_mul_ps(rsq00,rinv00);
1536 /* EWALD ELECTROSTATICS */
1538 /* Analytical PME correction */
1539 zeta2 = _mm_mul_ps(beta2,rsq00);
1540 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
1541 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1542 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1543 felec = _mm_mul_ps(qq00,felec);
1545 /* LENNARD-JONES DISPERSION/REPULSION */
1547 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1548 fvdw = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1550 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1552 fscal = _mm_add_ps(felec,fvdw);
1554 fscal = _mm_and_ps(fscal,cutoff_mask);
1556 /* Update vectorial force */
1557 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1558 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1559 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1561 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1562 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1563 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1567 /**************************
1568 * CALCULATE INTERACTIONS *
1569 **************************/
1571 if (gmx_mm_any_lt(rsq01,rcutoff2))
1574 r01 = _mm_mul_ps(rsq01,rinv01);
1576 /* EWALD ELECTROSTATICS */
1578 /* Analytical PME correction */
1579 zeta2 = _mm_mul_ps(beta2,rsq01);
1580 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
1581 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1582 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1583 felec = _mm_mul_ps(qq01,felec);
1585 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1589 fscal = _mm_and_ps(fscal,cutoff_mask);
1591 /* Update vectorial force */
1592 fix0 = _mm_macc_ps(dx01,fscal,fix0);
1593 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1594 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1596 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1597 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1598 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1602 /**************************
1603 * CALCULATE INTERACTIONS *
1604 **************************/
1606 if (gmx_mm_any_lt(rsq02,rcutoff2))
1609 r02 = _mm_mul_ps(rsq02,rinv02);
1611 /* EWALD ELECTROSTATICS */
1613 /* Analytical PME correction */
1614 zeta2 = _mm_mul_ps(beta2,rsq02);
1615 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
1616 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1617 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1618 felec = _mm_mul_ps(qq02,felec);
1620 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1624 fscal = _mm_and_ps(fscal,cutoff_mask);
1626 /* Update vectorial force */
1627 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1628 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1629 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
1631 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
1632 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
1633 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
1637 /**************************
1638 * CALCULATE INTERACTIONS *
1639 **************************/
1641 if (gmx_mm_any_lt(rsq10,rcutoff2))
1644 r10 = _mm_mul_ps(rsq10,rinv10);
1646 /* EWALD ELECTROSTATICS */
1648 /* Analytical PME correction */
1649 zeta2 = _mm_mul_ps(beta2,rsq10);
1650 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
1651 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1652 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1653 felec = _mm_mul_ps(qq10,felec);
1655 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1659 fscal = _mm_and_ps(fscal,cutoff_mask);
1661 /* Update vectorial force */
1662 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1663 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1664 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1666 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1667 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1668 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1672 /**************************
1673 * CALCULATE INTERACTIONS *
1674 **************************/
1676 if (gmx_mm_any_lt(rsq11,rcutoff2))
1679 r11 = _mm_mul_ps(rsq11,rinv11);
1681 /* EWALD ELECTROSTATICS */
1683 /* Analytical PME correction */
1684 zeta2 = _mm_mul_ps(beta2,rsq11);
1685 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1686 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1687 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1688 felec = _mm_mul_ps(qq11,felec);
1690 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1694 fscal = _mm_and_ps(fscal,cutoff_mask);
1696 /* Update vectorial force */
1697 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1698 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1699 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1701 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1702 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1703 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1707 /**************************
1708 * CALCULATE INTERACTIONS *
1709 **************************/
1711 if (gmx_mm_any_lt(rsq12,rcutoff2))
1714 r12 = _mm_mul_ps(rsq12,rinv12);
1716 /* EWALD ELECTROSTATICS */
1718 /* Analytical PME correction */
1719 zeta2 = _mm_mul_ps(beta2,rsq12);
1720 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1721 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1722 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1723 felec = _mm_mul_ps(qq12,felec);
1725 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1729 fscal = _mm_and_ps(fscal,cutoff_mask);
1731 /* Update vectorial force */
1732 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1733 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1734 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1736 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1737 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1738 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1742 /**************************
1743 * CALCULATE INTERACTIONS *
1744 **************************/
1746 if (gmx_mm_any_lt(rsq20,rcutoff2))
1749 r20 = _mm_mul_ps(rsq20,rinv20);
1751 /* EWALD ELECTROSTATICS */
1753 /* Analytical PME correction */
1754 zeta2 = _mm_mul_ps(beta2,rsq20);
1755 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
1756 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1757 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1758 felec = _mm_mul_ps(qq20,felec);
1760 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1764 fscal = _mm_and_ps(fscal,cutoff_mask);
1766 /* Update vectorial force */
1767 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1768 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1769 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1771 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1772 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1773 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1777 /**************************
1778 * CALCULATE INTERACTIONS *
1779 **************************/
1781 if (gmx_mm_any_lt(rsq21,rcutoff2))
1784 r21 = _mm_mul_ps(rsq21,rinv21);
1786 /* EWALD ELECTROSTATICS */
1788 /* Analytical PME correction */
1789 zeta2 = _mm_mul_ps(beta2,rsq21);
1790 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1791 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1792 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1793 felec = _mm_mul_ps(qq21,felec);
1795 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1799 fscal = _mm_and_ps(fscal,cutoff_mask);
1801 /* Update vectorial force */
1802 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1803 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1804 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1806 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1807 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1808 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1812 /**************************
1813 * CALCULATE INTERACTIONS *
1814 **************************/
1816 if (gmx_mm_any_lt(rsq22,rcutoff2))
1819 r22 = _mm_mul_ps(rsq22,rinv22);
1821 /* EWALD ELECTROSTATICS */
1823 /* Analytical PME correction */
1824 zeta2 = _mm_mul_ps(beta2,rsq22);
1825 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1826 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1827 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1828 felec = _mm_mul_ps(qq22,felec);
1830 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1834 fscal = _mm_and_ps(fscal,cutoff_mask);
1836 /* Update vectorial force */
1837 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1838 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1839 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1841 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1842 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1843 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1847 fjptrA = f+j_coord_offsetA;
1848 fjptrB = f+j_coord_offsetB;
1849 fjptrC = f+j_coord_offsetC;
1850 fjptrD = f+j_coord_offsetD;
1852 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1853 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1855 /* Inner loop uses 286 flops */
1858 if(jidx<j_index_end)
1861 /* Get j neighbor index, and coordinate index */
1862 jnrlistA = jjnr[jidx];
1863 jnrlistB = jjnr[jidx+1];
1864 jnrlistC = jjnr[jidx+2];
1865 jnrlistD = jjnr[jidx+3];
1866 /* Sign of each element will be negative for non-real atoms.
1867 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1868 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1870 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1871 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1872 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1873 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1874 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1875 j_coord_offsetA = DIM*jnrA;
1876 j_coord_offsetB = DIM*jnrB;
1877 j_coord_offsetC = DIM*jnrC;
1878 j_coord_offsetD = DIM*jnrD;
1880 /* load j atom coordinates */
1881 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1882 x+j_coord_offsetC,x+j_coord_offsetD,
1883 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1885 /* Calculate displacement vector */
1886 dx00 = _mm_sub_ps(ix0,jx0);
1887 dy00 = _mm_sub_ps(iy0,jy0);
1888 dz00 = _mm_sub_ps(iz0,jz0);
1889 dx01 = _mm_sub_ps(ix0,jx1);
1890 dy01 = _mm_sub_ps(iy0,jy1);
1891 dz01 = _mm_sub_ps(iz0,jz1);
1892 dx02 = _mm_sub_ps(ix0,jx2);
1893 dy02 = _mm_sub_ps(iy0,jy2);
1894 dz02 = _mm_sub_ps(iz0,jz2);
1895 dx10 = _mm_sub_ps(ix1,jx0);
1896 dy10 = _mm_sub_ps(iy1,jy0);
1897 dz10 = _mm_sub_ps(iz1,jz0);
1898 dx11 = _mm_sub_ps(ix1,jx1);
1899 dy11 = _mm_sub_ps(iy1,jy1);
1900 dz11 = _mm_sub_ps(iz1,jz1);
1901 dx12 = _mm_sub_ps(ix1,jx2);
1902 dy12 = _mm_sub_ps(iy1,jy2);
1903 dz12 = _mm_sub_ps(iz1,jz2);
1904 dx20 = _mm_sub_ps(ix2,jx0);
1905 dy20 = _mm_sub_ps(iy2,jy0);
1906 dz20 = _mm_sub_ps(iz2,jz0);
1907 dx21 = _mm_sub_ps(ix2,jx1);
1908 dy21 = _mm_sub_ps(iy2,jy1);
1909 dz21 = _mm_sub_ps(iz2,jz1);
1910 dx22 = _mm_sub_ps(ix2,jx2);
1911 dy22 = _mm_sub_ps(iy2,jy2);
1912 dz22 = _mm_sub_ps(iz2,jz2);
1914 /* Calculate squared distance and things based on it */
1915 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1916 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1917 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1918 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1919 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1920 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1921 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1922 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1923 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1925 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1926 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1927 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1928 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1929 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1930 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1931 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1932 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1933 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1935 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1936 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1937 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1938 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1939 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1940 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1941 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1942 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1943 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1945 fjx0 = _mm_setzero_ps();
1946 fjy0 = _mm_setzero_ps();
1947 fjz0 = _mm_setzero_ps();
1948 fjx1 = _mm_setzero_ps();
1949 fjy1 = _mm_setzero_ps();
1950 fjz1 = _mm_setzero_ps();
1951 fjx2 = _mm_setzero_ps();
1952 fjy2 = _mm_setzero_ps();
1953 fjz2 = _mm_setzero_ps();
1955 /**************************
1956 * CALCULATE INTERACTIONS *
1957 **************************/
1959 if (gmx_mm_any_lt(rsq00,rcutoff2))
1962 r00 = _mm_mul_ps(rsq00,rinv00);
1963 r00 = _mm_andnot_ps(dummy_mask,r00);
1965 /* EWALD ELECTROSTATICS */
1967 /* Analytical PME correction */
1968 zeta2 = _mm_mul_ps(beta2,rsq00);
1969 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
1970 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1971 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1972 felec = _mm_mul_ps(qq00,felec);
1974 /* LENNARD-JONES DISPERSION/REPULSION */
1976 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1977 fvdw = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1979 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1981 fscal = _mm_add_ps(felec,fvdw);
1983 fscal = _mm_and_ps(fscal,cutoff_mask);
1985 fscal = _mm_andnot_ps(dummy_mask,fscal);
1987 /* Update vectorial force */
1988 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1989 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1990 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1992 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1993 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1994 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1998 /**************************
1999 * CALCULATE INTERACTIONS *
2000 **************************/
2002 if (gmx_mm_any_lt(rsq01,rcutoff2))
2005 r01 = _mm_mul_ps(rsq01,rinv01);
2006 r01 = _mm_andnot_ps(dummy_mask,r01);
2008 /* EWALD ELECTROSTATICS */
2010 /* Analytical PME correction */
2011 zeta2 = _mm_mul_ps(beta2,rsq01);
2012 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
2013 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2014 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2015 felec = _mm_mul_ps(qq01,felec);
2017 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2021 fscal = _mm_and_ps(fscal,cutoff_mask);
2023 fscal = _mm_andnot_ps(dummy_mask,fscal);
2025 /* Update vectorial force */
2026 fix0 = _mm_macc_ps(dx01,fscal,fix0);
2027 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
2028 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
2030 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
2031 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
2032 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
2036 /**************************
2037 * CALCULATE INTERACTIONS *
2038 **************************/
2040 if (gmx_mm_any_lt(rsq02,rcutoff2))
2043 r02 = _mm_mul_ps(rsq02,rinv02);
2044 r02 = _mm_andnot_ps(dummy_mask,r02);
2046 /* EWALD ELECTROSTATICS */
2048 /* Analytical PME correction */
2049 zeta2 = _mm_mul_ps(beta2,rsq02);
2050 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
2051 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2052 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2053 felec = _mm_mul_ps(qq02,felec);
2055 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2059 fscal = _mm_and_ps(fscal,cutoff_mask);
2061 fscal = _mm_andnot_ps(dummy_mask,fscal);
2063 /* Update vectorial force */
2064 fix0 = _mm_macc_ps(dx02,fscal,fix0);
2065 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
2066 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
2068 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
2069 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
2070 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
2074 /**************************
2075 * CALCULATE INTERACTIONS *
2076 **************************/
2078 if (gmx_mm_any_lt(rsq10,rcutoff2))
2081 r10 = _mm_mul_ps(rsq10,rinv10);
2082 r10 = _mm_andnot_ps(dummy_mask,r10);
2084 /* EWALD ELECTROSTATICS */
2086 /* Analytical PME correction */
2087 zeta2 = _mm_mul_ps(beta2,rsq10);
2088 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
2089 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2090 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2091 felec = _mm_mul_ps(qq10,felec);
2093 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2097 fscal = _mm_and_ps(fscal,cutoff_mask);
2099 fscal = _mm_andnot_ps(dummy_mask,fscal);
2101 /* Update vectorial force */
2102 fix1 = _mm_macc_ps(dx10,fscal,fix1);
2103 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
2104 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
2106 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
2107 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
2108 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
2112 /**************************
2113 * CALCULATE INTERACTIONS *
2114 **************************/
2116 if (gmx_mm_any_lt(rsq11,rcutoff2))
2119 r11 = _mm_mul_ps(rsq11,rinv11);
2120 r11 = _mm_andnot_ps(dummy_mask,r11);
2122 /* EWALD ELECTROSTATICS */
2124 /* Analytical PME correction */
2125 zeta2 = _mm_mul_ps(beta2,rsq11);
2126 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
2127 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2128 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2129 felec = _mm_mul_ps(qq11,felec);
2131 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2135 fscal = _mm_and_ps(fscal,cutoff_mask);
2137 fscal = _mm_andnot_ps(dummy_mask,fscal);
2139 /* Update vectorial force */
2140 fix1 = _mm_macc_ps(dx11,fscal,fix1);
2141 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
2142 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
2144 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
2145 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
2146 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
2150 /**************************
2151 * CALCULATE INTERACTIONS *
2152 **************************/
2154 if (gmx_mm_any_lt(rsq12,rcutoff2))
2157 r12 = _mm_mul_ps(rsq12,rinv12);
2158 r12 = _mm_andnot_ps(dummy_mask,r12);
2160 /* EWALD ELECTROSTATICS */
2162 /* Analytical PME correction */
2163 zeta2 = _mm_mul_ps(beta2,rsq12);
2164 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
2165 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2166 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2167 felec = _mm_mul_ps(qq12,felec);
2169 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2173 fscal = _mm_and_ps(fscal,cutoff_mask);
2175 fscal = _mm_andnot_ps(dummy_mask,fscal);
2177 /* Update vectorial force */
2178 fix1 = _mm_macc_ps(dx12,fscal,fix1);
2179 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
2180 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
2182 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
2183 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
2184 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
2188 /**************************
2189 * CALCULATE INTERACTIONS *
2190 **************************/
2192 if (gmx_mm_any_lt(rsq20,rcutoff2))
2195 r20 = _mm_mul_ps(rsq20,rinv20);
2196 r20 = _mm_andnot_ps(dummy_mask,r20);
2198 /* EWALD ELECTROSTATICS */
2200 /* Analytical PME correction */
2201 zeta2 = _mm_mul_ps(beta2,rsq20);
2202 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
2203 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2204 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2205 felec = _mm_mul_ps(qq20,felec);
2207 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2211 fscal = _mm_and_ps(fscal,cutoff_mask);
2213 fscal = _mm_andnot_ps(dummy_mask,fscal);
2215 /* Update vectorial force */
2216 fix2 = _mm_macc_ps(dx20,fscal,fix2);
2217 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
2218 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
2220 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
2221 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
2222 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
2226 /**************************
2227 * CALCULATE INTERACTIONS *
2228 **************************/
2230 if (gmx_mm_any_lt(rsq21,rcutoff2))
2233 r21 = _mm_mul_ps(rsq21,rinv21);
2234 r21 = _mm_andnot_ps(dummy_mask,r21);
2236 /* EWALD ELECTROSTATICS */
2238 /* Analytical PME correction */
2239 zeta2 = _mm_mul_ps(beta2,rsq21);
2240 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
2241 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2242 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2243 felec = _mm_mul_ps(qq21,felec);
2245 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2249 fscal = _mm_and_ps(fscal,cutoff_mask);
2251 fscal = _mm_andnot_ps(dummy_mask,fscal);
2253 /* Update vectorial force */
2254 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2255 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2256 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2258 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2259 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2260 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2264 /**************************
2265 * CALCULATE INTERACTIONS *
2266 **************************/
2268 if (gmx_mm_any_lt(rsq22,rcutoff2))
2271 r22 = _mm_mul_ps(rsq22,rinv22);
2272 r22 = _mm_andnot_ps(dummy_mask,r22);
2274 /* EWALD ELECTROSTATICS */
2276 /* Analytical PME correction */
2277 zeta2 = _mm_mul_ps(beta2,rsq22);
2278 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
2279 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2280 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2281 felec = _mm_mul_ps(qq22,felec);
2283 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2287 fscal = _mm_and_ps(fscal,cutoff_mask);
2289 fscal = _mm_andnot_ps(dummy_mask,fscal);
2291 /* Update vectorial force */
2292 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2293 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2294 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2296 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2297 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2298 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2302 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2303 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2304 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2305 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2307 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2308 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2310 /* Inner loop uses 295 flops */
2313 /* End of innermost loop */
2315 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2316 f+i_coord_offset,fshift+i_shift_offset);
2318 /* Increment number of inner iterations */
2319 inneriter += j_index_end - j_index_start;
2321 /* Outer loop uses 18 flops */
2324 /* Increment number of outer iterations */
2327 /* Update outer/inner flops */
2329 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*295);