2 * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_single.h"
34 #include "kernelutil_x86_avx_128_fma_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_128_fma_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_128_fma_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
93 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
96 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
97 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
99 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
100 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
102 __m128 dummy_mask,cutoff_mask;
103 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
104 __m128 one = _mm_set1_ps(1.0);
105 __m128 two = _mm_set1_ps(2.0);
111 jindex = nlist->jindex;
113 shiftidx = nlist->shift;
115 shiftvec = fr->shift_vec[0];
116 fshift = fr->fshift[0];
117 facel = _mm_set1_ps(fr->epsfac);
118 charge = mdatoms->chargeA;
119 nvdwtype = fr->ntype;
121 vdwtype = mdatoms->typeA;
123 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
124 beta = _mm_set1_ps(fr->ic->ewaldcoeff);
125 beta2 = _mm_mul_ps(beta,beta);
126 beta3 = _mm_mul_ps(beta,beta2);
127 ewtab = fr->ic->tabq_coul_FDV0;
128 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
129 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
131 /* Setup water-specific parameters */
132 inr = nlist->iinr[0];
133 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
134 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
135 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
136 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
138 jq0 = _mm_set1_ps(charge[inr+0]);
139 jq1 = _mm_set1_ps(charge[inr+1]);
140 jq2 = _mm_set1_ps(charge[inr+2]);
141 vdwjidx0A = 2*vdwtype[inr+0];
142 qq00 = _mm_mul_ps(iq0,jq0);
143 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
144 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
145 qq01 = _mm_mul_ps(iq0,jq1);
146 qq02 = _mm_mul_ps(iq0,jq2);
147 qq10 = _mm_mul_ps(iq1,jq0);
148 qq11 = _mm_mul_ps(iq1,jq1);
149 qq12 = _mm_mul_ps(iq1,jq2);
150 qq20 = _mm_mul_ps(iq2,jq0);
151 qq21 = _mm_mul_ps(iq2,jq1);
152 qq22 = _mm_mul_ps(iq2,jq2);
154 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
155 rcutoff_scalar = fr->rcoulomb;
156 rcutoff = _mm_set1_ps(rcutoff_scalar);
157 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
159 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
160 rvdw = _mm_set1_ps(fr->rvdw);
162 /* Avoid stupid compiler warnings */
163 jnrA = jnrB = jnrC = jnrD = 0;
172 for(iidx=0;iidx<4*DIM;iidx++)
177 /* Start outer loop over neighborlists */
178 for(iidx=0; iidx<nri; iidx++)
180 /* Load shift vector for this list */
181 i_shift_offset = DIM*shiftidx[iidx];
183 /* Load limits for loop over neighbors */
184 j_index_start = jindex[iidx];
185 j_index_end = jindex[iidx+1];
187 /* Get outer coordinate index */
189 i_coord_offset = DIM*inr;
191 /* Load i particle coords and add shift vector */
192 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
193 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
195 fix0 = _mm_setzero_ps();
196 fiy0 = _mm_setzero_ps();
197 fiz0 = _mm_setzero_ps();
198 fix1 = _mm_setzero_ps();
199 fiy1 = _mm_setzero_ps();
200 fiz1 = _mm_setzero_ps();
201 fix2 = _mm_setzero_ps();
202 fiy2 = _mm_setzero_ps();
203 fiz2 = _mm_setzero_ps();
205 /* Reset potential sums */
206 velecsum = _mm_setzero_ps();
207 vvdwsum = _mm_setzero_ps();
209 /* Start inner kernel loop */
210 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
213 /* Get j neighbor index, and coordinate index */
218 j_coord_offsetA = DIM*jnrA;
219 j_coord_offsetB = DIM*jnrB;
220 j_coord_offsetC = DIM*jnrC;
221 j_coord_offsetD = DIM*jnrD;
223 /* load j atom coordinates */
224 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
225 x+j_coord_offsetC,x+j_coord_offsetD,
226 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
228 /* Calculate displacement vector */
229 dx00 = _mm_sub_ps(ix0,jx0);
230 dy00 = _mm_sub_ps(iy0,jy0);
231 dz00 = _mm_sub_ps(iz0,jz0);
232 dx01 = _mm_sub_ps(ix0,jx1);
233 dy01 = _mm_sub_ps(iy0,jy1);
234 dz01 = _mm_sub_ps(iz0,jz1);
235 dx02 = _mm_sub_ps(ix0,jx2);
236 dy02 = _mm_sub_ps(iy0,jy2);
237 dz02 = _mm_sub_ps(iz0,jz2);
238 dx10 = _mm_sub_ps(ix1,jx0);
239 dy10 = _mm_sub_ps(iy1,jy0);
240 dz10 = _mm_sub_ps(iz1,jz0);
241 dx11 = _mm_sub_ps(ix1,jx1);
242 dy11 = _mm_sub_ps(iy1,jy1);
243 dz11 = _mm_sub_ps(iz1,jz1);
244 dx12 = _mm_sub_ps(ix1,jx2);
245 dy12 = _mm_sub_ps(iy1,jy2);
246 dz12 = _mm_sub_ps(iz1,jz2);
247 dx20 = _mm_sub_ps(ix2,jx0);
248 dy20 = _mm_sub_ps(iy2,jy0);
249 dz20 = _mm_sub_ps(iz2,jz0);
250 dx21 = _mm_sub_ps(ix2,jx1);
251 dy21 = _mm_sub_ps(iy2,jy1);
252 dz21 = _mm_sub_ps(iz2,jz1);
253 dx22 = _mm_sub_ps(ix2,jx2);
254 dy22 = _mm_sub_ps(iy2,jy2);
255 dz22 = _mm_sub_ps(iz2,jz2);
257 /* Calculate squared distance and things based on it */
258 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
259 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
260 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
261 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
262 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
263 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
264 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
265 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
266 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
268 rinv00 = gmx_mm_invsqrt_ps(rsq00);
269 rinv01 = gmx_mm_invsqrt_ps(rsq01);
270 rinv02 = gmx_mm_invsqrt_ps(rsq02);
271 rinv10 = gmx_mm_invsqrt_ps(rsq10);
272 rinv11 = gmx_mm_invsqrt_ps(rsq11);
273 rinv12 = gmx_mm_invsqrt_ps(rsq12);
274 rinv20 = gmx_mm_invsqrt_ps(rsq20);
275 rinv21 = gmx_mm_invsqrt_ps(rsq21);
276 rinv22 = gmx_mm_invsqrt_ps(rsq22);
278 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
279 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
280 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
281 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
282 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
283 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
284 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
285 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
286 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
288 fjx0 = _mm_setzero_ps();
289 fjy0 = _mm_setzero_ps();
290 fjz0 = _mm_setzero_ps();
291 fjx1 = _mm_setzero_ps();
292 fjy1 = _mm_setzero_ps();
293 fjz1 = _mm_setzero_ps();
294 fjx2 = _mm_setzero_ps();
295 fjy2 = _mm_setzero_ps();
296 fjz2 = _mm_setzero_ps();
298 /**************************
299 * CALCULATE INTERACTIONS *
300 **************************/
302 if (gmx_mm_any_lt(rsq00,rcutoff2))
305 r00 = _mm_mul_ps(rsq00,rinv00);
307 /* EWALD ELECTROSTATICS */
309 /* Analytical PME correction */
310 zeta2 = _mm_mul_ps(beta2,rsq00);
311 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
312 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
313 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
314 felec = _mm_mul_ps(qq00,felec);
315 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
316 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv00,sh_ewald));
317 velec = _mm_mul_ps(qq00,velec);
319 /* LENNARD-JONES DISPERSION/REPULSION */
321 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
322 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
323 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
324 vvdw = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
325 _mm_mul_ps( _mm_nmacc_ps(c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
326 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
328 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
330 /* Update potential sum for this i atom from the interaction with this j atom. */
331 velec = _mm_and_ps(velec,cutoff_mask);
332 velecsum = _mm_add_ps(velecsum,velec);
333 vvdw = _mm_and_ps(vvdw,cutoff_mask);
334 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
336 fscal = _mm_add_ps(felec,fvdw);
338 fscal = _mm_and_ps(fscal,cutoff_mask);
340 /* Update vectorial force */
341 fix0 = _mm_macc_ps(dx00,fscal,fix0);
342 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
343 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
345 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
346 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
347 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
351 /**************************
352 * CALCULATE INTERACTIONS *
353 **************************/
355 if (gmx_mm_any_lt(rsq01,rcutoff2))
358 r01 = _mm_mul_ps(rsq01,rinv01);
360 /* EWALD ELECTROSTATICS */
362 /* Analytical PME correction */
363 zeta2 = _mm_mul_ps(beta2,rsq01);
364 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
365 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
366 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
367 felec = _mm_mul_ps(qq01,felec);
368 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
369 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv01,sh_ewald));
370 velec = _mm_mul_ps(qq01,velec);
372 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
374 /* Update potential sum for this i atom from the interaction with this j atom. */
375 velec = _mm_and_ps(velec,cutoff_mask);
376 velecsum = _mm_add_ps(velecsum,velec);
380 fscal = _mm_and_ps(fscal,cutoff_mask);
382 /* Update vectorial force */
383 fix0 = _mm_macc_ps(dx01,fscal,fix0);
384 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
385 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
387 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
388 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
389 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
393 /**************************
394 * CALCULATE INTERACTIONS *
395 **************************/
397 if (gmx_mm_any_lt(rsq02,rcutoff2))
400 r02 = _mm_mul_ps(rsq02,rinv02);
402 /* EWALD ELECTROSTATICS */
404 /* Analytical PME correction */
405 zeta2 = _mm_mul_ps(beta2,rsq02);
406 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
407 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
408 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
409 felec = _mm_mul_ps(qq02,felec);
410 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
411 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv02,sh_ewald));
412 velec = _mm_mul_ps(qq02,velec);
414 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
416 /* Update potential sum for this i atom from the interaction with this j atom. */
417 velec = _mm_and_ps(velec,cutoff_mask);
418 velecsum = _mm_add_ps(velecsum,velec);
422 fscal = _mm_and_ps(fscal,cutoff_mask);
424 /* Update vectorial force */
425 fix0 = _mm_macc_ps(dx02,fscal,fix0);
426 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
427 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
429 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
430 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
431 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
435 /**************************
436 * CALCULATE INTERACTIONS *
437 **************************/
439 if (gmx_mm_any_lt(rsq10,rcutoff2))
442 r10 = _mm_mul_ps(rsq10,rinv10);
444 /* EWALD ELECTROSTATICS */
446 /* Analytical PME correction */
447 zeta2 = _mm_mul_ps(beta2,rsq10);
448 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
449 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
450 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
451 felec = _mm_mul_ps(qq10,felec);
452 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
453 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv10,sh_ewald));
454 velec = _mm_mul_ps(qq10,velec);
456 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
458 /* Update potential sum for this i atom from the interaction with this j atom. */
459 velec = _mm_and_ps(velec,cutoff_mask);
460 velecsum = _mm_add_ps(velecsum,velec);
464 fscal = _mm_and_ps(fscal,cutoff_mask);
466 /* Update vectorial force */
467 fix1 = _mm_macc_ps(dx10,fscal,fix1);
468 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
469 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
471 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
472 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
473 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
477 /**************************
478 * CALCULATE INTERACTIONS *
479 **************************/
481 if (gmx_mm_any_lt(rsq11,rcutoff2))
484 r11 = _mm_mul_ps(rsq11,rinv11);
486 /* EWALD ELECTROSTATICS */
488 /* Analytical PME correction */
489 zeta2 = _mm_mul_ps(beta2,rsq11);
490 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
491 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
492 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
493 felec = _mm_mul_ps(qq11,felec);
494 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
495 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv11,sh_ewald));
496 velec = _mm_mul_ps(qq11,velec);
498 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
500 /* Update potential sum for this i atom from the interaction with this j atom. */
501 velec = _mm_and_ps(velec,cutoff_mask);
502 velecsum = _mm_add_ps(velecsum,velec);
506 fscal = _mm_and_ps(fscal,cutoff_mask);
508 /* Update vectorial force */
509 fix1 = _mm_macc_ps(dx11,fscal,fix1);
510 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
511 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
513 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
514 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
515 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
519 /**************************
520 * CALCULATE INTERACTIONS *
521 **************************/
523 if (gmx_mm_any_lt(rsq12,rcutoff2))
526 r12 = _mm_mul_ps(rsq12,rinv12);
528 /* EWALD ELECTROSTATICS */
530 /* Analytical PME correction */
531 zeta2 = _mm_mul_ps(beta2,rsq12);
532 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
533 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
534 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
535 felec = _mm_mul_ps(qq12,felec);
536 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
537 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv12,sh_ewald));
538 velec = _mm_mul_ps(qq12,velec);
540 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
542 /* Update potential sum for this i atom from the interaction with this j atom. */
543 velec = _mm_and_ps(velec,cutoff_mask);
544 velecsum = _mm_add_ps(velecsum,velec);
548 fscal = _mm_and_ps(fscal,cutoff_mask);
550 /* Update vectorial force */
551 fix1 = _mm_macc_ps(dx12,fscal,fix1);
552 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
553 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
555 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
556 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
557 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
561 /**************************
562 * CALCULATE INTERACTIONS *
563 **************************/
565 if (gmx_mm_any_lt(rsq20,rcutoff2))
568 r20 = _mm_mul_ps(rsq20,rinv20);
570 /* EWALD ELECTROSTATICS */
572 /* Analytical PME correction */
573 zeta2 = _mm_mul_ps(beta2,rsq20);
574 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
575 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
576 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
577 felec = _mm_mul_ps(qq20,felec);
578 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
579 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv20,sh_ewald));
580 velec = _mm_mul_ps(qq20,velec);
582 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
584 /* Update potential sum for this i atom from the interaction with this j atom. */
585 velec = _mm_and_ps(velec,cutoff_mask);
586 velecsum = _mm_add_ps(velecsum,velec);
590 fscal = _mm_and_ps(fscal,cutoff_mask);
592 /* Update vectorial force */
593 fix2 = _mm_macc_ps(dx20,fscal,fix2);
594 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
595 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
597 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
598 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
599 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
603 /**************************
604 * CALCULATE INTERACTIONS *
605 **************************/
607 if (gmx_mm_any_lt(rsq21,rcutoff2))
610 r21 = _mm_mul_ps(rsq21,rinv21);
612 /* EWALD ELECTROSTATICS */
614 /* Analytical PME correction */
615 zeta2 = _mm_mul_ps(beta2,rsq21);
616 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
617 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
618 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
619 felec = _mm_mul_ps(qq21,felec);
620 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
621 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv21,sh_ewald));
622 velec = _mm_mul_ps(qq21,velec);
624 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
626 /* Update potential sum for this i atom from the interaction with this j atom. */
627 velec = _mm_and_ps(velec,cutoff_mask);
628 velecsum = _mm_add_ps(velecsum,velec);
632 fscal = _mm_and_ps(fscal,cutoff_mask);
634 /* Update vectorial force */
635 fix2 = _mm_macc_ps(dx21,fscal,fix2);
636 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
637 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
639 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
640 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
641 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
645 /**************************
646 * CALCULATE INTERACTIONS *
647 **************************/
649 if (gmx_mm_any_lt(rsq22,rcutoff2))
652 r22 = _mm_mul_ps(rsq22,rinv22);
654 /* EWALD ELECTROSTATICS */
656 /* Analytical PME correction */
657 zeta2 = _mm_mul_ps(beta2,rsq22);
658 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
659 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
660 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
661 felec = _mm_mul_ps(qq22,felec);
662 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
663 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv22,sh_ewald));
664 velec = _mm_mul_ps(qq22,velec);
666 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
668 /* Update potential sum for this i atom from the interaction with this j atom. */
669 velec = _mm_and_ps(velec,cutoff_mask);
670 velecsum = _mm_add_ps(velecsum,velec);
674 fscal = _mm_and_ps(fscal,cutoff_mask);
676 /* Update vectorial force */
677 fix2 = _mm_macc_ps(dx22,fscal,fix2);
678 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
679 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
681 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
682 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
683 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
687 fjptrA = f+j_coord_offsetA;
688 fjptrB = f+j_coord_offsetB;
689 fjptrC = f+j_coord_offsetC;
690 fjptrD = f+j_coord_offsetD;
692 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
693 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
695 /* Inner loop uses 315 flops */
701 /* Get j neighbor index, and coordinate index */
702 jnrlistA = jjnr[jidx];
703 jnrlistB = jjnr[jidx+1];
704 jnrlistC = jjnr[jidx+2];
705 jnrlistD = jjnr[jidx+3];
706 /* Sign of each element will be negative for non-real atoms.
707 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
708 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
710 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
711 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
712 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
713 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
714 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
715 j_coord_offsetA = DIM*jnrA;
716 j_coord_offsetB = DIM*jnrB;
717 j_coord_offsetC = DIM*jnrC;
718 j_coord_offsetD = DIM*jnrD;
720 /* load j atom coordinates */
721 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
722 x+j_coord_offsetC,x+j_coord_offsetD,
723 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
725 /* Calculate displacement vector */
726 dx00 = _mm_sub_ps(ix0,jx0);
727 dy00 = _mm_sub_ps(iy0,jy0);
728 dz00 = _mm_sub_ps(iz0,jz0);
729 dx01 = _mm_sub_ps(ix0,jx1);
730 dy01 = _mm_sub_ps(iy0,jy1);
731 dz01 = _mm_sub_ps(iz0,jz1);
732 dx02 = _mm_sub_ps(ix0,jx2);
733 dy02 = _mm_sub_ps(iy0,jy2);
734 dz02 = _mm_sub_ps(iz0,jz2);
735 dx10 = _mm_sub_ps(ix1,jx0);
736 dy10 = _mm_sub_ps(iy1,jy0);
737 dz10 = _mm_sub_ps(iz1,jz0);
738 dx11 = _mm_sub_ps(ix1,jx1);
739 dy11 = _mm_sub_ps(iy1,jy1);
740 dz11 = _mm_sub_ps(iz1,jz1);
741 dx12 = _mm_sub_ps(ix1,jx2);
742 dy12 = _mm_sub_ps(iy1,jy2);
743 dz12 = _mm_sub_ps(iz1,jz2);
744 dx20 = _mm_sub_ps(ix2,jx0);
745 dy20 = _mm_sub_ps(iy2,jy0);
746 dz20 = _mm_sub_ps(iz2,jz0);
747 dx21 = _mm_sub_ps(ix2,jx1);
748 dy21 = _mm_sub_ps(iy2,jy1);
749 dz21 = _mm_sub_ps(iz2,jz1);
750 dx22 = _mm_sub_ps(ix2,jx2);
751 dy22 = _mm_sub_ps(iy2,jy2);
752 dz22 = _mm_sub_ps(iz2,jz2);
754 /* Calculate squared distance and things based on it */
755 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
756 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
757 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
758 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
759 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
760 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
761 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
762 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
763 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
765 rinv00 = gmx_mm_invsqrt_ps(rsq00);
766 rinv01 = gmx_mm_invsqrt_ps(rsq01);
767 rinv02 = gmx_mm_invsqrt_ps(rsq02);
768 rinv10 = gmx_mm_invsqrt_ps(rsq10);
769 rinv11 = gmx_mm_invsqrt_ps(rsq11);
770 rinv12 = gmx_mm_invsqrt_ps(rsq12);
771 rinv20 = gmx_mm_invsqrt_ps(rsq20);
772 rinv21 = gmx_mm_invsqrt_ps(rsq21);
773 rinv22 = gmx_mm_invsqrt_ps(rsq22);
775 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
776 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
777 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
778 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
779 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
780 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
781 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
782 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
783 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
785 fjx0 = _mm_setzero_ps();
786 fjy0 = _mm_setzero_ps();
787 fjz0 = _mm_setzero_ps();
788 fjx1 = _mm_setzero_ps();
789 fjy1 = _mm_setzero_ps();
790 fjz1 = _mm_setzero_ps();
791 fjx2 = _mm_setzero_ps();
792 fjy2 = _mm_setzero_ps();
793 fjz2 = _mm_setzero_ps();
795 /**************************
796 * CALCULATE INTERACTIONS *
797 **************************/
799 if (gmx_mm_any_lt(rsq00,rcutoff2))
802 r00 = _mm_mul_ps(rsq00,rinv00);
803 r00 = _mm_andnot_ps(dummy_mask,r00);
805 /* EWALD ELECTROSTATICS */
807 /* Analytical PME correction */
808 zeta2 = _mm_mul_ps(beta2,rsq00);
809 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
810 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
811 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
812 felec = _mm_mul_ps(qq00,felec);
813 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
814 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv00,sh_ewald));
815 velec = _mm_mul_ps(qq00,velec);
817 /* LENNARD-JONES DISPERSION/REPULSION */
819 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
820 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
821 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
822 vvdw = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
823 _mm_mul_ps( _mm_nmacc_ps(c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
824 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
826 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
828 /* Update potential sum for this i atom from the interaction with this j atom. */
829 velec = _mm_and_ps(velec,cutoff_mask);
830 velec = _mm_andnot_ps(dummy_mask,velec);
831 velecsum = _mm_add_ps(velecsum,velec);
832 vvdw = _mm_and_ps(vvdw,cutoff_mask);
833 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
834 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
836 fscal = _mm_add_ps(felec,fvdw);
838 fscal = _mm_and_ps(fscal,cutoff_mask);
840 fscal = _mm_andnot_ps(dummy_mask,fscal);
842 /* Update vectorial force */
843 fix0 = _mm_macc_ps(dx00,fscal,fix0);
844 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
845 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
847 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
848 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
849 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
853 /**************************
854 * CALCULATE INTERACTIONS *
855 **************************/
857 if (gmx_mm_any_lt(rsq01,rcutoff2))
860 r01 = _mm_mul_ps(rsq01,rinv01);
861 r01 = _mm_andnot_ps(dummy_mask,r01);
863 /* EWALD ELECTROSTATICS */
865 /* Analytical PME correction */
866 zeta2 = _mm_mul_ps(beta2,rsq01);
867 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
868 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
869 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
870 felec = _mm_mul_ps(qq01,felec);
871 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
872 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv01,sh_ewald));
873 velec = _mm_mul_ps(qq01,velec);
875 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
877 /* Update potential sum for this i atom from the interaction with this j atom. */
878 velec = _mm_and_ps(velec,cutoff_mask);
879 velec = _mm_andnot_ps(dummy_mask,velec);
880 velecsum = _mm_add_ps(velecsum,velec);
884 fscal = _mm_and_ps(fscal,cutoff_mask);
886 fscal = _mm_andnot_ps(dummy_mask,fscal);
888 /* Update vectorial force */
889 fix0 = _mm_macc_ps(dx01,fscal,fix0);
890 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
891 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
893 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
894 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
895 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
899 /**************************
900 * CALCULATE INTERACTIONS *
901 **************************/
903 if (gmx_mm_any_lt(rsq02,rcutoff2))
906 r02 = _mm_mul_ps(rsq02,rinv02);
907 r02 = _mm_andnot_ps(dummy_mask,r02);
909 /* EWALD ELECTROSTATICS */
911 /* Analytical PME correction */
912 zeta2 = _mm_mul_ps(beta2,rsq02);
913 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
914 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
915 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
916 felec = _mm_mul_ps(qq02,felec);
917 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
918 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv02,sh_ewald));
919 velec = _mm_mul_ps(qq02,velec);
921 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
923 /* Update potential sum for this i atom from the interaction with this j atom. */
924 velec = _mm_and_ps(velec,cutoff_mask);
925 velec = _mm_andnot_ps(dummy_mask,velec);
926 velecsum = _mm_add_ps(velecsum,velec);
930 fscal = _mm_and_ps(fscal,cutoff_mask);
932 fscal = _mm_andnot_ps(dummy_mask,fscal);
934 /* Update vectorial force */
935 fix0 = _mm_macc_ps(dx02,fscal,fix0);
936 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
937 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
939 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
940 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
941 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
945 /**************************
946 * CALCULATE INTERACTIONS *
947 **************************/
949 if (gmx_mm_any_lt(rsq10,rcutoff2))
952 r10 = _mm_mul_ps(rsq10,rinv10);
953 r10 = _mm_andnot_ps(dummy_mask,r10);
955 /* EWALD ELECTROSTATICS */
957 /* Analytical PME correction */
958 zeta2 = _mm_mul_ps(beta2,rsq10);
959 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
960 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
961 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
962 felec = _mm_mul_ps(qq10,felec);
963 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
964 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv10,sh_ewald));
965 velec = _mm_mul_ps(qq10,velec);
967 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
969 /* Update potential sum for this i atom from the interaction with this j atom. */
970 velec = _mm_and_ps(velec,cutoff_mask);
971 velec = _mm_andnot_ps(dummy_mask,velec);
972 velecsum = _mm_add_ps(velecsum,velec);
976 fscal = _mm_and_ps(fscal,cutoff_mask);
978 fscal = _mm_andnot_ps(dummy_mask,fscal);
980 /* Update vectorial force */
981 fix1 = _mm_macc_ps(dx10,fscal,fix1);
982 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
983 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
985 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
986 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
987 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
991 /**************************
992 * CALCULATE INTERACTIONS *
993 **************************/
995 if (gmx_mm_any_lt(rsq11,rcutoff2))
998 r11 = _mm_mul_ps(rsq11,rinv11);
999 r11 = _mm_andnot_ps(dummy_mask,r11);
1001 /* EWALD ELECTROSTATICS */
1003 /* Analytical PME correction */
1004 zeta2 = _mm_mul_ps(beta2,rsq11);
1005 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1006 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1007 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1008 felec = _mm_mul_ps(qq11,felec);
1009 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1010 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv11,sh_ewald));
1011 velec = _mm_mul_ps(qq11,velec);
1013 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1015 /* Update potential sum for this i atom from the interaction with this j atom. */
1016 velec = _mm_and_ps(velec,cutoff_mask);
1017 velec = _mm_andnot_ps(dummy_mask,velec);
1018 velecsum = _mm_add_ps(velecsum,velec);
1022 fscal = _mm_and_ps(fscal,cutoff_mask);
1024 fscal = _mm_andnot_ps(dummy_mask,fscal);
1026 /* Update vectorial force */
1027 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1028 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1029 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1031 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1032 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1033 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1037 /**************************
1038 * CALCULATE INTERACTIONS *
1039 **************************/
1041 if (gmx_mm_any_lt(rsq12,rcutoff2))
1044 r12 = _mm_mul_ps(rsq12,rinv12);
1045 r12 = _mm_andnot_ps(dummy_mask,r12);
1047 /* EWALD ELECTROSTATICS */
1049 /* Analytical PME correction */
1050 zeta2 = _mm_mul_ps(beta2,rsq12);
1051 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1052 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1053 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1054 felec = _mm_mul_ps(qq12,felec);
1055 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1056 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv12,sh_ewald));
1057 velec = _mm_mul_ps(qq12,velec);
1059 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1061 /* Update potential sum for this i atom from the interaction with this j atom. */
1062 velec = _mm_and_ps(velec,cutoff_mask);
1063 velec = _mm_andnot_ps(dummy_mask,velec);
1064 velecsum = _mm_add_ps(velecsum,velec);
1068 fscal = _mm_and_ps(fscal,cutoff_mask);
1070 fscal = _mm_andnot_ps(dummy_mask,fscal);
1072 /* Update vectorial force */
1073 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1074 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1075 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1077 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1078 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1079 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1083 /**************************
1084 * CALCULATE INTERACTIONS *
1085 **************************/
1087 if (gmx_mm_any_lt(rsq20,rcutoff2))
1090 r20 = _mm_mul_ps(rsq20,rinv20);
1091 r20 = _mm_andnot_ps(dummy_mask,r20);
1093 /* EWALD ELECTROSTATICS */
1095 /* Analytical PME correction */
1096 zeta2 = _mm_mul_ps(beta2,rsq20);
1097 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
1098 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1099 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1100 felec = _mm_mul_ps(qq20,felec);
1101 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1102 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv20,sh_ewald));
1103 velec = _mm_mul_ps(qq20,velec);
1105 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1107 /* Update potential sum for this i atom from the interaction with this j atom. */
1108 velec = _mm_and_ps(velec,cutoff_mask);
1109 velec = _mm_andnot_ps(dummy_mask,velec);
1110 velecsum = _mm_add_ps(velecsum,velec);
1114 fscal = _mm_and_ps(fscal,cutoff_mask);
1116 fscal = _mm_andnot_ps(dummy_mask,fscal);
1118 /* Update vectorial force */
1119 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1120 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1121 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1123 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1124 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1125 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1129 /**************************
1130 * CALCULATE INTERACTIONS *
1131 **************************/
1133 if (gmx_mm_any_lt(rsq21,rcutoff2))
1136 r21 = _mm_mul_ps(rsq21,rinv21);
1137 r21 = _mm_andnot_ps(dummy_mask,r21);
1139 /* EWALD ELECTROSTATICS */
1141 /* Analytical PME correction */
1142 zeta2 = _mm_mul_ps(beta2,rsq21);
1143 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1144 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1145 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1146 felec = _mm_mul_ps(qq21,felec);
1147 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1148 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv21,sh_ewald));
1149 velec = _mm_mul_ps(qq21,velec);
1151 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1153 /* Update potential sum for this i atom from the interaction with this j atom. */
1154 velec = _mm_and_ps(velec,cutoff_mask);
1155 velec = _mm_andnot_ps(dummy_mask,velec);
1156 velecsum = _mm_add_ps(velecsum,velec);
1160 fscal = _mm_and_ps(fscal,cutoff_mask);
1162 fscal = _mm_andnot_ps(dummy_mask,fscal);
1164 /* Update vectorial force */
1165 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1166 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1167 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1169 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1170 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1171 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1175 /**************************
1176 * CALCULATE INTERACTIONS *
1177 **************************/
1179 if (gmx_mm_any_lt(rsq22,rcutoff2))
1182 r22 = _mm_mul_ps(rsq22,rinv22);
1183 r22 = _mm_andnot_ps(dummy_mask,r22);
1185 /* EWALD ELECTROSTATICS */
1187 /* Analytical PME correction */
1188 zeta2 = _mm_mul_ps(beta2,rsq22);
1189 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1190 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1191 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1192 felec = _mm_mul_ps(qq22,felec);
1193 pmecorrV = gmx_mm_pmecorrV_ps(zeta2);
1194 velec = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv22,sh_ewald));
1195 velec = _mm_mul_ps(qq22,velec);
1197 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1199 /* Update potential sum for this i atom from the interaction with this j atom. */
1200 velec = _mm_and_ps(velec,cutoff_mask);
1201 velec = _mm_andnot_ps(dummy_mask,velec);
1202 velecsum = _mm_add_ps(velecsum,velec);
1206 fscal = _mm_and_ps(fscal,cutoff_mask);
1208 fscal = _mm_andnot_ps(dummy_mask,fscal);
1210 /* Update vectorial force */
1211 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1212 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1213 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1215 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1216 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1217 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1221 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1222 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1223 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1224 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1226 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1227 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1229 /* Inner loop uses 324 flops */
1232 /* End of innermost loop */
1234 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1235 f+i_coord_offset,fshift+i_shift_offset);
1238 /* Update potential energies */
1239 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1240 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1242 /* Increment number of inner iterations */
1243 inneriter += j_index_end - j_index_start;
1245 /* Outer loop uses 20 flops */
1248 /* Increment number of outer iterations */
1251 /* Update outer/inner flops */
1253 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*324);
1256 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_128_fma_single
1257 * Electrostatics interaction: Ewald
1258 * VdW interaction: LennardJones
1259 * Geometry: Water3-Water3
1260 * Calculate force/pot: Force
1263 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_128_fma_single
1264 (t_nblist * gmx_restrict nlist,
1265 rvec * gmx_restrict xx,
1266 rvec * gmx_restrict ff,
1267 t_forcerec * gmx_restrict fr,
1268 t_mdatoms * gmx_restrict mdatoms,
1269 nb_kernel_data_t * gmx_restrict kernel_data,
1270 t_nrnb * gmx_restrict nrnb)
1272 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1273 * just 0 for non-waters.
1274 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1275 * jnr indices corresponding to data put in the four positions in the SIMD register.
1277 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1278 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1279 int jnrA,jnrB,jnrC,jnrD;
1280 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1281 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1282 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1283 real rcutoff_scalar;
1284 real *shiftvec,*fshift,*x,*f;
1285 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1286 real scratch[4*DIM];
1287 __m128 fscal,rcutoff,rcutoff2,jidxall;
1289 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1291 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1293 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1294 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1295 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1296 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1297 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1298 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1299 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1300 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1301 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1302 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1303 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1304 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1305 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1306 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1307 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1308 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1309 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1312 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1315 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1316 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1318 __m128 ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1319 __m128 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1321 __m128 dummy_mask,cutoff_mask;
1322 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1323 __m128 one = _mm_set1_ps(1.0);
1324 __m128 two = _mm_set1_ps(2.0);
1330 jindex = nlist->jindex;
1332 shiftidx = nlist->shift;
1334 shiftvec = fr->shift_vec[0];
1335 fshift = fr->fshift[0];
1336 facel = _mm_set1_ps(fr->epsfac);
1337 charge = mdatoms->chargeA;
1338 nvdwtype = fr->ntype;
1339 vdwparam = fr->nbfp;
1340 vdwtype = mdatoms->typeA;
1342 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1343 beta = _mm_set1_ps(fr->ic->ewaldcoeff);
1344 beta2 = _mm_mul_ps(beta,beta);
1345 beta3 = _mm_mul_ps(beta,beta2);
1346 ewtab = fr->ic->tabq_coul_F;
1347 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1348 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1350 /* Setup water-specific parameters */
1351 inr = nlist->iinr[0];
1352 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1353 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1354 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1355 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1357 jq0 = _mm_set1_ps(charge[inr+0]);
1358 jq1 = _mm_set1_ps(charge[inr+1]);
1359 jq2 = _mm_set1_ps(charge[inr+2]);
1360 vdwjidx0A = 2*vdwtype[inr+0];
1361 qq00 = _mm_mul_ps(iq0,jq0);
1362 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1363 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1364 qq01 = _mm_mul_ps(iq0,jq1);
1365 qq02 = _mm_mul_ps(iq0,jq2);
1366 qq10 = _mm_mul_ps(iq1,jq0);
1367 qq11 = _mm_mul_ps(iq1,jq1);
1368 qq12 = _mm_mul_ps(iq1,jq2);
1369 qq20 = _mm_mul_ps(iq2,jq0);
1370 qq21 = _mm_mul_ps(iq2,jq1);
1371 qq22 = _mm_mul_ps(iq2,jq2);
1373 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1374 rcutoff_scalar = fr->rcoulomb;
1375 rcutoff = _mm_set1_ps(rcutoff_scalar);
1376 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1378 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
1379 rvdw = _mm_set1_ps(fr->rvdw);
1381 /* Avoid stupid compiler warnings */
1382 jnrA = jnrB = jnrC = jnrD = 0;
1383 j_coord_offsetA = 0;
1384 j_coord_offsetB = 0;
1385 j_coord_offsetC = 0;
1386 j_coord_offsetD = 0;
1391 for(iidx=0;iidx<4*DIM;iidx++)
1393 scratch[iidx] = 0.0;
1396 /* Start outer loop over neighborlists */
1397 for(iidx=0; iidx<nri; iidx++)
1399 /* Load shift vector for this list */
1400 i_shift_offset = DIM*shiftidx[iidx];
1402 /* Load limits for loop over neighbors */
1403 j_index_start = jindex[iidx];
1404 j_index_end = jindex[iidx+1];
1406 /* Get outer coordinate index */
1408 i_coord_offset = DIM*inr;
1410 /* Load i particle coords and add shift vector */
1411 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1412 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1414 fix0 = _mm_setzero_ps();
1415 fiy0 = _mm_setzero_ps();
1416 fiz0 = _mm_setzero_ps();
1417 fix1 = _mm_setzero_ps();
1418 fiy1 = _mm_setzero_ps();
1419 fiz1 = _mm_setzero_ps();
1420 fix2 = _mm_setzero_ps();
1421 fiy2 = _mm_setzero_ps();
1422 fiz2 = _mm_setzero_ps();
1424 /* Start inner kernel loop */
1425 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1428 /* Get j neighbor index, and coordinate index */
1430 jnrB = jjnr[jidx+1];
1431 jnrC = jjnr[jidx+2];
1432 jnrD = jjnr[jidx+3];
1433 j_coord_offsetA = DIM*jnrA;
1434 j_coord_offsetB = DIM*jnrB;
1435 j_coord_offsetC = DIM*jnrC;
1436 j_coord_offsetD = DIM*jnrD;
1438 /* load j atom coordinates */
1439 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1440 x+j_coord_offsetC,x+j_coord_offsetD,
1441 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1443 /* Calculate displacement vector */
1444 dx00 = _mm_sub_ps(ix0,jx0);
1445 dy00 = _mm_sub_ps(iy0,jy0);
1446 dz00 = _mm_sub_ps(iz0,jz0);
1447 dx01 = _mm_sub_ps(ix0,jx1);
1448 dy01 = _mm_sub_ps(iy0,jy1);
1449 dz01 = _mm_sub_ps(iz0,jz1);
1450 dx02 = _mm_sub_ps(ix0,jx2);
1451 dy02 = _mm_sub_ps(iy0,jy2);
1452 dz02 = _mm_sub_ps(iz0,jz2);
1453 dx10 = _mm_sub_ps(ix1,jx0);
1454 dy10 = _mm_sub_ps(iy1,jy0);
1455 dz10 = _mm_sub_ps(iz1,jz0);
1456 dx11 = _mm_sub_ps(ix1,jx1);
1457 dy11 = _mm_sub_ps(iy1,jy1);
1458 dz11 = _mm_sub_ps(iz1,jz1);
1459 dx12 = _mm_sub_ps(ix1,jx2);
1460 dy12 = _mm_sub_ps(iy1,jy2);
1461 dz12 = _mm_sub_ps(iz1,jz2);
1462 dx20 = _mm_sub_ps(ix2,jx0);
1463 dy20 = _mm_sub_ps(iy2,jy0);
1464 dz20 = _mm_sub_ps(iz2,jz0);
1465 dx21 = _mm_sub_ps(ix2,jx1);
1466 dy21 = _mm_sub_ps(iy2,jy1);
1467 dz21 = _mm_sub_ps(iz2,jz1);
1468 dx22 = _mm_sub_ps(ix2,jx2);
1469 dy22 = _mm_sub_ps(iy2,jy2);
1470 dz22 = _mm_sub_ps(iz2,jz2);
1472 /* Calculate squared distance and things based on it */
1473 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1474 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1475 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1476 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1477 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1478 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1479 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1480 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1481 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1483 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1484 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1485 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1486 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1487 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1488 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1489 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1490 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1491 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1493 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1494 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1495 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1496 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1497 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1498 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1499 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1500 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1501 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1503 fjx0 = _mm_setzero_ps();
1504 fjy0 = _mm_setzero_ps();
1505 fjz0 = _mm_setzero_ps();
1506 fjx1 = _mm_setzero_ps();
1507 fjy1 = _mm_setzero_ps();
1508 fjz1 = _mm_setzero_ps();
1509 fjx2 = _mm_setzero_ps();
1510 fjy2 = _mm_setzero_ps();
1511 fjz2 = _mm_setzero_ps();
1513 /**************************
1514 * CALCULATE INTERACTIONS *
1515 **************************/
1517 if (gmx_mm_any_lt(rsq00,rcutoff2))
1520 r00 = _mm_mul_ps(rsq00,rinv00);
1522 /* EWALD ELECTROSTATICS */
1524 /* Analytical PME correction */
1525 zeta2 = _mm_mul_ps(beta2,rsq00);
1526 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
1527 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1528 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1529 felec = _mm_mul_ps(qq00,felec);
1531 /* LENNARD-JONES DISPERSION/REPULSION */
1533 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1534 fvdw = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1536 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1538 fscal = _mm_add_ps(felec,fvdw);
1540 fscal = _mm_and_ps(fscal,cutoff_mask);
1542 /* Update vectorial force */
1543 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1544 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1545 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1547 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1548 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1549 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1553 /**************************
1554 * CALCULATE INTERACTIONS *
1555 **************************/
1557 if (gmx_mm_any_lt(rsq01,rcutoff2))
1560 r01 = _mm_mul_ps(rsq01,rinv01);
1562 /* EWALD ELECTROSTATICS */
1564 /* Analytical PME correction */
1565 zeta2 = _mm_mul_ps(beta2,rsq01);
1566 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
1567 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1568 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1569 felec = _mm_mul_ps(qq01,felec);
1571 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1575 fscal = _mm_and_ps(fscal,cutoff_mask);
1577 /* Update vectorial force */
1578 fix0 = _mm_macc_ps(dx01,fscal,fix0);
1579 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1580 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1582 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1583 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1584 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1588 /**************************
1589 * CALCULATE INTERACTIONS *
1590 **************************/
1592 if (gmx_mm_any_lt(rsq02,rcutoff2))
1595 r02 = _mm_mul_ps(rsq02,rinv02);
1597 /* EWALD ELECTROSTATICS */
1599 /* Analytical PME correction */
1600 zeta2 = _mm_mul_ps(beta2,rsq02);
1601 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
1602 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1603 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1604 felec = _mm_mul_ps(qq02,felec);
1606 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1610 fscal = _mm_and_ps(fscal,cutoff_mask);
1612 /* Update vectorial force */
1613 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1614 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1615 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
1617 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
1618 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
1619 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
1623 /**************************
1624 * CALCULATE INTERACTIONS *
1625 **************************/
1627 if (gmx_mm_any_lt(rsq10,rcutoff2))
1630 r10 = _mm_mul_ps(rsq10,rinv10);
1632 /* EWALD ELECTROSTATICS */
1634 /* Analytical PME correction */
1635 zeta2 = _mm_mul_ps(beta2,rsq10);
1636 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
1637 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1638 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1639 felec = _mm_mul_ps(qq10,felec);
1641 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1645 fscal = _mm_and_ps(fscal,cutoff_mask);
1647 /* Update vectorial force */
1648 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1649 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1650 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1652 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1653 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1654 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1658 /**************************
1659 * CALCULATE INTERACTIONS *
1660 **************************/
1662 if (gmx_mm_any_lt(rsq11,rcutoff2))
1665 r11 = _mm_mul_ps(rsq11,rinv11);
1667 /* EWALD ELECTROSTATICS */
1669 /* Analytical PME correction */
1670 zeta2 = _mm_mul_ps(beta2,rsq11);
1671 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
1672 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1673 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1674 felec = _mm_mul_ps(qq11,felec);
1676 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1680 fscal = _mm_and_ps(fscal,cutoff_mask);
1682 /* Update vectorial force */
1683 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1684 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1685 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1687 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1688 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1689 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1693 /**************************
1694 * CALCULATE INTERACTIONS *
1695 **************************/
1697 if (gmx_mm_any_lt(rsq12,rcutoff2))
1700 r12 = _mm_mul_ps(rsq12,rinv12);
1702 /* EWALD ELECTROSTATICS */
1704 /* Analytical PME correction */
1705 zeta2 = _mm_mul_ps(beta2,rsq12);
1706 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
1707 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1708 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1709 felec = _mm_mul_ps(qq12,felec);
1711 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1715 fscal = _mm_and_ps(fscal,cutoff_mask);
1717 /* Update vectorial force */
1718 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1719 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1720 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1722 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1723 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1724 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1728 /**************************
1729 * CALCULATE INTERACTIONS *
1730 **************************/
1732 if (gmx_mm_any_lt(rsq20,rcutoff2))
1735 r20 = _mm_mul_ps(rsq20,rinv20);
1737 /* EWALD ELECTROSTATICS */
1739 /* Analytical PME correction */
1740 zeta2 = _mm_mul_ps(beta2,rsq20);
1741 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
1742 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1743 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1744 felec = _mm_mul_ps(qq20,felec);
1746 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1750 fscal = _mm_and_ps(fscal,cutoff_mask);
1752 /* Update vectorial force */
1753 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1754 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1755 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1757 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1758 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1759 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1763 /**************************
1764 * CALCULATE INTERACTIONS *
1765 **************************/
1767 if (gmx_mm_any_lt(rsq21,rcutoff2))
1770 r21 = _mm_mul_ps(rsq21,rinv21);
1772 /* EWALD ELECTROSTATICS */
1774 /* Analytical PME correction */
1775 zeta2 = _mm_mul_ps(beta2,rsq21);
1776 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
1777 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1778 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1779 felec = _mm_mul_ps(qq21,felec);
1781 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1785 fscal = _mm_and_ps(fscal,cutoff_mask);
1787 /* Update vectorial force */
1788 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1789 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1790 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1792 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1793 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1794 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1798 /**************************
1799 * CALCULATE INTERACTIONS *
1800 **************************/
1802 if (gmx_mm_any_lt(rsq22,rcutoff2))
1805 r22 = _mm_mul_ps(rsq22,rinv22);
1807 /* EWALD ELECTROSTATICS */
1809 /* Analytical PME correction */
1810 zeta2 = _mm_mul_ps(beta2,rsq22);
1811 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
1812 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1813 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1814 felec = _mm_mul_ps(qq22,felec);
1816 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1820 fscal = _mm_and_ps(fscal,cutoff_mask);
1822 /* Update vectorial force */
1823 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1824 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1825 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1827 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1828 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1829 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1833 fjptrA = f+j_coord_offsetA;
1834 fjptrB = f+j_coord_offsetB;
1835 fjptrC = f+j_coord_offsetC;
1836 fjptrD = f+j_coord_offsetD;
1838 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1839 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1841 /* Inner loop uses 286 flops */
1844 if(jidx<j_index_end)
1847 /* Get j neighbor index, and coordinate index */
1848 jnrlistA = jjnr[jidx];
1849 jnrlistB = jjnr[jidx+1];
1850 jnrlistC = jjnr[jidx+2];
1851 jnrlistD = jjnr[jidx+3];
1852 /* Sign of each element will be negative for non-real atoms.
1853 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1854 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1856 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1857 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1858 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1859 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1860 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1861 j_coord_offsetA = DIM*jnrA;
1862 j_coord_offsetB = DIM*jnrB;
1863 j_coord_offsetC = DIM*jnrC;
1864 j_coord_offsetD = DIM*jnrD;
1866 /* load j atom coordinates */
1867 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1868 x+j_coord_offsetC,x+j_coord_offsetD,
1869 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1871 /* Calculate displacement vector */
1872 dx00 = _mm_sub_ps(ix0,jx0);
1873 dy00 = _mm_sub_ps(iy0,jy0);
1874 dz00 = _mm_sub_ps(iz0,jz0);
1875 dx01 = _mm_sub_ps(ix0,jx1);
1876 dy01 = _mm_sub_ps(iy0,jy1);
1877 dz01 = _mm_sub_ps(iz0,jz1);
1878 dx02 = _mm_sub_ps(ix0,jx2);
1879 dy02 = _mm_sub_ps(iy0,jy2);
1880 dz02 = _mm_sub_ps(iz0,jz2);
1881 dx10 = _mm_sub_ps(ix1,jx0);
1882 dy10 = _mm_sub_ps(iy1,jy0);
1883 dz10 = _mm_sub_ps(iz1,jz0);
1884 dx11 = _mm_sub_ps(ix1,jx1);
1885 dy11 = _mm_sub_ps(iy1,jy1);
1886 dz11 = _mm_sub_ps(iz1,jz1);
1887 dx12 = _mm_sub_ps(ix1,jx2);
1888 dy12 = _mm_sub_ps(iy1,jy2);
1889 dz12 = _mm_sub_ps(iz1,jz2);
1890 dx20 = _mm_sub_ps(ix2,jx0);
1891 dy20 = _mm_sub_ps(iy2,jy0);
1892 dz20 = _mm_sub_ps(iz2,jz0);
1893 dx21 = _mm_sub_ps(ix2,jx1);
1894 dy21 = _mm_sub_ps(iy2,jy1);
1895 dz21 = _mm_sub_ps(iz2,jz1);
1896 dx22 = _mm_sub_ps(ix2,jx2);
1897 dy22 = _mm_sub_ps(iy2,jy2);
1898 dz22 = _mm_sub_ps(iz2,jz2);
1900 /* Calculate squared distance and things based on it */
1901 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1902 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1903 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1904 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1905 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1906 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1907 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1908 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1909 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1911 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1912 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1913 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1914 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1915 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1916 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1917 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1918 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1919 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1921 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1922 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1923 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1924 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1925 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1926 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1927 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1928 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1929 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1931 fjx0 = _mm_setzero_ps();
1932 fjy0 = _mm_setzero_ps();
1933 fjz0 = _mm_setzero_ps();
1934 fjx1 = _mm_setzero_ps();
1935 fjy1 = _mm_setzero_ps();
1936 fjz1 = _mm_setzero_ps();
1937 fjx2 = _mm_setzero_ps();
1938 fjy2 = _mm_setzero_ps();
1939 fjz2 = _mm_setzero_ps();
1941 /**************************
1942 * CALCULATE INTERACTIONS *
1943 **************************/
1945 if (gmx_mm_any_lt(rsq00,rcutoff2))
1948 r00 = _mm_mul_ps(rsq00,rinv00);
1949 r00 = _mm_andnot_ps(dummy_mask,r00);
1951 /* EWALD ELECTROSTATICS */
1953 /* Analytical PME correction */
1954 zeta2 = _mm_mul_ps(beta2,rsq00);
1955 rinv3 = _mm_mul_ps(rinvsq00,rinv00);
1956 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
1957 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
1958 felec = _mm_mul_ps(qq00,felec);
1960 /* LENNARD-JONES DISPERSION/REPULSION */
1962 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1963 fvdw = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1965 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1967 fscal = _mm_add_ps(felec,fvdw);
1969 fscal = _mm_and_ps(fscal,cutoff_mask);
1971 fscal = _mm_andnot_ps(dummy_mask,fscal);
1973 /* Update vectorial force */
1974 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1975 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1976 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1978 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1979 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1980 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1984 /**************************
1985 * CALCULATE INTERACTIONS *
1986 **************************/
1988 if (gmx_mm_any_lt(rsq01,rcutoff2))
1991 r01 = _mm_mul_ps(rsq01,rinv01);
1992 r01 = _mm_andnot_ps(dummy_mask,r01);
1994 /* EWALD ELECTROSTATICS */
1996 /* Analytical PME correction */
1997 zeta2 = _mm_mul_ps(beta2,rsq01);
1998 rinv3 = _mm_mul_ps(rinvsq01,rinv01);
1999 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2000 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2001 felec = _mm_mul_ps(qq01,felec);
2003 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2007 fscal = _mm_and_ps(fscal,cutoff_mask);
2009 fscal = _mm_andnot_ps(dummy_mask,fscal);
2011 /* Update vectorial force */
2012 fix0 = _mm_macc_ps(dx01,fscal,fix0);
2013 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
2014 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
2016 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
2017 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
2018 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
2022 /**************************
2023 * CALCULATE INTERACTIONS *
2024 **************************/
2026 if (gmx_mm_any_lt(rsq02,rcutoff2))
2029 r02 = _mm_mul_ps(rsq02,rinv02);
2030 r02 = _mm_andnot_ps(dummy_mask,r02);
2032 /* EWALD ELECTROSTATICS */
2034 /* Analytical PME correction */
2035 zeta2 = _mm_mul_ps(beta2,rsq02);
2036 rinv3 = _mm_mul_ps(rinvsq02,rinv02);
2037 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2038 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2039 felec = _mm_mul_ps(qq02,felec);
2041 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2045 fscal = _mm_and_ps(fscal,cutoff_mask);
2047 fscal = _mm_andnot_ps(dummy_mask,fscal);
2049 /* Update vectorial force */
2050 fix0 = _mm_macc_ps(dx02,fscal,fix0);
2051 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
2052 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
2054 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
2055 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
2056 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
2060 /**************************
2061 * CALCULATE INTERACTIONS *
2062 **************************/
2064 if (gmx_mm_any_lt(rsq10,rcutoff2))
2067 r10 = _mm_mul_ps(rsq10,rinv10);
2068 r10 = _mm_andnot_ps(dummy_mask,r10);
2070 /* EWALD ELECTROSTATICS */
2072 /* Analytical PME correction */
2073 zeta2 = _mm_mul_ps(beta2,rsq10);
2074 rinv3 = _mm_mul_ps(rinvsq10,rinv10);
2075 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2076 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2077 felec = _mm_mul_ps(qq10,felec);
2079 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2083 fscal = _mm_and_ps(fscal,cutoff_mask);
2085 fscal = _mm_andnot_ps(dummy_mask,fscal);
2087 /* Update vectorial force */
2088 fix1 = _mm_macc_ps(dx10,fscal,fix1);
2089 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
2090 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
2092 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
2093 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
2094 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
2098 /**************************
2099 * CALCULATE INTERACTIONS *
2100 **************************/
2102 if (gmx_mm_any_lt(rsq11,rcutoff2))
2105 r11 = _mm_mul_ps(rsq11,rinv11);
2106 r11 = _mm_andnot_ps(dummy_mask,r11);
2108 /* EWALD ELECTROSTATICS */
2110 /* Analytical PME correction */
2111 zeta2 = _mm_mul_ps(beta2,rsq11);
2112 rinv3 = _mm_mul_ps(rinvsq11,rinv11);
2113 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2114 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2115 felec = _mm_mul_ps(qq11,felec);
2117 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2121 fscal = _mm_and_ps(fscal,cutoff_mask);
2123 fscal = _mm_andnot_ps(dummy_mask,fscal);
2125 /* Update vectorial force */
2126 fix1 = _mm_macc_ps(dx11,fscal,fix1);
2127 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
2128 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
2130 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
2131 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
2132 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
2136 /**************************
2137 * CALCULATE INTERACTIONS *
2138 **************************/
2140 if (gmx_mm_any_lt(rsq12,rcutoff2))
2143 r12 = _mm_mul_ps(rsq12,rinv12);
2144 r12 = _mm_andnot_ps(dummy_mask,r12);
2146 /* EWALD ELECTROSTATICS */
2148 /* Analytical PME correction */
2149 zeta2 = _mm_mul_ps(beta2,rsq12);
2150 rinv3 = _mm_mul_ps(rinvsq12,rinv12);
2151 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2152 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2153 felec = _mm_mul_ps(qq12,felec);
2155 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2159 fscal = _mm_and_ps(fscal,cutoff_mask);
2161 fscal = _mm_andnot_ps(dummy_mask,fscal);
2163 /* Update vectorial force */
2164 fix1 = _mm_macc_ps(dx12,fscal,fix1);
2165 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
2166 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
2168 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
2169 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
2170 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
2174 /**************************
2175 * CALCULATE INTERACTIONS *
2176 **************************/
2178 if (gmx_mm_any_lt(rsq20,rcutoff2))
2181 r20 = _mm_mul_ps(rsq20,rinv20);
2182 r20 = _mm_andnot_ps(dummy_mask,r20);
2184 /* EWALD ELECTROSTATICS */
2186 /* Analytical PME correction */
2187 zeta2 = _mm_mul_ps(beta2,rsq20);
2188 rinv3 = _mm_mul_ps(rinvsq20,rinv20);
2189 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2190 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2191 felec = _mm_mul_ps(qq20,felec);
2193 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2197 fscal = _mm_and_ps(fscal,cutoff_mask);
2199 fscal = _mm_andnot_ps(dummy_mask,fscal);
2201 /* Update vectorial force */
2202 fix2 = _mm_macc_ps(dx20,fscal,fix2);
2203 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
2204 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
2206 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
2207 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
2208 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
2212 /**************************
2213 * CALCULATE INTERACTIONS *
2214 **************************/
2216 if (gmx_mm_any_lt(rsq21,rcutoff2))
2219 r21 = _mm_mul_ps(rsq21,rinv21);
2220 r21 = _mm_andnot_ps(dummy_mask,r21);
2222 /* EWALD ELECTROSTATICS */
2224 /* Analytical PME correction */
2225 zeta2 = _mm_mul_ps(beta2,rsq21);
2226 rinv3 = _mm_mul_ps(rinvsq21,rinv21);
2227 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2228 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2229 felec = _mm_mul_ps(qq21,felec);
2231 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2235 fscal = _mm_and_ps(fscal,cutoff_mask);
2237 fscal = _mm_andnot_ps(dummy_mask,fscal);
2239 /* Update vectorial force */
2240 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2241 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2242 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2244 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2245 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2246 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2250 /**************************
2251 * CALCULATE INTERACTIONS *
2252 **************************/
2254 if (gmx_mm_any_lt(rsq22,rcutoff2))
2257 r22 = _mm_mul_ps(rsq22,rinv22);
2258 r22 = _mm_andnot_ps(dummy_mask,r22);
2260 /* EWALD ELECTROSTATICS */
2262 /* Analytical PME correction */
2263 zeta2 = _mm_mul_ps(beta2,rsq22);
2264 rinv3 = _mm_mul_ps(rinvsq22,rinv22);
2265 pmecorrF = gmx_mm_pmecorrF_ps(zeta2);
2266 felec = _mm_macc_ps(pmecorrF,beta3,rinv3);
2267 felec = _mm_mul_ps(qq22,felec);
2269 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2273 fscal = _mm_and_ps(fscal,cutoff_mask);
2275 fscal = _mm_andnot_ps(dummy_mask,fscal);
2277 /* Update vectorial force */
2278 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2279 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2280 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2282 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2283 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2284 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2288 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2289 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2290 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2291 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2293 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2294 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2296 /* Inner loop uses 295 flops */
2299 /* End of innermost loop */
2301 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2302 f+i_coord_offset,fshift+i_shift_offset);
2304 /* Increment number of inner iterations */
2305 inneriter += j_index_end - j_index_start;
2307 /* Outer loop uses 18 flops */
2310 /* Increment number of outer iterations */
2313 /* Update outer/inner flops */
2315 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*295);