2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_256_double
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
77 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
78 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
79 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
80 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
81 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
82 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
84 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
85 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
86 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
89 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
94 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
97 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
98 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
99 __m256d dummy_mask,cutoff_mask;
100 __m128 tmpmask0,tmpmask1;
101 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
102 __m256d one = _mm256_set1_pd(1.0);
103 __m256d two = _mm256_set1_pd(2.0);
109 jindex = nlist->jindex;
111 shiftidx = nlist->shift;
113 shiftvec = fr->shift_vec[0];
114 fshift = fr->fshift[0];
115 facel = _mm256_set1_pd(fr->epsfac);
116 charge = mdatoms->chargeA;
117 krf = _mm256_set1_pd(fr->ic->k_rf);
118 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
119 crf = _mm256_set1_pd(fr->ic->c_rf);
120 nvdwtype = fr->ntype;
122 vdwtype = mdatoms->typeA;
124 /* Setup water-specific parameters */
125 inr = nlist->iinr[0];
126 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
127 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
128 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
129 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
131 jq0 = _mm256_set1_pd(charge[inr+0]);
132 jq1 = _mm256_set1_pd(charge[inr+1]);
133 jq2 = _mm256_set1_pd(charge[inr+2]);
134 vdwjidx0A = 2*vdwtype[inr+0];
135 qq00 = _mm256_mul_pd(iq0,jq0);
136 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
137 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
138 qq01 = _mm256_mul_pd(iq0,jq1);
139 qq02 = _mm256_mul_pd(iq0,jq2);
140 qq10 = _mm256_mul_pd(iq1,jq0);
141 qq11 = _mm256_mul_pd(iq1,jq1);
142 qq12 = _mm256_mul_pd(iq1,jq2);
143 qq20 = _mm256_mul_pd(iq2,jq0);
144 qq21 = _mm256_mul_pd(iq2,jq1);
145 qq22 = _mm256_mul_pd(iq2,jq2);
147 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
148 rcutoff_scalar = fr->rcoulomb;
149 rcutoff = _mm256_set1_pd(rcutoff_scalar);
150 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
152 sh_vdw_invrcut6 = _mm256_set1_pd(fr->ic->sh_invrc6);
153 rvdw = _mm256_set1_pd(fr->rvdw);
155 /* Avoid stupid compiler warnings */
156 jnrA = jnrB = jnrC = jnrD = 0;
165 for(iidx=0;iidx<4*DIM;iidx++)
170 /* Start outer loop over neighborlists */
171 for(iidx=0; iidx<nri; iidx++)
173 /* Load shift vector for this list */
174 i_shift_offset = DIM*shiftidx[iidx];
176 /* Load limits for loop over neighbors */
177 j_index_start = jindex[iidx];
178 j_index_end = jindex[iidx+1];
180 /* Get outer coordinate index */
182 i_coord_offset = DIM*inr;
184 /* Load i particle coords and add shift vector */
185 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
186 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
188 fix0 = _mm256_setzero_pd();
189 fiy0 = _mm256_setzero_pd();
190 fiz0 = _mm256_setzero_pd();
191 fix1 = _mm256_setzero_pd();
192 fiy1 = _mm256_setzero_pd();
193 fiz1 = _mm256_setzero_pd();
194 fix2 = _mm256_setzero_pd();
195 fiy2 = _mm256_setzero_pd();
196 fiz2 = _mm256_setzero_pd();
198 /* Reset potential sums */
199 velecsum = _mm256_setzero_pd();
200 vvdwsum = _mm256_setzero_pd();
202 /* Start inner kernel loop */
203 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
206 /* Get j neighbor index, and coordinate index */
211 j_coord_offsetA = DIM*jnrA;
212 j_coord_offsetB = DIM*jnrB;
213 j_coord_offsetC = DIM*jnrC;
214 j_coord_offsetD = DIM*jnrD;
216 /* load j atom coordinates */
217 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
218 x+j_coord_offsetC,x+j_coord_offsetD,
219 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
221 /* Calculate displacement vector */
222 dx00 = _mm256_sub_pd(ix0,jx0);
223 dy00 = _mm256_sub_pd(iy0,jy0);
224 dz00 = _mm256_sub_pd(iz0,jz0);
225 dx01 = _mm256_sub_pd(ix0,jx1);
226 dy01 = _mm256_sub_pd(iy0,jy1);
227 dz01 = _mm256_sub_pd(iz0,jz1);
228 dx02 = _mm256_sub_pd(ix0,jx2);
229 dy02 = _mm256_sub_pd(iy0,jy2);
230 dz02 = _mm256_sub_pd(iz0,jz2);
231 dx10 = _mm256_sub_pd(ix1,jx0);
232 dy10 = _mm256_sub_pd(iy1,jy0);
233 dz10 = _mm256_sub_pd(iz1,jz0);
234 dx11 = _mm256_sub_pd(ix1,jx1);
235 dy11 = _mm256_sub_pd(iy1,jy1);
236 dz11 = _mm256_sub_pd(iz1,jz1);
237 dx12 = _mm256_sub_pd(ix1,jx2);
238 dy12 = _mm256_sub_pd(iy1,jy2);
239 dz12 = _mm256_sub_pd(iz1,jz2);
240 dx20 = _mm256_sub_pd(ix2,jx0);
241 dy20 = _mm256_sub_pd(iy2,jy0);
242 dz20 = _mm256_sub_pd(iz2,jz0);
243 dx21 = _mm256_sub_pd(ix2,jx1);
244 dy21 = _mm256_sub_pd(iy2,jy1);
245 dz21 = _mm256_sub_pd(iz2,jz1);
246 dx22 = _mm256_sub_pd(ix2,jx2);
247 dy22 = _mm256_sub_pd(iy2,jy2);
248 dz22 = _mm256_sub_pd(iz2,jz2);
250 /* Calculate squared distance and things based on it */
251 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
252 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
253 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
254 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
255 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
256 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
257 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
258 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
259 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
261 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
262 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
263 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
264 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
265 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
266 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
267 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
268 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
269 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
271 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
272 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
273 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
274 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
275 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
276 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
277 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
278 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
279 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
281 fjx0 = _mm256_setzero_pd();
282 fjy0 = _mm256_setzero_pd();
283 fjz0 = _mm256_setzero_pd();
284 fjx1 = _mm256_setzero_pd();
285 fjy1 = _mm256_setzero_pd();
286 fjz1 = _mm256_setzero_pd();
287 fjx2 = _mm256_setzero_pd();
288 fjy2 = _mm256_setzero_pd();
289 fjz2 = _mm256_setzero_pd();
291 /**************************
292 * CALCULATE INTERACTIONS *
293 **************************/
295 if (gmx_mm256_any_lt(rsq00,rcutoff2))
298 /* REACTION-FIELD ELECTROSTATICS */
299 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_add_pd(rinv00,_mm256_mul_pd(krf,rsq00)),crf));
300 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
302 /* LENNARD-JONES DISPERSION/REPULSION */
304 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
305 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
306 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
307 vvdw = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
308 _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
309 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
311 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
313 /* Update potential sum for this i atom from the interaction with this j atom. */
314 velec = _mm256_and_pd(velec,cutoff_mask);
315 velecsum = _mm256_add_pd(velecsum,velec);
316 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
317 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
319 fscal = _mm256_add_pd(felec,fvdw);
321 fscal = _mm256_and_pd(fscal,cutoff_mask);
323 /* Calculate temporary vectorial force */
324 tx = _mm256_mul_pd(fscal,dx00);
325 ty = _mm256_mul_pd(fscal,dy00);
326 tz = _mm256_mul_pd(fscal,dz00);
328 /* Update vectorial force */
329 fix0 = _mm256_add_pd(fix0,tx);
330 fiy0 = _mm256_add_pd(fiy0,ty);
331 fiz0 = _mm256_add_pd(fiz0,tz);
333 fjx0 = _mm256_add_pd(fjx0,tx);
334 fjy0 = _mm256_add_pd(fjy0,ty);
335 fjz0 = _mm256_add_pd(fjz0,tz);
339 /**************************
340 * CALCULATE INTERACTIONS *
341 **************************/
343 if (gmx_mm256_any_lt(rsq01,rcutoff2))
346 /* REACTION-FIELD ELECTROSTATICS */
347 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_add_pd(rinv01,_mm256_mul_pd(krf,rsq01)),crf));
348 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
350 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
352 /* Update potential sum for this i atom from the interaction with this j atom. */
353 velec = _mm256_and_pd(velec,cutoff_mask);
354 velecsum = _mm256_add_pd(velecsum,velec);
358 fscal = _mm256_and_pd(fscal,cutoff_mask);
360 /* Calculate temporary vectorial force */
361 tx = _mm256_mul_pd(fscal,dx01);
362 ty = _mm256_mul_pd(fscal,dy01);
363 tz = _mm256_mul_pd(fscal,dz01);
365 /* Update vectorial force */
366 fix0 = _mm256_add_pd(fix0,tx);
367 fiy0 = _mm256_add_pd(fiy0,ty);
368 fiz0 = _mm256_add_pd(fiz0,tz);
370 fjx1 = _mm256_add_pd(fjx1,tx);
371 fjy1 = _mm256_add_pd(fjy1,ty);
372 fjz1 = _mm256_add_pd(fjz1,tz);
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
380 if (gmx_mm256_any_lt(rsq02,rcutoff2))
383 /* REACTION-FIELD ELECTROSTATICS */
384 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_add_pd(rinv02,_mm256_mul_pd(krf,rsq02)),crf));
385 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
387 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
389 /* Update potential sum for this i atom from the interaction with this j atom. */
390 velec = _mm256_and_pd(velec,cutoff_mask);
391 velecsum = _mm256_add_pd(velecsum,velec);
395 fscal = _mm256_and_pd(fscal,cutoff_mask);
397 /* Calculate temporary vectorial force */
398 tx = _mm256_mul_pd(fscal,dx02);
399 ty = _mm256_mul_pd(fscal,dy02);
400 tz = _mm256_mul_pd(fscal,dz02);
402 /* Update vectorial force */
403 fix0 = _mm256_add_pd(fix0,tx);
404 fiy0 = _mm256_add_pd(fiy0,ty);
405 fiz0 = _mm256_add_pd(fiz0,tz);
407 fjx2 = _mm256_add_pd(fjx2,tx);
408 fjy2 = _mm256_add_pd(fjy2,ty);
409 fjz2 = _mm256_add_pd(fjz2,tz);
413 /**************************
414 * CALCULATE INTERACTIONS *
415 **************************/
417 if (gmx_mm256_any_lt(rsq10,rcutoff2))
420 /* REACTION-FIELD ELECTROSTATICS */
421 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_add_pd(rinv10,_mm256_mul_pd(krf,rsq10)),crf));
422 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
424 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
426 /* Update potential sum for this i atom from the interaction with this j atom. */
427 velec = _mm256_and_pd(velec,cutoff_mask);
428 velecsum = _mm256_add_pd(velecsum,velec);
432 fscal = _mm256_and_pd(fscal,cutoff_mask);
434 /* Calculate temporary vectorial force */
435 tx = _mm256_mul_pd(fscal,dx10);
436 ty = _mm256_mul_pd(fscal,dy10);
437 tz = _mm256_mul_pd(fscal,dz10);
439 /* Update vectorial force */
440 fix1 = _mm256_add_pd(fix1,tx);
441 fiy1 = _mm256_add_pd(fiy1,ty);
442 fiz1 = _mm256_add_pd(fiz1,tz);
444 fjx0 = _mm256_add_pd(fjx0,tx);
445 fjy0 = _mm256_add_pd(fjy0,ty);
446 fjz0 = _mm256_add_pd(fjz0,tz);
450 /**************************
451 * CALCULATE INTERACTIONS *
452 **************************/
454 if (gmx_mm256_any_lt(rsq11,rcutoff2))
457 /* REACTION-FIELD ELECTROSTATICS */
458 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
459 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
461 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
463 /* Update potential sum for this i atom from the interaction with this j atom. */
464 velec = _mm256_and_pd(velec,cutoff_mask);
465 velecsum = _mm256_add_pd(velecsum,velec);
469 fscal = _mm256_and_pd(fscal,cutoff_mask);
471 /* Calculate temporary vectorial force */
472 tx = _mm256_mul_pd(fscal,dx11);
473 ty = _mm256_mul_pd(fscal,dy11);
474 tz = _mm256_mul_pd(fscal,dz11);
476 /* Update vectorial force */
477 fix1 = _mm256_add_pd(fix1,tx);
478 fiy1 = _mm256_add_pd(fiy1,ty);
479 fiz1 = _mm256_add_pd(fiz1,tz);
481 fjx1 = _mm256_add_pd(fjx1,tx);
482 fjy1 = _mm256_add_pd(fjy1,ty);
483 fjz1 = _mm256_add_pd(fjz1,tz);
487 /**************************
488 * CALCULATE INTERACTIONS *
489 **************************/
491 if (gmx_mm256_any_lt(rsq12,rcutoff2))
494 /* REACTION-FIELD ELECTROSTATICS */
495 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
496 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
498 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
500 /* Update potential sum for this i atom from the interaction with this j atom. */
501 velec = _mm256_and_pd(velec,cutoff_mask);
502 velecsum = _mm256_add_pd(velecsum,velec);
506 fscal = _mm256_and_pd(fscal,cutoff_mask);
508 /* Calculate temporary vectorial force */
509 tx = _mm256_mul_pd(fscal,dx12);
510 ty = _mm256_mul_pd(fscal,dy12);
511 tz = _mm256_mul_pd(fscal,dz12);
513 /* Update vectorial force */
514 fix1 = _mm256_add_pd(fix1,tx);
515 fiy1 = _mm256_add_pd(fiy1,ty);
516 fiz1 = _mm256_add_pd(fiz1,tz);
518 fjx2 = _mm256_add_pd(fjx2,tx);
519 fjy2 = _mm256_add_pd(fjy2,ty);
520 fjz2 = _mm256_add_pd(fjz2,tz);
524 /**************************
525 * CALCULATE INTERACTIONS *
526 **************************/
528 if (gmx_mm256_any_lt(rsq20,rcutoff2))
531 /* REACTION-FIELD ELECTROSTATICS */
532 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_add_pd(rinv20,_mm256_mul_pd(krf,rsq20)),crf));
533 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
535 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
537 /* Update potential sum for this i atom from the interaction with this j atom. */
538 velec = _mm256_and_pd(velec,cutoff_mask);
539 velecsum = _mm256_add_pd(velecsum,velec);
543 fscal = _mm256_and_pd(fscal,cutoff_mask);
545 /* Calculate temporary vectorial force */
546 tx = _mm256_mul_pd(fscal,dx20);
547 ty = _mm256_mul_pd(fscal,dy20);
548 tz = _mm256_mul_pd(fscal,dz20);
550 /* Update vectorial force */
551 fix2 = _mm256_add_pd(fix2,tx);
552 fiy2 = _mm256_add_pd(fiy2,ty);
553 fiz2 = _mm256_add_pd(fiz2,tz);
555 fjx0 = _mm256_add_pd(fjx0,tx);
556 fjy0 = _mm256_add_pd(fjy0,ty);
557 fjz0 = _mm256_add_pd(fjz0,tz);
561 /**************************
562 * CALCULATE INTERACTIONS *
563 **************************/
565 if (gmx_mm256_any_lt(rsq21,rcutoff2))
568 /* REACTION-FIELD ELECTROSTATICS */
569 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
570 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
572 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
574 /* Update potential sum for this i atom from the interaction with this j atom. */
575 velec = _mm256_and_pd(velec,cutoff_mask);
576 velecsum = _mm256_add_pd(velecsum,velec);
580 fscal = _mm256_and_pd(fscal,cutoff_mask);
582 /* Calculate temporary vectorial force */
583 tx = _mm256_mul_pd(fscal,dx21);
584 ty = _mm256_mul_pd(fscal,dy21);
585 tz = _mm256_mul_pd(fscal,dz21);
587 /* Update vectorial force */
588 fix2 = _mm256_add_pd(fix2,tx);
589 fiy2 = _mm256_add_pd(fiy2,ty);
590 fiz2 = _mm256_add_pd(fiz2,tz);
592 fjx1 = _mm256_add_pd(fjx1,tx);
593 fjy1 = _mm256_add_pd(fjy1,ty);
594 fjz1 = _mm256_add_pd(fjz1,tz);
598 /**************************
599 * CALCULATE INTERACTIONS *
600 **************************/
602 if (gmx_mm256_any_lt(rsq22,rcutoff2))
605 /* REACTION-FIELD ELECTROSTATICS */
606 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
607 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
609 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
611 /* Update potential sum for this i atom from the interaction with this j atom. */
612 velec = _mm256_and_pd(velec,cutoff_mask);
613 velecsum = _mm256_add_pd(velecsum,velec);
617 fscal = _mm256_and_pd(fscal,cutoff_mask);
619 /* Calculate temporary vectorial force */
620 tx = _mm256_mul_pd(fscal,dx22);
621 ty = _mm256_mul_pd(fscal,dy22);
622 tz = _mm256_mul_pd(fscal,dz22);
624 /* Update vectorial force */
625 fix2 = _mm256_add_pd(fix2,tx);
626 fiy2 = _mm256_add_pd(fiy2,ty);
627 fiz2 = _mm256_add_pd(fiz2,tz);
629 fjx2 = _mm256_add_pd(fjx2,tx);
630 fjy2 = _mm256_add_pd(fjy2,ty);
631 fjz2 = _mm256_add_pd(fjz2,tz);
635 fjptrA = f+j_coord_offsetA;
636 fjptrB = f+j_coord_offsetB;
637 fjptrC = f+j_coord_offsetC;
638 fjptrD = f+j_coord_offsetD;
640 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
641 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
643 /* Inner loop uses 342 flops */
649 /* Get j neighbor index, and coordinate index */
650 jnrlistA = jjnr[jidx];
651 jnrlistB = jjnr[jidx+1];
652 jnrlistC = jjnr[jidx+2];
653 jnrlistD = jjnr[jidx+3];
654 /* Sign of each element will be negative for non-real atoms.
655 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
656 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
658 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
660 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
661 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
662 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
664 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
665 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
666 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
667 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
668 j_coord_offsetA = DIM*jnrA;
669 j_coord_offsetB = DIM*jnrB;
670 j_coord_offsetC = DIM*jnrC;
671 j_coord_offsetD = DIM*jnrD;
673 /* load j atom coordinates */
674 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
675 x+j_coord_offsetC,x+j_coord_offsetD,
676 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
678 /* Calculate displacement vector */
679 dx00 = _mm256_sub_pd(ix0,jx0);
680 dy00 = _mm256_sub_pd(iy0,jy0);
681 dz00 = _mm256_sub_pd(iz0,jz0);
682 dx01 = _mm256_sub_pd(ix0,jx1);
683 dy01 = _mm256_sub_pd(iy0,jy1);
684 dz01 = _mm256_sub_pd(iz0,jz1);
685 dx02 = _mm256_sub_pd(ix0,jx2);
686 dy02 = _mm256_sub_pd(iy0,jy2);
687 dz02 = _mm256_sub_pd(iz0,jz2);
688 dx10 = _mm256_sub_pd(ix1,jx0);
689 dy10 = _mm256_sub_pd(iy1,jy0);
690 dz10 = _mm256_sub_pd(iz1,jz0);
691 dx11 = _mm256_sub_pd(ix1,jx1);
692 dy11 = _mm256_sub_pd(iy1,jy1);
693 dz11 = _mm256_sub_pd(iz1,jz1);
694 dx12 = _mm256_sub_pd(ix1,jx2);
695 dy12 = _mm256_sub_pd(iy1,jy2);
696 dz12 = _mm256_sub_pd(iz1,jz2);
697 dx20 = _mm256_sub_pd(ix2,jx0);
698 dy20 = _mm256_sub_pd(iy2,jy0);
699 dz20 = _mm256_sub_pd(iz2,jz0);
700 dx21 = _mm256_sub_pd(ix2,jx1);
701 dy21 = _mm256_sub_pd(iy2,jy1);
702 dz21 = _mm256_sub_pd(iz2,jz1);
703 dx22 = _mm256_sub_pd(ix2,jx2);
704 dy22 = _mm256_sub_pd(iy2,jy2);
705 dz22 = _mm256_sub_pd(iz2,jz2);
707 /* Calculate squared distance and things based on it */
708 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
709 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
710 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
711 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
712 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
713 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
714 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
715 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
716 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
718 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
719 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
720 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
721 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
722 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
723 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
724 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
725 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
726 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
728 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
729 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
730 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
731 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
732 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
733 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
734 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
735 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
736 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
738 fjx0 = _mm256_setzero_pd();
739 fjy0 = _mm256_setzero_pd();
740 fjz0 = _mm256_setzero_pd();
741 fjx1 = _mm256_setzero_pd();
742 fjy1 = _mm256_setzero_pd();
743 fjz1 = _mm256_setzero_pd();
744 fjx2 = _mm256_setzero_pd();
745 fjy2 = _mm256_setzero_pd();
746 fjz2 = _mm256_setzero_pd();
748 /**************************
749 * CALCULATE INTERACTIONS *
750 **************************/
752 if (gmx_mm256_any_lt(rsq00,rcutoff2))
755 /* REACTION-FIELD ELECTROSTATICS */
756 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_add_pd(rinv00,_mm256_mul_pd(krf,rsq00)),crf));
757 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
759 /* LENNARD-JONES DISPERSION/REPULSION */
761 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
762 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
763 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
764 vvdw = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
765 _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
766 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
768 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
770 /* Update potential sum for this i atom from the interaction with this j atom. */
771 velec = _mm256_and_pd(velec,cutoff_mask);
772 velec = _mm256_andnot_pd(dummy_mask,velec);
773 velecsum = _mm256_add_pd(velecsum,velec);
774 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
775 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
776 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
778 fscal = _mm256_add_pd(felec,fvdw);
780 fscal = _mm256_and_pd(fscal,cutoff_mask);
782 fscal = _mm256_andnot_pd(dummy_mask,fscal);
784 /* Calculate temporary vectorial force */
785 tx = _mm256_mul_pd(fscal,dx00);
786 ty = _mm256_mul_pd(fscal,dy00);
787 tz = _mm256_mul_pd(fscal,dz00);
789 /* Update vectorial force */
790 fix0 = _mm256_add_pd(fix0,tx);
791 fiy0 = _mm256_add_pd(fiy0,ty);
792 fiz0 = _mm256_add_pd(fiz0,tz);
794 fjx0 = _mm256_add_pd(fjx0,tx);
795 fjy0 = _mm256_add_pd(fjy0,ty);
796 fjz0 = _mm256_add_pd(fjz0,tz);
800 /**************************
801 * CALCULATE INTERACTIONS *
802 **************************/
804 if (gmx_mm256_any_lt(rsq01,rcutoff2))
807 /* REACTION-FIELD ELECTROSTATICS */
808 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_add_pd(rinv01,_mm256_mul_pd(krf,rsq01)),crf));
809 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
811 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
813 /* Update potential sum for this i atom from the interaction with this j atom. */
814 velec = _mm256_and_pd(velec,cutoff_mask);
815 velec = _mm256_andnot_pd(dummy_mask,velec);
816 velecsum = _mm256_add_pd(velecsum,velec);
820 fscal = _mm256_and_pd(fscal,cutoff_mask);
822 fscal = _mm256_andnot_pd(dummy_mask,fscal);
824 /* Calculate temporary vectorial force */
825 tx = _mm256_mul_pd(fscal,dx01);
826 ty = _mm256_mul_pd(fscal,dy01);
827 tz = _mm256_mul_pd(fscal,dz01);
829 /* Update vectorial force */
830 fix0 = _mm256_add_pd(fix0,tx);
831 fiy0 = _mm256_add_pd(fiy0,ty);
832 fiz0 = _mm256_add_pd(fiz0,tz);
834 fjx1 = _mm256_add_pd(fjx1,tx);
835 fjy1 = _mm256_add_pd(fjy1,ty);
836 fjz1 = _mm256_add_pd(fjz1,tz);
840 /**************************
841 * CALCULATE INTERACTIONS *
842 **************************/
844 if (gmx_mm256_any_lt(rsq02,rcutoff2))
847 /* REACTION-FIELD ELECTROSTATICS */
848 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_add_pd(rinv02,_mm256_mul_pd(krf,rsq02)),crf));
849 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
851 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
853 /* Update potential sum for this i atom from the interaction with this j atom. */
854 velec = _mm256_and_pd(velec,cutoff_mask);
855 velec = _mm256_andnot_pd(dummy_mask,velec);
856 velecsum = _mm256_add_pd(velecsum,velec);
860 fscal = _mm256_and_pd(fscal,cutoff_mask);
862 fscal = _mm256_andnot_pd(dummy_mask,fscal);
864 /* Calculate temporary vectorial force */
865 tx = _mm256_mul_pd(fscal,dx02);
866 ty = _mm256_mul_pd(fscal,dy02);
867 tz = _mm256_mul_pd(fscal,dz02);
869 /* Update vectorial force */
870 fix0 = _mm256_add_pd(fix0,tx);
871 fiy0 = _mm256_add_pd(fiy0,ty);
872 fiz0 = _mm256_add_pd(fiz0,tz);
874 fjx2 = _mm256_add_pd(fjx2,tx);
875 fjy2 = _mm256_add_pd(fjy2,ty);
876 fjz2 = _mm256_add_pd(fjz2,tz);
880 /**************************
881 * CALCULATE INTERACTIONS *
882 **************************/
884 if (gmx_mm256_any_lt(rsq10,rcutoff2))
887 /* REACTION-FIELD ELECTROSTATICS */
888 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_add_pd(rinv10,_mm256_mul_pd(krf,rsq10)),crf));
889 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
891 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
893 /* Update potential sum for this i atom from the interaction with this j atom. */
894 velec = _mm256_and_pd(velec,cutoff_mask);
895 velec = _mm256_andnot_pd(dummy_mask,velec);
896 velecsum = _mm256_add_pd(velecsum,velec);
900 fscal = _mm256_and_pd(fscal,cutoff_mask);
902 fscal = _mm256_andnot_pd(dummy_mask,fscal);
904 /* Calculate temporary vectorial force */
905 tx = _mm256_mul_pd(fscal,dx10);
906 ty = _mm256_mul_pd(fscal,dy10);
907 tz = _mm256_mul_pd(fscal,dz10);
909 /* Update vectorial force */
910 fix1 = _mm256_add_pd(fix1,tx);
911 fiy1 = _mm256_add_pd(fiy1,ty);
912 fiz1 = _mm256_add_pd(fiz1,tz);
914 fjx0 = _mm256_add_pd(fjx0,tx);
915 fjy0 = _mm256_add_pd(fjy0,ty);
916 fjz0 = _mm256_add_pd(fjz0,tz);
920 /**************************
921 * CALCULATE INTERACTIONS *
922 **************************/
924 if (gmx_mm256_any_lt(rsq11,rcutoff2))
927 /* REACTION-FIELD ELECTROSTATICS */
928 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
929 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
931 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
933 /* Update potential sum for this i atom from the interaction with this j atom. */
934 velec = _mm256_and_pd(velec,cutoff_mask);
935 velec = _mm256_andnot_pd(dummy_mask,velec);
936 velecsum = _mm256_add_pd(velecsum,velec);
940 fscal = _mm256_and_pd(fscal,cutoff_mask);
942 fscal = _mm256_andnot_pd(dummy_mask,fscal);
944 /* Calculate temporary vectorial force */
945 tx = _mm256_mul_pd(fscal,dx11);
946 ty = _mm256_mul_pd(fscal,dy11);
947 tz = _mm256_mul_pd(fscal,dz11);
949 /* Update vectorial force */
950 fix1 = _mm256_add_pd(fix1,tx);
951 fiy1 = _mm256_add_pd(fiy1,ty);
952 fiz1 = _mm256_add_pd(fiz1,tz);
954 fjx1 = _mm256_add_pd(fjx1,tx);
955 fjy1 = _mm256_add_pd(fjy1,ty);
956 fjz1 = _mm256_add_pd(fjz1,tz);
960 /**************************
961 * CALCULATE INTERACTIONS *
962 **************************/
964 if (gmx_mm256_any_lt(rsq12,rcutoff2))
967 /* REACTION-FIELD ELECTROSTATICS */
968 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
969 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
971 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
973 /* Update potential sum for this i atom from the interaction with this j atom. */
974 velec = _mm256_and_pd(velec,cutoff_mask);
975 velec = _mm256_andnot_pd(dummy_mask,velec);
976 velecsum = _mm256_add_pd(velecsum,velec);
980 fscal = _mm256_and_pd(fscal,cutoff_mask);
982 fscal = _mm256_andnot_pd(dummy_mask,fscal);
984 /* Calculate temporary vectorial force */
985 tx = _mm256_mul_pd(fscal,dx12);
986 ty = _mm256_mul_pd(fscal,dy12);
987 tz = _mm256_mul_pd(fscal,dz12);
989 /* Update vectorial force */
990 fix1 = _mm256_add_pd(fix1,tx);
991 fiy1 = _mm256_add_pd(fiy1,ty);
992 fiz1 = _mm256_add_pd(fiz1,tz);
994 fjx2 = _mm256_add_pd(fjx2,tx);
995 fjy2 = _mm256_add_pd(fjy2,ty);
996 fjz2 = _mm256_add_pd(fjz2,tz);
1000 /**************************
1001 * CALCULATE INTERACTIONS *
1002 **************************/
1004 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1007 /* REACTION-FIELD ELECTROSTATICS */
1008 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_add_pd(rinv20,_mm256_mul_pd(krf,rsq20)),crf));
1009 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
1011 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1013 /* Update potential sum for this i atom from the interaction with this j atom. */
1014 velec = _mm256_and_pd(velec,cutoff_mask);
1015 velec = _mm256_andnot_pd(dummy_mask,velec);
1016 velecsum = _mm256_add_pd(velecsum,velec);
1020 fscal = _mm256_and_pd(fscal,cutoff_mask);
1022 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1024 /* Calculate temporary vectorial force */
1025 tx = _mm256_mul_pd(fscal,dx20);
1026 ty = _mm256_mul_pd(fscal,dy20);
1027 tz = _mm256_mul_pd(fscal,dz20);
1029 /* Update vectorial force */
1030 fix2 = _mm256_add_pd(fix2,tx);
1031 fiy2 = _mm256_add_pd(fiy2,ty);
1032 fiz2 = _mm256_add_pd(fiz2,tz);
1034 fjx0 = _mm256_add_pd(fjx0,tx);
1035 fjy0 = _mm256_add_pd(fjy0,ty);
1036 fjz0 = _mm256_add_pd(fjz0,tz);
1040 /**************************
1041 * CALCULATE INTERACTIONS *
1042 **************************/
1044 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1047 /* REACTION-FIELD ELECTROSTATICS */
1048 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
1049 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1051 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1053 /* Update potential sum for this i atom from the interaction with this j atom. */
1054 velec = _mm256_and_pd(velec,cutoff_mask);
1055 velec = _mm256_andnot_pd(dummy_mask,velec);
1056 velecsum = _mm256_add_pd(velecsum,velec);
1060 fscal = _mm256_and_pd(fscal,cutoff_mask);
1062 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1064 /* Calculate temporary vectorial force */
1065 tx = _mm256_mul_pd(fscal,dx21);
1066 ty = _mm256_mul_pd(fscal,dy21);
1067 tz = _mm256_mul_pd(fscal,dz21);
1069 /* Update vectorial force */
1070 fix2 = _mm256_add_pd(fix2,tx);
1071 fiy2 = _mm256_add_pd(fiy2,ty);
1072 fiz2 = _mm256_add_pd(fiz2,tz);
1074 fjx1 = _mm256_add_pd(fjx1,tx);
1075 fjy1 = _mm256_add_pd(fjy1,ty);
1076 fjz1 = _mm256_add_pd(fjz1,tz);
1080 /**************************
1081 * CALCULATE INTERACTIONS *
1082 **************************/
1084 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1087 /* REACTION-FIELD ELECTROSTATICS */
1088 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
1089 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1091 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1093 /* Update potential sum for this i atom from the interaction with this j atom. */
1094 velec = _mm256_and_pd(velec,cutoff_mask);
1095 velec = _mm256_andnot_pd(dummy_mask,velec);
1096 velecsum = _mm256_add_pd(velecsum,velec);
1100 fscal = _mm256_and_pd(fscal,cutoff_mask);
1102 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1104 /* Calculate temporary vectorial force */
1105 tx = _mm256_mul_pd(fscal,dx22);
1106 ty = _mm256_mul_pd(fscal,dy22);
1107 tz = _mm256_mul_pd(fscal,dz22);
1109 /* Update vectorial force */
1110 fix2 = _mm256_add_pd(fix2,tx);
1111 fiy2 = _mm256_add_pd(fiy2,ty);
1112 fiz2 = _mm256_add_pd(fiz2,tz);
1114 fjx2 = _mm256_add_pd(fjx2,tx);
1115 fjy2 = _mm256_add_pd(fjy2,ty);
1116 fjz2 = _mm256_add_pd(fjz2,tz);
1120 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1121 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1122 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1123 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1125 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1126 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1128 /* Inner loop uses 342 flops */
1131 /* End of innermost loop */
1133 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1134 f+i_coord_offset,fshift+i_shift_offset);
1137 /* Update potential energies */
1138 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1139 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1141 /* Increment number of inner iterations */
1142 inneriter += j_index_end - j_index_start;
1144 /* Outer loop uses 20 flops */
1147 /* Increment number of outer iterations */
1150 /* Update outer/inner flops */
1152 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*342);
1155 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_256_double
1156 * Electrostatics interaction: ReactionField
1157 * VdW interaction: LennardJones
1158 * Geometry: Water3-Water3
1159 * Calculate force/pot: Force
1162 nb_kernel_ElecRFCut_VdwLJSh_GeomW3W3_F_avx_256_double
1163 (t_nblist * gmx_restrict nlist,
1164 rvec * gmx_restrict xx,
1165 rvec * gmx_restrict ff,
1166 t_forcerec * gmx_restrict fr,
1167 t_mdatoms * gmx_restrict mdatoms,
1168 nb_kernel_data_t * gmx_restrict kernel_data,
1169 t_nrnb * gmx_restrict nrnb)
1171 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1172 * just 0 for non-waters.
1173 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1174 * jnr indices corresponding to data put in the four positions in the SIMD register.
1176 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1177 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1178 int jnrA,jnrB,jnrC,jnrD;
1179 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1180 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1181 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1182 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1183 real rcutoff_scalar;
1184 real *shiftvec,*fshift,*x,*f;
1185 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1186 real scratch[4*DIM];
1187 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1188 real * vdwioffsetptr0;
1189 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1190 real * vdwioffsetptr1;
1191 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1192 real * vdwioffsetptr2;
1193 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1194 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1195 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1196 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1197 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1198 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1199 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1200 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1201 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1202 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1203 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1204 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1205 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1206 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1207 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1208 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1209 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1212 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1215 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1216 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1217 __m256d dummy_mask,cutoff_mask;
1218 __m128 tmpmask0,tmpmask1;
1219 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1220 __m256d one = _mm256_set1_pd(1.0);
1221 __m256d two = _mm256_set1_pd(2.0);
1227 jindex = nlist->jindex;
1229 shiftidx = nlist->shift;
1231 shiftvec = fr->shift_vec[0];
1232 fshift = fr->fshift[0];
1233 facel = _mm256_set1_pd(fr->epsfac);
1234 charge = mdatoms->chargeA;
1235 krf = _mm256_set1_pd(fr->ic->k_rf);
1236 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
1237 crf = _mm256_set1_pd(fr->ic->c_rf);
1238 nvdwtype = fr->ntype;
1239 vdwparam = fr->nbfp;
1240 vdwtype = mdatoms->typeA;
1242 /* Setup water-specific parameters */
1243 inr = nlist->iinr[0];
1244 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1245 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1246 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1247 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1249 jq0 = _mm256_set1_pd(charge[inr+0]);
1250 jq1 = _mm256_set1_pd(charge[inr+1]);
1251 jq2 = _mm256_set1_pd(charge[inr+2]);
1252 vdwjidx0A = 2*vdwtype[inr+0];
1253 qq00 = _mm256_mul_pd(iq0,jq0);
1254 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1255 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1256 qq01 = _mm256_mul_pd(iq0,jq1);
1257 qq02 = _mm256_mul_pd(iq0,jq2);
1258 qq10 = _mm256_mul_pd(iq1,jq0);
1259 qq11 = _mm256_mul_pd(iq1,jq1);
1260 qq12 = _mm256_mul_pd(iq1,jq2);
1261 qq20 = _mm256_mul_pd(iq2,jq0);
1262 qq21 = _mm256_mul_pd(iq2,jq1);
1263 qq22 = _mm256_mul_pd(iq2,jq2);
1265 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1266 rcutoff_scalar = fr->rcoulomb;
1267 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1268 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1270 sh_vdw_invrcut6 = _mm256_set1_pd(fr->ic->sh_invrc6);
1271 rvdw = _mm256_set1_pd(fr->rvdw);
1273 /* Avoid stupid compiler warnings */
1274 jnrA = jnrB = jnrC = jnrD = 0;
1275 j_coord_offsetA = 0;
1276 j_coord_offsetB = 0;
1277 j_coord_offsetC = 0;
1278 j_coord_offsetD = 0;
1283 for(iidx=0;iidx<4*DIM;iidx++)
1285 scratch[iidx] = 0.0;
1288 /* Start outer loop over neighborlists */
1289 for(iidx=0; iidx<nri; iidx++)
1291 /* Load shift vector for this list */
1292 i_shift_offset = DIM*shiftidx[iidx];
1294 /* Load limits for loop over neighbors */
1295 j_index_start = jindex[iidx];
1296 j_index_end = jindex[iidx+1];
1298 /* Get outer coordinate index */
1300 i_coord_offset = DIM*inr;
1302 /* Load i particle coords and add shift vector */
1303 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1304 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1306 fix0 = _mm256_setzero_pd();
1307 fiy0 = _mm256_setzero_pd();
1308 fiz0 = _mm256_setzero_pd();
1309 fix1 = _mm256_setzero_pd();
1310 fiy1 = _mm256_setzero_pd();
1311 fiz1 = _mm256_setzero_pd();
1312 fix2 = _mm256_setzero_pd();
1313 fiy2 = _mm256_setzero_pd();
1314 fiz2 = _mm256_setzero_pd();
1316 /* Start inner kernel loop */
1317 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1320 /* Get j neighbor index, and coordinate index */
1322 jnrB = jjnr[jidx+1];
1323 jnrC = jjnr[jidx+2];
1324 jnrD = jjnr[jidx+3];
1325 j_coord_offsetA = DIM*jnrA;
1326 j_coord_offsetB = DIM*jnrB;
1327 j_coord_offsetC = DIM*jnrC;
1328 j_coord_offsetD = DIM*jnrD;
1330 /* load j atom coordinates */
1331 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1332 x+j_coord_offsetC,x+j_coord_offsetD,
1333 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1335 /* Calculate displacement vector */
1336 dx00 = _mm256_sub_pd(ix0,jx0);
1337 dy00 = _mm256_sub_pd(iy0,jy0);
1338 dz00 = _mm256_sub_pd(iz0,jz0);
1339 dx01 = _mm256_sub_pd(ix0,jx1);
1340 dy01 = _mm256_sub_pd(iy0,jy1);
1341 dz01 = _mm256_sub_pd(iz0,jz1);
1342 dx02 = _mm256_sub_pd(ix0,jx2);
1343 dy02 = _mm256_sub_pd(iy0,jy2);
1344 dz02 = _mm256_sub_pd(iz0,jz2);
1345 dx10 = _mm256_sub_pd(ix1,jx0);
1346 dy10 = _mm256_sub_pd(iy1,jy0);
1347 dz10 = _mm256_sub_pd(iz1,jz0);
1348 dx11 = _mm256_sub_pd(ix1,jx1);
1349 dy11 = _mm256_sub_pd(iy1,jy1);
1350 dz11 = _mm256_sub_pd(iz1,jz1);
1351 dx12 = _mm256_sub_pd(ix1,jx2);
1352 dy12 = _mm256_sub_pd(iy1,jy2);
1353 dz12 = _mm256_sub_pd(iz1,jz2);
1354 dx20 = _mm256_sub_pd(ix2,jx0);
1355 dy20 = _mm256_sub_pd(iy2,jy0);
1356 dz20 = _mm256_sub_pd(iz2,jz0);
1357 dx21 = _mm256_sub_pd(ix2,jx1);
1358 dy21 = _mm256_sub_pd(iy2,jy1);
1359 dz21 = _mm256_sub_pd(iz2,jz1);
1360 dx22 = _mm256_sub_pd(ix2,jx2);
1361 dy22 = _mm256_sub_pd(iy2,jy2);
1362 dz22 = _mm256_sub_pd(iz2,jz2);
1364 /* Calculate squared distance and things based on it */
1365 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1366 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1367 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1368 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1369 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1370 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1371 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1372 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1373 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1375 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1376 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1377 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1378 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1379 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1380 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1381 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1382 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1383 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1385 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1386 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1387 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1388 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1389 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1390 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1391 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1392 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1393 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1395 fjx0 = _mm256_setzero_pd();
1396 fjy0 = _mm256_setzero_pd();
1397 fjz0 = _mm256_setzero_pd();
1398 fjx1 = _mm256_setzero_pd();
1399 fjy1 = _mm256_setzero_pd();
1400 fjz1 = _mm256_setzero_pd();
1401 fjx2 = _mm256_setzero_pd();
1402 fjy2 = _mm256_setzero_pd();
1403 fjz2 = _mm256_setzero_pd();
1405 /**************************
1406 * CALCULATE INTERACTIONS *
1407 **************************/
1409 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1412 /* REACTION-FIELD ELECTROSTATICS */
1413 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
1415 /* LENNARD-JONES DISPERSION/REPULSION */
1417 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1418 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1420 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1422 fscal = _mm256_add_pd(felec,fvdw);
1424 fscal = _mm256_and_pd(fscal,cutoff_mask);
1426 /* Calculate temporary vectorial force */
1427 tx = _mm256_mul_pd(fscal,dx00);
1428 ty = _mm256_mul_pd(fscal,dy00);
1429 tz = _mm256_mul_pd(fscal,dz00);
1431 /* Update vectorial force */
1432 fix0 = _mm256_add_pd(fix0,tx);
1433 fiy0 = _mm256_add_pd(fiy0,ty);
1434 fiz0 = _mm256_add_pd(fiz0,tz);
1436 fjx0 = _mm256_add_pd(fjx0,tx);
1437 fjy0 = _mm256_add_pd(fjy0,ty);
1438 fjz0 = _mm256_add_pd(fjz0,tz);
1442 /**************************
1443 * CALCULATE INTERACTIONS *
1444 **************************/
1446 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1449 /* REACTION-FIELD ELECTROSTATICS */
1450 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
1452 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1456 fscal = _mm256_and_pd(fscal,cutoff_mask);
1458 /* Calculate temporary vectorial force */
1459 tx = _mm256_mul_pd(fscal,dx01);
1460 ty = _mm256_mul_pd(fscal,dy01);
1461 tz = _mm256_mul_pd(fscal,dz01);
1463 /* Update vectorial force */
1464 fix0 = _mm256_add_pd(fix0,tx);
1465 fiy0 = _mm256_add_pd(fiy0,ty);
1466 fiz0 = _mm256_add_pd(fiz0,tz);
1468 fjx1 = _mm256_add_pd(fjx1,tx);
1469 fjy1 = _mm256_add_pd(fjy1,ty);
1470 fjz1 = _mm256_add_pd(fjz1,tz);
1474 /**************************
1475 * CALCULATE INTERACTIONS *
1476 **************************/
1478 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1481 /* REACTION-FIELD ELECTROSTATICS */
1482 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
1484 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1488 fscal = _mm256_and_pd(fscal,cutoff_mask);
1490 /* Calculate temporary vectorial force */
1491 tx = _mm256_mul_pd(fscal,dx02);
1492 ty = _mm256_mul_pd(fscal,dy02);
1493 tz = _mm256_mul_pd(fscal,dz02);
1495 /* Update vectorial force */
1496 fix0 = _mm256_add_pd(fix0,tx);
1497 fiy0 = _mm256_add_pd(fiy0,ty);
1498 fiz0 = _mm256_add_pd(fiz0,tz);
1500 fjx2 = _mm256_add_pd(fjx2,tx);
1501 fjy2 = _mm256_add_pd(fjy2,ty);
1502 fjz2 = _mm256_add_pd(fjz2,tz);
1506 /**************************
1507 * CALCULATE INTERACTIONS *
1508 **************************/
1510 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1513 /* REACTION-FIELD ELECTROSTATICS */
1514 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
1516 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1520 fscal = _mm256_and_pd(fscal,cutoff_mask);
1522 /* Calculate temporary vectorial force */
1523 tx = _mm256_mul_pd(fscal,dx10);
1524 ty = _mm256_mul_pd(fscal,dy10);
1525 tz = _mm256_mul_pd(fscal,dz10);
1527 /* Update vectorial force */
1528 fix1 = _mm256_add_pd(fix1,tx);
1529 fiy1 = _mm256_add_pd(fiy1,ty);
1530 fiz1 = _mm256_add_pd(fiz1,tz);
1532 fjx0 = _mm256_add_pd(fjx0,tx);
1533 fjy0 = _mm256_add_pd(fjy0,ty);
1534 fjz0 = _mm256_add_pd(fjz0,tz);
1538 /**************************
1539 * CALCULATE INTERACTIONS *
1540 **************************/
1542 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1545 /* REACTION-FIELD ELECTROSTATICS */
1546 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1548 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1552 fscal = _mm256_and_pd(fscal,cutoff_mask);
1554 /* Calculate temporary vectorial force */
1555 tx = _mm256_mul_pd(fscal,dx11);
1556 ty = _mm256_mul_pd(fscal,dy11);
1557 tz = _mm256_mul_pd(fscal,dz11);
1559 /* Update vectorial force */
1560 fix1 = _mm256_add_pd(fix1,tx);
1561 fiy1 = _mm256_add_pd(fiy1,ty);
1562 fiz1 = _mm256_add_pd(fiz1,tz);
1564 fjx1 = _mm256_add_pd(fjx1,tx);
1565 fjy1 = _mm256_add_pd(fjy1,ty);
1566 fjz1 = _mm256_add_pd(fjz1,tz);
1570 /**************************
1571 * CALCULATE INTERACTIONS *
1572 **************************/
1574 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1577 /* REACTION-FIELD ELECTROSTATICS */
1578 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1580 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1584 fscal = _mm256_and_pd(fscal,cutoff_mask);
1586 /* Calculate temporary vectorial force */
1587 tx = _mm256_mul_pd(fscal,dx12);
1588 ty = _mm256_mul_pd(fscal,dy12);
1589 tz = _mm256_mul_pd(fscal,dz12);
1591 /* Update vectorial force */
1592 fix1 = _mm256_add_pd(fix1,tx);
1593 fiy1 = _mm256_add_pd(fiy1,ty);
1594 fiz1 = _mm256_add_pd(fiz1,tz);
1596 fjx2 = _mm256_add_pd(fjx2,tx);
1597 fjy2 = _mm256_add_pd(fjy2,ty);
1598 fjz2 = _mm256_add_pd(fjz2,tz);
1602 /**************************
1603 * CALCULATE INTERACTIONS *
1604 **************************/
1606 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1609 /* REACTION-FIELD ELECTROSTATICS */
1610 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
1612 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1616 fscal = _mm256_and_pd(fscal,cutoff_mask);
1618 /* Calculate temporary vectorial force */
1619 tx = _mm256_mul_pd(fscal,dx20);
1620 ty = _mm256_mul_pd(fscal,dy20);
1621 tz = _mm256_mul_pd(fscal,dz20);
1623 /* Update vectorial force */
1624 fix2 = _mm256_add_pd(fix2,tx);
1625 fiy2 = _mm256_add_pd(fiy2,ty);
1626 fiz2 = _mm256_add_pd(fiz2,tz);
1628 fjx0 = _mm256_add_pd(fjx0,tx);
1629 fjy0 = _mm256_add_pd(fjy0,ty);
1630 fjz0 = _mm256_add_pd(fjz0,tz);
1634 /**************************
1635 * CALCULATE INTERACTIONS *
1636 **************************/
1638 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1641 /* REACTION-FIELD ELECTROSTATICS */
1642 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1644 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1648 fscal = _mm256_and_pd(fscal,cutoff_mask);
1650 /* Calculate temporary vectorial force */
1651 tx = _mm256_mul_pd(fscal,dx21);
1652 ty = _mm256_mul_pd(fscal,dy21);
1653 tz = _mm256_mul_pd(fscal,dz21);
1655 /* Update vectorial force */
1656 fix2 = _mm256_add_pd(fix2,tx);
1657 fiy2 = _mm256_add_pd(fiy2,ty);
1658 fiz2 = _mm256_add_pd(fiz2,tz);
1660 fjx1 = _mm256_add_pd(fjx1,tx);
1661 fjy1 = _mm256_add_pd(fjy1,ty);
1662 fjz1 = _mm256_add_pd(fjz1,tz);
1666 /**************************
1667 * CALCULATE INTERACTIONS *
1668 **************************/
1670 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1673 /* REACTION-FIELD ELECTROSTATICS */
1674 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1676 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1680 fscal = _mm256_and_pd(fscal,cutoff_mask);
1682 /* Calculate temporary vectorial force */
1683 tx = _mm256_mul_pd(fscal,dx22);
1684 ty = _mm256_mul_pd(fscal,dy22);
1685 tz = _mm256_mul_pd(fscal,dz22);
1687 /* Update vectorial force */
1688 fix2 = _mm256_add_pd(fix2,tx);
1689 fiy2 = _mm256_add_pd(fiy2,ty);
1690 fiz2 = _mm256_add_pd(fiz2,tz);
1692 fjx2 = _mm256_add_pd(fjx2,tx);
1693 fjy2 = _mm256_add_pd(fjy2,ty);
1694 fjz2 = _mm256_add_pd(fjz2,tz);
1698 fjptrA = f+j_coord_offsetA;
1699 fjptrB = f+j_coord_offsetB;
1700 fjptrC = f+j_coord_offsetC;
1701 fjptrD = f+j_coord_offsetD;
1703 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1704 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1706 /* Inner loop uses 277 flops */
1709 if(jidx<j_index_end)
1712 /* Get j neighbor index, and coordinate index */
1713 jnrlistA = jjnr[jidx];
1714 jnrlistB = jjnr[jidx+1];
1715 jnrlistC = jjnr[jidx+2];
1716 jnrlistD = jjnr[jidx+3];
1717 /* Sign of each element will be negative for non-real atoms.
1718 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1719 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1721 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1723 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1724 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1725 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1727 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1728 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1729 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1730 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1731 j_coord_offsetA = DIM*jnrA;
1732 j_coord_offsetB = DIM*jnrB;
1733 j_coord_offsetC = DIM*jnrC;
1734 j_coord_offsetD = DIM*jnrD;
1736 /* load j atom coordinates */
1737 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1738 x+j_coord_offsetC,x+j_coord_offsetD,
1739 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1741 /* Calculate displacement vector */
1742 dx00 = _mm256_sub_pd(ix0,jx0);
1743 dy00 = _mm256_sub_pd(iy0,jy0);
1744 dz00 = _mm256_sub_pd(iz0,jz0);
1745 dx01 = _mm256_sub_pd(ix0,jx1);
1746 dy01 = _mm256_sub_pd(iy0,jy1);
1747 dz01 = _mm256_sub_pd(iz0,jz1);
1748 dx02 = _mm256_sub_pd(ix0,jx2);
1749 dy02 = _mm256_sub_pd(iy0,jy2);
1750 dz02 = _mm256_sub_pd(iz0,jz2);
1751 dx10 = _mm256_sub_pd(ix1,jx0);
1752 dy10 = _mm256_sub_pd(iy1,jy0);
1753 dz10 = _mm256_sub_pd(iz1,jz0);
1754 dx11 = _mm256_sub_pd(ix1,jx1);
1755 dy11 = _mm256_sub_pd(iy1,jy1);
1756 dz11 = _mm256_sub_pd(iz1,jz1);
1757 dx12 = _mm256_sub_pd(ix1,jx2);
1758 dy12 = _mm256_sub_pd(iy1,jy2);
1759 dz12 = _mm256_sub_pd(iz1,jz2);
1760 dx20 = _mm256_sub_pd(ix2,jx0);
1761 dy20 = _mm256_sub_pd(iy2,jy0);
1762 dz20 = _mm256_sub_pd(iz2,jz0);
1763 dx21 = _mm256_sub_pd(ix2,jx1);
1764 dy21 = _mm256_sub_pd(iy2,jy1);
1765 dz21 = _mm256_sub_pd(iz2,jz1);
1766 dx22 = _mm256_sub_pd(ix2,jx2);
1767 dy22 = _mm256_sub_pd(iy2,jy2);
1768 dz22 = _mm256_sub_pd(iz2,jz2);
1770 /* Calculate squared distance and things based on it */
1771 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1772 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1773 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1774 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1775 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1776 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1777 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1778 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1779 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1781 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1782 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1783 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1784 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1785 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1786 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1787 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1788 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1789 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1791 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1792 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1793 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1794 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1795 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1796 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1797 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1798 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1799 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1801 fjx0 = _mm256_setzero_pd();
1802 fjy0 = _mm256_setzero_pd();
1803 fjz0 = _mm256_setzero_pd();
1804 fjx1 = _mm256_setzero_pd();
1805 fjy1 = _mm256_setzero_pd();
1806 fjz1 = _mm256_setzero_pd();
1807 fjx2 = _mm256_setzero_pd();
1808 fjy2 = _mm256_setzero_pd();
1809 fjz2 = _mm256_setzero_pd();
1811 /**************************
1812 * CALCULATE INTERACTIONS *
1813 **************************/
1815 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1818 /* REACTION-FIELD ELECTROSTATICS */
1819 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
1821 /* LENNARD-JONES DISPERSION/REPULSION */
1823 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1824 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1826 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1828 fscal = _mm256_add_pd(felec,fvdw);
1830 fscal = _mm256_and_pd(fscal,cutoff_mask);
1832 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1834 /* Calculate temporary vectorial force */
1835 tx = _mm256_mul_pd(fscal,dx00);
1836 ty = _mm256_mul_pd(fscal,dy00);
1837 tz = _mm256_mul_pd(fscal,dz00);
1839 /* Update vectorial force */
1840 fix0 = _mm256_add_pd(fix0,tx);
1841 fiy0 = _mm256_add_pd(fiy0,ty);
1842 fiz0 = _mm256_add_pd(fiz0,tz);
1844 fjx0 = _mm256_add_pd(fjx0,tx);
1845 fjy0 = _mm256_add_pd(fjy0,ty);
1846 fjz0 = _mm256_add_pd(fjz0,tz);
1850 /**************************
1851 * CALCULATE INTERACTIONS *
1852 **************************/
1854 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1857 /* REACTION-FIELD ELECTROSTATICS */
1858 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
1860 cutoff_mask = _mm256_cmp_pd(rsq01,rcutoff2,_CMP_LT_OQ);
1864 fscal = _mm256_and_pd(fscal,cutoff_mask);
1866 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1868 /* Calculate temporary vectorial force */
1869 tx = _mm256_mul_pd(fscal,dx01);
1870 ty = _mm256_mul_pd(fscal,dy01);
1871 tz = _mm256_mul_pd(fscal,dz01);
1873 /* Update vectorial force */
1874 fix0 = _mm256_add_pd(fix0,tx);
1875 fiy0 = _mm256_add_pd(fiy0,ty);
1876 fiz0 = _mm256_add_pd(fiz0,tz);
1878 fjx1 = _mm256_add_pd(fjx1,tx);
1879 fjy1 = _mm256_add_pd(fjy1,ty);
1880 fjz1 = _mm256_add_pd(fjz1,tz);
1884 /**************************
1885 * CALCULATE INTERACTIONS *
1886 **************************/
1888 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1891 /* REACTION-FIELD ELECTROSTATICS */
1892 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
1894 cutoff_mask = _mm256_cmp_pd(rsq02,rcutoff2,_CMP_LT_OQ);
1898 fscal = _mm256_and_pd(fscal,cutoff_mask);
1900 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1902 /* Calculate temporary vectorial force */
1903 tx = _mm256_mul_pd(fscal,dx02);
1904 ty = _mm256_mul_pd(fscal,dy02);
1905 tz = _mm256_mul_pd(fscal,dz02);
1907 /* Update vectorial force */
1908 fix0 = _mm256_add_pd(fix0,tx);
1909 fiy0 = _mm256_add_pd(fiy0,ty);
1910 fiz0 = _mm256_add_pd(fiz0,tz);
1912 fjx2 = _mm256_add_pd(fjx2,tx);
1913 fjy2 = _mm256_add_pd(fjy2,ty);
1914 fjz2 = _mm256_add_pd(fjz2,tz);
1918 /**************************
1919 * CALCULATE INTERACTIONS *
1920 **************************/
1922 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1925 /* REACTION-FIELD ELECTROSTATICS */
1926 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
1928 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1932 fscal = _mm256_and_pd(fscal,cutoff_mask);
1934 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1936 /* Calculate temporary vectorial force */
1937 tx = _mm256_mul_pd(fscal,dx10);
1938 ty = _mm256_mul_pd(fscal,dy10);
1939 tz = _mm256_mul_pd(fscal,dz10);
1941 /* Update vectorial force */
1942 fix1 = _mm256_add_pd(fix1,tx);
1943 fiy1 = _mm256_add_pd(fiy1,ty);
1944 fiz1 = _mm256_add_pd(fiz1,tz);
1946 fjx0 = _mm256_add_pd(fjx0,tx);
1947 fjy0 = _mm256_add_pd(fjy0,ty);
1948 fjz0 = _mm256_add_pd(fjz0,tz);
1952 /**************************
1953 * CALCULATE INTERACTIONS *
1954 **************************/
1956 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1959 /* REACTION-FIELD ELECTROSTATICS */
1960 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1962 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1966 fscal = _mm256_and_pd(fscal,cutoff_mask);
1968 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1970 /* Calculate temporary vectorial force */
1971 tx = _mm256_mul_pd(fscal,dx11);
1972 ty = _mm256_mul_pd(fscal,dy11);
1973 tz = _mm256_mul_pd(fscal,dz11);
1975 /* Update vectorial force */
1976 fix1 = _mm256_add_pd(fix1,tx);
1977 fiy1 = _mm256_add_pd(fiy1,ty);
1978 fiz1 = _mm256_add_pd(fiz1,tz);
1980 fjx1 = _mm256_add_pd(fjx1,tx);
1981 fjy1 = _mm256_add_pd(fjy1,ty);
1982 fjz1 = _mm256_add_pd(fjz1,tz);
1986 /**************************
1987 * CALCULATE INTERACTIONS *
1988 **************************/
1990 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1993 /* REACTION-FIELD ELECTROSTATICS */
1994 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1996 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2000 fscal = _mm256_and_pd(fscal,cutoff_mask);
2002 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2004 /* Calculate temporary vectorial force */
2005 tx = _mm256_mul_pd(fscal,dx12);
2006 ty = _mm256_mul_pd(fscal,dy12);
2007 tz = _mm256_mul_pd(fscal,dz12);
2009 /* Update vectorial force */
2010 fix1 = _mm256_add_pd(fix1,tx);
2011 fiy1 = _mm256_add_pd(fiy1,ty);
2012 fiz1 = _mm256_add_pd(fiz1,tz);
2014 fjx2 = _mm256_add_pd(fjx2,tx);
2015 fjy2 = _mm256_add_pd(fjy2,ty);
2016 fjz2 = _mm256_add_pd(fjz2,tz);
2020 /**************************
2021 * CALCULATE INTERACTIONS *
2022 **************************/
2024 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2027 /* REACTION-FIELD ELECTROSTATICS */
2028 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
2030 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
2034 fscal = _mm256_and_pd(fscal,cutoff_mask);
2036 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2038 /* Calculate temporary vectorial force */
2039 tx = _mm256_mul_pd(fscal,dx20);
2040 ty = _mm256_mul_pd(fscal,dy20);
2041 tz = _mm256_mul_pd(fscal,dz20);
2043 /* Update vectorial force */
2044 fix2 = _mm256_add_pd(fix2,tx);
2045 fiy2 = _mm256_add_pd(fiy2,ty);
2046 fiz2 = _mm256_add_pd(fiz2,tz);
2048 fjx0 = _mm256_add_pd(fjx0,tx);
2049 fjy0 = _mm256_add_pd(fjy0,ty);
2050 fjz0 = _mm256_add_pd(fjz0,tz);
2054 /**************************
2055 * CALCULATE INTERACTIONS *
2056 **************************/
2058 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2061 /* REACTION-FIELD ELECTROSTATICS */
2062 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
2064 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2068 fscal = _mm256_and_pd(fscal,cutoff_mask);
2070 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2072 /* Calculate temporary vectorial force */
2073 tx = _mm256_mul_pd(fscal,dx21);
2074 ty = _mm256_mul_pd(fscal,dy21);
2075 tz = _mm256_mul_pd(fscal,dz21);
2077 /* Update vectorial force */
2078 fix2 = _mm256_add_pd(fix2,tx);
2079 fiy2 = _mm256_add_pd(fiy2,ty);
2080 fiz2 = _mm256_add_pd(fiz2,tz);
2082 fjx1 = _mm256_add_pd(fjx1,tx);
2083 fjy1 = _mm256_add_pd(fjy1,ty);
2084 fjz1 = _mm256_add_pd(fjz1,tz);
2088 /**************************
2089 * CALCULATE INTERACTIONS *
2090 **************************/
2092 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2095 /* REACTION-FIELD ELECTROSTATICS */
2096 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
2098 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2102 fscal = _mm256_and_pd(fscal,cutoff_mask);
2104 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2106 /* Calculate temporary vectorial force */
2107 tx = _mm256_mul_pd(fscal,dx22);
2108 ty = _mm256_mul_pd(fscal,dy22);
2109 tz = _mm256_mul_pd(fscal,dz22);
2111 /* Update vectorial force */
2112 fix2 = _mm256_add_pd(fix2,tx);
2113 fiy2 = _mm256_add_pd(fiy2,ty);
2114 fiz2 = _mm256_add_pd(fiz2,tz);
2116 fjx2 = _mm256_add_pd(fjx2,tx);
2117 fjy2 = _mm256_add_pd(fjy2,ty);
2118 fjz2 = _mm256_add_pd(fjz2,tz);
2122 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2123 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2124 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2125 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2127 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2128 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2130 /* Inner loop uses 277 flops */
2133 /* End of innermost loop */
2135 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2136 f+i_coord_offset,fshift+i_shift_offset);
2138 /* Increment number of inner iterations */
2139 inneriter += j_index_end - j_index_start;
2141 /* Outer loop uses 18 flops */
2144 /* Increment number of outer iterations */
2147 /* Update outer/inner flops */
2149 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*277);