2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_256_double
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 real * vdwioffsetptr3;
77 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
79 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
81 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
83 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
85 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
86 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
87 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
88 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
89 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
90 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
91 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
92 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
93 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
94 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
95 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
96 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
99 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
102 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
103 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
104 __m256d dummy_mask,cutoff_mask;
105 __m128 tmpmask0,tmpmask1;
106 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
107 __m256d one = _mm256_set1_pd(1.0);
108 __m256d two = _mm256_set1_pd(2.0);
114 jindex = nlist->jindex;
116 shiftidx = nlist->shift;
118 shiftvec = fr->shift_vec[0];
119 fshift = fr->fshift[0];
120 facel = _mm256_set1_pd(fr->epsfac);
121 charge = mdatoms->chargeA;
122 krf = _mm256_set1_pd(fr->ic->k_rf);
123 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
124 crf = _mm256_set1_pd(fr->ic->c_rf);
125 nvdwtype = fr->ntype;
127 vdwtype = mdatoms->typeA;
129 /* Setup water-specific parameters */
130 inr = nlist->iinr[0];
131 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
132 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
133 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
134 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
136 jq1 = _mm256_set1_pd(charge[inr+1]);
137 jq2 = _mm256_set1_pd(charge[inr+2]);
138 jq3 = _mm256_set1_pd(charge[inr+3]);
139 vdwjidx0A = 2*vdwtype[inr+0];
140 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
141 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
142 qq11 = _mm256_mul_pd(iq1,jq1);
143 qq12 = _mm256_mul_pd(iq1,jq2);
144 qq13 = _mm256_mul_pd(iq1,jq3);
145 qq21 = _mm256_mul_pd(iq2,jq1);
146 qq22 = _mm256_mul_pd(iq2,jq2);
147 qq23 = _mm256_mul_pd(iq2,jq3);
148 qq31 = _mm256_mul_pd(iq3,jq1);
149 qq32 = _mm256_mul_pd(iq3,jq2);
150 qq33 = _mm256_mul_pd(iq3,jq3);
152 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
153 rcutoff_scalar = fr->rcoulomb;
154 rcutoff = _mm256_set1_pd(rcutoff_scalar);
155 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
157 sh_vdw_invrcut6 = _mm256_set1_pd(fr->ic->sh_invrc6);
158 rvdw = _mm256_set1_pd(fr->rvdw);
160 /* Avoid stupid compiler warnings */
161 jnrA = jnrB = jnrC = jnrD = 0;
170 for(iidx=0;iidx<4*DIM;iidx++)
175 /* Start outer loop over neighborlists */
176 for(iidx=0; iidx<nri; iidx++)
178 /* Load shift vector for this list */
179 i_shift_offset = DIM*shiftidx[iidx];
181 /* Load limits for loop over neighbors */
182 j_index_start = jindex[iidx];
183 j_index_end = jindex[iidx+1];
185 /* Get outer coordinate index */
187 i_coord_offset = DIM*inr;
189 /* Load i particle coords and add shift vector */
190 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
191 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
193 fix0 = _mm256_setzero_pd();
194 fiy0 = _mm256_setzero_pd();
195 fiz0 = _mm256_setzero_pd();
196 fix1 = _mm256_setzero_pd();
197 fiy1 = _mm256_setzero_pd();
198 fiz1 = _mm256_setzero_pd();
199 fix2 = _mm256_setzero_pd();
200 fiy2 = _mm256_setzero_pd();
201 fiz2 = _mm256_setzero_pd();
202 fix3 = _mm256_setzero_pd();
203 fiy3 = _mm256_setzero_pd();
204 fiz3 = _mm256_setzero_pd();
206 /* Reset potential sums */
207 velecsum = _mm256_setzero_pd();
208 vvdwsum = _mm256_setzero_pd();
210 /* Start inner kernel loop */
211 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
214 /* Get j neighbor index, and coordinate index */
219 j_coord_offsetA = DIM*jnrA;
220 j_coord_offsetB = DIM*jnrB;
221 j_coord_offsetC = DIM*jnrC;
222 j_coord_offsetD = DIM*jnrD;
224 /* load j atom coordinates */
225 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
226 x+j_coord_offsetC,x+j_coord_offsetD,
227 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
228 &jy2,&jz2,&jx3,&jy3,&jz3);
230 /* Calculate displacement vector */
231 dx00 = _mm256_sub_pd(ix0,jx0);
232 dy00 = _mm256_sub_pd(iy0,jy0);
233 dz00 = _mm256_sub_pd(iz0,jz0);
234 dx11 = _mm256_sub_pd(ix1,jx1);
235 dy11 = _mm256_sub_pd(iy1,jy1);
236 dz11 = _mm256_sub_pd(iz1,jz1);
237 dx12 = _mm256_sub_pd(ix1,jx2);
238 dy12 = _mm256_sub_pd(iy1,jy2);
239 dz12 = _mm256_sub_pd(iz1,jz2);
240 dx13 = _mm256_sub_pd(ix1,jx3);
241 dy13 = _mm256_sub_pd(iy1,jy3);
242 dz13 = _mm256_sub_pd(iz1,jz3);
243 dx21 = _mm256_sub_pd(ix2,jx1);
244 dy21 = _mm256_sub_pd(iy2,jy1);
245 dz21 = _mm256_sub_pd(iz2,jz1);
246 dx22 = _mm256_sub_pd(ix2,jx2);
247 dy22 = _mm256_sub_pd(iy2,jy2);
248 dz22 = _mm256_sub_pd(iz2,jz2);
249 dx23 = _mm256_sub_pd(ix2,jx3);
250 dy23 = _mm256_sub_pd(iy2,jy3);
251 dz23 = _mm256_sub_pd(iz2,jz3);
252 dx31 = _mm256_sub_pd(ix3,jx1);
253 dy31 = _mm256_sub_pd(iy3,jy1);
254 dz31 = _mm256_sub_pd(iz3,jz1);
255 dx32 = _mm256_sub_pd(ix3,jx2);
256 dy32 = _mm256_sub_pd(iy3,jy2);
257 dz32 = _mm256_sub_pd(iz3,jz2);
258 dx33 = _mm256_sub_pd(ix3,jx3);
259 dy33 = _mm256_sub_pd(iy3,jy3);
260 dz33 = _mm256_sub_pd(iz3,jz3);
262 /* Calculate squared distance and things based on it */
263 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
264 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
265 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
266 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
267 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
268 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
269 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
270 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
271 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
272 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
274 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
275 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
276 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
277 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
278 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
279 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
280 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
281 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
282 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
284 rinvsq00 = gmx_mm256_inv_pd(rsq00);
285 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
286 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
287 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
288 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
289 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
290 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
291 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
292 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
293 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
295 fjx0 = _mm256_setzero_pd();
296 fjy0 = _mm256_setzero_pd();
297 fjz0 = _mm256_setzero_pd();
298 fjx1 = _mm256_setzero_pd();
299 fjy1 = _mm256_setzero_pd();
300 fjz1 = _mm256_setzero_pd();
301 fjx2 = _mm256_setzero_pd();
302 fjy2 = _mm256_setzero_pd();
303 fjz2 = _mm256_setzero_pd();
304 fjx3 = _mm256_setzero_pd();
305 fjy3 = _mm256_setzero_pd();
306 fjz3 = _mm256_setzero_pd();
308 /**************************
309 * CALCULATE INTERACTIONS *
310 **************************/
312 if (gmx_mm256_any_lt(rsq00,rcutoff2))
315 /* LENNARD-JONES DISPERSION/REPULSION */
317 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
318 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
319 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
320 vvdw = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
321 _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
322 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
324 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
326 /* Update potential sum for this i atom from the interaction with this j atom. */
327 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
328 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
332 fscal = _mm256_and_pd(fscal,cutoff_mask);
334 /* Calculate temporary vectorial force */
335 tx = _mm256_mul_pd(fscal,dx00);
336 ty = _mm256_mul_pd(fscal,dy00);
337 tz = _mm256_mul_pd(fscal,dz00);
339 /* Update vectorial force */
340 fix0 = _mm256_add_pd(fix0,tx);
341 fiy0 = _mm256_add_pd(fiy0,ty);
342 fiz0 = _mm256_add_pd(fiz0,tz);
344 fjx0 = _mm256_add_pd(fjx0,tx);
345 fjy0 = _mm256_add_pd(fjy0,ty);
346 fjz0 = _mm256_add_pd(fjz0,tz);
350 /**************************
351 * CALCULATE INTERACTIONS *
352 **************************/
354 if (gmx_mm256_any_lt(rsq11,rcutoff2))
357 /* REACTION-FIELD ELECTROSTATICS */
358 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
359 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
361 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
363 /* Update potential sum for this i atom from the interaction with this j atom. */
364 velec = _mm256_and_pd(velec,cutoff_mask);
365 velecsum = _mm256_add_pd(velecsum,velec);
369 fscal = _mm256_and_pd(fscal,cutoff_mask);
371 /* Calculate temporary vectorial force */
372 tx = _mm256_mul_pd(fscal,dx11);
373 ty = _mm256_mul_pd(fscal,dy11);
374 tz = _mm256_mul_pd(fscal,dz11);
376 /* Update vectorial force */
377 fix1 = _mm256_add_pd(fix1,tx);
378 fiy1 = _mm256_add_pd(fiy1,ty);
379 fiz1 = _mm256_add_pd(fiz1,tz);
381 fjx1 = _mm256_add_pd(fjx1,tx);
382 fjy1 = _mm256_add_pd(fjy1,ty);
383 fjz1 = _mm256_add_pd(fjz1,tz);
387 /**************************
388 * CALCULATE INTERACTIONS *
389 **************************/
391 if (gmx_mm256_any_lt(rsq12,rcutoff2))
394 /* REACTION-FIELD ELECTROSTATICS */
395 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
396 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
398 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
400 /* Update potential sum for this i atom from the interaction with this j atom. */
401 velec = _mm256_and_pd(velec,cutoff_mask);
402 velecsum = _mm256_add_pd(velecsum,velec);
406 fscal = _mm256_and_pd(fscal,cutoff_mask);
408 /* Calculate temporary vectorial force */
409 tx = _mm256_mul_pd(fscal,dx12);
410 ty = _mm256_mul_pd(fscal,dy12);
411 tz = _mm256_mul_pd(fscal,dz12);
413 /* Update vectorial force */
414 fix1 = _mm256_add_pd(fix1,tx);
415 fiy1 = _mm256_add_pd(fiy1,ty);
416 fiz1 = _mm256_add_pd(fiz1,tz);
418 fjx2 = _mm256_add_pd(fjx2,tx);
419 fjy2 = _mm256_add_pd(fjy2,ty);
420 fjz2 = _mm256_add_pd(fjz2,tz);
424 /**************************
425 * CALCULATE INTERACTIONS *
426 **************************/
428 if (gmx_mm256_any_lt(rsq13,rcutoff2))
431 /* REACTION-FIELD ELECTROSTATICS */
432 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_add_pd(rinv13,_mm256_mul_pd(krf,rsq13)),crf));
433 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
435 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
437 /* Update potential sum for this i atom from the interaction with this j atom. */
438 velec = _mm256_and_pd(velec,cutoff_mask);
439 velecsum = _mm256_add_pd(velecsum,velec);
443 fscal = _mm256_and_pd(fscal,cutoff_mask);
445 /* Calculate temporary vectorial force */
446 tx = _mm256_mul_pd(fscal,dx13);
447 ty = _mm256_mul_pd(fscal,dy13);
448 tz = _mm256_mul_pd(fscal,dz13);
450 /* Update vectorial force */
451 fix1 = _mm256_add_pd(fix1,tx);
452 fiy1 = _mm256_add_pd(fiy1,ty);
453 fiz1 = _mm256_add_pd(fiz1,tz);
455 fjx3 = _mm256_add_pd(fjx3,tx);
456 fjy3 = _mm256_add_pd(fjy3,ty);
457 fjz3 = _mm256_add_pd(fjz3,tz);
461 /**************************
462 * CALCULATE INTERACTIONS *
463 **************************/
465 if (gmx_mm256_any_lt(rsq21,rcutoff2))
468 /* REACTION-FIELD ELECTROSTATICS */
469 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
470 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
472 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
474 /* Update potential sum for this i atom from the interaction with this j atom. */
475 velec = _mm256_and_pd(velec,cutoff_mask);
476 velecsum = _mm256_add_pd(velecsum,velec);
480 fscal = _mm256_and_pd(fscal,cutoff_mask);
482 /* Calculate temporary vectorial force */
483 tx = _mm256_mul_pd(fscal,dx21);
484 ty = _mm256_mul_pd(fscal,dy21);
485 tz = _mm256_mul_pd(fscal,dz21);
487 /* Update vectorial force */
488 fix2 = _mm256_add_pd(fix2,tx);
489 fiy2 = _mm256_add_pd(fiy2,ty);
490 fiz2 = _mm256_add_pd(fiz2,tz);
492 fjx1 = _mm256_add_pd(fjx1,tx);
493 fjy1 = _mm256_add_pd(fjy1,ty);
494 fjz1 = _mm256_add_pd(fjz1,tz);
498 /**************************
499 * CALCULATE INTERACTIONS *
500 **************************/
502 if (gmx_mm256_any_lt(rsq22,rcutoff2))
505 /* REACTION-FIELD ELECTROSTATICS */
506 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
507 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
509 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
511 /* Update potential sum for this i atom from the interaction with this j atom. */
512 velec = _mm256_and_pd(velec,cutoff_mask);
513 velecsum = _mm256_add_pd(velecsum,velec);
517 fscal = _mm256_and_pd(fscal,cutoff_mask);
519 /* Calculate temporary vectorial force */
520 tx = _mm256_mul_pd(fscal,dx22);
521 ty = _mm256_mul_pd(fscal,dy22);
522 tz = _mm256_mul_pd(fscal,dz22);
524 /* Update vectorial force */
525 fix2 = _mm256_add_pd(fix2,tx);
526 fiy2 = _mm256_add_pd(fiy2,ty);
527 fiz2 = _mm256_add_pd(fiz2,tz);
529 fjx2 = _mm256_add_pd(fjx2,tx);
530 fjy2 = _mm256_add_pd(fjy2,ty);
531 fjz2 = _mm256_add_pd(fjz2,tz);
535 /**************************
536 * CALCULATE INTERACTIONS *
537 **************************/
539 if (gmx_mm256_any_lt(rsq23,rcutoff2))
542 /* REACTION-FIELD ELECTROSTATICS */
543 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_add_pd(rinv23,_mm256_mul_pd(krf,rsq23)),crf));
544 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
546 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
548 /* Update potential sum for this i atom from the interaction with this j atom. */
549 velec = _mm256_and_pd(velec,cutoff_mask);
550 velecsum = _mm256_add_pd(velecsum,velec);
554 fscal = _mm256_and_pd(fscal,cutoff_mask);
556 /* Calculate temporary vectorial force */
557 tx = _mm256_mul_pd(fscal,dx23);
558 ty = _mm256_mul_pd(fscal,dy23);
559 tz = _mm256_mul_pd(fscal,dz23);
561 /* Update vectorial force */
562 fix2 = _mm256_add_pd(fix2,tx);
563 fiy2 = _mm256_add_pd(fiy2,ty);
564 fiz2 = _mm256_add_pd(fiz2,tz);
566 fjx3 = _mm256_add_pd(fjx3,tx);
567 fjy3 = _mm256_add_pd(fjy3,ty);
568 fjz3 = _mm256_add_pd(fjz3,tz);
572 /**************************
573 * CALCULATE INTERACTIONS *
574 **************************/
576 if (gmx_mm256_any_lt(rsq31,rcutoff2))
579 /* REACTION-FIELD ELECTROSTATICS */
580 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_add_pd(rinv31,_mm256_mul_pd(krf,rsq31)),crf));
581 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
583 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
585 /* Update potential sum for this i atom from the interaction with this j atom. */
586 velec = _mm256_and_pd(velec,cutoff_mask);
587 velecsum = _mm256_add_pd(velecsum,velec);
591 fscal = _mm256_and_pd(fscal,cutoff_mask);
593 /* Calculate temporary vectorial force */
594 tx = _mm256_mul_pd(fscal,dx31);
595 ty = _mm256_mul_pd(fscal,dy31);
596 tz = _mm256_mul_pd(fscal,dz31);
598 /* Update vectorial force */
599 fix3 = _mm256_add_pd(fix3,tx);
600 fiy3 = _mm256_add_pd(fiy3,ty);
601 fiz3 = _mm256_add_pd(fiz3,tz);
603 fjx1 = _mm256_add_pd(fjx1,tx);
604 fjy1 = _mm256_add_pd(fjy1,ty);
605 fjz1 = _mm256_add_pd(fjz1,tz);
609 /**************************
610 * CALCULATE INTERACTIONS *
611 **************************/
613 if (gmx_mm256_any_lt(rsq32,rcutoff2))
616 /* REACTION-FIELD ELECTROSTATICS */
617 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_add_pd(rinv32,_mm256_mul_pd(krf,rsq32)),crf));
618 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
620 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
622 /* Update potential sum for this i atom from the interaction with this j atom. */
623 velec = _mm256_and_pd(velec,cutoff_mask);
624 velecsum = _mm256_add_pd(velecsum,velec);
628 fscal = _mm256_and_pd(fscal,cutoff_mask);
630 /* Calculate temporary vectorial force */
631 tx = _mm256_mul_pd(fscal,dx32);
632 ty = _mm256_mul_pd(fscal,dy32);
633 tz = _mm256_mul_pd(fscal,dz32);
635 /* Update vectorial force */
636 fix3 = _mm256_add_pd(fix3,tx);
637 fiy3 = _mm256_add_pd(fiy3,ty);
638 fiz3 = _mm256_add_pd(fiz3,tz);
640 fjx2 = _mm256_add_pd(fjx2,tx);
641 fjy2 = _mm256_add_pd(fjy2,ty);
642 fjz2 = _mm256_add_pd(fjz2,tz);
646 /**************************
647 * CALCULATE INTERACTIONS *
648 **************************/
650 if (gmx_mm256_any_lt(rsq33,rcutoff2))
653 /* REACTION-FIELD ELECTROSTATICS */
654 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_add_pd(rinv33,_mm256_mul_pd(krf,rsq33)),crf));
655 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
657 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
659 /* Update potential sum for this i atom from the interaction with this j atom. */
660 velec = _mm256_and_pd(velec,cutoff_mask);
661 velecsum = _mm256_add_pd(velecsum,velec);
665 fscal = _mm256_and_pd(fscal,cutoff_mask);
667 /* Calculate temporary vectorial force */
668 tx = _mm256_mul_pd(fscal,dx33);
669 ty = _mm256_mul_pd(fscal,dy33);
670 tz = _mm256_mul_pd(fscal,dz33);
672 /* Update vectorial force */
673 fix3 = _mm256_add_pd(fix3,tx);
674 fiy3 = _mm256_add_pd(fiy3,ty);
675 fiz3 = _mm256_add_pd(fiz3,tz);
677 fjx3 = _mm256_add_pd(fjx3,tx);
678 fjy3 = _mm256_add_pd(fjy3,ty);
679 fjz3 = _mm256_add_pd(fjz3,tz);
683 fjptrA = f+j_coord_offsetA;
684 fjptrB = f+j_coord_offsetB;
685 fjptrC = f+j_coord_offsetC;
686 fjptrD = f+j_coord_offsetD;
688 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
689 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
690 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
692 /* Inner loop uses 368 flops */
698 /* Get j neighbor index, and coordinate index */
699 jnrlistA = jjnr[jidx];
700 jnrlistB = jjnr[jidx+1];
701 jnrlistC = jjnr[jidx+2];
702 jnrlistD = jjnr[jidx+3];
703 /* Sign of each element will be negative for non-real atoms.
704 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
705 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
707 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
709 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
710 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
711 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
713 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
714 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
715 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
716 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
717 j_coord_offsetA = DIM*jnrA;
718 j_coord_offsetB = DIM*jnrB;
719 j_coord_offsetC = DIM*jnrC;
720 j_coord_offsetD = DIM*jnrD;
722 /* load j atom coordinates */
723 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
724 x+j_coord_offsetC,x+j_coord_offsetD,
725 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
726 &jy2,&jz2,&jx3,&jy3,&jz3);
728 /* Calculate displacement vector */
729 dx00 = _mm256_sub_pd(ix0,jx0);
730 dy00 = _mm256_sub_pd(iy0,jy0);
731 dz00 = _mm256_sub_pd(iz0,jz0);
732 dx11 = _mm256_sub_pd(ix1,jx1);
733 dy11 = _mm256_sub_pd(iy1,jy1);
734 dz11 = _mm256_sub_pd(iz1,jz1);
735 dx12 = _mm256_sub_pd(ix1,jx2);
736 dy12 = _mm256_sub_pd(iy1,jy2);
737 dz12 = _mm256_sub_pd(iz1,jz2);
738 dx13 = _mm256_sub_pd(ix1,jx3);
739 dy13 = _mm256_sub_pd(iy1,jy3);
740 dz13 = _mm256_sub_pd(iz1,jz3);
741 dx21 = _mm256_sub_pd(ix2,jx1);
742 dy21 = _mm256_sub_pd(iy2,jy1);
743 dz21 = _mm256_sub_pd(iz2,jz1);
744 dx22 = _mm256_sub_pd(ix2,jx2);
745 dy22 = _mm256_sub_pd(iy2,jy2);
746 dz22 = _mm256_sub_pd(iz2,jz2);
747 dx23 = _mm256_sub_pd(ix2,jx3);
748 dy23 = _mm256_sub_pd(iy2,jy3);
749 dz23 = _mm256_sub_pd(iz2,jz3);
750 dx31 = _mm256_sub_pd(ix3,jx1);
751 dy31 = _mm256_sub_pd(iy3,jy1);
752 dz31 = _mm256_sub_pd(iz3,jz1);
753 dx32 = _mm256_sub_pd(ix3,jx2);
754 dy32 = _mm256_sub_pd(iy3,jy2);
755 dz32 = _mm256_sub_pd(iz3,jz2);
756 dx33 = _mm256_sub_pd(ix3,jx3);
757 dy33 = _mm256_sub_pd(iy3,jy3);
758 dz33 = _mm256_sub_pd(iz3,jz3);
760 /* Calculate squared distance and things based on it */
761 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
762 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
763 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
764 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
765 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
766 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
767 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
768 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
769 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
770 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
772 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
773 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
774 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
775 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
776 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
777 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
778 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
779 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
780 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
782 rinvsq00 = gmx_mm256_inv_pd(rsq00);
783 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
784 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
785 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
786 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
787 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
788 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
789 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
790 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
791 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
793 fjx0 = _mm256_setzero_pd();
794 fjy0 = _mm256_setzero_pd();
795 fjz0 = _mm256_setzero_pd();
796 fjx1 = _mm256_setzero_pd();
797 fjy1 = _mm256_setzero_pd();
798 fjz1 = _mm256_setzero_pd();
799 fjx2 = _mm256_setzero_pd();
800 fjy2 = _mm256_setzero_pd();
801 fjz2 = _mm256_setzero_pd();
802 fjx3 = _mm256_setzero_pd();
803 fjy3 = _mm256_setzero_pd();
804 fjz3 = _mm256_setzero_pd();
806 /**************************
807 * CALCULATE INTERACTIONS *
808 **************************/
810 if (gmx_mm256_any_lt(rsq00,rcutoff2))
813 /* LENNARD-JONES DISPERSION/REPULSION */
815 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
816 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
817 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
818 vvdw = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
819 _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
820 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
822 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
824 /* Update potential sum for this i atom from the interaction with this j atom. */
825 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
826 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
827 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
831 fscal = _mm256_and_pd(fscal,cutoff_mask);
833 fscal = _mm256_andnot_pd(dummy_mask,fscal);
835 /* Calculate temporary vectorial force */
836 tx = _mm256_mul_pd(fscal,dx00);
837 ty = _mm256_mul_pd(fscal,dy00);
838 tz = _mm256_mul_pd(fscal,dz00);
840 /* Update vectorial force */
841 fix0 = _mm256_add_pd(fix0,tx);
842 fiy0 = _mm256_add_pd(fiy0,ty);
843 fiz0 = _mm256_add_pd(fiz0,tz);
845 fjx0 = _mm256_add_pd(fjx0,tx);
846 fjy0 = _mm256_add_pd(fjy0,ty);
847 fjz0 = _mm256_add_pd(fjz0,tz);
851 /**************************
852 * CALCULATE INTERACTIONS *
853 **************************/
855 if (gmx_mm256_any_lt(rsq11,rcutoff2))
858 /* REACTION-FIELD ELECTROSTATICS */
859 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
860 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
862 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
864 /* Update potential sum for this i atom from the interaction with this j atom. */
865 velec = _mm256_and_pd(velec,cutoff_mask);
866 velec = _mm256_andnot_pd(dummy_mask,velec);
867 velecsum = _mm256_add_pd(velecsum,velec);
871 fscal = _mm256_and_pd(fscal,cutoff_mask);
873 fscal = _mm256_andnot_pd(dummy_mask,fscal);
875 /* Calculate temporary vectorial force */
876 tx = _mm256_mul_pd(fscal,dx11);
877 ty = _mm256_mul_pd(fscal,dy11);
878 tz = _mm256_mul_pd(fscal,dz11);
880 /* Update vectorial force */
881 fix1 = _mm256_add_pd(fix1,tx);
882 fiy1 = _mm256_add_pd(fiy1,ty);
883 fiz1 = _mm256_add_pd(fiz1,tz);
885 fjx1 = _mm256_add_pd(fjx1,tx);
886 fjy1 = _mm256_add_pd(fjy1,ty);
887 fjz1 = _mm256_add_pd(fjz1,tz);
891 /**************************
892 * CALCULATE INTERACTIONS *
893 **************************/
895 if (gmx_mm256_any_lt(rsq12,rcutoff2))
898 /* REACTION-FIELD ELECTROSTATICS */
899 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
900 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
902 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
904 /* Update potential sum for this i atom from the interaction with this j atom. */
905 velec = _mm256_and_pd(velec,cutoff_mask);
906 velec = _mm256_andnot_pd(dummy_mask,velec);
907 velecsum = _mm256_add_pd(velecsum,velec);
911 fscal = _mm256_and_pd(fscal,cutoff_mask);
913 fscal = _mm256_andnot_pd(dummy_mask,fscal);
915 /* Calculate temporary vectorial force */
916 tx = _mm256_mul_pd(fscal,dx12);
917 ty = _mm256_mul_pd(fscal,dy12);
918 tz = _mm256_mul_pd(fscal,dz12);
920 /* Update vectorial force */
921 fix1 = _mm256_add_pd(fix1,tx);
922 fiy1 = _mm256_add_pd(fiy1,ty);
923 fiz1 = _mm256_add_pd(fiz1,tz);
925 fjx2 = _mm256_add_pd(fjx2,tx);
926 fjy2 = _mm256_add_pd(fjy2,ty);
927 fjz2 = _mm256_add_pd(fjz2,tz);
931 /**************************
932 * CALCULATE INTERACTIONS *
933 **************************/
935 if (gmx_mm256_any_lt(rsq13,rcutoff2))
938 /* REACTION-FIELD ELECTROSTATICS */
939 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_add_pd(rinv13,_mm256_mul_pd(krf,rsq13)),crf));
940 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
942 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
944 /* Update potential sum for this i atom from the interaction with this j atom. */
945 velec = _mm256_and_pd(velec,cutoff_mask);
946 velec = _mm256_andnot_pd(dummy_mask,velec);
947 velecsum = _mm256_add_pd(velecsum,velec);
951 fscal = _mm256_and_pd(fscal,cutoff_mask);
953 fscal = _mm256_andnot_pd(dummy_mask,fscal);
955 /* Calculate temporary vectorial force */
956 tx = _mm256_mul_pd(fscal,dx13);
957 ty = _mm256_mul_pd(fscal,dy13);
958 tz = _mm256_mul_pd(fscal,dz13);
960 /* Update vectorial force */
961 fix1 = _mm256_add_pd(fix1,tx);
962 fiy1 = _mm256_add_pd(fiy1,ty);
963 fiz1 = _mm256_add_pd(fiz1,tz);
965 fjx3 = _mm256_add_pd(fjx3,tx);
966 fjy3 = _mm256_add_pd(fjy3,ty);
967 fjz3 = _mm256_add_pd(fjz3,tz);
971 /**************************
972 * CALCULATE INTERACTIONS *
973 **************************/
975 if (gmx_mm256_any_lt(rsq21,rcutoff2))
978 /* REACTION-FIELD ELECTROSTATICS */
979 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
980 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
982 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
984 /* Update potential sum for this i atom from the interaction with this j atom. */
985 velec = _mm256_and_pd(velec,cutoff_mask);
986 velec = _mm256_andnot_pd(dummy_mask,velec);
987 velecsum = _mm256_add_pd(velecsum,velec);
991 fscal = _mm256_and_pd(fscal,cutoff_mask);
993 fscal = _mm256_andnot_pd(dummy_mask,fscal);
995 /* Calculate temporary vectorial force */
996 tx = _mm256_mul_pd(fscal,dx21);
997 ty = _mm256_mul_pd(fscal,dy21);
998 tz = _mm256_mul_pd(fscal,dz21);
1000 /* Update vectorial force */
1001 fix2 = _mm256_add_pd(fix2,tx);
1002 fiy2 = _mm256_add_pd(fiy2,ty);
1003 fiz2 = _mm256_add_pd(fiz2,tz);
1005 fjx1 = _mm256_add_pd(fjx1,tx);
1006 fjy1 = _mm256_add_pd(fjy1,ty);
1007 fjz1 = _mm256_add_pd(fjz1,tz);
1011 /**************************
1012 * CALCULATE INTERACTIONS *
1013 **************************/
1015 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1018 /* REACTION-FIELD ELECTROSTATICS */
1019 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
1020 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1022 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1024 /* Update potential sum for this i atom from the interaction with this j atom. */
1025 velec = _mm256_and_pd(velec,cutoff_mask);
1026 velec = _mm256_andnot_pd(dummy_mask,velec);
1027 velecsum = _mm256_add_pd(velecsum,velec);
1031 fscal = _mm256_and_pd(fscal,cutoff_mask);
1033 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1035 /* Calculate temporary vectorial force */
1036 tx = _mm256_mul_pd(fscal,dx22);
1037 ty = _mm256_mul_pd(fscal,dy22);
1038 tz = _mm256_mul_pd(fscal,dz22);
1040 /* Update vectorial force */
1041 fix2 = _mm256_add_pd(fix2,tx);
1042 fiy2 = _mm256_add_pd(fiy2,ty);
1043 fiz2 = _mm256_add_pd(fiz2,tz);
1045 fjx2 = _mm256_add_pd(fjx2,tx);
1046 fjy2 = _mm256_add_pd(fjy2,ty);
1047 fjz2 = _mm256_add_pd(fjz2,tz);
1051 /**************************
1052 * CALCULATE INTERACTIONS *
1053 **************************/
1055 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1058 /* REACTION-FIELD ELECTROSTATICS */
1059 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_add_pd(rinv23,_mm256_mul_pd(krf,rsq23)),crf));
1060 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
1062 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1064 /* Update potential sum for this i atom from the interaction with this j atom. */
1065 velec = _mm256_and_pd(velec,cutoff_mask);
1066 velec = _mm256_andnot_pd(dummy_mask,velec);
1067 velecsum = _mm256_add_pd(velecsum,velec);
1071 fscal = _mm256_and_pd(fscal,cutoff_mask);
1073 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1075 /* Calculate temporary vectorial force */
1076 tx = _mm256_mul_pd(fscal,dx23);
1077 ty = _mm256_mul_pd(fscal,dy23);
1078 tz = _mm256_mul_pd(fscal,dz23);
1080 /* Update vectorial force */
1081 fix2 = _mm256_add_pd(fix2,tx);
1082 fiy2 = _mm256_add_pd(fiy2,ty);
1083 fiz2 = _mm256_add_pd(fiz2,tz);
1085 fjx3 = _mm256_add_pd(fjx3,tx);
1086 fjy3 = _mm256_add_pd(fjy3,ty);
1087 fjz3 = _mm256_add_pd(fjz3,tz);
1091 /**************************
1092 * CALCULATE INTERACTIONS *
1093 **************************/
1095 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1098 /* REACTION-FIELD ELECTROSTATICS */
1099 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_add_pd(rinv31,_mm256_mul_pd(krf,rsq31)),crf));
1100 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1102 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1104 /* Update potential sum for this i atom from the interaction with this j atom. */
1105 velec = _mm256_and_pd(velec,cutoff_mask);
1106 velec = _mm256_andnot_pd(dummy_mask,velec);
1107 velecsum = _mm256_add_pd(velecsum,velec);
1111 fscal = _mm256_and_pd(fscal,cutoff_mask);
1113 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1115 /* Calculate temporary vectorial force */
1116 tx = _mm256_mul_pd(fscal,dx31);
1117 ty = _mm256_mul_pd(fscal,dy31);
1118 tz = _mm256_mul_pd(fscal,dz31);
1120 /* Update vectorial force */
1121 fix3 = _mm256_add_pd(fix3,tx);
1122 fiy3 = _mm256_add_pd(fiy3,ty);
1123 fiz3 = _mm256_add_pd(fiz3,tz);
1125 fjx1 = _mm256_add_pd(fjx1,tx);
1126 fjy1 = _mm256_add_pd(fjy1,ty);
1127 fjz1 = _mm256_add_pd(fjz1,tz);
1131 /**************************
1132 * CALCULATE INTERACTIONS *
1133 **************************/
1135 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1138 /* REACTION-FIELD ELECTROSTATICS */
1139 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_add_pd(rinv32,_mm256_mul_pd(krf,rsq32)),crf));
1140 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
1142 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1144 /* Update potential sum for this i atom from the interaction with this j atom. */
1145 velec = _mm256_and_pd(velec,cutoff_mask);
1146 velec = _mm256_andnot_pd(dummy_mask,velec);
1147 velecsum = _mm256_add_pd(velecsum,velec);
1151 fscal = _mm256_and_pd(fscal,cutoff_mask);
1153 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1155 /* Calculate temporary vectorial force */
1156 tx = _mm256_mul_pd(fscal,dx32);
1157 ty = _mm256_mul_pd(fscal,dy32);
1158 tz = _mm256_mul_pd(fscal,dz32);
1160 /* Update vectorial force */
1161 fix3 = _mm256_add_pd(fix3,tx);
1162 fiy3 = _mm256_add_pd(fiy3,ty);
1163 fiz3 = _mm256_add_pd(fiz3,tz);
1165 fjx2 = _mm256_add_pd(fjx2,tx);
1166 fjy2 = _mm256_add_pd(fjy2,ty);
1167 fjz2 = _mm256_add_pd(fjz2,tz);
1171 /**************************
1172 * CALCULATE INTERACTIONS *
1173 **************************/
1175 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1178 /* REACTION-FIELD ELECTROSTATICS */
1179 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_add_pd(rinv33,_mm256_mul_pd(krf,rsq33)),crf));
1180 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
1182 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1184 /* Update potential sum for this i atom from the interaction with this j atom. */
1185 velec = _mm256_and_pd(velec,cutoff_mask);
1186 velec = _mm256_andnot_pd(dummy_mask,velec);
1187 velecsum = _mm256_add_pd(velecsum,velec);
1191 fscal = _mm256_and_pd(fscal,cutoff_mask);
1193 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1195 /* Calculate temporary vectorial force */
1196 tx = _mm256_mul_pd(fscal,dx33);
1197 ty = _mm256_mul_pd(fscal,dy33);
1198 tz = _mm256_mul_pd(fscal,dz33);
1200 /* Update vectorial force */
1201 fix3 = _mm256_add_pd(fix3,tx);
1202 fiy3 = _mm256_add_pd(fiy3,ty);
1203 fiz3 = _mm256_add_pd(fiz3,tz);
1205 fjx3 = _mm256_add_pd(fjx3,tx);
1206 fjy3 = _mm256_add_pd(fjy3,ty);
1207 fjz3 = _mm256_add_pd(fjz3,tz);
1211 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1212 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1213 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1214 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1216 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1217 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1218 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1220 /* Inner loop uses 368 flops */
1223 /* End of innermost loop */
1225 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1226 f+i_coord_offset,fshift+i_shift_offset);
1229 /* Update potential energies */
1230 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1231 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1233 /* Increment number of inner iterations */
1234 inneriter += j_index_end - j_index_start;
1236 /* Outer loop uses 26 flops */
1239 /* Increment number of outer iterations */
1242 /* Update outer/inner flops */
1244 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*368);
1247 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_256_double
1248 * Electrostatics interaction: ReactionField
1249 * VdW interaction: LennardJones
1250 * Geometry: Water4-Water4
1251 * Calculate force/pot: Force
1254 nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_256_double
1255 (t_nblist * gmx_restrict nlist,
1256 rvec * gmx_restrict xx,
1257 rvec * gmx_restrict ff,
1258 t_forcerec * gmx_restrict fr,
1259 t_mdatoms * gmx_restrict mdatoms,
1260 nb_kernel_data_t * gmx_restrict kernel_data,
1261 t_nrnb * gmx_restrict nrnb)
1263 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1264 * just 0 for non-waters.
1265 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1266 * jnr indices corresponding to data put in the four positions in the SIMD register.
1268 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1269 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1270 int jnrA,jnrB,jnrC,jnrD;
1271 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1272 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1273 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1274 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1275 real rcutoff_scalar;
1276 real *shiftvec,*fshift,*x,*f;
1277 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1278 real scratch[4*DIM];
1279 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1280 real * vdwioffsetptr0;
1281 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1282 real * vdwioffsetptr1;
1283 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1284 real * vdwioffsetptr2;
1285 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1286 real * vdwioffsetptr3;
1287 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1288 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1289 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1290 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1291 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1292 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1293 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1294 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1295 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1296 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1297 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1298 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1299 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1300 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1301 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1302 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1303 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1304 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1305 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1306 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1309 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1312 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1313 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1314 __m256d dummy_mask,cutoff_mask;
1315 __m128 tmpmask0,tmpmask1;
1316 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1317 __m256d one = _mm256_set1_pd(1.0);
1318 __m256d two = _mm256_set1_pd(2.0);
1324 jindex = nlist->jindex;
1326 shiftidx = nlist->shift;
1328 shiftvec = fr->shift_vec[0];
1329 fshift = fr->fshift[0];
1330 facel = _mm256_set1_pd(fr->epsfac);
1331 charge = mdatoms->chargeA;
1332 krf = _mm256_set1_pd(fr->ic->k_rf);
1333 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
1334 crf = _mm256_set1_pd(fr->ic->c_rf);
1335 nvdwtype = fr->ntype;
1336 vdwparam = fr->nbfp;
1337 vdwtype = mdatoms->typeA;
1339 /* Setup water-specific parameters */
1340 inr = nlist->iinr[0];
1341 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1342 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1343 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1344 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1346 jq1 = _mm256_set1_pd(charge[inr+1]);
1347 jq2 = _mm256_set1_pd(charge[inr+2]);
1348 jq3 = _mm256_set1_pd(charge[inr+3]);
1349 vdwjidx0A = 2*vdwtype[inr+0];
1350 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1351 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1352 qq11 = _mm256_mul_pd(iq1,jq1);
1353 qq12 = _mm256_mul_pd(iq1,jq2);
1354 qq13 = _mm256_mul_pd(iq1,jq3);
1355 qq21 = _mm256_mul_pd(iq2,jq1);
1356 qq22 = _mm256_mul_pd(iq2,jq2);
1357 qq23 = _mm256_mul_pd(iq2,jq3);
1358 qq31 = _mm256_mul_pd(iq3,jq1);
1359 qq32 = _mm256_mul_pd(iq3,jq2);
1360 qq33 = _mm256_mul_pd(iq3,jq3);
1362 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1363 rcutoff_scalar = fr->rcoulomb;
1364 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1365 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1367 sh_vdw_invrcut6 = _mm256_set1_pd(fr->ic->sh_invrc6);
1368 rvdw = _mm256_set1_pd(fr->rvdw);
1370 /* Avoid stupid compiler warnings */
1371 jnrA = jnrB = jnrC = jnrD = 0;
1372 j_coord_offsetA = 0;
1373 j_coord_offsetB = 0;
1374 j_coord_offsetC = 0;
1375 j_coord_offsetD = 0;
1380 for(iidx=0;iidx<4*DIM;iidx++)
1382 scratch[iidx] = 0.0;
1385 /* Start outer loop over neighborlists */
1386 for(iidx=0; iidx<nri; iidx++)
1388 /* Load shift vector for this list */
1389 i_shift_offset = DIM*shiftidx[iidx];
1391 /* Load limits for loop over neighbors */
1392 j_index_start = jindex[iidx];
1393 j_index_end = jindex[iidx+1];
1395 /* Get outer coordinate index */
1397 i_coord_offset = DIM*inr;
1399 /* Load i particle coords and add shift vector */
1400 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1401 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1403 fix0 = _mm256_setzero_pd();
1404 fiy0 = _mm256_setzero_pd();
1405 fiz0 = _mm256_setzero_pd();
1406 fix1 = _mm256_setzero_pd();
1407 fiy1 = _mm256_setzero_pd();
1408 fiz1 = _mm256_setzero_pd();
1409 fix2 = _mm256_setzero_pd();
1410 fiy2 = _mm256_setzero_pd();
1411 fiz2 = _mm256_setzero_pd();
1412 fix3 = _mm256_setzero_pd();
1413 fiy3 = _mm256_setzero_pd();
1414 fiz3 = _mm256_setzero_pd();
1416 /* Start inner kernel loop */
1417 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1420 /* Get j neighbor index, and coordinate index */
1422 jnrB = jjnr[jidx+1];
1423 jnrC = jjnr[jidx+2];
1424 jnrD = jjnr[jidx+3];
1425 j_coord_offsetA = DIM*jnrA;
1426 j_coord_offsetB = DIM*jnrB;
1427 j_coord_offsetC = DIM*jnrC;
1428 j_coord_offsetD = DIM*jnrD;
1430 /* load j atom coordinates */
1431 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1432 x+j_coord_offsetC,x+j_coord_offsetD,
1433 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1434 &jy2,&jz2,&jx3,&jy3,&jz3);
1436 /* Calculate displacement vector */
1437 dx00 = _mm256_sub_pd(ix0,jx0);
1438 dy00 = _mm256_sub_pd(iy0,jy0);
1439 dz00 = _mm256_sub_pd(iz0,jz0);
1440 dx11 = _mm256_sub_pd(ix1,jx1);
1441 dy11 = _mm256_sub_pd(iy1,jy1);
1442 dz11 = _mm256_sub_pd(iz1,jz1);
1443 dx12 = _mm256_sub_pd(ix1,jx2);
1444 dy12 = _mm256_sub_pd(iy1,jy2);
1445 dz12 = _mm256_sub_pd(iz1,jz2);
1446 dx13 = _mm256_sub_pd(ix1,jx3);
1447 dy13 = _mm256_sub_pd(iy1,jy3);
1448 dz13 = _mm256_sub_pd(iz1,jz3);
1449 dx21 = _mm256_sub_pd(ix2,jx1);
1450 dy21 = _mm256_sub_pd(iy2,jy1);
1451 dz21 = _mm256_sub_pd(iz2,jz1);
1452 dx22 = _mm256_sub_pd(ix2,jx2);
1453 dy22 = _mm256_sub_pd(iy2,jy2);
1454 dz22 = _mm256_sub_pd(iz2,jz2);
1455 dx23 = _mm256_sub_pd(ix2,jx3);
1456 dy23 = _mm256_sub_pd(iy2,jy3);
1457 dz23 = _mm256_sub_pd(iz2,jz3);
1458 dx31 = _mm256_sub_pd(ix3,jx1);
1459 dy31 = _mm256_sub_pd(iy3,jy1);
1460 dz31 = _mm256_sub_pd(iz3,jz1);
1461 dx32 = _mm256_sub_pd(ix3,jx2);
1462 dy32 = _mm256_sub_pd(iy3,jy2);
1463 dz32 = _mm256_sub_pd(iz3,jz2);
1464 dx33 = _mm256_sub_pd(ix3,jx3);
1465 dy33 = _mm256_sub_pd(iy3,jy3);
1466 dz33 = _mm256_sub_pd(iz3,jz3);
1468 /* Calculate squared distance and things based on it */
1469 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1470 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1471 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1472 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1473 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1474 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1475 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1476 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1477 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1478 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1480 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1481 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1482 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1483 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1484 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1485 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1486 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1487 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1488 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
1490 rinvsq00 = gmx_mm256_inv_pd(rsq00);
1491 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1492 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1493 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1494 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1495 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1496 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1497 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1498 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1499 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1501 fjx0 = _mm256_setzero_pd();
1502 fjy0 = _mm256_setzero_pd();
1503 fjz0 = _mm256_setzero_pd();
1504 fjx1 = _mm256_setzero_pd();
1505 fjy1 = _mm256_setzero_pd();
1506 fjz1 = _mm256_setzero_pd();
1507 fjx2 = _mm256_setzero_pd();
1508 fjy2 = _mm256_setzero_pd();
1509 fjz2 = _mm256_setzero_pd();
1510 fjx3 = _mm256_setzero_pd();
1511 fjy3 = _mm256_setzero_pd();
1512 fjz3 = _mm256_setzero_pd();
1514 /**************************
1515 * CALCULATE INTERACTIONS *
1516 **************************/
1518 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1521 /* LENNARD-JONES DISPERSION/REPULSION */
1523 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1524 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1526 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1530 fscal = _mm256_and_pd(fscal,cutoff_mask);
1532 /* Calculate temporary vectorial force */
1533 tx = _mm256_mul_pd(fscal,dx00);
1534 ty = _mm256_mul_pd(fscal,dy00);
1535 tz = _mm256_mul_pd(fscal,dz00);
1537 /* Update vectorial force */
1538 fix0 = _mm256_add_pd(fix0,tx);
1539 fiy0 = _mm256_add_pd(fiy0,ty);
1540 fiz0 = _mm256_add_pd(fiz0,tz);
1542 fjx0 = _mm256_add_pd(fjx0,tx);
1543 fjy0 = _mm256_add_pd(fjy0,ty);
1544 fjz0 = _mm256_add_pd(fjz0,tz);
1548 /**************************
1549 * CALCULATE INTERACTIONS *
1550 **************************/
1552 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1555 /* REACTION-FIELD ELECTROSTATICS */
1556 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1558 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1562 fscal = _mm256_and_pd(fscal,cutoff_mask);
1564 /* Calculate temporary vectorial force */
1565 tx = _mm256_mul_pd(fscal,dx11);
1566 ty = _mm256_mul_pd(fscal,dy11);
1567 tz = _mm256_mul_pd(fscal,dz11);
1569 /* Update vectorial force */
1570 fix1 = _mm256_add_pd(fix1,tx);
1571 fiy1 = _mm256_add_pd(fiy1,ty);
1572 fiz1 = _mm256_add_pd(fiz1,tz);
1574 fjx1 = _mm256_add_pd(fjx1,tx);
1575 fjy1 = _mm256_add_pd(fjy1,ty);
1576 fjz1 = _mm256_add_pd(fjz1,tz);
1580 /**************************
1581 * CALCULATE INTERACTIONS *
1582 **************************/
1584 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1587 /* REACTION-FIELD ELECTROSTATICS */
1588 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1590 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1594 fscal = _mm256_and_pd(fscal,cutoff_mask);
1596 /* Calculate temporary vectorial force */
1597 tx = _mm256_mul_pd(fscal,dx12);
1598 ty = _mm256_mul_pd(fscal,dy12);
1599 tz = _mm256_mul_pd(fscal,dz12);
1601 /* Update vectorial force */
1602 fix1 = _mm256_add_pd(fix1,tx);
1603 fiy1 = _mm256_add_pd(fiy1,ty);
1604 fiz1 = _mm256_add_pd(fiz1,tz);
1606 fjx2 = _mm256_add_pd(fjx2,tx);
1607 fjy2 = _mm256_add_pd(fjy2,ty);
1608 fjz2 = _mm256_add_pd(fjz2,tz);
1612 /**************************
1613 * CALCULATE INTERACTIONS *
1614 **************************/
1616 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1619 /* REACTION-FIELD ELECTROSTATICS */
1620 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
1622 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1626 fscal = _mm256_and_pd(fscal,cutoff_mask);
1628 /* Calculate temporary vectorial force */
1629 tx = _mm256_mul_pd(fscal,dx13);
1630 ty = _mm256_mul_pd(fscal,dy13);
1631 tz = _mm256_mul_pd(fscal,dz13);
1633 /* Update vectorial force */
1634 fix1 = _mm256_add_pd(fix1,tx);
1635 fiy1 = _mm256_add_pd(fiy1,ty);
1636 fiz1 = _mm256_add_pd(fiz1,tz);
1638 fjx3 = _mm256_add_pd(fjx3,tx);
1639 fjy3 = _mm256_add_pd(fjy3,ty);
1640 fjz3 = _mm256_add_pd(fjz3,tz);
1644 /**************************
1645 * CALCULATE INTERACTIONS *
1646 **************************/
1648 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1651 /* REACTION-FIELD ELECTROSTATICS */
1652 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1654 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1658 fscal = _mm256_and_pd(fscal,cutoff_mask);
1660 /* Calculate temporary vectorial force */
1661 tx = _mm256_mul_pd(fscal,dx21);
1662 ty = _mm256_mul_pd(fscal,dy21);
1663 tz = _mm256_mul_pd(fscal,dz21);
1665 /* Update vectorial force */
1666 fix2 = _mm256_add_pd(fix2,tx);
1667 fiy2 = _mm256_add_pd(fiy2,ty);
1668 fiz2 = _mm256_add_pd(fiz2,tz);
1670 fjx1 = _mm256_add_pd(fjx1,tx);
1671 fjy1 = _mm256_add_pd(fjy1,ty);
1672 fjz1 = _mm256_add_pd(fjz1,tz);
1676 /**************************
1677 * CALCULATE INTERACTIONS *
1678 **************************/
1680 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1683 /* REACTION-FIELD ELECTROSTATICS */
1684 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1686 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1690 fscal = _mm256_and_pd(fscal,cutoff_mask);
1692 /* Calculate temporary vectorial force */
1693 tx = _mm256_mul_pd(fscal,dx22);
1694 ty = _mm256_mul_pd(fscal,dy22);
1695 tz = _mm256_mul_pd(fscal,dz22);
1697 /* Update vectorial force */
1698 fix2 = _mm256_add_pd(fix2,tx);
1699 fiy2 = _mm256_add_pd(fiy2,ty);
1700 fiz2 = _mm256_add_pd(fiz2,tz);
1702 fjx2 = _mm256_add_pd(fjx2,tx);
1703 fjy2 = _mm256_add_pd(fjy2,ty);
1704 fjz2 = _mm256_add_pd(fjz2,tz);
1708 /**************************
1709 * CALCULATE INTERACTIONS *
1710 **************************/
1712 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1715 /* REACTION-FIELD ELECTROSTATICS */
1716 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
1718 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1722 fscal = _mm256_and_pd(fscal,cutoff_mask);
1724 /* Calculate temporary vectorial force */
1725 tx = _mm256_mul_pd(fscal,dx23);
1726 ty = _mm256_mul_pd(fscal,dy23);
1727 tz = _mm256_mul_pd(fscal,dz23);
1729 /* Update vectorial force */
1730 fix2 = _mm256_add_pd(fix2,tx);
1731 fiy2 = _mm256_add_pd(fiy2,ty);
1732 fiz2 = _mm256_add_pd(fiz2,tz);
1734 fjx3 = _mm256_add_pd(fjx3,tx);
1735 fjy3 = _mm256_add_pd(fjy3,ty);
1736 fjz3 = _mm256_add_pd(fjz3,tz);
1740 /**************************
1741 * CALCULATE INTERACTIONS *
1742 **************************/
1744 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1747 /* REACTION-FIELD ELECTROSTATICS */
1748 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1750 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1754 fscal = _mm256_and_pd(fscal,cutoff_mask);
1756 /* Calculate temporary vectorial force */
1757 tx = _mm256_mul_pd(fscal,dx31);
1758 ty = _mm256_mul_pd(fscal,dy31);
1759 tz = _mm256_mul_pd(fscal,dz31);
1761 /* Update vectorial force */
1762 fix3 = _mm256_add_pd(fix3,tx);
1763 fiy3 = _mm256_add_pd(fiy3,ty);
1764 fiz3 = _mm256_add_pd(fiz3,tz);
1766 fjx1 = _mm256_add_pd(fjx1,tx);
1767 fjy1 = _mm256_add_pd(fjy1,ty);
1768 fjz1 = _mm256_add_pd(fjz1,tz);
1772 /**************************
1773 * CALCULATE INTERACTIONS *
1774 **************************/
1776 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1779 /* REACTION-FIELD ELECTROSTATICS */
1780 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
1782 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1786 fscal = _mm256_and_pd(fscal,cutoff_mask);
1788 /* Calculate temporary vectorial force */
1789 tx = _mm256_mul_pd(fscal,dx32);
1790 ty = _mm256_mul_pd(fscal,dy32);
1791 tz = _mm256_mul_pd(fscal,dz32);
1793 /* Update vectorial force */
1794 fix3 = _mm256_add_pd(fix3,tx);
1795 fiy3 = _mm256_add_pd(fiy3,ty);
1796 fiz3 = _mm256_add_pd(fiz3,tz);
1798 fjx2 = _mm256_add_pd(fjx2,tx);
1799 fjy2 = _mm256_add_pd(fjy2,ty);
1800 fjz2 = _mm256_add_pd(fjz2,tz);
1804 /**************************
1805 * CALCULATE INTERACTIONS *
1806 **************************/
1808 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1811 /* REACTION-FIELD ELECTROSTATICS */
1812 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
1814 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1818 fscal = _mm256_and_pd(fscal,cutoff_mask);
1820 /* Calculate temporary vectorial force */
1821 tx = _mm256_mul_pd(fscal,dx33);
1822 ty = _mm256_mul_pd(fscal,dy33);
1823 tz = _mm256_mul_pd(fscal,dz33);
1825 /* Update vectorial force */
1826 fix3 = _mm256_add_pd(fix3,tx);
1827 fiy3 = _mm256_add_pd(fiy3,ty);
1828 fiz3 = _mm256_add_pd(fiz3,tz);
1830 fjx3 = _mm256_add_pd(fjx3,tx);
1831 fjy3 = _mm256_add_pd(fjy3,ty);
1832 fjz3 = _mm256_add_pd(fjz3,tz);
1836 fjptrA = f+j_coord_offsetA;
1837 fjptrB = f+j_coord_offsetB;
1838 fjptrC = f+j_coord_offsetC;
1839 fjptrD = f+j_coord_offsetD;
1841 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1842 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1843 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1845 /* Inner loop uses 303 flops */
1848 if(jidx<j_index_end)
1851 /* Get j neighbor index, and coordinate index */
1852 jnrlistA = jjnr[jidx];
1853 jnrlistB = jjnr[jidx+1];
1854 jnrlistC = jjnr[jidx+2];
1855 jnrlistD = jjnr[jidx+3];
1856 /* Sign of each element will be negative for non-real atoms.
1857 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1858 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1860 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1862 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1863 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1864 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1866 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1867 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1868 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1869 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1870 j_coord_offsetA = DIM*jnrA;
1871 j_coord_offsetB = DIM*jnrB;
1872 j_coord_offsetC = DIM*jnrC;
1873 j_coord_offsetD = DIM*jnrD;
1875 /* load j atom coordinates */
1876 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1877 x+j_coord_offsetC,x+j_coord_offsetD,
1878 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1879 &jy2,&jz2,&jx3,&jy3,&jz3);
1881 /* Calculate displacement vector */
1882 dx00 = _mm256_sub_pd(ix0,jx0);
1883 dy00 = _mm256_sub_pd(iy0,jy0);
1884 dz00 = _mm256_sub_pd(iz0,jz0);
1885 dx11 = _mm256_sub_pd(ix1,jx1);
1886 dy11 = _mm256_sub_pd(iy1,jy1);
1887 dz11 = _mm256_sub_pd(iz1,jz1);
1888 dx12 = _mm256_sub_pd(ix1,jx2);
1889 dy12 = _mm256_sub_pd(iy1,jy2);
1890 dz12 = _mm256_sub_pd(iz1,jz2);
1891 dx13 = _mm256_sub_pd(ix1,jx3);
1892 dy13 = _mm256_sub_pd(iy1,jy3);
1893 dz13 = _mm256_sub_pd(iz1,jz3);
1894 dx21 = _mm256_sub_pd(ix2,jx1);
1895 dy21 = _mm256_sub_pd(iy2,jy1);
1896 dz21 = _mm256_sub_pd(iz2,jz1);
1897 dx22 = _mm256_sub_pd(ix2,jx2);
1898 dy22 = _mm256_sub_pd(iy2,jy2);
1899 dz22 = _mm256_sub_pd(iz2,jz2);
1900 dx23 = _mm256_sub_pd(ix2,jx3);
1901 dy23 = _mm256_sub_pd(iy2,jy3);
1902 dz23 = _mm256_sub_pd(iz2,jz3);
1903 dx31 = _mm256_sub_pd(ix3,jx1);
1904 dy31 = _mm256_sub_pd(iy3,jy1);
1905 dz31 = _mm256_sub_pd(iz3,jz1);
1906 dx32 = _mm256_sub_pd(ix3,jx2);
1907 dy32 = _mm256_sub_pd(iy3,jy2);
1908 dz32 = _mm256_sub_pd(iz3,jz2);
1909 dx33 = _mm256_sub_pd(ix3,jx3);
1910 dy33 = _mm256_sub_pd(iy3,jy3);
1911 dz33 = _mm256_sub_pd(iz3,jz3);
1913 /* Calculate squared distance and things based on it */
1914 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1915 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1916 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1917 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1918 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1919 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1920 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1921 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1922 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1923 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1925 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1926 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1927 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1928 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1929 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1930 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1931 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1932 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1933 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
1935 rinvsq00 = gmx_mm256_inv_pd(rsq00);
1936 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1937 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1938 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1939 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1940 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1941 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1942 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1943 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1944 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1946 fjx0 = _mm256_setzero_pd();
1947 fjy0 = _mm256_setzero_pd();
1948 fjz0 = _mm256_setzero_pd();
1949 fjx1 = _mm256_setzero_pd();
1950 fjy1 = _mm256_setzero_pd();
1951 fjz1 = _mm256_setzero_pd();
1952 fjx2 = _mm256_setzero_pd();
1953 fjy2 = _mm256_setzero_pd();
1954 fjz2 = _mm256_setzero_pd();
1955 fjx3 = _mm256_setzero_pd();
1956 fjy3 = _mm256_setzero_pd();
1957 fjz3 = _mm256_setzero_pd();
1959 /**************************
1960 * CALCULATE INTERACTIONS *
1961 **************************/
1963 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1966 /* LENNARD-JONES DISPERSION/REPULSION */
1968 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1969 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1971 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1975 fscal = _mm256_and_pd(fscal,cutoff_mask);
1977 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1979 /* Calculate temporary vectorial force */
1980 tx = _mm256_mul_pd(fscal,dx00);
1981 ty = _mm256_mul_pd(fscal,dy00);
1982 tz = _mm256_mul_pd(fscal,dz00);
1984 /* Update vectorial force */
1985 fix0 = _mm256_add_pd(fix0,tx);
1986 fiy0 = _mm256_add_pd(fiy0,ty);
1987 fiz0 = _mm256_add_pd(fiz0,tz);
1989 fjx0 = _mm256_add_pd(fjx0,tx);
1990 fjy0 = _mm256_add_pd(fjy0,ty);
1991 fjz0 = _mm256_add_pd(fjz0,tz);
1995 /**************************
1996 * CALCULATE INTERACTIONS *
1997 **************************/
1999 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2002 /* REACTION-FIELD ELECTROSTATICS */
2003 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
2005 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2009 fscal = _mm256_and_pd(fscal,cutoff_mask);
2011 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2013 /* Calculate temporary vectorial force */
2014 tx = _mm256_mul_pd(fscal,dx11);
2015 ty = _mm256_mul_pd(fscal,dy11);
2016 tz = _mm256_mul_pd(fscal,dz11);
2018 /* Update vectorial force */
2019 fix1 = _mm256_add_pd(fix1,tx);
2020 fiy1 = _mm256_add_pd(fiy1,ty);
2021 fiz1 = _mm256_add_pd(fiz1,tz);
2023 fjx1 = _mm256_add_pd(fjx1,tx);
2024 fjy1 = _mm256_add_pd(fjy1,ty);
2025 fjz1 = _mm256_add_pd(fjz1,tz);
2029 /**************************
2030 * CALCULATE INTERACTIONS *
2031 **************************/
2033 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2036 /* REACTION-FIELD ELECTROSTATICS */
2037 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
2039 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2043 fscal = _mm256_and_pd(fscal,cutoff_mask);
2045 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2047 /* Calculate temporary vectorial force */
2048 tx = _mm256_mul_pd(fscal,dx12);
2049 ty = _mm256_mul_pd(fscal,dy12);
2050 tz = _mm256_mul_pd(fscal,dz12);
2052 /* Update vectorial force */
2053 fix1 = _mm256_add_pd(fix1,tx);
2054 fiy1 = _mm256_add_pd(fiy1,ty);
2055 fiz1 = _mm256_add_pd(fiz1,tz);
2057 fjx2 = _mm256_add_pd(fjx2,tx);
2058 fjy2 = _mm256_add_pd(fjy2,ty);
2059 fjz2 = _mm256_add_pd(fjz2,tz);
2063 /**************************
2064 * CALCULATE INTERACTIONS *
2065 **************************/
2067 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2070 /* REACTION-FIELD ELECTROSTATICS */
2071 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
2073 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2077 fscal = _mm256_and_pd(fscal,cutoff_mask);
2079 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2081 /* Calculate temporary vectorial force */
2082 tx = _mm256_mul_pd(fscal,dx13);
2083 ty = _mm256_mul_pd(fscal,dy13);
2084 tz = _mm256_mul_pd(fscal,dz13);
2086 /* Update vectorial force */
2087 fix1 = _mm256_add_pd(fix1,tx);
2088 fiy1 = _mm256_add_pd(fiy1,ty);
2089 fiz1 = _mm256_add_pd(fiz1,tz);
2091 fjx3 = _mm256_add_pd(fjx3,tx);
2092 fjy3 = _mm256_add_pd(fjy3,ty);
2093 fjz3 = _mm256_add_pd(fjz3,tz);
2097 /**************************
2098 * CALCULATE INTERACTIONS *
2099 **************************/
2101 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2104 /* REACTION-FIELD ELECTROSTATICS */
2105 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
2107 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2111 fscal = _mm256_and_pd(fscal,cutoff_mask);
2113 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2115 /* Calculate temporary vectorial force */
2116 tx = _mm256_mul_pd(fscal,dx21);
2117 ty = _mm256_mul_pd(fscal,dy21);
2118 tz = _mm256_mul_pd(fscal,dz21);
2120 /* Update vectorial force */
2121 fix2 = _mm256_add_pd(fix2,tx);
2122 fiy2 = _mm256_add_pd(fiy2,ty);
2123 fiz2 = _mm256_add_pd(fiz2,tz);
2125 fjx1 = _mm256_add_pd(fjx1,tx);
2126 fjy1 = _mm256_add_pd(fjy1,ty);
2127 fjz1 = _mm256_add_pd(fjz1,tz);
2131 /**************************
2132 * CALCULATE INTERACTIONS *
2133 **************************/
2135 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2138 /* REACTION-FIELD ELECTROSTATICS */
2139 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
2141 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2145 fscal = _mm256_and_pd(fscal,cutoff_mask);
2147 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2149 /* Calculate temporary vectorial force */
2150 tx = _mm256_mul_pd(fscal,dx22);
2151 ty = _mm256_mul_pd(fscal,dy22);
2152 tz = _mm256_mul_pd(fscal,dz22);
2154 /* Update vectorial force */
2155 fix2 = _mm256_add_pd(fix2,tx);
2156 fiy2 = _mm256_add_pd(fiy2,ty);
2157 fiz2 = _mm256_add_pd(fiz2,tz);
2159 fjx2 = _mm256_add_pd(fjx2,tx);
2160 fjy2 = _mm256_add_pd(fjy2,ty);
2161 fjz2 = _mm256_add_pd(fjz2,tz);
2165 /**************************
2166 * CALCULATE INTERACTIONS *
2167 **************************/
2169 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2172 /* REACTION-FIELD ELECTROSTATICS */
2173 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
2175 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2179 fscal = _mm256_and_pd(fscal,cutoff_mask);
2181 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2183 /* Calculate temporary vectorial force */
2184 tx = _mm256_mul_pd(fscal,dx23);
2185 ty = _mm256_mul_pd(fscal,dy23);
2186 tz = _mm256_mul_pd(fscal,dz23);
2188 /* Update vectorial force */
2189 fix2 = _mm256_add_pd(fix2,tx);
2190 fiy2 = _mm256_add_pd(fiy2,ty);
2191 fiz2 = _mm256_add_pd(fiz2,tz);
2193 fjx3 = _mm256_add_pd(fjx3,tx);
2194 fjy3 = _mm256_add_pd(fjy3,ty);
2195 fjz3 = _mm256_add_pd(fjz3,tz);
2199 /**************************
2200 * CALCULATE INTERACTIONS *
2201 **************************/
2203 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2206 /* REACTION-FIELD ELECTROSTATICS */
2207 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
2209 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2213 fscal = _mm256_and_pd(fscal,cutoff_mask);
2215 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2217 /* Calculate temporary vectorial force */
2218 tx = _mm256_mul_pd(fscal,dx31);
2219 ty = _mm256_mul_pd(fscal,dy31);
2220 tz = _mm256_mul_pd(fscal,dz31);
2222 /* Update vectorial force */
2223 fix3 = _mm256_add_pd(fix3,tx);
2224 fiy3 = _mm256_add_pd(fiy3,ty);
2225 fiz3 = _mm256_add_pd(fiz3,tz);
2227 fjx1 = _mm256_add_pd(fjx1,tx);
2228 fjy1 = _mm256_add_pd(fjy1,ty);
2229 fjz1 = _mm256_add_pd(fjz1,tz);
2233 /**************************
2234 * CALCULATE INTERACTIONS *
2235 **************************/
2237 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2240 /* REACTION-FIELD ELECTROSTATICS */
2241 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
2243 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2247 fscal = _mm256_and_pd(fscal,cutoff_mask);
2249 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2251 /* Calculate temporary vectorial force */
2252 tx = _mm256_mul_pd(fscal,dx32);
2253 ty = _mm256_mul_pd(fscal,dy32);
2254 tz = _mm256_mul_pd(fscal,dz32);
2256 /* Update vectorial force */
2257 fix3 = _mm256_add_pd(fix3,tx);
2258 fiy3 = _mm256_add_pd(fiy3,ty);
2259 fiz3 = _mm256_add_pd(fiz3,tz);
2261 fjx2 = _mm256_add_pd(fjx2,tx);
2262 fjy2 = _mm256_add_pd(fjy2,ty);
2263 fjz2 = _mm256_add_pd(fjz2,tz);
2267 /**************************
2268 * CALCULATE INTERACTIONS *
2269 **************************/
2271 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2274 /* REACTION-FIELD ELECTROSTATICS */
2275 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
2277 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2281 fscal = _mm256_and_pd(fscal,cutoff_mask);
2283 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2285 /* Calculate temporary vectorial force */
2286 tx = _mm256_mul_pd(fscal,dx33);
2287 ty = _mm256_mul_pd(fscal,dy33);
2288 tz = _mm256_mul_pd(fscal,dz33);
2290 /* Update vectorial force */
2291 fix3 = _mm256_add_pd(fix3,tx);
2292 fiy3 = _mm256_add_pd(fiy3,ty);
2293 fiz3 = _mm256_add_pd(fiz3,tz);
2295 fjx3 = _mm256_add_pd(fjx3,tx);
2296 fjy3 = _mm256_add_pd(fjy3,ty);
2297 fjz3 = _mm256_add_pd(fjz3,tz);
2301 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2302 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2303 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2304 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2306 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2307 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2308 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2310 /* Inner loop uses 303 flops */
2313 /* End of innermost loop */
2315 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2316 f+i_coord_offset,fshift+i_shift_offset);
2318 /* Increment number of inner iterations */
2319 inneriter += j_index_end - j_index_start;
2321 /* Outer loop uses 24 flops */
2324 /* Increment number of outer iterations */
2327 /* Update outer/inner flops */
2329 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*303);