2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_256_double
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 real * vdwioffsetptr3;
77 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
79 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
81 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
83 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
85 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
86 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
87 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
88 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
89 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
90 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
91 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
92 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
93 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
94 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
95 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
96 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
99 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
102 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
103 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
104 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
105 real rswitch_scalar,d_scalar;
106 __m256d dummy_mask,cutoff_mask;
107 __m128 tmpmask0,tmpmask1;
108 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
109 __m256d one = _mm256_set1_pd(1.0);
110 __m256d two = _mm256_set1_pd(2.0);
116 jindex = nlist->jindex;
118 shiftidx = nlist->shift;
120 shiftvec = fr->shift_vec[0];
121 fshift = fr->fshift[0];
122 facel = _mm256_set1_pd(fr->epsfac);
123 charge = mdatoms->chargeA;
124 krf = _mm256_set1_pd(fr->ic->k_rf);
125 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
126 crf = _mm256_set1_pd(fr->ic->c_rf);
127 nvdwtype = fr->ntype;
129 vdwtype = mdatoms->typeA;
131 /* Setup water-specific parameters */
132 inr = nlist->iinr[0];
133 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
134 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
135 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
136 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
138 jq1 = _mm256_set1_pd(charge[inr+1]);
139 jq2 = _mm256_set1_pd(charge[inr+2]);
140 jq3 = _mm256_set1_pd(charge[inr+3]);
141 vdwjidx0A = 2*vdwtype[inr+0];
142 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
143 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
144 qq11 = _mm256_mul_pd(iq1,jq1);
145 qq12 = _mm256_mul_pd(iq1,jq2);
146 qq13 = _mm256_mul_pd(iq1,jq3);
147 qq21 = _mm256_mul_pd(iq2,jq1);
148 qq22 = _mm256_mul_pd(iq2,jq2);
149 qq23 = _mm256_mul_pd(iq2,jq3);
150 qq31 = _mm256_mul_pd(iq3,jq1);
151 qq32 = _mm256_mul_pd(iq3,jq2);
152 qq33 = _mm256_mul_pd(iq3,jq3);
154 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
155 rcutoff_scalar = fr->rcoulomb;
156 rcutoff = _mm256_set1_pd(rcutoff_scalar);
157 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
159 rswitch_scalar = fr->rvdw_switch;
160 rswitch = _mm256_set1_pd(rswitch_scalar);
161 /* Setup switch parameters */
162 d_scalar = rcutoff_scalar-rswitch_scalar;
163 d = _mm256_set1_pd(d_scalar);
164 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
165 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
166 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
167 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
168 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
169 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
171 /* Avoid stupid compiler warnings */
172 jnrA = jnrB = jnrC = jnrD = 0;
181 for(iidx=0;iidx<4*DIM;iidx++)
186 /* Start outer loop over neighborlists */
187 for(iidx=0; iidx<nri; iidx++)
189 /* Load shift vector for this list */
190 i_shift_offset = DIM*shiftidx[iidx];
192 /* Load limits for loop over neighbors */
193 j_index_start = jindex[iidx];
194 j_index_end = jindex[iidx+1];
196 /* Get outer coordinate index */
198 i_coord_offset = DIM*inr;
200 /* Load i particle coords and add shift vector */
201 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
202 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
204 fix0 = _mm256_setzero_pd();
205 fiy0 = _mm256_setzero_pd();
206 fiz0 = _mm256_setzero_pd();
207 fix1 = _mm256_setzero_pd();
208 fiy1 = _mm256_setzero_pd();
209 fiz1 = _mm256_setzero_pd();
210 fix2 = _mm256_setzero_pd();
211 fiy2 = _mm256_setzero_pd();
212 fiz2 = _mm256_setzero_pd();
213 fix3 = _mm256_setzero_pd();
214 fiy3 = _mm256_setzero_pd();
215 fiz3 = _mm256_setzero_pd();
217 /* Reset potential sums */
218 velecsum = _mm256_setzero_pd();
219 vvdwsum = _mm256_setzero_pd();
221 /* Start inner kernel loop */
222 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
225 /* Get j neighbor index, and coordinate index */
230 j_coord_offsetA = DIM*jnrA;
231 j_coord_offsetB = DIM*jnrB;
232 j_coord_offsetC = DIM*jnrC;
233 j_coord_offsetD = DIM*jnrD;
235 /* load j atom coordinates */
236 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
237 x+j_coord_offsetC,x+j_coord_offsetD,
238 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
239 &jy2,&jz2,&jx3,&jy3,&jz3);
241 /* Calculate displacement vector */
242 dx00 = _mm256_sub_pd(ix0,jx0);
243 dy00 = _mm256_sub_pd(iy0,jy0);
244 dz00 = _mm256_sub_pd(iz0,jz0);
245 dx11 = _mm256_sub_pd(ix1,jx1);
246 dy11 = _mm256_sub_pd(iy1,jy1);
247 dz11 = _mm256_sub_pd(iz1,jz1);
248 dx12 = _mm256_sub_pd(ix1,jx2);
249 dy12 = _mm256_sub_pd(iy1,jy2);
250 dz12 = _mm256_sub_pd(iz1,jz2);
251 dx13 = _mm256_sub_pd(ix1,jx3);
252 dy13 = _mm256_sub_pd(iy1,jy3);
253 dz13 = _mm256_sub_pd(iz1,jz3);
254 dx21 = _mm256_sub_pd(ix2,jx1);
255 dy21 = _mm256_sub_pd(iy2,jy1);
256 dz21 = _mm256_sub_pd(iz2,jz1);
257 dx22 = _mm256_sub_pd(ix2,jx2);
258 dy22 = _mm256_sub_pd(iy2,jy2);
259 dz22 = _mm256_sub_pd(iz2,jz2);
260 dx23 = _mm256_sub_pd(ix2,jx3);
261 dy23 = _mm256_sub_pd(iy2,jy3);
262 dz23 = _mm256_sub_pd(iz2,jz3);
263 dx31 = _mm256_sub_pd(ix3,jx1);
264 dy31 = _mm256_sub_pd(iy3,jy1);
265 dz31 = _mm256_sub_pd(iz3,jz1);
266 dx32 = _mm256_sub_pd(ix3,jx2);
267 dy32 = _mm256_sub_pd(iy3,jy2);
268 dz32 = _mm256_sub_pd(iz3,jz2);
269 dx33 = _mm256_sub_pd(ix3,jx3);
270 dy33 = _mm256_sub_pd(iy3,jy3);
271 dz33 = _mm256_sub_pd(iz3,jz3);
273 /* Calculate squared distance and things based on it */
274 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
275 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
276 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
277 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
278 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
279 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
280 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
281 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
282 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
283 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
285 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
286 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
287 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
288 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
289 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
290 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
291 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
292 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
293 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
294 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
296 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
297 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
298 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
299 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
300 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
301 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
302 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
303 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
304 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
305 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
307 fjx0 = _mm256_setzero_pd();
308 fjy0 = _mm256_setzero_pd();
309 fjz0 = _mm256_setzero_pd();
310 fjx1 = _mm256_setzero_pd();
311 fjy1 = _mm256_setzero_pd();
312 fjz1 = _mm256_setzero_pd();
313 fjx2 = _mm256_setzero_pd();
314 fjy2 = _mm256_setzero_pd();
315 fjz2 = _mm256_setzero_pd();
316 fjx3 = _mm256_setzero_pd();
317 fjy3 = _mm256_setzero_pd();
318 fjz3 = _mm256_setzero_pd();
320 /**************************
321 * CALCULATE INTERACTIONS *
322 **************************/
324 if (gmx_mm256_any_lt(rsq00,rcutoff2))
327 r00 = _mm256_mul_pd(rsq00,rinv00);
329 /* LENNARD-JONES DISPERSION/REPULSION */
331 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
332 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
333 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
334 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
335 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
337 d = _mm256_sub_pd(r00,rswitch);
338 d = _mm256_max_pd(d,_mm256_setzero_pd());
339 d2 = _mm256_mul_pd(d,d);
340 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
342 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
344 /* Evaluate switch function */
345 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
346 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
347 vvdw = _mm256_mul_pd(vvdw,sw);
348 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
350 /* Update potential sum for this i atom from the interaction with this j atom. */
351 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
352 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
356 fscal = _mm256_and_pd(fscal,cutoff_mask);
358 /* Calculate temporary vectorial force */
359 tx = _mm256_mul_pd(fscal,dx00);
360 ty = _mm256_mul_pd(fscal,dy00);
361 tz = _mm256_mul_pd(fscal,dz00);
363 /* Update vectorial force */
364 fix0 = _mm256_add_pd(fix0,tx);
365 fiy0 = _mm256_add_pd(fiy0,ty);
366 fiz0 = _mm256_add_pd(fiz0,tz);
368 fjx0 = _mm256_add_pd(fjx0,tx);
369 fjy0 = _mm256_add_pd(fjy0,ty);
370 fjz0 = _mm256_add_pd(fjz0,tz);
374 /**************************
375 * CALCULATE INTERACTIONS *
376 **************************/
378 if (gmx_mm256_any_lt(rsq11,rcutoff2))
381 /* REACTION-FIELD ELECTROSTATICS */
382 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
383 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
385 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
387 /* Update potential sum for this i atom from the interaction with this j atom. */
388 velec = _mm256_and_pd(velec,cutoff_mask);
389 velecsum = _mm256_add_pd(velecsum,velec);
393 fscal = _mm256_and_pd(fscal,cutoff_mask);
395 /* Calculate temporary vectorial force */
396 tx = _mm256_mul_pd(fscal,dx11);
397 ty = _mm256_mul_pd(fscal,dy11);
398 tz = _mm256_mul_pd(fscal,dz11);
400 /* Update vectorial force */
401 fix1 = _mm256_add_pd(fix1,tx);
402 fiy1 = _mm256_add_pd(fiy1,ty);
403 fiz1 = _mm256_add_pd(fiz1,tz);
405 fjx1 = _mm256_add_pd(fjx1,tx);
406 fjy1 = _mm256_add_pd(fjy1,ty);
407 fjz1 = _mm256_add_pd(fjz1,tz);
411 /**************************
412 * CALCULATE INTERACTIONS *
413 **************************/
415 if (gmx_mm256_any_lt(rsq12,rcutoff2))
418 /* REACTION-FIELD ELECTROSTATICS */
419 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
420 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
422 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
424 /* Update potential sum for this i atom from the interaction with this j atom. */
425 velec = _mm256_and_pd(velec,cutoff_mask);
426 velecsum = _mm256_add_pd(velecsum,velec);
430 fscal = _mm256_and_pd(fscal,cutoff_mask);
432 /* Calculate temporary vectorial force */
433 tx = _mm256_mul_pd(fscal,dx12);
434 ty = _mm256_mul_pd(fscal,dy12);
435 tz = _mm256_mul_pd(fscal,dz12);
437 /* Update vectorial force */
438 fix1 = _mm256_add_pd(fix1,tx);
439 fiy1 = _mm256_add_pd(fiy1,ty);
440 fiz1 = _mm256_add_pd(fiz1,tz);
442 fjx2 = _mm256_add_pd(fjx2,tx);
443 fjy2 = _mm256_add_pd(fjy2,ty);
444 fjz2 = _mm256_add_pd(fjz2,tz);
448 /**************************
449 * CALCULATE INTERACTIONS *
450 **************************/
452 if (gmx_mm256_any_lt(rsq13,rcutoff2))
455 /* REACTION-FIELD ELECTROSTATICS */
456 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_add_pd(rinv13,_mm256_mul_pd(krf,rsq13)),crf));
457 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
459 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
461 /* Update potential sum for this i atom from the interaction with this j atom. */
462 velec = _mm256_and_pd(velec,cutoff_mask);
463 velecsum = _mm256_add_pd(velecsum,velec);
467 fscal = _mm256_and_pd(fscal,cutoff_mask);
469 /* Calculate temporary vectorial force */
470 tx = _mm256_mul_pd(fscal,dx13);
471 ty = _mm256_mul_pd(fscal,dy13);
472 tz = _mm256_mul_pd(fscal,dz13);
474 /* Update vectorial force */
475 fix1 = _mm256_add_pd(fix1,tx);
476 fiy1 = _mm256_add_pd(fiy1,ty);
477 fiz1 = _mm256_add_pd(fiz1,tz);
479 fjx3 = _mm256_add_pd(fjx3,tx);
480 fjy3 = _mm256_add_pd(fjy3,ty);
481 fjz3 = _mm256_add_pd(fjz3,tz);
485 /**************************
486 * CALCULATE INTERACTIONS *
487 **************************/
489 if (gmx_mm256_any_lt(rsq21,rcutoff2))
492 /* REACTION-FIELD ELECTROSTATICS */
493 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
494 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
496 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
498 /* Update potential sum for this i atom from the interaction with this j atom. */
499 velec = _mm256_and_pd(velec,cutoff_mask);
500 velecsum = _mm256_add_pd(velecsum,velec);
504 fscal = _mm256_and_pd(fscal,cutoff_mask);
506 /* Calculate temporary vectorial force */
507 tx = _mm256_mul_pd(fscal,dx21);
508 ty = _mm256_mul_pd(fscal,dy21);
509 tz = _mm256_mul_pd(fscal,dz21);
511 /* Update vectorial force */
512 fix2 = _mm256_add_pd(fix2,tx);
513 fiy2 = _mm256_add_pd(fiy2,ty);
514 fiz2 = _mm256_add_pd(fiz2,tz);
516 fjx1 = _mm256_add_pd(fjx1,tx);
517 fjy1 = _mm256_add_pd(fjy1,ty);
518 fjz1 = _mm256_add_pd(fjz1,tz);
522 /**************************
523 * CALCULATE INTERACTIONS *
524 **************************/
526 if (gmx_mm256_any_lt(rsq22,rcutoff2))
529 /* REACTION-FIELD ELECTROSTATICS */
530 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
531 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
533 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
535 /* Update potential sum for this i atom from the interaction with this j atom. */
536 velec = _mm256_and_pd(velec,cutoff_mask);
537 velecsum = _mm256_add_pd(velecsum,velec);
541 fscal = _mm256_and_pd(fscal,cutoff_mask);
543 /* Calculate temporary vectorial force */
544 tx = _mm256_mul_pd(fscal,dx22);
545 ty = _mm256_mul_pd(fscal,dy22);
546 tz = _mm256_mul_pd(fscal,dz22);
548 /* Update vectorial force */
549 fix2 = _mm256_add_pd(fix2,tx);
550 fiy2 = _mm256_add_pd(fiy2,ty);
551 fiz2 = _mm256_add_pd(fiz2,tz);
553 fjx2 = _mm256_add_pd(fjx2,tx);
554 fjy2 = _mm256_add_pd(fjy2,ty);
555 fjz2 = _mm256_add_pd(fjz2,tz);
559 /**************************
560 * CALCULATE INTERACTIONS *
561 **************************/
563 if (gmx_mm256_any_lt(rsq23,rcutoff2))
566 /* REACTION-FIELD ELECTROSTATICS */
567 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_add_pd(rinv23,_mm256_mul_pd(krf,rsq23)),crf));
568 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
570 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
572 /* Update potential sum for this i atom from the interaction with this j atom. */
573 velec = _mm256_and_pd(velec,cutoff_mask);
574 velecsum = _mm256_add_pd(velecsum,velec);
578 fscal = _mm256_and_pd(fscal,cutoff_mask);
580 /* Calculate temporary vectorial force */
581 tx = _mm256_mul_pd(fscal,dx23);
582 ty = _mm256_mul_pd(fscal,dy23);
583 tz = _mm256_mul_pd(fscal,dz23);
585 /* Update vectorial force */
586 fix2 = _mm256_add_pd(fix2,tx);
587 fiy2 = _mm256_add_pd(fiy2,ty);
588 fiz2 = _mm256_add_pd(fiz2,tz);
590 fjx3 = _mm256_add_pd(fjx3,tx);
591 fjy3 = _mm256_add_pd(fjy3,ty);
592 fjz3 = _mm256_add_pd(fjz3,tz);
596 /**************************
597 * CALCULATE INTERACTIONS *
598 **************************/
600 if (gmx_mm256_any_lt(rsq31,rcutoff2))
603 /* REACTION-FIELD ELECTROSTATICS */
604 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_add_pd(rinv31,_mm256_mul_pd(krf,rsq31)),crf));
605 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
607 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
609 /* Update potential sum for this i atom from the interaction with this j atom. */
610 velec = _mm256_and_pd(velec,cutoff_mask);
611 velecsum = _mm256_add_pd(velecsum,velec);
615 fscal = _mm256_and_pd(fscal,cutoff_mask);
617 /* Calculate temporary vectorial force */
618 tx = _mm256_mul_pd(fscal,dx31);
619 ty = _mm256_mul_pd(fscal,dy31);
620 tz = _mm256_mul_pd(fscal,dz31);
622 /* Update vectorial force */
623 fix3 = _mm256_add_pd(fix3,tx);
624 fiy3 = _mm256_add_pd(fiy3,ty);
625 fiz3 = _mm256_add_pd(fiz3,tz);
627 fjx1 = _mm256_add_pd(fjx1,tx);
628 fjy1 = _mm256_add_pd(fjy1,ty);
629 fjz1 = _mm256_add_pd(fjz1,tz);
633 /**************************
634 * CALCULATE INTERACTIONS *
635 **************************/
637 if (gmx_mm256_any_lt(rsq32,rcutoff2))
640 /* REACTION-FIELD ELECTROSTATICS */
641 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_add_pd(rinv32,_mm256_mul_pd(krf,rsq32)),crf));
642 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
644 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
646 /* Update potential sum for this i atom from the interaction with this j atom. */
647 velec = _mm256_and_pd(velec,cutoff_mask);
648 velecsum = _mm256_add_pd(velecsum,velec);
652 fscal = _mm256_and_pd(fscal,cutoff_mask);
654 /* Calculate temporary vectorial force */
655 tx = _mm256_mul_pd(fscal,dx32);
656 ty = _mm256_mul_pd(fscal,dy32);
657 tz = _mm256_mul_pd(fscal,dz32);
659 /* Update vectorial force */
660 fix3 = _mm256_add_pd(fix3,tx);
661 fiy3 = _mm256_add_pd(fiy3,ty);
662 fiz3 = _mm256_add_pd(fiz3,tz);
664 fjx2 = _mm256_add_pd(fjx2,tx);
665 fjy2 = _mm256_add_pd(fjy2,ty);
666 fjz2 = _mm256_add_pd(fjz2,tz);
670 /**************************
671 * CALCULATE INTERACTIONS *
672 **************************/
674 if (gmx_mm256_any_lt(rsq33,rcutoff2))
677 /* REACTION-FIELD ELECTROSTATICS */
678 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_add_pd(rinv33,_mm256_mul_pd(krf,rsq33)),crf));
679 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
681 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
683 /* Update potential sum for this i atom from the interaction with this j atom. */
684 velec = _mm256_and_pd(velec,cutoff_mask);
685 velecsum = _mm256_add_pd(velecsum,velec);
689 fscal = _mm256_and_pd(fscal,cutoff_mask);
691 /* Calculate temporary vectorial force */
692 tx = _mm256_mul_pd(fscal,dx33);
693 ty = _mm256_mul_pd(fscal,dy33);
694 tz = _mm256_mul_pd(fscal,dz33);
696 /* Update vectorial force */
697 fix3 = _mm256_add_pd(fix3,tx);
698 fiy3 = _mm256_add_pd(fiy3,ty);
699 fiz3 = _mm256_add_pd(fiz3,tz);
701 fjx3 = _mm256_add_pd(fjx3,tx);
702 fjy3 = _mm256_add_pd(fjy3,ty);
703 fjz3 = _mm256_add_pd(fjz3,tz);
707 fjptrA = f+j_coord_offsetA;
708 fjptrB = f+j_coord_offsetB;
709 fjptrC = f+j_coord_offsetC;
710 fjptrD = f+j_coord_offsetD;
712 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
713 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
714 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
716 /* Inner loop uses 386 flops */
722 /* Get j neighbor index, and coordinate index */
723 jnrlistA = jjnr[jidx];
724 jnrlistB = jjnr[jidx+1];
725 jnrlistC = jjnr[jidx+2];
726 jnrlistD = jjnr[jidx+3];
727 /* Sign of each element will be negative for non-real atoms.
728 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
729 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
731 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
733 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
734 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
735 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
737 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
738 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
739 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
740 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
741 j_coord_offsetA = DIM*jnrA;
742 j_coord_offsetB = DIM*jnrB;
743 j_coord_offsetC = DIM*jnrC;
744 j_coord_offsetD = DIM*jnrD;
746 /* load j atom coordinates */
747 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
748 x+j_coord_offsetC,x+j_coord_offsetD,
749 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
750 &jy2,&jz2,&jx3,&jy3,&jz3);
752 /* Calculate displacement vector */
753 dx00 = _mm256_sub_pd(ix0,jx0);
754 dy00 = _mm256_sub_pd(iy0,jy0);
755 dz00 = _mm256_sub_pd(iz0,jz0);
756 dx11 = _mm256_sub_pd(ix1,jx1);
757 dy11 = _mm256_sub_pd(iy1,jy1);
758 dz11 = _mm256_sub_pd(iz1,jz1);
759 dx12 = _mm256_sub_pd(ix1,jx2);
760 dy12 = _mm256_sub_pd(iy1,jy2);
761 dz12 = _mm256_sub_pd(iz1,jz2);
762 dx13 = _mm256_sub_pd(ix1,jx3);
763 dy13 = _mm256_sub_pd(iy1,jy3);
764 dz13 = _mm256_sub_pd(iz1,jz3);
765 dx21 = _mm256_sub_pd(ix2,jx1);
766 dy21 = _mm256_sub_pd(iy2,jy1);
767 dz21 = _mm256_sub_pd(iz2,jz1);
768 dx22 = _mm256_sub_pd(ix2,jx2);
769 dy22 = _mm256_sub_pd(iy2,jy2);
770 dz22 = _mm256_sub_pd(iz2,jz2);
771 dx23 = _mm256_sub_pd(ix2,jx3);
772 dy23 = _mm256_sub_pd(iy2,jy3);
773 dz23 = _mm256_sub_pd(iz2,jz3);
774 dx31 = _mm256_sub_pd(ix3,jx1);
775 dy31 = _mm256_sub_pd(iy3,jy1);
776 dz31 = _mm256_sub_pd(iz3,jz1);
777 dx32 = _mm256_sub_pd(ix3,jx2);
778 dy32 = _mm256_sub_pd(iy3,jy2);
779 dz32 = _mm256_sub_pd(iz3,jz2);
780 dx33 = _mm256_sub_pd(ix3,jx3);
781 dy33 = _mm256_sub_pd(iy3,jy3);
782 dz33 = _mm256_sub_pd(iz3,jz3);
784 /* Calculate squared distance and things based on it */
785 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
786 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
787 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
788 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
789 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
790 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
791 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
792 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
793 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
794 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
796 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
797 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
798 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
799 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
800 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
801 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
802 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
803 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
804 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
805 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
807 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
808 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
809 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
810 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
811 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
812 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
813 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
814 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
815 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
816 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
818 fjx0 = _mm256_setzero_pd();
819 fjy0 = _mm256_setzero_pd();
820 fjz0 = _mm256_setzero_pd();
821 fjx1 = _mm256_setzero_pd();
822 fjy1 = _mm256_setzero_pd();
823 fjz1 = _mm256_setzero_pd();
824 fjx2 = _mm256_setzero_pd();
825 fjy2 = _mm256_setzero_pd();
826 fjz2 = _mm256_setzero_pd();
827 fjx3 = _mm256_setzero_pd();
828 fjy3 = _mm256_setzero_pd();
829 fjz3 = _mm256_setzero_pd();
831 /**************************
832 * CALCULATE INTERACTIONS *
833 **************************/
835 if (gmx_mm256_any_lt(rsq00,rcutoff2))
838 r00 = _mm256_mul_pd(rsq00,rinv00);
839 r00 = _mm256_andnot_pd(dummy_mask,r00);
841 /* LENNARD-JONES DISPERSION/REPULSION */
843 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
844 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
845 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
846 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
847 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
849 d = _mm256_sub_pd(r00,rswitch);
850 d = _mm256_max_pd(d,_mm256_setzero_pd());
851 d2 = _mm256_mul_pd(d,d);
852 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
854 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
856 /* Evaluate switch function */
857 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
858 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
859 vvdw = _mm256_mul_pd(vvdw,sw);
860 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
862 /* Update potential sum for this i atom from the interaction with this j atom. */
863 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
864 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
865 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
869 fscal = _mm256_and_pd(fscal,cutoff_mask);
871 fscal = _mm256_andnot_pd(dummy_mask,fscal);
873 /* Calculate temporary vectorial force */
874 tx = _mm256_mul_pd(fscal,dx00);
875 ty = _mm256_mul_pd(fscal,dy00);
876 tz = _mm256_mul_pd(fscal,dz00);
878 /* Update vectorial force */
879 fix0 = _mm256_add_pd(fix0,tx);
880 fiy0 = _mm256_add_pd(fiy0,ty);
881 fiz0 = _mm256_add_pd(fiz0,tz);
883 fjx0 = _mm256_add_pd(fjx0,tx);
884 fjy0 = _mm256_add_pd(fjy0,ty);
885 fjz0 = _mm256_add_pd(fjz0,tz);
889 /**************************
890 * CALCULATE INTERACTIONS *
891 **************************/
893 if (gmx_mm256_any_lt(rsq11,rcutoff2))
896 /* REACTION-FIELD ELECTROSTATICS */
897 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
898 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
900 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
902 /* Update potential sum for this i atom from the interaction with this j atom. */
903 velec = _mm256_and_pd(velec,cutoff_mask);
904 velec = _mm256_andnot_pd(dummy_mask,velec);
905 velecsum = _mm256_add_pd(velecsum,velec);
909 fscal = _mm256_and_pd(fscal,cutoff_mask);
911 fscal = _mm256_andnot_pd(dummy_mask,fscal);
913 /* Calculate temporary vectorial force */
914 tx = _mm256_mul_pd(fscal,dx11);
915 ty = _mm256_mul_pd(fscal,dy11);
916 tz = _mm256_mul_pd(fscal,dz11);
918 /* Update vectorial force */
919 fix1 = _mm256_add_pd(fix1,tx);
920 fiy1 = _mm256_add_pd(fiy1,ty);
921 fiz1 = _mm256_add_pd(fiz1,tz);
923 fjx1 = _mm256_add_pd(fjx1,tx);
924 fjy1 = _mm256_add_pd(fjy1,ty);
925 fjz1 = _mm256_add_pd(fjz1,tz);
929 /**************************
930 * CALCULATE INTERACTIONS *
931 **************************/
933 if (gmx_mm256_any_lt(rsq12,rcutoff2))
936 /* REACTION-FIELD ELECTROSTATICS */
937 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
938 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
940 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
942 /* Update potential sum for this i atom from the interaction with this j atom. */
943 velec = _mm256_and_pd(velec,cutoff_mask);
944 velec = _mm256_andnot_pd(dummy_mask,velec);
945 velecsum = _mm256_add_pd(velecsum,velec);
949 fscal = _mm256_and_pd(fscal,cutoff_mask);
951 fscal = _mm256_andnot_pd(dummy_mask,fscal);
953 /* Calculate temporary vectorial force */
954 tx = _mm256_mul_pd(fscal,dx12);
955 ty = _mm256_mul_pd(fscal,dy12);
956 tz = _mm256_mul_pd(fscal,dz12);
958 /* Update vectorial force */
959 fix1 = _mm256_add_pd(fix1,tx);
960 fiy1 = _mm256_add_pd(fiy1,ty);
961 fiz1 = _mm256_add_pd(fiz1,tz);
963 fjx2 = _mm256_add_pd(fjx2,tx);
964 fjy2 = _mm256_add_pd(fjy2,ty);
965 fjz2 = _mm256_add_pd(fjz2,tz);
969 /**************************
970 * CALCULATE INTERACTIONS *
971 **************************/
973 if (gmx_mm256_any_lt(rsq13,rcutoff2))
976 /* REACTION-FIELD ELECTROSTATICS */
977 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_add_pd(rinv13,_mm256_mul_pd(krf,rsq13)),crf));
978 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
980 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
982 /* Update potential sum for this i atom from the interaction with this j atom. */
983 velec = _mm256_and_pd(velec,cutoff_mask);
984 velec = _mm256_andnot_pd(dummy_mask,velec);
985 velecsum = _mm256_add_pd(velecsum,velec);
989 fscal = _mm256_and_pd(fscal,cutoff_mask);
991 fscal = _mm256_andnot_pd(dummy_mask,fscal);
993 /* Calculate temporary vectorial force */
994 tx = _mm256_mul_pd(fscal,dx13);
995 ty = _mm256_mul_pd(fscal,dy13);
996 tz = _mm256_mul_pd(fscal,dz13);
998 /* Update vectorial force */
999 fix1 = _mm256_add_pd(fix1,tx);
1000 fiy1 = _mm256_add_pd(fiy1,ty);
1001 fiz1 = _mm256_add_pd(fiz1,tz);
1003 fjx3 = _mm256_add_pd(fjx3,tx);
1004 fjy3 = _mm256_add_pd(fjy3,ty);
1005 fjz3 = _mm256_add_pd(fjz3,tz);
1009 /**************************
1010 * CALCULATE INTERACTIONS *
1011 **************************/
1013 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1016 /* REACTION-FIELD ELECTROSTATICS */
1017 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
1018 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1020 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1022 /* Update potential sum for this i atom from the interaction with this j atom. */
1023 velec = _mm256_and_pd(velec,cutoff_mask);
1024 velec = _mm256_andnot_pd(dummy_mask,velec);
1025 velecsum = _mm256_add_pd(velecsum,velec);
1029 fscal = _mm256_and_pd(fscal,cutoff_mask);
1031 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1033 /* Calculate temporary vectorial force */
1034 tx = _mm256_mul_pd(fscal,dx21);
1035 ty = _mm256_mul_pd(fscal,dy21);
1036 tz = _mm256_mul_pd(fscal,dz21);
1038 /* Update vectorial force */
1039 fix2 = _mm256_add_pd(fix2,tx);
1040 fiy2 = _mm256_add_pd(fiy2,ty);
1041 fiz2 = _mm256_add_pd(fiz2,tz);
1043 fjx1 = _mm256_add_pd(fjx1,tx);
1044 fjy1 = _mm256_add_pd(fjy1,ty);
1045 fjz1 = _mm256_add_pd(fjz1,tz);
1049 /**************************
1050 * CALCULATE INTERACTIONS *
1051 **************************/
1053 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1056 /* REACTION-FIELD ELECTROSTATICS */
1057 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
1058 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1060 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1062 /* Update potential sum for this i atom from the interaction with this j atom. */
1063 velec = _mm256_and_pd(velec,cutoff_mask);
1064 velec = _mm256_andnot_pd(dummy_mask,velec);
1065 velecsum = _mm256_add_pd(velecsum,velec);
1069 fscal = _mm256_and_pd(fscal,cutoff_mask);
1071 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1073 /* Calculate temporary vectorial force */
1074 tx = _mm256_mul_pd(fscal,dx22);
1075 ty = _mm256_mul_pd(fscal,dy22);
1076 tz = _mm256_mul_pd(fscal,dz22);
1078 /* Update vectorial force */
1079 fix2 = _mm256_add_pd(fix2,tx);
1080 fiy2 = _mm256_add_pd(fiy2,ty);
1081 fiz2 = _mm256_add_pd(fiz2,tz);
1083 fjx2 = _mm256_add_pd(fjx2,tx);
1084 fjy2 = _mm256_add_pd(fjy2,ty);
1085 fjz2 = _mm256_add_pd(fjz2,tz);
1089 /**************************
1090 * CALCULATE INTERACTIONS *
1091 **************************/
1093 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1096 /* REACTION-FIELD ELECTROSTATICS */
1097 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_add_pd(rinv23,_mm256_mul_pd(krf,rsq23)),crf));
1098 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
1100 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1102 /* Update potential sum for this i atom from the interaction with this j atom. */
1103 velec = _mm256_and_pd(velec,cutoff_mask);
1104 velec = _mm256_andnot_pd(dummy_mask,velec);
1105 velecsum = _mm256_add_pd(velecsum,velec);
1109 fscal = _mm256_and_pd(fscal,cutoff_mask);
1111 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1113 /* Calculate temporary vectorial force */
1114 tx = _mm256_mul_pd(fscal,dx23);
1115 ty = _mm256_mul_pd(fscal,dy23);
1116 tz = _mm256_mul_pd(fscal,dz23);
1118 /* Update vectorial force */
1119 fix2 = _mm256_add_pd(fix2,tx);
1120 fiy2 = _mm256_add_pd(fiy2,ty);
1121 fiz2 = _mm256_add_pd(fiz2,tz);
1123 fjx3 = _mm256_add_pd(fjx3,tx);
1124 fjy3 = _mm256_add_pd(fjy3,ty);
1125 fjz3 = _mm256_add_pd(fjz3,tz);
1129 /**************************
1130 * CALCULATE INTERACTIONS *
1131 **************************/
1133 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1136 /* REACTION-FIELD ELECTROSTATICS */
1137 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_add_pd(rinv31,_mm256_mul_pd(krf,rsq31)),crf));
1138 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1140 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1142 /* Update potential sum for this i atom from the interaction with this j atom. */
1143 velec = _mm256_and_pd(velec,cutoff_mask);
1144 velec = _mm256_andnot_pd(dummy_mask,velec);
1145 velecsum = _mm256_add_pd(velecsum,velec);
1149 fscal = _mm256_and_pd(fscal,cutoff_mask);
1151 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1153 /* Calculate temporary vectorial force */
1154 tx = _mm256_mul_pd(fscal,dx31);
1155 ty = _mm256_mul_pd(fscal,dy31);
1156 tz = _mm256_mul_pd(fscal,dz31);
1158 /* Update vectorial force */
1159 fix3 = _mm256_add_pd(fix3,tx);
1160 fiy3 = _mm256_add_pd(fiy3,ty);
1161 fiz3 = _mm256_add_pd(fiz3,tz);
1163 fjx1 = _mm256_add_pd(fjx1,tx);
1164 fjy1 = _mm256_add_pd(fjy1,ty);
1165 fjz1 = _mm256_add_pd(fjz1,tz);
1169 /**************************
1170 * CALCULATE INTERACTIONS *
1171 **************************/
1173 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1176 /* REACTION-FIELD ELECTROSTATICS */
1177 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_add_pd(rinv32,_mm256_mul_pd(krf,rsq32)),crf));
1178 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
1180 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1182 /* Update potential sum for this i atom from the interaction with this j atom. */
1183 velec = _mm256_and_pd(velec,cutoff_mask);
1184 velec = _mm256_andnot_pd(dummy_mask,velec);
1185 velecsum = _mm256_add_pd(velecsum,velec);
1189 fscal = _mm256_and_pd(fscal,cutoff_mask);
1191 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1193 /* Calculate temporary vectorial force */
1194 tx = _mm256_mul_pd(fscal,dx32);
1195 ty = _mm256_mul_pd(fscal,dy32);
1196 tz = _mm256_mul_pd(fscal,dz32);
1198 /* Update vectorial force */
1199 fix3 = _mm256_add_pd(fix3,tx);
1200 fiy3 = _mm256_add_pd(fiy3,ty);
1201 fiz3 = _mm256_add_pd(fiz3,tz);
1203 fjx2 = _mm256_add_pd(fjx2,tx);
1204 fjy2 = _mm256_add_pd(fjy2,ty);
1205 fjz2 = _mm256_add_pd(fjz2,tz);
1209 /**************************
1210 * CALCULATE INTERACTIONS *
1211 **************************/
1213 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1216 /* REACTION-FIELD ELECTROSTATICS */
1217 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_add_pd(rinv33,_mm256_mul_pd(krf,rsq33)),crf));
1218 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
1220 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1222 /* Update potential sum for this i atom from the interaction with this j atom. */
1223 velec = _mm256_and_pd(velec,cutoff_mask);
1224 velec = _mm256_andnot_pd(dummy_mask,velec);
1225 velecsum = _mm256_add_pd(velecsum,velec);
1229 fscal = _mm256_and_pd(fscal,cutoff_mask);
1231 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1233 /* Calculate temporary vectorial force */
1234 tx = _mm256_mul_pd(fscal,dx33);
1235 ty = _mm256_mul_pd(fscal,dy33);
1236 tz = _mm256_mul_pd(fscal,dz33);
1238 /* Update vectorial force */
1239 fix3 = _mm256_add_pd(fix3,tx);
1240 fiy3 = _mm256_add_pd(fiy3,ty);
1241 fiz3 = _mm256_add_pd(fiz3,tz);
1243 fjx3 = _mm256_add_pd(fjx3,tx);
1244 fjy3 = _mm256_add_pd(fjy3,ty);
1245 fjz3 = _mm256_add_pd(fjz3,tz);
1249 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1250 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1251 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1252 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1254 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1255 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1256 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1258 /* Inner loop uses 387 flops */
1261 /* End of innermost loop */
1263 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1264 f+i_coord_offset,fshift+i_shift_offset);
1267 /* Update potential energies */
1268 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1269 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1271 /* Increment number of inner iterations */
1272 inneriter += j_index_end - j_index_start;
1274 /* Outer loop uses 26 flops */
1277 /* Increment number of outer iterations */
1280 /* Update outer/inner flops */
1282 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*387);
1285 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_256_double
1286 * Electrostatics interaction: ReactionField
1287 * VdW interaction: LennardJones
1288 * Geometry: Water4-Water4
1289 * Calculate force/pot: Force
1292 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_256_double
1293 (t_nblist * gmx_restrict nlist,
1294 rvec * gmx_restrict xx,
1295 rvec * gmx_restrict ff,
1296 t_forcerec * gmx_restrict fr,
1297 t_mdatoms * gmx_restrict mdatoms,
1298 nb_kernel_data_t * gmx_restrict kernel_data,
1299 t_nrnb * gmx_restrict nrnb)
1301 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1302 * just 0 for non-waters.
1303 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1304 * jnr indices corresponding to data put in the four positions in the SIMD register.
1306 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1307 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1308 int jnrA,jnrB,jnrC,jnrD;
1309 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1310 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1311 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1312 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1313 real rcutoff_scalar;
1314 real *shiftvec,*fshift,*x,*f;
1315 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1316 real scratch[4*DIM];
1317 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1318 real * vdwioffsetptr0;
1319 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1320 real * vdwioffsetptr1;
1321 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1322 real * vdwioffsetptr2;
1323 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1324 real * vdwioffsetptr3;
1325 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1326 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1327 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1328 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1329 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1330 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1331 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1332 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1333 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1334 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1335 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1336 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1337 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1338 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1339 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1340 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1341 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1342 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1343 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1344 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1347 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1350 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1351 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1352 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1353 real rswitch_scalar,d_scalar;
1354 __m256d dummy_mask,cutoff_mask;
1355 __m128 tmpmask0,tmpmask1;
1356 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1357 __m256d one = _mm256_set1_pd(1.0);
1358 __m256d two = _mm256_set1_pd(2.0);
1364 jindex = nlist->jindex;
1366 shiftidx = nlist->shift;
1368 shiftvec = fr->shift_vec[0];
1369 fshift = fr->fshift[0];
1370 facel = _mm256_set1_pd(fr->epsfac);
1371 charge = mdatoms->chargeA;
1372 krf = _mm256_set1_pd(fr->ic->k_rf);
1373 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
1374 crf = _mm256_set1_pd(fr->ic->c_rf);
1375 nvdwtype = fr->ntype;
1376 vdwparam = fr->nbfp;
1377 vdwtype = mdatoms->typeA;
1379 /* Setup water-specific parameters */
1380 inr = nlist->iinr[0];
1381 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1382 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1383 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1384 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1386 jq1 = _mm256_set1_pd(charge[inr+1]);
1387 jq2 = _mm256_set1_pd(charge[inr+2]);
1388 jq3 = _mm256_set1_pd(charge[inr+3]);
1389 vdwjidx0A = 2*vdwtype[inr+0];
1390 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1391 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1392 qq11 = _mm256_mul_pd(iq1,jq1);
1393 qq12 = _mm256_mul_pd(iq1,jq2);
1394 qq13 = _mm256_mul_pd(iq1,jq3);
1395 qq21 = _mm256_mul_pd(iq2,jq1);
1396 qq22 = _mm256_mul_pd(iq2,jq2);
1397 qq23 = _mm256_mul_pd(iq2,jq3);
1398 qq31 = _mm256_mul_pd(iq3,jq1);
1399 qq32 = _mm256_mul_pd(iq3,jq2);
1400 qq33 = _mm256_mul_pd(iq3,jq3);
1402 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1403 rcutoff_scalar = fr->rcoulomb;
1404 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1405 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1407 rswitch_scalar = fr->rvdw_switch;
1408 rswitch = _mm256_set1_pd(rswitch_scalar);
1409 /* Setup switch parameters */
1410 d_scalar = rcutoff_scalar-rswitch_scalar;
1411 d = _mm256_set1_pd(d_scalar);
1412 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1413 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1414 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1415 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1416 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1417 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1419 /* Avoid stupid compiler warnings */
1420 jnrA = jnrB = jnrC = jnrD = 0;
1421 j_coord_offsetA = 0;
1422 j_coord_offsetB = 0;
1423 j_coord_offsetC = 0;
1424 j_coord_offsetD = 0;
1429 for(iidx=0;iidx<4*DIM;iidx++)
1431 scratch[iidx] = 0.0;
1434 /* Start outer loop over neighborlists */
1435 for(iidx=0; iidx<nri; iidx++)
1437 /* Load shift vector for this list */
1438 i_shift_offset = DIM*shiftidx[iidx];
1440 /* Load limits for loop over neighbors */
1441 j_index_start = jindex[iidx];
1442 j_index_end = jindex[iidx+1];
1444 /* Get outer coordinate index */
1446 i_coord_offset = DIM*inr;
1448 /* Load i particle coords and add shift vector */
1449 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1450 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1452 fix0 = _mm256_setzero_pd();
1453 fiy0 = _mm256_setzero_pd();
1454 fiz0 = _mm256_setzero_pd();
1455 fix1 = _mm256_setzero_pd();
1456 fiy1 = _mm256_setzero_pd();
1457 fiz1 = _mm256_setzero_pd();
1458 fix2 = _mm256_setzero_pd();
1459 fiy2 = _mm256_setzero_pd();
1460 fiz2 = _mm256_setzero_pd();
1461 fix3 = _mm256_setzero_pd();
1462 fiy3 = _mm256_setzero_pd();
1463 fiz3 = _mm256_setzero_pd();
1465 /* Start inner kernel loop */
1466 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1469 /* Get j neighbor index, and coordinate index */
1471 jnrB = jjnr[jidx+1];
1472 jnrC = jjnr[jidx+2];
1473 jnrD = jjnr[jidx+3];
1474 j_coord_offsetA = DIM*jnrA;
1475 j_coord_offsetB = DIM*jnrB;
1476 j_coord_offsetC = DIM*jnrC;
1477 j_coord_offsetD = DIM*jnrD;
1479 /* load j atom coordinates */
1480 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1481 x+j_coord_offsetC,x+j_coord_offsetD,
1482 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1483 &jy2,&jz2,&jx3,&jy3,&jz3);
1485 /* Calculate displacement vector */
1486 dx00 = _mm256_sub_pd(ix0,jx0);
1487 dy00 = _mm256_sub_pd(iy0,jy0);
1488 dz00 = _mm256_sub_pd(iz0,jz0);
1489 dx11 = _mm256_sub_pd(ix1,jx1);
1490 dy11 = _mm256_sub_pd(iy1,jy1);
1491 dz11 = _mm256_sub_pd(iz1,jz1);
1492 dx12 = _mm256_sub_pd(ix1,jx2);
1493 dy12 = _mm256_sub_pd(iy1,jy2);
1494 dz12 = _mm256_sub_pd(iz1,jz2);
1495 dx13 = _mm256_sub_pd(ix1,jx3);
1496 dy13 = _mm256_sub_pd(iy1,jy3);
1497 dz13 = _mm256_sub_pd(iz1,jz3);
1498 dx21 = _mm256_sub_pd(ix2,jx1);
1499 dy21 = _mm256_sub_pd(iy2,jy1);
1500 dz21 = _mm256_sub_pd(iz2,jz1);
1501 dx22 = _mm256_sub_pd(ix2,jx2);
1502 dy22 = _mm256_sub_pd(iy2,jy2);
1503 dz22 = _mm256_sub_pd(iz2,jz2);
1504 dx23 = _mm256_sub_pd(ix2,jx3);
1505 dy23 = _mm256_sub_pd(iy2,jy3);
1506 dz23 = _mm256_sub_pd(iz2,jz3);
1507 dx31 = _mm256_sub_pd(ix3,jx1);
1508 dy31 = _mm256_sub_pd(iy3,jy1);
1509 dz31 = _mm256_sub_pd(iz3,jz1);
1510 dx32 = _mm256_sub_pd(ix3,jx2);
1511 dy32 = _mm256_sub_pd(iy3,jy2);
1512 dz32 = _mm256_sub_pd(iz3,jz2);
1513 dx33 = _mm256_sub_pd(ix3,jx3);
1514 dy33 = _mm256_sub_pd(iy3,jy3);
1515 dz33 = _mm256_sub_pd(iz3,jz3);
1517 /* Calculate squared distance and things based on it */
1518 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1519 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1520 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1521 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1522 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1523 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1524 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1525 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1526 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1527 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1529 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1530 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1531 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1532 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1533 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1534 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1535 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1536 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1537 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1538 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
1540 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1541 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1542 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1543 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1544 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1545 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1546 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1547 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1548 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1549 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1551 fjx0 = _mm256_setzero_pd();
1552 fjy0 = _mm256_setzero_pd();
1553 fjz0 = _mm256_setzero_pd();
1554 fjx1 = _mm256_setzero_pd();
1555 fjy1 = _mm256_setzero_pd();
1556 fjz1 = _mm256_setzero_pd();
1557 fjx2 = _mm256_setzero_pd();
1558 fjy2 = _mm256_setzero_pd();
1559 fjz2 = _mm256_setzero_pd();
1560 fjx3 = _mm256_setzero_pd();
1561 fjy3 = _mm256_setzero_pd();
1562 fjz3 = _mm256_setzero_pd();
1564 /**************************
1565 * CALCULATE INTERACTIONS *
1566 **************************/
1568 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1571 r00 = _mm256_mul_pd(rsq00,rinv00);
1573 /* LENNARD-JONES DISPERSION/REPULSION */
1575 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1576 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
1577 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1578 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1579 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1581 d = _mm256_sub_pd(r00,rswitch);
1582 d = _mm256_max_pd(d,_mm256_setzero_pd());
1583 d2 = _mm256_mul_pd(d,d);
1584 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1586 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1588 /* Evaluate switch function */
1589 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1590 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1591 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1595 fscal = _mm256_and_pd(fscal,cutoff_mask);
1597 /* Calculate temporary vectorial force */
1598 tx = _mm256_mul_pd(fscal,dx00);
1599 ty = _mm256_mul_pd(fscal,dy00);
1600 tz = _mm256_mul_pd(fscal,dz00);
1602 /* Update vectorial force */
1603 fix0 = _mm256_add_pd(fix0,tx);
1604 fiy0 = _mm256_add_pd(fiy0,ty);
1605 fiz0 = _mm256_add_pd(fiz0,tz);
1607 fjx0 = _mm256_add_pd(fjx0,tx);
1608 fjy0 = _mm256_add_pd(fjy0,ty);
1609 fjz0 = _mm256_add_pd(fjz0,tz);
1613 /**************************
1614 * CALCULATE INTERACTIONS *
1615 **************************/
1617 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1620 /* REACTION-FIELD ELECTROSTATICS */
1621 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1623 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1627 fscal = _mm256_and_pd(fscal,cutoff_mask);
1629 /* Calculate temporary vectorial force */
1630 tx = _mm256_mul_pd(fscal,dx11);
1631 ty = _mm256_mul_pd(fscal,dy11);
1632 tz = _mm256_mul_pd(fscal,dz11);
1634 /* Update vectorial force */
1635 fix1 = _mm256_add_pd(fix1,tx);
1636 fiy1 = _mm256_add_pd(fiy1,ty);
1637 fiz1 = _mm256_add_pd(fiz1,tz);
1639 fjx1 = _mm256_add_pd(fjx1,tx);
1640 fjy1 = _mm256_add_pd(fjy1,ty);
1641 fjz1 = _mm256_add_pd(fjz1,tz);
1645 /**************************
1646 * CALCULATE INTERACTIONS *
1647 **************************/
1649 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1652 /* REACTION-FIELD ELECTROSTATICS */
1653 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1655 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1659 fscal = _mm256_and_pd(fscal,cutoff_mask);
1661 /* Calculate temporary vectorial force */
1662 tx = _mm256_mul_pd(fscal,dx12);
1663 ty = _mm256_mul_pd(fscal,dy12);
1664 tz = _mm256_mul_pd(fscal,dz12);
1666 /* Update vectorial force */
1667 fix1 = _mm256_add_pd(fix1,tx);
1668 fiy1 = _mm256_add_pd(fiy1,ty);
1669 fiz1 = _mm256_add_pd(fiz1,tz);
1671 fjx2 = _mm256_add_pd(fjx2,tx);
1672 fjy2 = _mm256_add_pd(fjy2,ty);
1673 fjz2 = _mm256_add_pd(fjz2,tz);
1677 /**************************
1678 * CALCULATE INTERACTIONS *
1679 **************************/
1681 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1684 /* REACTION-FIELD ELECTROSTATICS */
1685 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
1687 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1691 fscal = _mm256_and_pd(fscal,cutoff_mask);
1693 /* Calculate temporary vectorial force */
1694 tx = _mm256_mul_pd(fscal,dx13);
1695 ty = _mm256_mul_pd(fscal,dy13);
1696 tz = _mm256_mul_pd(fscal,dz13);
1698 /* Update vectorial force */
1699 fix1 = _mm256_add_pd(fix1,tx);
1700 fiy1 = _mm256_add_pd(fiy1,ty);
1701 fiz1 = _mm256_add_pd(fiz1,tz);
1703 fjx3 = _mm256_add_pd(fjx3,tx);
1704 fjy3 = _mm256_add_pd(fjy3,ty);
1705 fjz3 = _mm256_add_pd(fjz3,tz);
1709 /**************************
1710 * CALCULATE INTERACTIONS *
1711 **************************/
1713 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1716 /* REACTION-FIELD ELECTROSTATICS */
1717 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1719 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1723 fscal = _mm256_and_pd(fscal,cutoff_mask);
1725 /* Calculate temporary vectorial force */
1726 tx = _mm256_mul_pd(fscal,dx21);
1727 ty = _mm256_mul_pd(fscal,dy21);
1728 tz = _mm256_mul_pd(fscal,dz21);
1730 /* Update vectorial force */
1731 fix2 = _mm256_add_pd(fix2,tx);
1732 fiy2 = _mm256_add_pd(fiy2,ty);
1733 fiz2 = _mm256_add_pd(fiz2,tz);
1735 fjx1 = _mm256_add_pd(fjx1,tx);
1736 fjy1 = _mm256_add_pd(fjy1,ty);
1737 fjz1 = _mm256_add_pd(fjz1,tz);
1741 /**************************
1742 * CALCULATE INTERACTIONS *
1743 **************************/
1745 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1748 /* REACTION-FIELD ELECTROSTATICS */
1749 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1751 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1755 fscal = _mm256_and_pd(fscal,cutoff_mask);
1757 /* Calculate temporary vectorial force */
1758 tx = _mm256_mul_pd(fscal,dx22);
1759 ty = _mm256_mul_pd(fscal,dy22);
1760 tz = _mm256_mul_pd(fscal,dz22);
1762 /* Update vectorial force */
1763 fix2 = _mm256_add_pd(fix2,tx);
1764 fiy2 = _mm256_add_pd(fiy2,ty);
1765 fiz2 = _mm256_add_pd(fiz2,tz);
1767 fjx2 = _mm256_add_pd(fjx2,tx);
1768 fjy2 = _mm256_add_pd(fjy2,ty);
1769 fjz2 = _mm256_add_pd(fjz2,tz);
1773 /**************************
1774 * CALCULATE INTERACTIONS *
1775 **************************/
1777 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1780 /* REACTION-FIELD ELECTROSTATICS */
1781 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
1783 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1787 fscal = _mm256_and_pd(fscal,cutoff_mask);
1789 /* Calculate temporary vectorial force */
1790 tx = _mm256_mul_pd(fscal,dx23);
1791 ty = _mm256_mul_pd(fscal,dy23);
1792 tz = _mm256_mul_pd(fscal,dz23);
1794 /* Update vectorial force */
1795 fix2 = _mm256_add_pd(fix2,tx);
1796 fiy2 = _mm256_add_pd(fiy2,ty);
1797 fiz2 = _mm256_add_pd(fiz2,tz);
1799 fjx3 = _mm256_add_pd(fjx3,tx);
1800 fjy3 = _mm256_add_pd(fjy3,ty);
1801 fjz3 = _mm256_add_pd(fjz3,tz);
1805 /**************************
1806 * CALCULATE INTERACTIONS *
1807 **************************/
1809 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1812 /* REACTION-FIELD ELECTROSTATICS */
1813 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1815 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1819 fscal = _mm256_and_pd(fscal,cutoff_mask);
1821 /* Calculate temporary vectorial force */
1822 tx = _mm256_mul_pd(fscal,dx31);
1823 ty = _mm256_mul_pd(fscal,dy31);
1824 tz = _mm256_mul_pd(fscal,dz31);
1826 /* Update vectorial force */
1827 fix3 = _mm256_add_pd(fix3,tx);
1828 fiy3 = _mm256_add_pd(fiy3,ty);
1829 fiz3 = _mm256_add_pd(fiz3,tz);
1831 fjx1 = _mm256_add_pd(fjx1,tx);
1832 fjy1 = _mm256_add_pd(fjy1,ty);
1833 fjz1 = _mm256_add_pd(fjz1,tz);
1837 /**************************
1838 * CALCULATE INTERACTIONS *
1839 **************************/
1841 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1844 /* REACTION-FIELD ELECTROSTATICS */
1845 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
1847 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1851 fscal = _mm256_and_pd(fscal,cutoff_mask);
1853 /* Calculate temporary vectorial force */
1854 tx = _mm256_mul_pd(fscal,dx32);
1855 ty = _mm256_mul_pd(fscal,dy32);
1856 tz = _mm256_mul_pd(fscal,dz32);
1858 /* Update vectorial force */
1859 fix3 = _mm256_add_pd(fix3,tx);
1860 fiy3 = _mm256_add_pd(fiy3,ty);
1861 fiz3 = _mm256_add_pd(fiz3,tz);
1863 fjx2 = _mm256_add_pd(fjx2,tx);
1864 fjy2 = _mm256_add_pd(fjy2,ty);
1865 fjz2 = _mm256_add_pd(fjz2,tz);
1869 /**************************
1870 * CALCULATE INTERACTIONS *
1871 **************************/
1873 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1876 /* REACTION-FIELD ELECTROSTATICS */
1877 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
1879 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1883 fscal = _mm256_and_pd(fscal,cutoff_mask);
1885 /* Calculate temporary vectorial force */
1886 tx = _mm256_mul_pd(fscal,dx33);
1887 ty = _mm256_mul_pd(fscal,dy33);
1888 tz = _mm256_mul_pd(fscal,dz33);
1890 /* Update vectorial force */
1891 fix3 = _mm256_add_pd(fix3,tx);
1892 fiy3 = _mm256_add_pd(fiy3,ty);
1893 fiz3 = _mm256_add_pd(fiz3,tz);
1895 fjx3 = _mm256_add_pd(fjx3,tx);
1896 fjy3 = _mm256_add_pd(fjy3,ty);
1897 fjz3 = _mm256_add_pd(fjz3,tz);
1901 fjptrA = f+j_coord_offsetA;
1902 fjptrB = f+j_coord_offsetB;
1903 fjptrC = f+j_coord_offsetC;
1904 fjptrD = f+j_coord_offsetD;
1906 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1907 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1908 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1910 /* Inner loop uses 329 flops */
1913 if(jidx<j_index_end)
1916 /* Get j neighbor index, and coordinate index */
1917 jnrlistA = jjnr[jidx];
1918 jnrlistB = jjnr[jidx+1];
1919 jnrlistC = jjnr[jidx+2];
1920 jnrlistD = jjnr[jidx+3];
1921 /* Sign of each element will be negative for non-real atoms.
1922 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1923 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1925 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1927 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1928 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1929 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1931 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1932 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1933 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1934 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1935 j_coord_offsetA = DIM*jnrA;
1936 j_coord_offsetB = DIM*jnrB;
1937 j_coord_offsetC = DIM*jnrC;
1938 j_coord_offsetD = DIM*jnrD;
1940 /* load j atom coordinates */
1941 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1942 x+j_coord_offsetC,x+j_coord_offsetD,
1943 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1944 &jy2,&jz2,&jx3,&jy3,&jz3);
1946 /* Calculate displacement vector */
1947 dx00 = _mm256_sub_pd(ix0,jx0);
1948 dy00 = _mm256_sub_pd(iy0,jy0);
1949 dz00 = _mm256_sub_pd(iz0,jz0);
1950 dx11 = _mm256_sub_pd(ix1,jx1);
1951 dy11 = _mm256_sub_pd(iy1,jy1);
1952 dz11 = _mm256_sub_pd(iz1,jz1);
1953 dx12 = _mm256_sub_pd(ix1,jx2);
1954 dy12 = _mm256_sub_pd(iy1,jy2);
1955 dz12 = _mm256_sub_pd(iz1,jz2);
1956 dx13 = _mm256_sub_pd(ix1,jx3);
1957 dy13 = _mm256_sub_pd(iy1,jy3);
1958 dz13 = _mm256_sub_pd(iz1,jz3);
1959 dx21 = _mm256_sub_pd(ix2,jx1);
1960 dy21 = _mm256_sub_pd(iy2,jy1);
1961 dz21 = _mm256_sub_pd(iz2,jz1);
1962 dx22 = _mm256_sub_pd(ix2,jx2);
1963 dy22 = _mm256_sub_pd(iy2,jy2);
1964 dz22 = _mm256_sub_pd(iz2,jz2);
1965 dx23 = _mm256_sub_pd(ix2,jx3);
1966 dy23 = _mm256_sub_pd(iy2,jy3);
1967 dz23 = _mm256_sub_pd(iz2,jz3);
1968 dx31 = _mm256_sub_pd(ix3,jx1);
1969 dy31 = _mm256_sub_pd(iy3,jy1);
1970 dz31 = _mm256_sub_pd(iz3,jz1);
1971 dx32 = _mm256_sub_pd(ix3,jx2);
1972 dy32 = _mm256_sub_pd(iy3,jy2);
1973 dz32 = _mm256_sub_pd(iz3,jz2);
1974 dx33 = _mm256_sub_pd(ix3,jx3);
1975 dy33 = _mm256_sub_pd(iy3,jy3);
1976 dz33 = _mm256_sub_pd(iz3,jz3);
1978 /* Calculate squared distance and things based on it */
1979 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1980 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1981 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1982 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1983 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1984 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1985 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1986 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1987 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1988 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1990 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1991 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1992 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1993 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1994 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1995 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1996 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1997 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1998 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1999 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
2001 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
2002 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2003 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2004 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
2005 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2006 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2007 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
2008 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
2009 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
2010 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
2012 fjx0 = _mm256_setzero_pd();
2013 fjy0 = _mm256_setzero_pd();
2014 fjz0 = _mm256_setzero_pd();
2015 fjx1 = _mm256_setzero_pd();
2016 fjy1 = _mm256_setzero_pd();
2017 fjz1 = _mm256_setzero_pd();
2018 fjx2 = _mm256_setzero_pd();
2019 fjy2 = _mm256_setzero_pd();
2020 fjz2 = _mm256_setzero_pd();
2021 fjx3 = _mm256_setzero_pd();
2022 fjy3 = _mm256_setzero_pd();
2023 fjz3 = _mm256_setzero_pd();
2025 /**************************
2026 * CALCULATE INTERACTIONS *
2027 **************************/
2029 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2032 r00 = _mm256_mul_pd(rsq00,rinv00);
2033 r00 = _mm256_andnot_pd(dummy_mask,r00);
2035 /* LENNARD-JONES DISPERSION/REPULSION */
2037 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2038 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
2039 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
2040 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
2041 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
2043 d = _mm256_sub_pd(r00,rswitch);
2044 d = _mm256_max_pd(d,_mm256_setzero_pd());
2045 d2 = _mm256_mul_pd(d,d);
2046 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2048 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2050 /* Evaluate switch function */
2051 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2052 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
2053 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2057 fscal = _mm256_and_pd(fscal,cutoff_mask);
2059 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2061 /* Calculate temporary vectorial force */
2062 tx = _mm256_mul_pd(fscal,dx00);
2063 ty = _mm256_mul_pd(fscal,dy00);
2064 tz = _mm256_mul_pd(fscal,dz00);
2066 /* Update vectorial force */
2067 fix0 = _mm256_add_pd(fix0,tx);
2068 fiy0 = _mm256_add_pd(fiy0,ty);
2069 fiz0 = _mm256_add_pd(fiz0,tz);
2071 fjx0 = _mm256_add_pd(fjx0,tx);
2072 fjy0 = _mm256_add_pd(fjy0,ty);
2073 fjz0 = _mm256_add_pd(fjz0,tz);
2077 /**************************
2078 * CALCULATE INTERACTIONS *
2079 **************************/
2081 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2084 /* REACTION-FIELD ELECTROSTATICS */
2085 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
2087 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2091 fscal = _mm256_and_pd(fscal,cutoff_mask);
2093 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2095 /* Calculate temporary vectorial force */
2096 tx = _mm256_mul_pd(fscal,dx11);
2097 ty = _mm256_mul_pd(fscal,dy11);
2098 tz = _mm256_mul_pd(fscal,dz11);
2100 /* Update vectorial force */
2101 fix1 = _mm256_add_pd(fix1,tx);
2102 fiy1 = _mm256_add_pd(fiy1,ty);
2103 fiz1 = _mm256_add_pd(fiz1,tz);
2105 fjx1 = _mm256_add_pd(fjx1,tx);
2106 fjy1 = _mm256_add_pd(fjy1,ty);
2107 fjz1 = _mm256_add_pd(fjz1,tz);
2111 /**************************
2112 * CALCULATE INTERACTIONS *
2113 **************************/
2115 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2118 /* REACTION-FIELD ELECTROSTATICS */
2119 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
2121 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2125 fscal = _mm256_and_pd(fscal,cutoff_mask);
2127 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2129 /* Calculate temporary vectorial force */
2130 tx = _mm256_mul_pd(fscal,dx12);
2131 ty = _mm256_mul_pd(fscal,dy12);
2132 tz = _mm256_mul_pd(fscal,dz12);
2134 /* Update vectorial force */
2135 fix1 = _mm256_add_pd(fix1,tx);
2136 fiy1 = _mm256_add_pd(fiy1,ty);
2137 fiz1 = _mm256_add_pd(fiz1,tz);
2139 fjx2 = _mm256_add_pd(fjx2,tx);
2140 fjy2 = _mm256_add_pd(fjy2,ty);
2141 fjz2 = _mm256_add_pd(fjz2,tz);
2145 /**************************
2146 * CALCULATE INTERACTIONS *
2147 **************************/
2149 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2152 /* REACTION-FIELD ELECTROSTATICS */
2153 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
2155 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2159 fscal = _mm256_and_pd(fscal,cutoff_mask);
2161 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2163 /* Calculate temporary vectorial force */
2164 tx = _mm256_mul_pd(fscal,dx13);
2165 ty = _mm256_mul_pd(fscal,dy13);
2166 tz = _mm256_mul_pd(fscal,dz13);
2168 /* Update vectorial force */
2169 fix1 = _mm256_add_pd(fix1,tx);
2170 fiy1 = _mm256_add_pd(fiy1,ty);
2171 fiz1 = _mm256_add_pd(fiz1,tz);
2173 fjx3 = _mm256_add_pd(fjx3,tx);
2174 fjy3 = _mm256_add_pd(fjy3,ty);
2175 fjz3 = _mm256_add_pd(fjz3,tz);
2179 /**************************
2180 * CALCULATE INTERACTIONS *
2181 **************************/
2183 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2186 /* REACTION-FIELD ELECTROSTATICS */
2187 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
2189 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2193 fscal = _mm256_and_pd(fscal,cutoff_mask);
2195 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2197 /* Calculate temporary vectorial force */
2198 tx = _mm256_mul_pd(fscal,dx21);
2199 ty = _mm256_mul_pd(fscal,dy21);
2200 tz = _mm256_mul_pd(fscal,dz21);
2202 /* Update vectorial force */
2203 fix2 = _mm256_add_pd(fix2,tx);
2204 fiy2 = _mm256_add_pd(fiy2,ty);
2205 fiz2 = _mm256_add_pd(fiz2,tz);
2207 fjx1 = _mm256_add_pd(fjx1,tx);
2208 fjy1 = _mm256_add_pd(fjy1,ty);
2209 fjz1 = _mm256_add_pd(fjz1,tz);
2213 /**************************
2214 * CALCULATE INTERACTIONS *
2215 **************************/
2217 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2220 /* REACTION-FIELD ELECTROSTATICS */
2221 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
2223 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2227 fscal = _mm256_and_pd(fscal,cutoff_mask);
2229 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2231 /* Calculate temporary vectorial force */
2232 tx = _mm256_mul_pd(fscal,dx22);
2233 ty = _mm256_mul_pd(fscal,dy22);
2234 tz = _mm256_mul_pd(fscal,dz22);
2236 /* Update vectorial force */
2237 fix2 = _mm256_add_pd(fix2,tx);
2238 fiy2 = _mm256_add_pd(fiy2,ty);
2239 fiz2 = _mm256_add_pd(fiz2,tz);
2241 fjx2 = _mm256_add_pd(fjx2,tx);
2242 fjy2 = _mm256_add_pd(fjy2,ty);
2243 fjz2 = _mm256_add_pd(fjz2,tz);
2247 /**************************
2248 * CALCULATE INTERACTIONS *
2249 **************************/
2251 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2254 /* REACTION-FIELD ELECTROSTATICS */
2255 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
2257 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2261 fscal = _mm256_and_pd(fscal,cutoff_mask);
2263 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2265 /* Calculate temporary vectorial force */
2266 tx = _mm256_mul_pd(fscal,dx23);
2267 ty = _mm256_mul_pd(fscal,dy23);
2268 tz = _mm256_mul_pd(fscal,dz23);
2270 /* Update vectorial force */
2271 fix2 = _mm256_add_pd(fix2,tx);
2272 fiy2 = _mm256_add_pd(fiy2,ty);
2273 fiz2 = _mm256_add_pd(fiz2,tz);
2275 fjx3 = _mm256_add_pd(fjx3,tx);
2276 fjy3 = _mm256_add_pd(fjy3,ty);
2277 fjz3 = _mm256_add_pd(fjz3,tz);
2281 /**************************
2282 * CALCULATE INTERACTIONS *
2283 **************************/
2285 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2288 /* REACTION-FIELD ELECTROSTATICS */
2289 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
2291 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2295 fscal = _mm256_and_pd(fscal,cutoff_mask);
2297 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2299 /* Calculate temporary vectorial force */
2300 tx = _mm256_mul_pd(fscal,dx31);
2301 ty = _mm256_mul_pd(fscal,dy31);
2302 tz = _mm256_mul_pd(fscal,dz31);
2304 /* Update vectorial force */
2305 fix3 = _mm256_add_pd(fix3,tx);
2306 fiy3 = _mm256_add_pd(fiy3,ty);
2307 fiz3 = _mm256_add_pd(fiz3,tz);
2309 fjx1 = _mm256_add_pd(fjx1,tx);
2310 fjy1 = _mm256_add_pd(fjy1,ty);
2311 fjz1 = _mm256_add_pd(fjz1,tz);
2315 /**************************
2316 * CALCULATE INTERACTIONS *
2317 **************************/
2319 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2322 /* REACTION-FIELD ELECTROSTATICS */
2323 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
2325 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2329 fscal = _mm256_and_pd(fscal,cutoff_mask);
2331 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2333 /* Calculate temporary vectorial force */
2334 tx = _mm256_mul_pd(fscal,dx32);
2335 ty = _mm256_mul_pd(fscal,dy32);
2336 tz = _mm256_mul_pd(fscal,dz32);
2338 /* Update vectorial force */
2339 fix3 = _mm256_add_pd(fix3,tx);
2340 fiy3 = _mm256_add_pd(fiy3,ty);
2341 fiz3 = _mm256_add_pd(fiz3,tz);
2343 fjx2 = _mm256_add_pd(fjx2,tx);
2344 fjy2 = _mm256_add_pd(fjy2,ty);
2345 fjz2 = _mm256_add_pd(fjz2,tz);
2349 /**************************
2350 * CALCULATE INTERACTIONS *
2351 **************************/
2353 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2356 /* REACTION-FIELD ELECTROSTATICS */
2357 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
2359 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2363 fscal = _mm256_and_pd(fscal,cutoff_mask);
2365 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2367 /* Calculate temporary vectorial force */
2368 tx = _mm256_mul_pd(fscal,dx33);
2369 ty = _mm256_mul_pd(fscal,dy33);
2370 tz = _mm256_mul_pd(fscal,dz33);
2372 /* Update vectorial force */
2373 fix3 = _mm256_add_pd(fix3,tx);
2374 fiy3 = _mm256_add_pd(fiy3,ty);
2375 fiz3 = _mm256_add_pd(fiz3,tz);
2377 fjx3 = _mm256_add_pd(fjx3,tx);
2378 fjy3 = _mm256_add_pd(fjy3,ty);
2379 fjz3 = _mm256_add_pd(fjz3,tz);
2383 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2384 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2385 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2386 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2388 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2389 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2390 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2392 /* Inner loop uses 330 flops */
2395 /* End of innermost loop */
2397 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2398 f+i_coord_offset,fshift+i_shift_offset);
2400 /* Increment number of inner iterations */
2401 inneriter += j_index_end - j_index_start;
2403 /* Outer loop uses 24 flops */
2406 /* Increment number of outer iterations */
2409 /* Update outer/inner flops */
2411 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*330);