2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_256_double
52 * Electrostatics interaction: ReactionField
53 * VdW interaction: LennardJones
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_256_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 real * vdwioffsetptr0;
85 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 real * vdwioffsetptr1;
87 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 real * vdwioffsetptr2;
89 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 real * vdwioffsetptr3;
91 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
92 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
93 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
95 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
97 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
99 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
100 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
101 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
104 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
107 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
108 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
109 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
110 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
113 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
116 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
117 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
118 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
119 real rswitch_scalar,d_scalar;
120 __m256d dummy_mask,cutoff_mask;
121 __m128 tmpmask0,tmpmask1;
122 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
123 __m256d one = _mm256_set1_pd(1.0);
124 __m256d two = _mm256_set1_pd(2.0);
130 jindex = nlist->jindex;
132 shiftidx = nlist->shift;
134 shiftvec = fr->shift_vec[0];
135 fshift = fr->fshift[0];
136 facel = _mm256_set1_pd(fr->epsfac);
137 charge = mdatoms->chargeA;
138 krf = _mm256_set1_pd(fr->ic->k_rf);
139 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
140 crf = _mm256_set1_pd(fr->ic->c_rf);
141 nvdwtype = fr->ntype;
143 vdwtype = mdatoms->typeA;
145 /* Setup water-specific parameters */
146 inr = nlist->iinr[0];
147 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
148 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
149 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
150 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
152 jq1 = _mm256_set1_pd(charge[inr+1]);
153 jq2 = _mm256_set1_pd(charge[inr+2]);
154 jq3 = _mm256_set1_pd(charge[inr+3]);
155 vdwjidx0A = 2*vdwtype[inr+0];
156 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
157 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
158 qq11 = _mm256_mul_pd(iq1,jq1);
159 qq12 = _mm256_mul_pd(iq1,jq2);
160 qq13 = _mm256_mul_pd(iq1,jq3);
161 qq21 = _mm256_mul_pd(iq2,jq1);
162 qq22 = _mm256_mul_pd(iq2,jq2);
163 qq23 = _mm256_mul_pd(iq2,jq3);
164 qq31 = _mm256_mul_pd(iq3,jq1);
165 qq32 = _mm256_mul_pd(iq3,jq2);
166 qq33 = _mm256_mul_pd(iq3,jq3);
168 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
169 rcutoff_scalar = fr->rcoulomb;
170 rcutoff = _mm256_set1_pd(rcutoff_scalar);
171 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
173 rswitch_scalar = fr->rvdw_switch;
174 rswitch = _mm256_set1_pd(rswitch_scalar);
175 /* Setup switch parameters */
176 d_scalar = rcutoff_scalar-rswitch_scalar;
177 d = _mm256_set1_pd(d_scalar);
178 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
179 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
180 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
181 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
182 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
183 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
185 /* Avoid stupid compiler warnings */
186 jnrA = jnrB = jnrC = jnrD = 0;
195 for(iidx=0;iidx<4*DIM;iidx++)
200 /* Start outer loop over neighborlists */
201 for(iidx=0; iidx<nri; iidx++)
203 /* Load shift vector for this list */
204 i_shift_offset = DIM*shiftidx[iidx];
206 /* Load limits for loop over neighbors */
207 j_index_start = jindex[iidx];
208 j_index_end = jindex[iidx+1];
210 /* Get outer coordinate index */
212 i_coord_offset = DIM*inr;
214 /* Load i particle coords and add shift vector */
215 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
216 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
218 fix0 = _mm256_setzero_pd();
219 fiy0 = _mm256_setzero_pd();
220 fiz0 = _mm256_setzero_pd();
221 fix1 = _mm256_setzero_pd();
222 fiy1 = _mm256_setzero_pd();
223 fiz1 = _mm256_setzero_pd();
224 fix2 = _mm256_setzero_pd();
225 fiy2 = _mm256_setzero_pd();
226 fiz2 = _mm256_setzero_pd();
227 fix3 = _mm256_setzero_pd();
228 fiy3 = _mm256_setzero_pd();
229 fiz3 = _mm256_setzero_pd();
231 /* Reset potential sums */
232 velecsum = _mm256_setzero_pd();
233 vvdwsum = _mm256_setzero_pd();
235 /* Start inner kernel loop */
236 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
239 /* Get j neighbor index, and coordinate index */
244 j_coord_offsetA = DIM*jnrA;
245 j_coord_offsetB = DIM*jnrB;
246 j_coord_offsetC = DIM*jnrC;
247 j_coord_offsetD = DIM*jnrD;
249 /* load j atom coordinates */
250 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
251 x+j_coord_offsetC,x+j_coord_offsetD,
252 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
253 &jy2,&jz2,&jx3,&jy3,&jz3);
255 /* Calculate displacement vector */
256 dx00 = _mm256_sub_pd(ix0,jx0);
257 dy00 = _mm256_sub_pd(iy0,jy0);
258 dz00 = _mm256_sub_pd(iz0,jz0);
259 dx11 = _mm256_sub_pd(ix1,jx1);
260 dy11 = _mm256_sub_pd(iy1,jy1);
261 dz11 = _mm256_sub_pd(iz1,jz1);
262 dx12 = _mm256_sub_pd(ix1,jx2);
263 dy12 = _mm256_sub_pd(iy1,jy2);
264 dz12 = _mm256_sub_pd(iz1,jz2);
265 dx13 = _mm256_sub_pd(ix1,jx3);
266 dy13 = _mm256_sub_pd(iy1,jy3);
267 dz13 = _mm256_sub_pd(iz1,jz3);
268 dx21 = _mm256_sub_pd(ix2,jx1);
269 dy21 = _mm256_sub_pd(iy2,jy1);
270 dz21 = _mm256_sub_pd(iz2,jz1);
271 dx22 = _mm256_sub_pd(ix2,jx2);
272 dy22 = _mm256_sub_pd(iy2,jy2);
273 dz22 = _mm256_sub_pd(iz2,jz2);
274 dx23 = _mm256_sub_pd(ix2,jx3);
275 dy23 = _mm256_sub_pd(iy2,jy3);
276 dz23 = _mm256_sub_pd(iz2,jz3);
277 dx31 = _mm256_sub_pd(ix3,jx1);
278 dy31 = _mm256_sub_pd(iy3,jy1);
279 dz31 = _mm256_sub_pd(iz3,jz1);
280 dx32 = _mm256_sub_pd(ix3,jx2);
281 dy32 = _mm256_sub_pd(iy3,jy2);
282 dz32 = _mm256_sub_pd(iz3,jz2);
283 dx33 = _mm256_sub_pd(ix3,jx3);
284 dy33 = _mm256_sub_pd(iy3,jy3);
285 dz33 = _mm256_sub_pd(iz3,jz3);
287 /* Calculate squared distance and things based on it */
288 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
289 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
290 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
291 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
292 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
293 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
294 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
295 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
296 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
297 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
299 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
300 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
301 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
302 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
303 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
304 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
305 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
306 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
307 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
308 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
310 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
311 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
312 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
313 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
314 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
315 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
316 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
317 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
318 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
319 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
321 fjx0 = _mm256_setzero_pd();
322 fjy0 = _mm256_setzero_pd();
323 fjz0 = _mm256_setzero_pd();
324 fjx1 = _mm256_setzero_pd();
325 fjy1 = _mm256_setzero_pd();
326 fjz1 = _mm256_setzero_pd();
327 fjx2 = _mm256_setzero_pd();
328 fjy2 = _mm256_setzero_pd();
329 fjz2 = _mm256_setzero_pd();
330 fjx3 = _mm256_setzero_pd();
331 fjy3 = _mm256_setzero_pd();
332 fjz3 = _mm256_setzero_pd();
334 /**************************
335 * CALCULATE INTERACTIONS *
336 **************************/
338 if (gmx_mm256_any_lt(rsq00,rcutoff2))
341 r00 = _mm256_mul_pd(rsq00,rinv00);
343 /* LENNARD-JONES DISPERSION/REPULSION */
345 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
346 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
347 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
348 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
349 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
351 d = _mm256_sub_pd(r00,rswitch);
352 d = _mm256_max_pd(d,_mm256_setzero_pd());
353 d2 = _mm256_mul_pd(d,d);
354 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
356 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
358 /* Evaluate switch function */
359 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
360 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
361 vvdw = _mm256_mul_pd(vvdw,sw);
362 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
364 /* Update potential sum for this i atom from the interaction with this j atom. */
365 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
366 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
370 fscal = _mm256_and_pd(fscal,cutoff_mask);
372 /* Calculate temporary vectorial force */
373 tx = _mm256_mul_pd(fscal,dx00);
374 ty = _mm256_mul_pd(fscal,dy00);
375 tz = _mm256_mul_pd(fscal,dz00);
377 /* Update vectorial force */
378 fix0 = _mm256_add_pd(fix0,tx);
379 fiy0 = _mm256_add_pd(fiy0,ty);
380 fiz0 = _mm256_add_pd(fiz0,tz);
382 fjx0 = _mm256_add_pd(fjx0,tx);
383 fjy0 = _mm256_add_pd(fjy0,ty);
384 fjz0 = _mm256_add_pd(fjz0,tz);
388 /**************************
389 * CALCULATE INTERACTIONS *
390 **************************/
392 if (gmx_mm256_any_lt(rsq11,rcutoff2))
395 /* REACTION-FIELD ELECTROSTATICS */
396 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
397 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
399 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
401 /* Update potential sum for this i atom from the interaction with this j atom. */
402 velec = _mm256_and_pd(velec,cutoff_mask);
403 velecsum = _mm256_add_pd(velecsum,velec);
407 fscal = _mm256_and_pd(fscal,cutoff_mask);
409 /* Calculate temporary vectorial force */
410 tx = _mm256_mul_pd(fscal,dx11);
411 ty = _mm256_mul_pd(fscal,dy11);
412 tz = _mm256_mul_pd(fscal,dz11);
414 /* Update vectorial force */
415 fix1 = _mm256_add_pd(fix1,tx);
416 fiy1 = _mm256_add_pd(fiy1,ty);
417 fiz1 = _mm256_add_pd(fiz1,tz);
419 fjx1 = _mm256_add_pd(fjx1,tx);
420 fjy1 = _mm256_add_pd(fjy1,ty);
421 fjz1 = _mm256_add_pd(fjz1,tz);
425 /**************************
426 * CALCULATE INTERACTIONS *
427 **************************/
429 if (gmx_mm256_any_lt(rsq12,rcutoff2))
432 /* REACTION-FIELD ELECTROSTATICS */
433 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
434 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
436 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
438 /* Update potential sum for this i atom from the interaction with this j atom. */
439 velec = _mm256_and_pd(velec,cutoff_mask);
440 velecsum = _mm256_add_pd(velecsum,velec);
444 fscal = _mm256_and_pd(fscal,cutoff_mask);
446 /* Calculate temporary vectorial force */
447 tx = _mm256_mul_pd(fscal,dx12);
448 ty = _mm256_mul_pd(fscal,dy12);
449 tz = _mm256_mul_pd(fscal,dz12);
451 /* Update vectorial force */
452 fix1 = _mm256_add_pd(fix1,tx);
453 fiy1 = _mm256_add_pd(fiy1,ty);
454 fiz1 = _mm256_add_pd(fiz1,tz);
456 fjx2 = _mm256_add_pd(fjx2,tx);
457 fjy2 = _mm256_add_pd(fjy2,ty);
458 fjz2 = _mm256_add_pd(fjz2,tz);
462 /**************************
463 * CALCULATE INTERACTIONS *
464 **************************/
466 if (gmx_mm256_any_lt(rsq13,rcutoff2))
469 /* REACTION-FIELD ELECTROSTATICS */
470 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_add_pd(rinv13,_mm256_mul_pd(krf,rsq13)),crf));
471 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
473 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
475 /* Update potential sum for this i atom from the interaction with this j atom. */
476 velec = _mm256_and_pd(velec,cutoff_mask);
477 velecsum = _mm256_add_pd(velecsum,velec);
481 fscal = _mm256_and_pd(fscal,cutoff_mask);
483 /* Calculate temporary vectorial force */
484 tx = _mm256_mul_pd(fscal,dx13);
485 ty = _mm256_mul_pd(fscal,dy13);
486 tz = _mm256_mul_pd(fscal,dz13);
488 /* Update vectorial force */
489 fix1 = _mm256_add_pd(fix1,tx);
490 fiy1 = _mm256_add_pd(fiy1,ty);
491 fiz1 = _mm256_add_pd(fiz1,tz);
493 fjx3 = _mm256_add_pd(fjx3,tx);
494 fjy3 = _mm256_add_pd(fjy3,ty);
495 fjz3 = _mm256_add_pd(fjz3,tz);
499 /**************************
500 * CALCULATE INTERACTIONS *
501 **************************/
503 if (gmx_mm256_any_lt(rsq21,rcutoff2))
506 /* REACTION-FIELD ELECTROSTATICS */
507 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
508 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
510 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
512 /* Update potential sum for this i atom from the interaction with this j atom. */
513 velec = _mm256_and_pd(velec,cutoff_mask);
514 velecsum = _mm256_add_pd(velecsum,velec);
518 fscal = _mm256_and_pd(fscal,cutoff_mask);
520 /* Calculate temporary vectorial force */
521 tx = _mm256_mul_pd(fscal,dx21);
522 ty = _mm256_mul_pd(fscal,dy21);
523 tz = _mm256_mul_pd(fscal,dz21);
525 /* Update vectorial force */
526 fix2 = _mm256_add_pd(fix2,tx);
527 fiy2 = _mm256_add_pd(fiy2,ty);
528 fiz2 = _mm256_add_pd(fiz2,tz);
530 fjx1 = _mm256_add_pd(fjx1,tx);
531 fjy1 = _mm256_add_pd(fjy1,ty);
532 fjz1 = _mm256_add_pd(fjz1,tz);
536 /**************************
537 * CALCULATE INTERACTIONS *
538 **************************/
540 if (gmx_mm256_any_lt(rsq22,rcutoff2))
543 /* REACTION-FIELD ELECTROSTATICS */
544 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
545 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
547 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
549 /* Update potential sum for this i atom from the interaction with this j atom. */
550 velec = _mm256_and_pd(velec,cutoff_mask);
551 velecsum = _mm256_add_pd(velecsum,velec);
555 fscal = _mm256_and_pd(fscal,cutoff_mask);
557 /* Calculate temporary vectorial force */
558 tx = _mm256_mul_pd(fscal,dx22);
559 ty = _mm256_mul_pd(fscal,dy22);
560 tz = _mm256_mul_pd(fscal,dz22);
562 /* Update vectorial force */
563 fix2 = _mm256_add_pd(fix2,tx);
564 fiy2 = _mm256_add_pd(fiy2,ty);
565 fiz2 = _mm256_add_pd(fiz2,tz);
567 fjx2 = _mm256_add_pd(fjx2,tx);
568 fjy2 = _mm256_add_pd(fjy2,ty);
569 fjz2 = _mm256_add_pd(fjz2,tz);
573 /**************************
574 * CALCULATE INTERACTIONS *
575 **************************/
577 if (gmx_mm256_any_lt(rsq23,rcutoff2))
580 /* REACTION-FIELD ELECTROSTATICS */
581 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_add_pd(rinv23,_mm256_mul_pd(krf,rsq23)),crf));
582 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
584 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
586 /* Update potential sum for this i atom from the interaction with this j atom. */
587 velec = _mm256_and_pd(velec,cutoff_mask);
588 velecsum = _mm256_add_pd(velecsum,velec);
592 fscal = _mm256_and_pd(fscal,cutoff_mask);
594 /* Calculate temporary vectorial force */
595 tx = _mm256_mul_pd(fscal,dx23);
596 ty = _mm256_mul_pd(fscal,dy23);
597 tz = _mm256_mul_pd(fscal,dz23);
599 /* Update vectorial force */
600 fix2 = _mm256_add_pd(fix2,tx);
601 fiy2 = _mm256_add_pd(fiy2,ty);
602 fiz2 = _mm256_add_pd(fiz2,tz);
604 fjx3 = _mm256_add_pd(fjx3,tx);
605 fjy3 = _mm256_add_pd(fjy3,ty);
606 fjz3 = _mm256_add_pd(fjz3,tz);
610 /**************************
611 * CALCULATE INTERACTIONS *
612 **************************/
614 if (gmx_mm256_any_lt(rsq31,rcutoff2))
617 /* REACTION-FIELD ELECTROSTATICS */
618 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_add_pd(rinv31,_mm256_mul_pd(krf,rsq31)),crf));
619 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
621 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
623 /* Update potential sum for this i atom from the interaction with this j atom. */
624 velec = _mm256_and_pd(velec,cutoff_mask);
625 velecsum = _mm256_add_pd(velecsum,velec);
629 fscal = _mm256_and_pd(fscal,cutoff_mask);
631 /* Calculate temporary vectorial force */
632 tx = _mm256_mul_pd(fscal,dx31);
633 ty = _mm256_mul_pd(fscal,dy31);
634 tz = _mm256_mul_pd(fscal,dz31);
636 /* Update vectorial force */
637 fix3 = _mm256_add_pd(fix3,tx);
638 fiy3 = _mm256_add_pd(fiy3,ty);
639 fiz3 = _mm256_add_pd(fiz3,tz);
641 fjx1 = _mm256_add_pd(fjx1,tx);
642 fjy1 = _mm256_add_pd(fjy1,ty);
643 fjz1 = _mm256_add_pd(fjz1,tz);
647 /**************************
648 * CALCULATE INTERACTIONS *
649 **************************/
651 if (gmx_mm256_any_lt(rsq32,rcutoff2))
654 /* REACTION-FIELD ELECTROSTATICS */
655 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_add_pd(rinv32,_mm256_mul_pd(krf,rsq32)),crf));
656 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
658 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
660 /* Update potential sum for this i atom from the interaction with this j atom. */
661 velec = _mm256_and_pd(velec,cutoff_mask);
662 velecsum = _mm256_add_pd(velecsum,velec);
666 fscal = _mm256_and_pd(fscal,cutoff_mask);
668 /* Calculate temporary vectorial force */
669 tx = _mm256_mul_pd(fscal,dx32);
670 ty = _mm256_mul_pd(fscal,dy32);
671 tz = _mm256_mul_pd(fscal,dz32);
673 /* Update vectorial force */
674 fix3 = _mm256_add_pd(fix3,tx);
675 fiy3 = _mm256_add_pd(fiy3,ty);
676 fiz3 = _mm256_add_pd(fiz3,tz);
678 fjx2 = _mm256_add_pd(fjx2,tx);
679 fjy2 = _mm256_add_pd(fjy2,ty);
680 fjz2 = _mm256_add_pd(fjz2,tz);
684 /**************************
685 * CALCULATE INTERACTIONS *
686 **************************/
688 if (gmx_mm256_any_lt(rsq33,rcutoff2))
691 /* REACTION-FIELD ELECTROSTATICS */
692 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_add_pd(rinv33,_mm256_mul_pd(krf,rsq33)),crf));
693 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
695 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
697 /* Update potential sum for this i atom from the interaction with this j atom. */
698 velec = _mm256_and_pd(velec,cutoff_mask);
699 velecsum = _mm256_add_pd(velecsum,velec);
703 fscal = _mm256_and_pd(fscal,cutoff_mask);
705 /* Calculate temporary vectorial force */
706 tx = _mm256_mul_pd(fscal,dx33);
707 ty = _mm256_mul_pd(fscal,dy33);
708 tz = _mm256_mul_pd(fscal,dz33);
710 /* Update vectorial force */
711 fix3 = _mm256_add_pd(fix3,tx);
712 fiy3 = _mm256_add_pd(fiy3,ty);
713 fiz3 = _mm256_add_pd(fiz3,tz);
715 fjx3 = _mm256_add_pd(fjx3,tx);
716 fjy3 = _mm256_add_pd(fjy3,ty);
717 fjz3 = _mm256_add_pd(fjz3,tz);
721 fjptrA = f+j_coord_offsetA;
722 fjptrB = f+j_coord_offsetB;
723 fjptrC = f+j_coord_offsetC;
724 fjptrD = f+j_coord_offsetD;
726 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
727 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
728 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
730 /* Inner loop uses 386 flops */
736 /* Get j neighbor index, and coordinate index */
737 jnrlistA = jjnr[jidx];
738 jnrlistB = jjnr[jidx+1];
739 jnrlistC = jjnr[jidx+2];
740 jnrlistD = jjnr[jidx+3];
741 /* Sign of each element will be negative for non-real atoms.
742 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
743 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
745 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
747 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
748 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
749 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
751 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
752 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
753 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
754 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
755 j_coord_offsetA = DIM*jnrA;
756 j_coord_offsetB = DIM*jnrB;
757 j_coord_offsetC = DIM*jnrC;
758 j_coord_offsetD = DIM*jnrD;
760 /* load j atom coordinates */
761 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
762 x+j_coord_offsetC,x+j_coord_offsetD,
763 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
764 &jy2,&jz2,&jx3,&jy3,&jz3);
766 /* Calculate displacement vector */
767 dx00 = _mm256_sub_pd(ix0,jx0);
768 dy00 = _mm256_sub_pd(iy0,jy0);
769 dz00 = _mm256_sub_pd(iz0,jz0);
770 dx11 = _mm256_sub_pd(ix1,jx1);
771 dy11 = _mm256_sub_pd(iy1,jy1);
772 dz11 = _mm256_sub_pd(iz1,jz1);
773 dx12 = _mm256_sub_pd(ix1,jx2);
774 dy12 = _mm256_sub_pd(iy1,jy2);
775 dz12 = _mm256_sub_pd(iz1,jz2);
776 dx13 = _mm256_sub_pd(ix1,jx3);
777 dy13 = _mm256_sub_pd(iy1,jy3);
778 dz13 = _mm256_sub_pd(iz1,jz3);
779 dx21 = _mm256_sub_pd(ix2,jx1);
780 dy21 = _mm256_sub_pd(iy2,jy1);
781 dz21 = _mm256_sub_pd(iz2,jz1);
782 dx22 = _mm256_sub_pd(ix2,jx2);
783 dy22 = _mm256_sub_pd(iy2,jy2);
784 dz22 = _mm256_sub_pd(iz2,jz2);
785 dx23 = _mm256_sub_pd(ix2,jx3);
786 dy23 = _mm256_sub_pd(iy2,jy3);
787 dz23 = _mm256_sub_pd(iz2,jz3);
788 dx31 = _mm256_sub_pd(ix3,jx1);
789 dy31 = _mm256_sub_pd(iy3,jy1);
790 dz31 = _mm256_sub_pd(iz3,jz1);
791 dx32 = _mm256_sub_pd(ix3,jx2);
792 dy32 = _mm256_sub_pd(iy3,jy2);
793 dz32 = _mm256_sub_pd(iz3,jz2);
794 dx33 = _mm256_sub_pd(ix3,jx3);
795 dy33 = _mm256_sub_pd(iy3,jy3);
796 dz33 = _mm256_sub_pd(iz3,jz3);
798 /* Calculate squared distance and things based on it */
799 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
800 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
801 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
802 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
803 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
804 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
805 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
806 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
807 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
808 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
810 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
811 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
812 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
813 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
814 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
815 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
816 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
817 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
818 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
819 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
821 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
822 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
823 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
824 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
825 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
826 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
827 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
828 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
829 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
830 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
832 fjx0 = _mm256_setzero_pd();
833 fjy0 = _mm256_setzero_pd();
834 fjz0 = _mm256_setzero_pd();
835 fjx1 = _mm256_setzero_pd();
836 fjy1 = _mm256_setzero_pd();
837 fjz1 = _mm256_setzero_pd();
838 fjx2 = _mm256_setzero_pd();
839 fjy2 = _mm256_setzero_pd();
840 fjz2 = _mm256_setzero_pd();
841 fjx3 = _mm256_setzero_pd();
842 fjy3 = _mm256_setzero_pd();
843 fjz3 = _mm256_setzero_pd();
845 /**************************
846 * CALCULATE INTERACTIONS *
847 **************************/
849 if (gmx_mm256_any_lt(rsq00,rcutoff2))
852 r00 = _mm256_mul_pd(rsq00,rinv00);
853 r00 = _mm256_andnot_pd(dummy_mask,r00);
855 /* LENNARD-JONES DISPERSION/REPULSION */
857 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
858 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
859 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
860 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
861 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
863 d = _mm256_sub_pd(r00,rswitch);
864 d = _mm256_max_pd(d,_mm256_setzero_pd());
865 d2 = _mm256_mul_pd(d,d);
866 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
868 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
870 /* Evaluate switch function */
871 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
872 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
873 vvdw = _mm256_mul_pd(vvdw,sw);
874 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
876 /* Update potential sum for this i atom from the interaction with this j atom. */
877 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
878 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
879 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
883 fscal = _mm256_and_pd(fscal,cutoff_mask);
885 fscal = _mm256_andnot_pd(dummy_mask,fscal);
887 /* Calculate temporary vectorial force */
888 tx = _mm256_mul_pd(fscal,dx00);
889 ty = _mm256_mul_pd(fscal,dy00);
890 tz = _mm256_mul_pd(fscal,dz00);
892 /* Update vectorial force */
893 fix0 = _mm256_add_pd(fix0,tx);
894 fiy0 = _mm256_add_pd(fiy0,ty);
895 fiz0 = _mm256_add_pd(fiz0,tz);
897 fjx0 = _mm256_add_pd(fjx0,tx);
898 fjy0 = _mm256_add_pd(fjy0,ty);
899 fjz0 = _mm256_add_pd(fjz0,tz);
903 /**************************
904 * CALCULATE INTERACTIONS *
905 **************************/
907 if (gmx_mm256_any_lt(rsq11,rcutoff2))
910 /* REACTION-FIELD ELECTROSTATICS */
911 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
912 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
914 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
916 /* Update potential sum for this i atom from the interaction with this j atom. */
917 velec = _mm256_and_pd(velec,cutoff_mask);
918 velec = _mm256_andnot_pd(dummy_mask,velec);
919 velecsum = _mm256_add_pd(velecsum,velec);
923 fscal = _mm256_and_pd(fscal,cutoff_mask);
925 fscal = _mm256_andnot_pd(dummy_mask,fscal);
927 /* Calculate temporary vectorial force */
928 tx = _mm256_mul_pd(fscal,dx11);
929 ty = _mm256_mul_pd(fscal,dy11);
930 tz = _mm256_mul_pd(fscal,dz11);
932 /* Update vectorial force */
933 fix1 = _mm256_add_pd(fix1,tx);
934 fiy1 = _mm256_add_pd(fiy1,ty);
935 fiz1 = _mm256_add_pd(fiz1,tz);
937 fjx1 = _mm256_add_pd(fjx1,tx);
938 fjy1 = _mm256_add_pd(fjy1,ty);
939 fjz1 = _mm256_add_pd(fjz1,tz);
943 /**************************
944 * CALCULATE INTERACTIONS *
945 **************************/
947 if (gmx_mm256_any_lt(rsq12,rcutoff2))
950 /* REACTION-FIELD ELECTROSTATICS */
951 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
952 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
954 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
956 /* Update potential sum for this i atom from the interaction with this j atom. */
957 velec = _mm256_and_pd(velec,cutoff_mask);
958 velec = _mm256_andnot_pd(dummy_mask,velec);
959 velecsum = _mm256_add_pd(velecsum,velec);
963 fscal = _mm256_and_pd(fscal,cutoff_mask);
965 fscal = _mm256_andnot_pd(dummy_mask,fscal);
967 /* Calculate temporary vectorial force */
968 tx = _mm256_mul_pd(fscal,dx12);
969 ty = _mm256_mul_pd(fscal,dy12);
970 tz = _mm256_mul_pd(fscal,dz12);
972 /* Update vectorial force */
973 fix1 = _mm256_add_pd(fix1,tx);
974 fiy1 = _mm256_add_pd(fiy1,ty);
975 fiz1 = _mm256_add_pd(fiz1,tz);
977 fjx2 = _mm256_add_pd(fjx2,tx);
978 fjy2 = _mm256_add_pd(fjy2,ty);
979 fjz2 = _mm256_add_pd(fjz2,tz);
983 /**************************
984 * CALCULATE INTERACTIONS *
985 **************************/
987 if (gmx_mm256_any_lt(rsq13,rcutoff2))
990 /* REACTION-FIELD ELECTROSTATICS */
991 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_add_pd(rinv13,_mm256_mul_pd(krf,rsq13)),crf));
992 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
994 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
996 /* Update potential sum for this i atom from the interaction with this j atom. */
997 velec = _mm256_and_pd(velec,cutoff_mask);
998 velec = _mm256_andnot_pd(dummy_mask,velec);
999 velecsum = _mm256_add_pd(velecsum,velec);
1003 fscal = _mm256_and_pd(fscal,cutoff_mask);
1005 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1007 /* Calculate temporary vectorial force */
1008 tx = _mm256_mul_pd(fscal,dx13);
1009 ty = _mm256_mul_pd(fscal,dy13);
1010 tz = _mm256_mul_pd(fscal,dz13);
1012 /* Update vectorial force */
1013 fix1 = _mm256_add_pd(fix1,tx);
1014 fiy1 = _mm256_add_pd(fiy1,ty);
1015 fiz1 = _mm256_add_pd(fiz1,tz);
1017 fjx3 = _mm256_add_pd(fjx3,tx);
1018 fjy3 = _mm256_add_pd(fjy3,ty);
1019 fjz3 = _mm256_add_pd(fjz3,tz);
1023 /**************************
1024 * CALCULATE INTERACTIONS *
1025 **************************/
1027 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1030 /* REACTION-FIELD ELECTROSTATICS */
1031 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
1032 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1034 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1036 /* Update potential sum for this i atom from the interaction with this j atom. */
1037 velec = _mm256_and_pd(velec,cutoff_mask);
1038 velec = _mm256_andnot_pd(dummy_mask,velec);
1039 velecsum = _mm256_add_pd(velecsum,velec);
1043 fscal = _mm256_and_pd(fscal,cutoff_mask);
1045 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1047 /* Calculate temporary vectorial force */
1048 tx = _mm256_mul_pd(fscal,dx21);
1049 ty = _mm256_mul_pd(fscal,dy21);
1050 tz = _mm256_mul_pd(fscal,dz21);
1052 /* Update vectorial force */
1053 fix2 = _mm256_add_pd(fix2,tx);
1054 fiy2 = _mm256_add_pd(fiy2,ty);
1055 fiz2 = _mm256_add_pd(fiz2,tz);
1057 fjx1 = _mm256_add_pd(fjx1,tx);
1058 fjy1 = _mm256_add_pd(fjy1,ty);
1059 fjz1 = _mm256_add_pd(fjz1,tz);
1063 /**************************
1064 * CALCULATE INTERACTIONS *
1065 **************************/
1067 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1070 /* REACTION-FIELD ELECTROSTATICS */
1071 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
1072 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1074 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1076 /* Update potential sum for this i atom from the interaction with this j atom. */
1077 velec = _mm256_and_pd(velec,cutoff_mask);
1078 velec = _mm256_andnot_pd(dummy_mask,velec);
1079 velecsum = _mm256_add_pd(velecsum,velec);
1083 fscal = _mm256_and_pd(fscal,cutoff_mask);
1085 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1087 /* Calculate temporary vectorial force */
1088 tx = _mm256_mul_pd(fscal,dx22);
1089 ty = _mm256_mul_pd(fscal,dy22);
1090 tz = _mm256_mul_pd(fscal,dz22);
1092 /* Update vectorial force */
1093 fix2 = _mm256_add_pd(fix2,tx);
1094 fiy2 = _mm256_add_pd(fiy2,ty);
1095 fiz2 = _mm256_add_pd(fiz2,tz);
1097 fjx2 = _mm256_add_pd(fjx2,tx);
1098 fjy2 = _mm256_add_pd(fjy2,ty);
1099 fjz2 = _mm256_add_pd(fjz2,tz);
1103 /**************************
1104 * CALCULATE INTERACTIONS *
1105 **************************/
1107 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1110 /* REACTION-FIELD ELECTROSTATICS */
1111 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_add_pd(rinv23,_mm256_mul_pd(krf,rsq23)),crf));
1112 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
1114 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1116 /* Update potential sum for this i atom from the interaction with this j atom. */
1117 velec = _mm256_and_pd(velec,cutoff_mask);
1118 velec = _mm256_andnot_pd(dummy_mask,velec);
1119 velecsum = _mm256_add_pd(velecsum,velec);
1123 fscal = _mm256_and_pd(fscal,cutoff_mask);
1125 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1127 /* Calculate temporary vectorial force */
1128 tx = _mm256_mul_pd(fscal,dx23);
1129 ty = _mm256_mul_pd(fscal,dy23);
1130 tz = _mm256_mul_pd(fscal,dz23);
1132 /* Update vectorial force */
1133 fix2 = _mm256_add_pd(fix2,tx);
1134 fiy2 = _mm256_add_pd(fiy2,ty);
1135 fiz2 = _mm256_add_pd(fiz2,tz);
1137 fjx3 = _mm256_add_pd(fjx3,tx);
1138 fjy3 = _mm256_add_pd(fjy3,ty);
1139 fjz3 = _mm256_add_pd(fjz3,tz);
1143 /**************************
1144 * CALCULATE INTERACTIONS *
1145 **************************/
1147 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1150 /* REACTION-FIELD ELECTROSTATICS */
1151 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_add_pd(rinv31,_mm256_mul_pd(krf,rsq31)),crf));
1152 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1154 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1156 /* Update potential sum for this i atom from the interaction with this j atom. */
1157 velec = _mm256_and_pd(velec,cutoff_mask);
1158 velec = _mm256_andnot_pd(dummy_mask,velec);
1159 velecsum = _mm256_add_pd(velecsum,velec);
1163 fscal = _mm256_and_pd(fscal,cutoff_mask);
1165 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1167 /* Calculate temporary vectorial force */
1168 tx = _mm256_mul_pd(fscal,dx31);
1169 ty = _mm256_mul_pd(fscal,dy31);
1170 tz = _mm256_mul_pd(fscal,dz31);
1172 /* Update vectorial force */
1173 fix3 = _mm256_add_pd(fix3,tx);
1174 fiy3 = _mm256_add_pd(fiy3,ty);
1175 fiz3 = _mm256_add_pd(fiz3,tz);
1177 fjx1 = _mm256_add_pd(fjx1,tx);
1178 fjy1 = _mm256_add_pd(fjy1,ty);
1179 fjz1 = _mm256_add_pd(fjz1,tz);
1183 /**************************
1184 * CALCULATE INTERACTIONS *
1185 **************************/
1187 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1190 /* REACTION-FIELD ELECTROSTATICS */
1191 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_add_pd(rinv32,_mm256_mul_pd(krf,rsq32)),crf));
1192 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
1194 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1196 /* Update potential sum for this i atom from the interaction with this j atom. */
1197 velec = _mm256_and_pd(velec,cutoff_mask);
1198 velec = _mm256_andnot_pd(dummy_mask,velec);
1199 velecsum = _mm256_add_pd(velecsum,velec);
1203 fscal = _mm256_and_pd(fscal,cutoff_mask);
1205 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1207 /* Calculate temporary vectorial force */
1208 tx = _mm256_mul_pd(fscal,dx32);
1209 ty = _mm256_mul_pd(fscal,dy32);
1210 tz = _mm256_mul_pd(fscal,dz32);
1212 /* Update vectorial force */
1213 fix3 = _mm256_add_pd(fix3,tx);
1214 fiy3 = _mm256_add_pd(fiy3,ty);
1215 fiz3 = _mm256_add_pd(fiz3,tz);
1217 fjx2 = _mm256_add_pd(fjx2,tx);
1218 fjy2 = _mm256_add_pd(fjy2,ty);
1219 fjz2 = _mm256_add_pd(fjz2,tz);
1223 /**************************
1224 * CALCULATE INTERACTIONS *
1225 **************************/
1227 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1230 /* REACTION-FIELD ELECTROSTATICS */
1231 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_add_pd(rinv33,_mm256_mul_pd(krf,rsq33)),crf));
1232 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
1234 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1236 /* Update potential sum for this i atom from the interaction with this j atom. */
1237 velec = _mm256_and_pd(velec,cutoff_mask);
1238 velec = _mm256_andnot_pd(dummy_mask,velec);
1239 velecsum = _mm256_add_pd(velecsum,velec);
1243 fscal = _mm256_and_pd(fscal,cutoff_mask);
1245 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1247 /* Calculate temporary vectorial force */
1248 tx = _mm256_mul_pd(fscal,dx33);
1249 ty = _mm256_mul_pd(fscal,dy33);
1250 tz = _mm256_mul_pd(fscal,dz33);
1252 /* Update vectorial force */
1253 fix3 = _mm256_add_pd(fix3,tx);
1254 fiy3 = _mm256_add_pd(fiy3,ty);
1255 fiz3 = _mm256_add_pd(fiz3,tz);
1257 fjx3 = _mm256_add_pd(fjx3,tx);
1258 fjy3 = _mm256_add_pd(fjy3,ty);
1259 fjz3 = _mm256_add_pd(fjz3,tz);
1263 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1264 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1265 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1266 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1268 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1269 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1270 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1272 /* Inner loop uses 387 flops */
1275 /* End of innermost loop */
1277 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1278 f+i_coord_offset,fshift+i_shift_offset);
1281 /* Update potential energies */
1282 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1283 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1285 /* Increment number of inner iterations */
1286 inneriter += j_index_end - j_index_start;
1288 /* Outer loop uses 26 flops */
1291 /* Increment number of outer iterations */
1294 /* Update outer/inner flops */
1296 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*387);
1299 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_256_double
1300 * Electrostatics interaction: ReactionField
1301 * VdW interaction: LennardJones
1302 * Geometry: Water4-Water4
1303 * Calculate force/pot: Force
1306 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_256_double
1307 (t_nblist * gmx_restrict nlist,
1308 rvec * gmx_restrict xx,
1309 rvec * gmx_restrict ff,
1310 t_forcerec * gmx_restrict fr,
1311 t_mdatoms * gmx_restrict mdatoms,
1312 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1313 t_nrnb * gmx_restrict nrnb)
1315 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1316 * just 0 for non-waters.
1317 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1318 * jnr indices corresponding to data put in the four positions in the SIMD register.
1320 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1321 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1322 int jnrA,jnrB,jnrC,jnrD;
1323 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1324 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1325 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1326 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1327 real rcutoff_scalar;
1328 real *shiftvec,*fshift,*x,*f;
1329 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1330 real scratch[4*DIM];
1331 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1332 real * vdwioffsetptr0;
1333 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1334 real * vdwioffsetptr1;
1335 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1336 real * vdwioffsetptr2;
1337 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1338 real * vdwioffsetptr3;
1339 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1340 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1341 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1342 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1343 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1344 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1345 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1346 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1347 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1348 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1349 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1350 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1351 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1352 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1353 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1354 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1355 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1356 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1357 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1358 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1361 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1364 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1365 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1366 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1367 real rswitch_scalar,d_scalar;
1368 __m256d dummy_mask,cutoff_mask;
1369 __m128 tmpmask0,tmpmask1;
1370 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1371 __m256d one = _mm256_set1_pd(1.0);
1372 __m256d two = _mm256_set1_pd(2.0);
1378 jindex = nlist->jindex;
1380 shiftidx = nlist->shift;
1382 shiftvec = fr->shift_vec[0];
1383 fshift = fr->fshift[0];
1384 facel = _mm256_set1_pd(fr->epsfac);
1385 charge = mdatoms->chargeA;
1386 krf = _mm256_set1_pd(fr->ic->k_rf);
1387 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
1388 crf = _mm256_set1_pd(fr->ic->c_rf);
1389 nvdwtype = fr->ntype;
1390 vdwparam = fr->nbfp;
1391 vdwtype = mdatoms->typeA;
1393 /* Setup water-specific parameters */
1394 inr = nlist->iinr[0];
1395 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1396 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1397 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1398 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1400 jq1 = _mm256_set1_pd(charge[inr+1]);
1401 jq2 = _mm256_set1_pd(charge[inr+2]);
1402 jq3 = _mm256_set1_pd(charge[inr+3]);
1403 vdwjidx0A = 2*vdwtype[inr+0];
1404 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1405 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1406 qq11 = _mm256_mul_pd(iq1,jq1);
1407 qq12 = _mm256_mul_pd(iq1,jq2);
1408 qq13 = _mm256_mul_pd(iq1,jq3);
1409 qq21 = _mm256_mul_pd(iq2,jq1);
1410 qq22 = _mm256_mul_pd(iq2,jq2);
1411 qq23 = _mm256_mul_pd(iq2,jq3);
1412 qq31 = _mm256_mul_pd(iq3,jq1);
1413 qq32 = _mm256_mul_pd(iq3,jq2);
1414 qq33 = _mm256_mul_pd(iq3,jq3);
1416 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1417 rcutoff_scalar = fr->rcoulomb;
1418 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1419 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1421 rswitch_scalar = fr->rvdw_switch;
1422 rswitch = _mm256_set1_pd(rswitch_scalar);
1423 /* Setup switch parameters */
1424 d_scalar = rcutoff_scalar-rswitch_scalar;
1425 d = _mm256_set1_pd(d_scalar);
1426 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1427 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1428 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1429 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1430 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1431 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1433 /* Avoid stupid compiler warnings */
1434 jnrA = jnrB = jnrC = jnrD = 0;
1435 j_coord_offsetA = 0;
1436 j_coord_offsetB = 0;
1437 j_coord_offsetC = 0;
1438 j_coord_offsetD = 0;
1443 for(iidx=0;iidx<4*DIM;iidx++)
1445 scratch[iidx] = 0.0;
1448 /* Start outer loop over neighborlists */
1449 for(iidx=0; iidx<nri; iidx++)
1451 /* Load shift vector for this list */
1452 i_shift_offset = DIM*shiftidx[iidx];
1454 /* Load limits for loop over neighbors */
1455 j_index_start = jindex[iidx];
1456 j_index_end = jindex[iidx+1];
1458 /* Get outer coordinate index */
1460 i_coord_offset = DIM*inr;
1462 /* Load i particle coords and add shift vector */
1463 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1464 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1466 fix0 = _mm256_setzero_pd();
1467 fiy0 = _mm256_setzero_pd();
1468 fiz0 = _mm256_setzero_pd();
1469 fix1 = _mm256_setzero_pd();
1470 fiy1 = _mm256_setzero_pd();
1471 fiz1 = _mm256_setzero_pd();
1472 fix2 = _mm256_setzero_pd();
1473 fiy2 = _mm256_setzero_pd();
1474 fiz2 = _mm256_setzero_pd();
1475 fix3 = _mm256_setzero_pd();
1476 fiy3 = _mm256_setzero_pd();
1477 fiz3 = _mm256_setzero_pd();
1479 /* Start inner kernel loop */
1480 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1483 /* Get j neighbor index, and coordinate index */
1485 jnrB = jjnr[jidx+1];
1486 jnrC = jjnr[jidx+2];
1487 jnrD = jjnr[jidx+3];
1488 j_coord_offsetA = DIM*jnrA;
1489 j_coord_offsetB = DIM*jnrB;
1490 j_coord_offsetC = DIM*jnrC;
1491 j_coord_offsetD = DIM*jnrD;
1493 /* load j atom coordinates */
1494 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1495 x+j_coord_offsetC,x+j_coord_offsetD,
1496 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1497 &jy2,&jz2,&jx3,&jy3,&jz3);
1499 /* Calculate displacement vector */
1500 dx00 = _mm256_sub_pd(ix0,jx0);
1501 dy00 = _mm256_sub_pd(iy0,jy0);
1502 dz00 = _mm256_sub_pd(iz0,jz0);
1503 dx11 = _mm256_sub_pd(ix1,jx1);
1504 dy11 = _mm256_sub_pd(iy1,jy1);
1505 dz11 = _mm256_sub_pd(iz1,jz1);
1506 dx12 = _mm256_sub_pd(ix1,jx2);
1507 dy12 = _mm256_sub_pd(iy1,jy2);
1508 dz12 = _mm256_sub_pd(iz1,jz2);
1509 dx13 = _mm256_sub_pd(ix1,jx3);
1510 dy13 = _mm256_sub_pd(iy1,jy3);
1511 dz13 = _mm256_sub_pd(iz1,jz3);
1512 dx21 = _mm256_sub_pd(ix2,jx1);
1513 dy21 = _mm256_sub_pd(iy2,jy1);
1514 dz21 = _mm256_sub_pd(iz2,jz1);
1515 dx22 = _mm256_sub_pd(ix2,jx2);
1516 dy22 = _mm256_sub_pd(iy2,jy2);
1517 dz22 = _mm256_sub_pd(iz2,jz2);
1518 dx23 = _mm256_sub_pd(ix2,jx3);
1519 dy23 = _mm256_sub_pd(iy2,jy3);
1520 dz23 = _mm256_sub_pd(iz2,jz3);
1521 dx31 = _mm256_sub_pd(ix3,jx1);
1522 dy31 = _mm256_sub_pd(iy3,jy1);
1523 dz31 = _mm256_sub_pd(iz3,jz1);
1524 dx32 = _mm256_sub_pd(ix3,jx2);
1525 dy32 = _mm256_sub_pd(iy3,jy2);
1526 dz32 = _mm256_sub_pd(iz3,jz2);
1527 dx33 = _mm256_sub_pd(ix3,jx3);
1528 dy33 = _mm256_sub_pd(iy3,jy3);
1529 dz33 = _mm256_sub_pd(iz3,jz3);
1531 /* Calculate squared distance and things based on it */
1532 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1533 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1534 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1535 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1536 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1537 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1538 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1539 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1540 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1541 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1543 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1544 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1545 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1546 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1547 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1548 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1549 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1550 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1551 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1552 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
1554 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1555 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1556 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1557 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1558 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1559 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1560 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1561 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1562 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1563 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1565 fjx0 = _mm256_setzero_pd();
1566 fjy0 = _mm256_setzero_pd();
1567 fjz0 = _mm256_setzero_pd();
1568 fjx1 = _mm256_setzero_pd();
1569 fjy1 = _mm256_setzero_pd();
1570 fjz1 = _mm256_setzero_pd();
1571 fjx2 = _mm256_setzero_pd();
1572 fjy2 = _mm256_setzero_pd();
1573 fjz2 = _mm256_setzero_pd();
1574 fjx3 = _mm256_setzero_pd();
1575 fjy3 = _mm256_setzero_pd();
1576 fjz3 = _mm256_setzero_pd();
1578 /**************************
1579 * CALCULATE INTERACTIONS *
1580 **************************/
1582 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1585 r00 = _mm256_mul_pd(rsq00,rinv00);
1587 /* LENNARD-JONES DISPERSION/REPULSION */
1589 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1590 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
1591 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1592 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1593 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1595 d = _mm256_sub_pd(r00,rswitch);
1596 d = _mm256_max_pd(d,_mm256_setzero_pd());
1597 d2 = _mm256_mul_pd(d,d);
1598 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1600 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1602 /* Evaluate switch function */
1603 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1604 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1605 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1609 fscal = _mm256_and_pd(fscal,cutoff_mask);
1611 /* Calculate temporary vectorial force */
1612 tx = _mm256_mul_pd(fscal,dx00);
1613 ty = _mm256_mul_pd(fscal,dy00);
1614 tz = _mm256_mul_pd(fscal,dz00);
1616 /* Update vectorial force */
1617 fix0 = _mm256_add_pd(fix0,tx);
1618 fiy0 = _mm256_add_pd(fiy0,ty);
1619 fiz0 = _mm256_add_pd(fiz0,tz);
1621 fjx0 = _mm256_add_pd(fjx0,tx);
1622 fjy0 = _mm256_add_pd(fjy0,ty);
1623 fjz0 = _mm256_add_pd(fjz0,tz);
1627 /**************************
1628 * CALCULATE INTERACTIONS *
1629 **************************/
1631 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1634 /* REACTION-FIELD ELECTROSTATICS */
1635 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1637 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1641 fscal = _mm256_and_pd(fscal,cutoff_mask);
1643 /* Calculate temporary vectorial force */
1644 tx = _mm256_mul_pd(fscal,dx11);
1645 ty = _mm256_mul_pd(fscal,dy11);
1646 tz = _mm256_mul_pd(fscal,dz11);
1648 /* Update vectorial force */
1649 fix1 = _mm256_add_pd(fix1,tx);
1650 fiy1 = _mm256_add_pd(fiy1,ty);
1651 fiz1 = _mm256_add_pd(fiz1,tz);
1653 fjx1 = _mm256_add_pd(fjx1,tx);
1654 fjy1 = _mm256_add_pd(fjy1,ty);
1655 fjz1 = _mm256_add_pd(fjz1,tz);
1659 /**************************
1660 * CALCULATE INTERACTIONS *
1661 **************************/
1663 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1666 /* REACTION-FIELD ELECTROSTATICS */
1667 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1669 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1673 fscal = _mm256_and_pd(fscal,cutoff_mask);
1675 /* Calculate temporary vectorial force */
1676 tx = _mm256_mul_pd(fscal,dx12);
1677 ty = _mm256_mul_pd(fscal,dy12);
1678 tz = _mm256_mul_pd(fscal,dz12);
1680 /* Update vectorial force */
1681 fix1 = _mm256_add_pd(fix1,tx);
1682 fiy1 = _mm256_add_pd(fiy1,ty);
1683 fiz1 = _mm256_add_pd(fiz1,tz);
1685 fjx2 = _mm256_add_pd(fjx2,tx);
1686 fjy2 = _mm256_add_pd(fjy2,ty);
1687 fjz2 = _mm256_add_pd(fjz2,tz);
1691 /**************************
1692 * CALCULATE INTERACTIONS *
1693 **************************/
1695 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1698 /* REACTION-FIELD ELECTROSTATICS */
1699 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
1701 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1705 fscal = _mm256_and_pd(fscal,cutoff_mask);
1707 /* Calculate temporary vectorial force */
1708 tx = _mm256_mul_pd(fscal,dx13);
1709 ty = _mm256_mul_pd(fscal,dy13);
1710 tz = _mm256_mul_pd(fscal,dz13);
1712 /* Update vectorial force */
1713 fix1 = _mm256_add_pd(fix1,tx);
1714 fiy1 = _mm256_add_pd(fiy1,ty);
1715 fiz1 = _mm256_add_pd(fiz1,tz);
1717 fjx3 = _mm256_add_pd(fjx3,tx);
1718 fjy3 = _mm256_add_pd(fjy3,ty);
1719 fjz3 = _mm256_add_pd(fjz3,tz);
1723 /**************************
1724 * CALCULATE INTERACTIONS *
1725 **************************/
1727 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1730 /* REACTION-FIELD ELECTROSTATICS */
1731 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1733 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1737 fscal = _mm256_and_pd(fscal,cutoff_mask);
1739 /* Calculate temporary vectorial force */
1740 tx = _mm256_mul_pd(fscal,dx21);
1741 ty = _mm256_mul_pd(fscal,dy21);
1742 tz = _mm256_mul_pd(fscal,dz21);
1744 /* Update vectorial force */
1745 fix2 = _mm256_add_pd(fix2,tx);
1746 fiy2 = _mm256_add_pd(fiy2,ty);
1747 fiz2 = _mm256_add_pd(fiz2,tz);
1749 fjx1 = _mm256_add_pd(fjx1,tx);
1750 fjy1 = _mm256_add_pd(fjy1,ty);
1751 fjz1 = _mm256_add_pd(fjz1,tz);
1755 /**************************
1756 * CALCULATE INTERACTIONS *
1757 **************************/
1759 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1762 /* REACTION-FIELD ELECTROSTATICS */
1763 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1765 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1769 fscal = _mm256_and_pd(fscal,cutoff_mask);
1771 /* Calculate temporary vectorial force */
1772 tx = _mm256_mul_pd(fscal,dx22);
1773 ty = _mm256_mul_pd(fscal,dy22);
1774 tz = _mm256_mul_pd(fscal,dz22);
1776 /* Update vectorial force */
1777 fix2 = _mm256_add_pd(fix2,tx);
1778 fiy2 = _mm256_add_pd(fiy2,ty);
1779 fiz2 = _mm256_add_pd(fiz2,tz);
1781 fjx2 = _mm256_add_pd(fjx2,tx);
1782 fjy2 = _mm256_add_pd(fjy2,ty);
1783 fjz2 = _mm256_add_pd(fjz2,tz);
1787 /**************************
1788 * CALCULATE INTERACTIONS *
1789 **************************/
1791 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1794 /* REACTION-FIELD ELECTROSTATICS */
1795 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
1797 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1801 fscal = _mm256_and_pd(fscal,cutoff_mask);
1803 /* Calculate temporary vectorial force */
1804 tx = _mm256_mul_pd(fscal,dx23);
1805 ty = _mm256_mul_pd(fscal,dy23);
1806 tz = _mm256_mul_pd(fscal,dz23);
1808 /* Update vectorial force */
1809 fix2 = _mm256_add_pd(fix2,tx);
1810 fiy2 = _mm256_add_pd(fiy2,ty);
1811 fiz2 = _mm256_add_pd(fiz2,tz);
1813 fjx3 = _mm256_add_pd(fjx3,tx);
1814 fjy3 = _mm256_add_pd(fjy3,ty);
1815 fjz3 = _mm256_add_pd(fjz3,tz);
1819 /**************************
1820 * CALCULATE INTERACTIONS *
1821 **************************/
1823 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1826 /* REACTION-FIELD ELECTROSTATICS */
1827 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1829 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1833 fscal = _mm256_and_pd(fscal,cutoff_mask);
1835 /* Calculate temporary vectorial force */
1836 tx = _mm256_mul_pd(fscal,dx31);
1837 ty = _mm256_mul_pd(fscal,dy31);
1838 tz = _mm256_mul_pd(fscal,dz31);
1840 /* Update vectorial force */
1841 fix3 = _mm256_add_pd(fix3,tx);
1842 fiy3 = _mm256_add_pd(fiy3,ty);
1843 fiz3 = _mm256_add_pd(fiz3,tz);
1845 fjx1 = _mm256_add_pd(fjx1,tx);
1846 fjy1 = _mm256_add_pd(fjy1,ty);
1847 fjz1 = _mm256_add_pd(fjz1,tz);
1851 /**************************
1852 * CALCULATE INTERACTIONS *
1853 **************************/
1855 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1858 /* REACTION-FIELD ELECTROSTATICS */
1859 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
1861 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1865 fscal = _mm256_and_pd(fscal,cutoff_mask);
1867 /* Calculate temporary vectorial force */
1868 tx = _mm256_mul_pd(fscal,dx32);
1869 ty = _mm256_mul_pd(fscal,dy32);
1870 tz = _mm256_mul_pd(fscal,dz32);
1872 /* Update vectorial force */
1873 fix3 = _mm256_add_pd(fix3,tx);
1874 fiy3 = _mm256_add_pd(fiy3,ty);
1875 fiz3 = _mm256_add_pd(fiz3,tz);
1877 fjx2 = _mm256_add_pd(fjx2,tx);
1878 fjy2 = _mm256_add_pd(fjy2,ty);
1879 fjz2 = _mm256_add_pd(fjz2,tz);
1883 /**************************
1884 * CALCULATE INTERACTIONS *
1885 **************************/
1887 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1890 /* REACTION-FIELD ELECTROSTATICS */
1891 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
1893 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1897 fscal = _mm256_and_pd(fscal,cutoff_mask);
1899 /* Calculate temporary vectorial force */
1900 tx = _mm256_mul_pd(fscal,dx33);
1901 ty = _mm256_mul_pd(fscal,dy33);
1902 tz = _mm256_mul_pd(fscal,dz33);
1904 /* Update vectorial force */
1905 fix3 = _mm256_add_pd(fix3,tx);
1906 fiy3 = _mm256_add_pd(fiy3,ty);
1907 fiz3 = _mm256_add_pd(fiz3,tz);
1909 fjx3 = _mm256_add_pd(fjx3,tx);
1910 fjy3 = _mm256_add_pd(fjy3,ty);
1911 fjz3 = _mm256_add_pd(fjz3,tz);
1915 fjptrA = f+j_coord_offsetA;
1916 fjptrB = f+j_coord_offsetB;
1917 fjptrC = f+j_coord_offsetC;
1918 fjptrD = f+j_coord_offsetD;
1920 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1921 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1922 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1924 /* Inner loop uses 329 flops */
1927 if(jidx<j_index_end)
1930 /* Get j neighbor index, and coordinate index */
1931 jnrlistA = jjnr[jidx];
1932 jnrlistB = jjnr[jidx+1];
1933 jnrlistC = jjnr[jidx+2];
1934 jnrlistD = jjnr[jidx+3];
1935 /* Sign of each element will be negative for non-real atoms.
1936 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1937 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1939 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1941 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1942 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1943 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1945 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1946 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1947 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1948 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1949 j_coord_offsetA = DIM*jnrA;
1950 j_coord_offsetB = DIM*jnrB;
1951 j_coord_offsetC = DIM*jnrC;
1952 j_coord_offsetD = DIM*jnrD;
1954 /* load j atom coordinates */
1955 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1956 x+j_coord_offsetC,x+j_coord_offsetD,
1957 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1958 &jy2,&jz2,&jx3,&jy3,&jz3);
1960 /* Calculate displacement vector */
1961 dx00 = _mm256_sub_pd(ix0,jx0);
1962 dy00 = _mm256_sub_pd(iy0,jy0);
1963 dz00 = _mm256_sub_pd(iz0,jz0);
1964 dx11 = _mm256_sub_pd(ix1,jx1);
1965 dy11 = _mm256_sub_pd(iy1,jy1);
1966 dz11 = _mm256_sub_pd(iz1,jz1);
1967 dx12 = _mm256_sub_pd(ix1,jx2);
1968 dy12 = _mm256_sub_pd(iy1,jy2);
1969 dz12 = _mm256_sub_pd(iz1,jz2);
1970 dx13 = _mm256_sub_pd(ix1,jx3);
1971 dy13 = _mm256_sub_pd(iy1,jy3);
1972 dz13 = _mm256_sub_pd(iz1,jz3);
1973 dx21 = _mm256_sub_pd(ix2,jx1);
1974 dy21 = _mm256_sub_pd(iy2,jy1);
1975 dz21 = _mm256_sub_pd(iz2,jz1);
1976 dx22 = _mm256_sub_pd(ix2,jx2);
1977 dy22 = _mm256_sub_pd(iy2,jy2);
1978 dz22 = _mm256_sub_pd(iz2,jz2);
1979 dx23 = _mm256_sub_pd(ix2,jx3);
1980 dy23 = _mm256_sub_pd(iy2,jy3);
1981 dz23 = _mm256_sub_pd(iz2,jz3);
1982 dx31 = _mm256_sub_pd(ix3,jx1);
1983 dy31 = _mm256_sub_pd(iy3,jy1);
1984 dz31 = _mm256_sub_pd(iz3,jz1);
1985 dx32 = _mm256_sub_pd(ix3,jx2);
1986 dy32 = _mm256_sub_pd(iy3,jy2);
1987 dz32 = _mm256_sub_pd(iz3,jz2);
1988 dx33 = _mm256_sub_pd(ix3,jx3);
1989 dy33 = _mm256_sub_pd(iy3,jy3);
1990 dz33 = _mm256_sub_pd(iz3,jz3);
1992 /* Calculate squared distance and things based on it */
1993 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1994 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1995 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1996 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1997 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1998 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1999 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2000 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2001 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2002 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2004 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
2005 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2006 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2007 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
2008 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2009 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2010 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
2011 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
2012 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
2013 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
2015 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
2016 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2017 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2018 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
2019 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2020 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2021 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
2022 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
2023 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
2024 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
2026 fjx0 = _mm256_setzero_pd();
2027 fjy0 = _mm256_setzero_pd();
2028 fjz0 = _mm256_setzero_pd();
2029 fjx1 = _mm256_setzero_pd();
2030 fjy1 = _mm256_setzero_pd();
2031 fjz1 = _mm256_setzero_pd();
2032 fjx2 = _mm256_setzero_pd();
2033 fjy2 = _mm256_setzero_pd();
2034 fjz2 = _mm256_setzero_pd();
2035 fjx3 = _mm256_setzero_pd();
2036 fjy3 = _mm256_setzero_pd();
2037 fjz3 = _mm256_setzero_pd();
2039 /**************************
2040 * CALCULATE INTERACTIONS *
2041 **************************/
2043 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2046 r00 = _mm256_mul_pd(rsq00,rinv00);
2047 r00 = _mm256_andnot_pd(dummy_mask,r00);
2049 /* LENNARD-JONES DISPERSION/REPULSION */
2051 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2052 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
2053 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
2054 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
2055 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
2057 d = _mm256_sub_pd(r00,rswitch);
2058 d = _mm256_max_pd(d,_mm256_setzero_pd());
2059 d2 = _mm256_mul_pd(d,d);
2060 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2062 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2064 /* Evaluate switch function */
2065 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2066 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
2067 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2071 fscal = _mm256_and_pd(fscal,cutoff_mask);
2073 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2075 /* Calculate temporary vectorial force */
2076 tx = _mm256_mul_pd(fscal,dx00);
2077 ty = _mm256_mul_pd(fscal,dy00);
2078 tz = _mm256_mul_pd(fscal,dz00);
2080 /* Update vectorial force */
2081 fix0 = _mm256_add_pd(fix0,tx);
2082 fiy0 = _mm256_add_pd(fiy0,ty);
2083 fiz0 = _mm256_add_pd(fiz0,tz);
2085 fjx0 = _mm256_add_pd(fjx0,tx);
2086 fjy0 = _mm256_add_pd(fjy0,ty);
2087 fjz0 = _mm256_add_pd(fjz0,tz);
2091 /**************************
2092 * CALCULATE INTERACTIONS *
2093 **************************/
2095 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2098 /* REACTION-FIELD ELECTROSTATICS */
2099 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
2101 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2105 fscal = _mm256_and_pd(fscal,cutoff_mask);
2107 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2109 /* Calculate temporary vectorial force */
2110 tx = _mm256_mul_pd(fscal,dx11);
2111 ty = _mm256_mul_pd(fscal,dy11);
2112 tz = _mm256_mul_pd(fscal,dz11);
2114 /* Update vectorial force */
2115 fix1 = _mm256_add_pd(fix1,tx);
2116 fiy1 = _mm256_add_pd(fiy1,ty);
2117 fiz1 = _mm256_add_pd(fiz1,tz);
2119 fjx1 = _mm256_add_pd(fjx1,tx);
2120 fjy1 = _mm256_add_pd(fjy1,ty);
2121 fjz1 = _mm256_add_pd(fjz1,tz);
2125 /**************************
2126 * CALCULATE INTERACTIONS *
2127 **************************/
2129 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2132 /* REACTION-FIELD ELECTROSTATICS */
2133 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
2135 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2139 fscal = _mm256_and_pd(fscal,cutoff_mask);
2141 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2143 /* Calculate temporary vectorial force */
2144 tx = _mm256_mul_pd(fscal,dx12);
2145 ty = _mm256_mul_pd(fscal,dy12);
2146 tz = _mm256_mul_pd(fscal,dz12);
2148 /* Update vectorial force */
2149 fix1 = _mm256_add_pd(fix1,tx);
2150 fiy1 = _mm256_add_pd(fiy1,ty);
2151 fiz1 = _mm256_add_pd(fiz1,tz);
2153 fjx2 = _mm256_add_pd(fjx2,tx);
2154 fjy2 = _mm256_add_pd(fjy2,ty);
2155 fjz2 = _mm256_add_pd(fjz2,tz);
2159 /**************************
2160 * CALCULATE INTERACTIONS *
2161 **************************/
2163 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2166 /* REACTION-FIELD ELECTROSTATICS */
2167 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
2169 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2173 fscal = _mm256_and_pd(fscal,cutoff_mask);
2175 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2177 /* Calculate temporary vectorial force */
2178 tx = _mm256_mul_pd(fscal,dx13);
2179 ty = _mm256_mul_pd(fscal,dy13);
2180 tz = _mm256_mul_pd(fscal,dz13);
2182 /* Update vectorial force */
2183 fix1 = _mm256_add_pd(fix1,tx);
2184 fiy1 = _mm256_add_pd(fiy1,ty);
2185 fiz1 = _mm256_add_pd(fiz1,tz);
2187 fjx3 = _mm256_add_pd(fjx3,tx);
2188 fjy3 = _mm256_add_pd(fjy3,ty);
2189 fjz3 = _mm256_add_pd(fjz3,tz);
2193 /**************************
2194 * CALCULATE INTERACTIONS *
2195 **************************/
2197 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2200 /* REACTION-FIELD ELECTROSTATICS */
2201 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
2203 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2207 fscal = _mm256_and_pd(fscal,cutoff_mask);
2209 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2211 /* Calculate temporary vectorial force */
2212 tx = _mm256_mul_pd(fscal,dx21);
2213 ty = _mm256_mul_pd(fscal,dy21);
2214 tz = _mm256_mul_pd(fscal,dz21);
2216 /* Update vectorial force */
2217 fix2 = _mm256_add_pd(fix2,tx);
2218 fiy2 = _mm256_add_pd(fiy2,ty);
2219 fiz2 = _mm256_add_pd(fiz2,tz);
2221 fjx1 = _mm256_add_pd(fjx1,tx);
2222 fjy1 = _mm256_add_pd(fjy1,ty);
2223 fjz1 = _mm256_add_pd(fjz1,tz);
2227 /**************************
2228 * CALCULATE INTERACTIONS *
2229 **************************/
2231 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2234 /* REACTION-FIELD ELECTROSTATICS */
2235 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
2237 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2241 fscal = _mm256_and_pd(fscal,cutoff_mask);
2243 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2245 /* Calculate temporary vectorial force */
2246 tx = _mm256_mul_pd(fscal,dx22);
2247 ty = _mm256_mul_pd(fscal,dy22);
2248 tz = _mm256_mul_pd(fscal,dz22);
2250 /* Update vectorial force */
2251 fix2 = _mm256_add_pd(fix2,tx);
2252 fiy2 = _mm256_add_pd(fiy2,ty);
2253 fiz2 = _mm256_add_pd(fiz2,tz);
2255 fjx2 = _mm256_add_pd(fjx2,tx);
2256 fjy2 = _mm256_add_pd(fjy2,ty);
2257 fjz2 = _mm256_add_pd(fjz2,tz);
2261 /**************************
2262 * CALCULATE INTERACTIONS *
2263 **************************/
2265 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2268 /* REACTION-FIELD ELECTROSTATICS */
2269 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
2271 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2275 fscal = _mm256_and_pd(fscal,cutoff_mask);
2277 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2279 /* Calculate temporary vectorial force */
2280 tx = _mm256_mul_pd(fscal,dx23);
2281 ty = _mm256_mul_pd(fscal,dy23);
2282 tz = _mm256_mul_pd(fscal,dz23);
2284 /* Update vectorial force */
2285 fix2 = _mm256_add_pd(fix2,tx);
2286 fiy2 = _mm256_add_pd(fiy2,ty);
2287 fiz2 = _mm256_add_pd(fiz2,tz);
2289 fjx3 = _mm256_add_pd(fjx3,tx);
2290 fjy3 = _mm256_add_pd(fjy3,ty);
2291 fjz3 = _mm256_add_pd(fjz3,tz);
2295 /**************************
2296 * CALCULATE INTERACTIONS *
2297 **************************/
2299 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2302 /* REACTION-FIELD ELECTROSTATICS */
2303 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
2305 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2309 fscal = _mm256_and_pd(fscal,cutoff_mask);
2311 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2313 /* Calculate temporary vectorial force */
2314 tx = _mm256_mul_pd(fscal,dx31);
2315 ty = _mm256_mul_pd(fscal,dy31);
2316 tz = _mm256_mul_pd(fscal,dz31);
2318 /* Update vectorial force */
2319 fix3 = _mm256_add_pd(fix3,tx);
2320 fiy3 = _mm256_add_pd(fiy3,ty);
2321 fiz3 = _mm256_add_pd(fiz3,tz);
2323 fjx1 = _mm256_add_pd(fjx1,tx);
2324 fjy1 = _mm256_add_pd(fjy1,ty);
2325 fjz1 = _mm256_add_pd(fjz1,tz);
2329 /**************************
2330 * CALCULATE INTERACTIONS *
2331 **************************/
2333 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2336 /* REACTION-FIELD ELECTROSTATICS */
2337 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
2339 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2343 fscal = _mm256_and_pd(fscal,cutoff_mask);
2345 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2347 /* Calculate temporary vectorial force */
2348 tx = _mm256_mul_pd(fscal,dx32);
2349 ty = _mm256_mul_pd(fscal,dy32);
2350 tz = _mm256_mul_pd(fscal,dz32);
2352 /* Update vectorial force */
2353 fix3 = _mm256_add_pd(fix3,tx);
2354 fiy3 = _mm256_add_pd(fiy3,ty);
2355 fiz3 = _mm256_add_pd(fiz3,tz);
2357 fjx2 = _mm256_add_pd(fjx2,tx);
2358 fjy2 = _mm256_add_pd(fjy2,ty);
2359 fjz2 = _mm256_add_pd(fjz2,tz);
2363 /**************************
2364 * CALCULATE INTERACTIONS *
2365 **************************/
2367 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2370 /* REACTION-FIELD ELECTROSTATICS */
2371 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
2373 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2377 fscal = _mm256_and_pd(fscal,cutoff_mask);
2379 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2381 /* Calculate temporary vectorial force */
2382 tx = _mm256_mul_pd(fscal,dx33);
2383 ty = _mm256_mul_pd(fscal,dy33);
2384 tz = _mm256_mul_pd(fscal,dz33);
2386 /* Update vectorial force */
2387 fix3 = _mm256_add_pd(fix3,tx);
2388 fiy3 = _mm256_add_pd(fiy3,ty);
2389 fiz3 = _mm256_add_pd(fiz3,tz);
2391 fjx3 = _mm256_add_pd(fjx3,tx);
2392 fjy3 = _mm256_add_pd(fjy3,ty);
2393 fjz3 = _mm256_add_pd(fjz3,tz);
2397 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2398 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2399 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2400 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2402 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2403 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2404 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2406 /* Inner loop uses 330 flops */
2409 /* End of innermost loop */
2411 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2412 f+i_coord_offset,fshift+i_shift_offset);
2414 /* Increment number of inner iterations */
2415 inneriter += j_index_end - j_index_start;
2417 /* Outer loop uses 24 flops */
2420 /* Increment number of outer iterations */
2423 /* Update outer/inner flops */
2425 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*330);