2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
49 #include "gromacs/simd/math_x86_avx_256_double.h"
50 #include "kernelutil_x86_avx_256_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_256_double
54 * Electrostatics interaction: ReactionField
55 * VdW interaction: LennardJones
56 * Geometry: Water4-Water4
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_avx_256_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
79 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
80 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
82 real *shiftvec,*fshift,*x,*f;
83 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
85 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86 real * vdwioffsetptr0;
87 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88 real * vdwioffsetptr1;
89 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 real * vdwioffsetptr2;
91 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 real * vdwioffsetptr3;
93 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
94 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
95 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
97 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
98 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
99 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
100 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
101 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
102 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
103 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
104 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
105 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
106 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
107 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
108 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
109 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
110 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
111 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
112 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
115 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
118 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
119 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
120 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
121 real rswitch_scalar,d_scalar;
122 __m256d dummy_mask,cutoff_mask;
123 __m128 tmpmask0,tmpmask1;
124 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
125 __m256d one = _mm256_set1_pd(1.0);
126 __m256d two = _mm256_set1_pd(2.0);
132 jindex = nlist->jindex;
134 shiftidx = nlist->shift;
136 shiftvec = fr->shift_vec[0];
137 fshift = fr->fshift[0];
138 facel = _mm256_set1_pd(fr->epsfac);
139 charge = mdatoms->chargeA;
140 krf = _mm256_set1_pd(fr->ic->k_rf);
141 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
142 crf = _mm256_set1_pd(fr->ic->c_rf);
143 nvdwtype = fr->ntype;
145 vdwtype = mdatoms->typeA;
147 /* Setup water-specific parameters */
148 inr = nlist->iinr[0];
149 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
150 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
151 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
152 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
154 jq1 = _mm256_set1_pd(charge[inr+1]);
155 jq2 = _mm256_set1_pd(charge[inr+2]);
156 jq3 = _mm256_set1_pd(charge[inr+3]);
157 vdwjidx0A = 2*vdwtype[inr+0];
158 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
159 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
160 qq11 = _mm256_mul_pd(iq1,jq1);
161 qq12 = _mm256_mul_pd(iq1,jq2);
162 qq13 = _mm256_mul_pd(iq1,jq3);
163 qq21 = _mm256_mul_pd(iq2,jq1);
164 qq22 = _mm256_mul_pd(iq2,jq2);
165 qq23 = _mm256_mul_pd(iq2,jq3);
166 qq31 = _mm256_mul_pd(iq3,jq1);
167 qq32 = _mm256_mul_pd(iq3,jq2);
168 qq33 = _mm256_mul_pd(iq3,jq3);
170 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
171 rcutoff_scalar = fr->rcoulomb;
172 rcutoff = _mm256_set1_pd(rcutoff_scalar);
173 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
175 rswitch_scalar = fr->rvdw_switch;
176 rswitch = _mm256_set1_pd(rswitch_scalar);
177 /* Setup switch parameters */
178 d_scalar = rcutoff_scalar-rswitch_scalar;
179 d = _mm256_set1_pd(d_scalar);
180 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
181 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
182 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
183 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
184 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
185 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
187 /* Avoid stupid compiler warnings */
188 jnrA = jnrB = jnrC = jnrD = 0;
197 for(iidx=0;iidx<4*DIM;iidx++)
202 /* Start outer loop over neighborlists */
203 for(iidx=0; iidx<nri; iidx++)
205 /* Load shift vector for this list */
206 i_shift_offset = DIM*shiftidx[iidx];
208 /* Load limits for loop over neighbors */
209 j_index_start = jindex[iidx];
210 j_index_end = jindex[iidx+1];
212 /* Get outer coordinate index */
214 i_coord_offset = DIM*inr;
216 /* Load i particle coords and add shift vector */
217 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
218 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
220 fix0 = _mm256_setzero_pd();
221 fiy0 = _mm256_setzero_pd();
222 fiz0 = _mm256_setzero_pd();
223 fix1 = _mm256_setzero_pd();
224 fiy1 = _mm256_setzero_pd();
225 fiz1 = _mm256_setzero_pd();
226 fix2 = _mm256_setzero_pd();
227 fiy2 = _mm256_setzero_pd();
228 fiz2 = _mm256_setzero_pd();
229 fix3 = _mm256_setzero_pd();
230 fiy3 = _mm256_setzero_pd();
231 fiz3 = _mm256_setzero_pd();
233 /* Reset potential sums */
234 velecsum = _mm256_setzero_pd();
235 vvdwsum = _mm256_setzero_pd();
237 /* Start inner kernel loop */
238 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
241 /* Get j neighbor index, and coordinate index */
246 j_coord_offsetA = DIM*jnrA;
247 j_coord_offsetB = DIM*jnrB;
248 j_coord_offsetC = DIM*jnrC;
249 j_coord_offsetD = DIM*jnrD;
251 /* load j atom coordinates */
252 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
253 x+j_coord_offsetC,x+j_coord_offsetD,
254 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
255 &jy2,&jz2,&jx3,&jy3,&jz3);
257 /* Calculate displacement vector */
258 dx00 = _mm256_sub_pd(ix0,jx0);
259 dy00 = _mm256_sub_pd(iy0,jy0);
260 dz00 = _mm256_sub_pd(iz0,jz0);
261 dx11 = _mm256_sub_pd(ix1,jx1);
262 dy11 = _mm256_sub_pd(iy1,jy1);
263 dz11 = _mm256_sub_pd(iz1,jz1);
264 dx12 = _mm256_sub_pd(ix1,jx2);
265 dy12 = _mm256_sub_pd(iy1,jy2);
266 dz12 = _mm256_sub_pd(iz1,jz2);
267 dx13 = _mm256_sub_pd(ix1,jx3);
268 dy13 = _mm256_sub_pd(iy1,jy3);
269 dz13 = _mm256_sub_pd(iz1,jz3);
270 dx21 = _mm256_sub_pd(ix2,jx1);
271 dy21 = _mm256_sub_pd(iy2,jy1);
272 dz21 = _mm256_sub_pd(iz2,jz1);
273 dx22 = _mm256_sub_pd(ix2,jx2);
274 dy22 = _mm256_sub_pd(iy2,jy2);
275 dz22 = _mm256_sub_pd(iz2,jz2);
276 dx23 = _mm256_sub_pd(ix2,jx3);
277 dy23 = _mm256_sub_pd(iy2,jy3);
278 dz23 = _mm256_sub_pd(iz2,jz3);
279 dx31 = _mm256_sub_pd(ix3,jx1);
280 dy31 = _mm256_sub_pd(iy3,jy1);
281 dz31 = _mm256_sub_pd(iz3,jz1);
282 dx32 = _mm256_sub_pd(ix3,jx2);
283 dy32 = _mm256_sub_pd(iy3,jy2);
284 dz32 = _mm256_sub_pd(iz3,jz2);
285 dx33 = _mm256_sub_pd(ix3,jx3);
286 dy33 = _mm256_sub_pd(iy3,jy3);
287 dz33 = _mm256_sub_pd(iz3,jz3);
289 /* Calculate squared distance and things based on it */
290 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
291 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
292 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
293 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
294 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
295 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
296 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
297 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
298 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
299 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
301 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
302 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
303 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
304 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
305 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
306 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
307 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
308 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
309 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
310 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
312 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
313 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
314 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
315 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
316 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
317 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
318 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
319 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
320 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
321 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
323 fjx0 = _mm256_setzero_pd();
324 fjy0 = _mm256_setzero_pd();
325 fjz0 = _mm256_setzero_pd();
326 fjx1 = _mm256_setzero_pd();
327 fjy1 = _mm256_setzero_pd();
328 fjz1 = _mm256_setzero_pd();
329 fjx2 = _mm256_setzero_pd();
330 fjy2 = _mm256_setzero_pd();
331 fjz2 = _mm256_setzero_pd();
332 fjx3 = _mm256_setzero_pd();
333 fjy3 = _mm256_setzero_pd();
334 fjz3 = _mm256_setzero_pd();
336 /**************************
337 * CALCULATE INTERACTIONS *
338 **************************/
340 if (gmx_mm256_any_lt(rsq00,rcutoff2))
343 r00 = _mm256_mul_pd(rsq00,rinv00);
345 /* LENNARD-JONES DISPERSION/REPULSION */
347 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
348 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
349 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
350 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
351 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
353 d = _mm256_sub_pd(r00,rswitch);
354 d = _mm256_max_pd(d,_mm256_setzero_pd());
355 d2 = _mm256_mul_pd(d,d);
356 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
358 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
360 /* Evaluate switch function */
361 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
362 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
363 vvdw = _mm256_mul_pd(vvdw,sw);
364 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
366 /* Update potential sum for this i atom from the interaction with this j atom. */
367 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
368 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
372 fscal = _mm256_and_pd(fscal,cutoff_mask);
374 /* Calculate temporary vectorial force */
375 tx = _mm256_mul_pd(fscal,dx00);
376 ty = _mm256_mul_pd(fscal,dy00);
377 tz = _mm256_mul_pd(fscal,dz00);
379 /* Update vectorial force */
380 fix0 = _mm256_add_pd(fix0,tx);
381 fiy0 = _mm256_add_pd(fiy0,ty);
382 fiz0 = _mm256_add_pd(fiz0,tz);
384 fjx0 = _mm256_add_pd(fjx0,tx);
385 fjy0 = _mm256_add_pd(fjy0,ty);
386 fjz0 = _mm256_add_pd(fjz0,tz);
390 /**************************
391 * CALCULATE INTERACTIONS *
392 **************************/
394 if (gmx_mm256_any_lt(rsq11,rcutoff2))
397 /* REACTION-FIELD ELECTROSTATICS */
398 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
399 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
401 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
403 /* Update potential sum for this i atom from the interaction with this j atom. */
404 velec = _mm256_and_pd(velec,cutoff_mask);
405 velecsum = _mm256_add_pd(velecsum,velec);
409 fscal = _mm256_and_pd(fscal,cutoff_mask);
411 /* Calculate temporary vectorial force */
412 tx = _mm256_mul_pd(fscal,dx11);
413 ty = _mm256_mul_pd(fscal,dy11);
414 tz = _mm256_mul_pd(fscal,dz11);
416 /* Update vectorial force */
417 fix1 = _mm256_add_pd(fix1,tx);
418 fiy1 = _mm256_add_pd(fiy1,ty);
419 fiz1 = _mm256_add_pd(fiz1,tz);
421 fjx1 = _mm256_add_pd(fjx1,tx);
422 fjy1 = _mm256_add_pd(fjy1,ty);
423 fjz1 = _mm256_add_pd(fjz1,tz);
427 /**************************
428 * CALCULATE INTERACTIONS *
429 **************************/
431 if (gmx_mm256_any_lt(rsq12,rcutoff2))
434 /* REACTION-FIELD ELECTROSTATICS */
435 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
436 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
438 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
440 /* Update potential sum for this i atom from the interaction with this j atom. */
441 velec = _mm256_and_pd(velec,cutoff_mask);
442 velecsum = _mm256_add_pd(velecsum,velec);
446 fscal = _mm256_and_pd(fscal,cutoff_mask);
448 /* Calculate temporary vectorial force */
449 tx = _mm256_mul_pd(fscal,dx12);
450 ty = _mm256_mul_pd(fscal,dy12);
451 tz = _mm256_mul_pd(fscal,dz12);
453 /* Update vectorial force */
454 fix1 = _mm256_add_pd(fix1,tx);
455 fiy1 = _mm256_add_pd(fiy1,ty);
456 fiz1 = _mm256_add_pd(fiz1,tz);
458 fjx2 = _mm256_add_pd(fjx2,tx);
459 fjy2 = _mm256_add_pd(fjy2,ty);
460 fjz2 = _mm256_add_pd(fjz2,tz);
464 /**************************
465 * CALCULATE INTERACTIONS *
466 **************************/
468 if (gmx_mm256_any_lt(rsq13,rcutoff2))
471 /* REACTION-FIELD ELECTROSTATICS */
472 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_add_pd(rinv13,_mm256_mul_pd(krf,rsq13)),crf));
473 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
475 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
477 /* Update potential sum for this i atom from the interaction with this j atom. */
478 velec = _mm256_and_pd(velec,cutoff_mask);
479 velecsum = _mm256_add_pd(velecsum,velec);
483 fscal = _mm256_and_pd(fscal,cutoff_mask);
485 /* Calculate temporary vectorial force */
486 tx = _mm256_mul_pd(fscal,dx13);
487 ty = _mm256_mul_pd(fscal,dy13);
488 tz = _mm256_mul_pd(fscal,dz13);
490 /* Update vectorial force */
491 fix1 = _mm256_add_pd(fix1,tx);
492 fiy1 = _mm256_add_pd(fiy1,ty);
493 fiz1 = _mm256_add_pd(fiz1,tz);
495 fjx3 = _mm256_add_pd(fjx3,tx);
496 fjy3 = _mm256_add_pd(fjy3,ty);
497 fjz3 = _mm256_add_pd(fjz3,tz);
501 /**************************
502 * CALCULATE INTERACTIONS *
503 **************************/
505 if (gmx_mm256_any_lt(rsq21,rcutoff2))
508 /* REACTION-FIELD ELECTROSTATICS */
509 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
510 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
512 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
514 /* Update potential sum for this i atom from the interaction with this j atom. */
515 velec = _mm256_and_pd(velec,cutoff_mask);
516 velecsum = _mm256_add_pd(velecsum,velec);
520 fscal = _mm256_and_pd(fscal,cutoff_mask);
522 /* Calculate temporary vectorial force */
523 tx = _mm256_mul_pd(fscal,dx21);
524 ty = _mm256_mul_pd(fscal,dy21);
525 tz = _mm256_mul_pd(fscal,dz21);
527 /* Update vectorial force */
528 fix2 = _mm256_add_pd(fix2,tx);
529 fiy2 = _mm256_add_pd(fiy2,ty);
530 fiz2 = _mm256_add_pd(fiz2,tz);
532 fjx1 = _mm256_add_pd(fjx1,tx);
533 fjy1 = _mm256_add_pd(fjy1,ty);
534 fjz1 = _mm256_add_pd(fjz1,tz);
538 /**************************
539 * CALCULATE INTERACTIONS *
540 **************************/
542 if (gmx_mm256_any_lt(rsq22,rcutoff2))
545 /* REACTION-FIELD ELECTROSTATICS */
546 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
547 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
549 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
551 /* Update potential sum for this i atom from the interaction with this j atom. */
552 velec = _mm256_and_pd(velec,cutoff_mask);
553 velecsum = _mm256_add_pd(velecsum,velec);
557 fscal = _mm256_and_pd(fscal,cutoff_mask);
559 /* Calculate temporary vectorial force */
560 tx = _mm256_mul_pd(fscal,dx22);
561 ty = _mm256_mul_pd(fscal,dy22);
562 tz = _mm256_mul_pd(fscal,dz22);
564 /* Update vectorial force */
565 fix2 = _mm256_add_pd(fix2,tx);
566 fiy2 = _mm256_add_pd(fiy2,ty);
567 fiz2 = _mm256_add_pd(fiz2,tz);
569 fjx2 = _mm256_add_pd(fjx2,tx);
570 fjy2 = _mm256_add_pd(fjy2,ty);
571 fjz2 = _mm256_add_pd(fjz2,tz);
575 /**************************
576 * CALCULATE INTERACTIONS *
577 **************************/
579 if (gmx_mm256_any_lt(rsq23,rcutoff2))
582 /* REACTION-FIELD ELECTROSTATICS */
583 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_add_pd(rinv23,_mm256_mul_pd(krf,rsq23)),crf));
584 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
586 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
588 /* Update potential sum for this i atom from the interaction with this j atom. */
589 velec = _mm256_and_pd(velec,cutoff_mask);
590 velecsum = _mm256_add_pd(velecsum,velec);
594 fscal = _mm256_and_pd(fscal,cutoff_mask);
596 /* Calculate temporary vectorial force */
597 tx = _mm256_mul_pd(fscal,dx23);
598 ty = _mm256_mul_pd(fscal,dy23);
599 tz = _mm256_mul_pd(fscal,dz23);
601 /* Update vectorial force */
602 fix2 = _mm256_add_pd(fix2,tx);
603 fiy2 = _mm256_add_pd(fiy2,ty);
604 fiz2 = _mm256_add_pd(fiz2,tz);
606 fjx3 = _mm256_add_pd(fjx3,tx);
607 fjy3 = _mm256_add_pd(fjy3,ty);
608 fjz3 = _mm256_add_pd(fjz3,tz);
612 /**************************
613 * CALCULATE INTERACTIONS *
614 **************************/
616 if (gmx_mm256_any_lt(rsq31,rcutoff2))
619 /* REACTION-FIELD ELECTROSTATICS */
620 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_add_pd(rinv31,_mm256_mul_pd(krf,rsq31)),crf));
621 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
623 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
625 /* Update potential sum for this i atom from the interaction with this j atom. */
626 velec = _mm256_and_pd(velec,cutoff_mask);
627 velecsum = _mm256_add_pd(velecsum,velec);
631 fscal = _mm256_and_pd(fscal,cutoff_mask);
633 /* Calculate temporary vectorial force */
634 tx = _mm256_mul_pd(fscal,dx31);
635 ty = _mm256_mul_pd(fscal,dy31);
636 tz = _mm256_mul_pd(fscal,dz31);
638 /* Update vectorial force */
639 fix3 = _mm256_add_pd(fix3,tx);
640 fiy3 = _mm256_add_pd(fiy3,ty);
641 fiz3 = _mm256_add_pd(fiz3,tz);
643 fjx1 = _mm256_add_pd(fjx1,tx);
644 fjy1 = _mm256_add_pd(fjy1,ty);
645 fjz1 = _mm256_add_pd(fjz1,tz);
649 /**************************
650 * CALCULATE INTERACTIONS *
651 **************************/
653 if (gmx_mm256_any_lt(rsq32,rcutoff2))
656 /* REACTION-FIELD ELECTROSTATICS */
657 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_add_pd(rinv32,_mm256_mul_pd(krf,rsq32)),crf));
658 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
660 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
662 /* Update potential sum for this i atom from the interaction with this j atom. */
663 velec = _mm256_and_pd(velec,cutoff_mask);
664 velecsum = _mm256_add_pd(velecsum,velec);
668 fscal = _mm256_and_pd(fscal,cutoff_mask);
670 /* Calculate temporary vectorial force */
671 tx = _mm256_mul_pd(fscal,dx32);
672 ty = _mm256_mul_pd(fscal,dy32);
673 tz = _mm256_mul_pd(fscal,dz32);
675 /* Update vectorial force */
676 fix3 = _mm256_add_pd(fix3,tx);
677 fiy3 = _mm256_add_pd(fiy3,ty);
678 fiz3 = _mm256_add_pd(fiz3,tz);
680 fjx2 = _mm256_add_pd(fjx2,tx);
681 fjy2 = _mm256_add_pd(fjy2,ty);
682 fjz2 = _mm256_add_pd(fjz2,tz);
686 /**************************
687 * CALCULATE INTERACTIONS *
688 **************************/
690 if (gmx_mm256_any_lt(rsq33,rcutoff2))
693 /* REACTION-FIELD ELECTROSTATICS */
694 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_add_pd(rinv33,_mm256_mul_pd(krf,rsq33)),crf));
695 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
697 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
699 /* Update potential sum for this i atom from the interaction with this j atom. */
700 velec = _mm256_and_pd(velec,cutoff_mask);
701 velecsum = _mm256_add_pd(velecsum,velec);
705 fscal = _mm256_and_pd(fscal,cutoff_mask);
707 /* Calculate temporary vectorial force */
708 tx = _mm256_mul_pd(fscal,dx33);
709 ty = _mm256_mul_pd(fscal,dy33);
710 tz = _mm256_mul_pd(fscal,dz33);
712 /* Update vectorial force */
713 fix3 = _mm256_add_pd(fix3,tx);
714 fiy3 = _mm256_add_pd(fiy3,ty);
715 fiz3 = _mm256_add_pd(fiz3,tz);
717 fjx3 = _mm256_add_pd(fjx3,tx);
718 fjy3 = _mm256_add_pd(fjy3,ty);
719 fjz3 = _mm256_add_pd(fjz3,tz);
723 fjptrA = f+j_coord_offsetA;
724 fjptrB = f+j_coord_offsetB;
725 fjptrC = f+j_coord_offsetC;
726 fjptrD = f+j_coord_offsetD;
728 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
729 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
730 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
732 /* Inner loop uses 386 flops */
738 /* Get j neighbor index, and coordinate index */
739 jnrlistA = jjnr[jidx];
740 jnrlistB = jjnr[jidx+1];
741 jnrlistC = jjnr[jidx+2];
742 jnrlistD = jjnr[jidx+3];
743 /* Sign of each element will be negative for non-real atoms.
744 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
745 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
747 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
749 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
750 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
751 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
753 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
754 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
755 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
756 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
757 j_coord_offsetA = DIM*jnrA;
758 j_coord_offsetB = DIM*jnrB;
759 j_coord_offsetC = DIM*jnrC;
760 j_coord_offsetD = DIM*jnrD;
762 /* load j atom coordinates */
763 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
764 x+j_coord_offsetC,x+j_coord_offsetD,
765 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
766 &jy2,&jz2,&jx3,&jy3,&jz3);
768 /* Calculate displacement vector */
769 dx00 = _mm256_sub_pd(ix0,jx0);
770 dy00 = _mm256_sub_pd(iy0,jy0);
771 dz00 = _mm256_sub_pd(iz0,jz0);
772 dx11 = _mm256_sub_pd(ix1,jx1);
773 dy11 = _mm256_sub_pd(iy1,jy1);
774 dz11 = _mm256_sub_pd(iz1,jz1);
775 dx12 = _mm256_sub_pd(ix1,jx2);
776 dy12 = _mm256_sub_pd(iy1,jy2);
777 dz12 = _mm256_sub_pd(iz1,jz2);
778 dx13 = _mm256_sub_pd(ix1,jx3);
779 dy13 = _mm256_sub_pd(iy1,jy3);
780 dz13 = _mm256_sub_pd(iz1,jz3);
781 dx21 = _mm256_sub_pd(ix2,jx1);
782 dy21 = _mm256_sub_pd(iy2,jy1);
783 dz21 = _mm256_sub_pd(iz2,jz1);
784 dx22 = _mm256_sub_pd(ix2,jx2);
785 dy22 = _mm256_sub_pd(iy2,jy2);
786 dz22 = _mm256_sub_pd(iz2,jz2);
787 dx23 = _mm256_sub_pd(ix2,jx3);
788 dy23 = _mm256_sub_pd(iy2,jy3);
789 dz23 = _mm256_sub_pd(iz2,jz3);
790 dx31 = _mm256_sub_pd(ix3,jx1);
791 dy31 = _mm256_sub_pd(iy3,jy1);
792 dz31 = _mm256_sub_pd(iz3,jz1);
793 dx32 = _mm256_sub_pd(ix3,jx2);
794 dy32 = _mm256_sub_pd(iy3,jy2);
795 dz32 = _mm256_sub_pd(iz3,jz2);
796 dx33 = _mm256_sub_pd(ix3,jx3);
797 dy33 = _mm256_sub_pd(iy3,jy3);
798 dz33 = _mm256_sub_pd(iz3,jz3);
800 /* Calculate squared distance and things based on it */
801 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
802 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
803 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
804 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
805 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
806 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
807 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
808 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
809 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
810 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
812 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
813 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
814 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
815 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
816 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
817 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
818 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
819 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
820 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
821 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
823 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
824 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
825 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
826 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
827 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
828 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
829 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
830 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
831 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
832 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
834 fjx0 = _mm256_setzero_pd();
835 fjy0 = _mm256_setzero_pd();
836 fjz0 = _mm256_setzero_pd();
837 fjx1 = _mm256_setzero_pd();
838 fjy1 = _mm256_setzero_pd();
839 fjz1 = _mm256_setzero_pd();
840 fjx2 = _mm256_setzero_pd();
841 fjy2 = _mm256_setzero_pd();
842 fjz2 = _mm256_setzero_pd();
843 fjx3 = _mm256_setzero_pd();
844 fjy3 = _mm256_setzero_pd();
845 fjz3 = _mm256_setzero_pd();
847 /**************************
848 * CALCULATE INTERACTIONS *
849 **************************/
851 if (gmx_mm256_any_lt(rsq00,rcutoff2))
854 r00 = _mm256_mul_pd(rsq00,rinv00);
855 r00 = _mm256_andnot_pd(dummy_mask,r00);
857 /* LENNARD-JONES DISPERSION/REPULSION */
859 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
860 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
861 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
862 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
863 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
865 d = _mm256_sub_pd(r00,rswitch);
866 d = _mm256_max_pd(d,_mm256_setzero_pd());
867 d2 = _mm256_mul_pd(d,d);
868 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
870 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
872 /* Evaluate switch function */
873 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
874 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
875 vvdw = _mm256_mul_pd(vvdw,sw);
876 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
878 /* Update potential sum for this i atom from the interaction with this j atom. */
879 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
880 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
881 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
885 fscal = _mm256_and_pd(fscal,cutoff_mask);
887 fscal = _mm256_andnot_pd(dummy_mask,fscal);
889 /* Calculate temporary vectorial force */
890 tx = _mm256_mul_pd(fscal,dx00);
891 ty = _mm256_mul_pd(fscal,dy00);
892 tz = _mm256_mul_pd(fscal,dz00);
894 /* Update vectorial force */
895 fix0 = _mm256_add_pd(fix0,tx);
896 fiy0 = _mm256_add_pd(fiy0,ty);
897 fiz0 = _mm256_add_pd(fiz0,tz);
899 fjx0 = _mm256_add_pd(fjx0,tx);
900 fjy0 = _mm256_add_pd(fjy0,ty);
901 fjz0 = _mm256_add_pd(fjz0,tz);
905 /**************************
906 * CALCULATE INTERACTIONS *
907 **************************/
909 if (gmx_mm256_any_lt(rsq11,rcutoff2))
912 /* REACTION-FIELD ELECTROSTATICS */
913 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
914 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
916 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
918 /* Update potential sum for this i atom from the interaction with this j atom. */
919 velec = _mm256_and_pd(velec,cutoff_mask);
920 velec = _mm256_andnot_pd(dummy_mask,velec);
921 velecsum = _mm256_add_pd(velecsum,velec);
925 fscal = _mm256_and_pd(fscal,cutoff_mask);
927 fscal = _mm256_andnot_pd(dummy_mask,fscal);
929 /* Calculate temporary vectorial force */
930 tx = _mm256_mul_pd(fscal,dx11);
931 ty = _mm256_mul_pd(fscal,dy11);
932 tz = _mm256_mul_pd(fscal,dz11);
934 /* Update vectorial force */
935 fix1 = _mm256_add_pd(fix1,tx);
936 fiy1 = _mm256_add_pd(fiy1,ty);
937 fiz1 = _mm256_add_pd(fiz1,tz);
939 fjx1 = _mm256_add_pd(fjx1,tx);
940 fjy1 = _mm256_add_pd(fjy1,ty);
941 fjz1 = _mm256_add_pd(fjz1,tz);
945 /**************************
946 * CALCULATE INTERACTIONS *
947 **************************/
949 if (gmx_mm256_any_lt(rsq12,rcutoff2))
952 /* REACTION-FIELD ELECTROSTATICS */
953 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
954 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
956 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
958 /* Update potential sum for this i atom from the interaction with this j atom. */
959 velec = _mm256_and_pd(velec,cutoff_mask);
960 velec = _mm256_andnot_pd(dummy_mask,velec);
961 velecsum = _mm256_add_pd(velecsum,velec);
965 fscal = _mm256_and_pd(fscal,cutoff_mask);
967 fscal = _mm256_andnot_pd(dummy_mask,fscal);
969 /* Calculate temporary vectorial force */
970 tx = _mm256_mul_pd(fscal,dx12);
971 ty = _mm256_mul_pd(fscal,dy12);
972 tz = _mm256_mul_pd(fscal,dz12);
974 /* Update vectorial force */
975 fix1 = _mm256_add_pd(fix1,tx);
976 fiy1 = _mm256_add_pd(fiy1,ty);
977 fiz1 = _mm256_add_pd(fiz1,tz);
979 fjx2 = _mm256_add_pd(fjx2,tx);
980 fjy2 = _mm256_add_pd(fjy2,ty);
981 fjz2 = _mm256_add_pd(fjz2,tz);
985 /**************************
986 * CALCULATE INTERACTIONS *
987 **************************/
989 if (gmx_mm256_any_lt(rsq13,rcutoff2))
992 /* REACTION-FIELD ELECTROSTATICS */
993 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_add_pd(rinv13,_mm256_mul_pd(krf,rsq13)),crf));
994 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
996 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
998 /* Update potential sum for this i atom from the interaction with this j atom. */
999 velec = _mm256_and_pd(velec,cutoff_mask);
1000 velec = _mm256_andnot_pd(dummy_mask,velec);
1001 velecsum = _mm256_add_pd(velecsum,velec);
1005 fscal = _mm256_and_pd(fscal,cutoff_mask);
1007 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1009 /* Calculate temporary vectorial force */
1010 tx = _mm256_mul_pd(fscal,dx13);
1011 ty = _mm256_mul_pd(fscal,dy13);
1012 tz = _mm256_mul_pd(fscal,dz13);
1014 /* Update vectorial force */
1015 fix1 = _mm256_add_pd(fix1,tx);
1016 fiy1 = _mm256_add_pd(fiy1,ty);
1017 fiz1 = _mm256_add_pd(fiz1,tz);
1019 fjx3 = _mm256_add_pd(fjx3,tx);
1020 fjy3 = _mm256_add_pd(fjy3,ty);
1021 fjz3 = _mm256_add_pd(fjz3,tz);
1025 /**************************
1026 * CALCULATE INTERACTIONS *
1027 **************************/
1029 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1032 /* REACTION-FIELD ELECTROSTATICS */
1033 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
1034 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1036 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1038 /* Update potential sum for this i atom from the interaction with this j atom. */
1039 velec = _mm256_and_pd(velec,cutoff_mask);
1040 velec = _mm256_andnot_pd(dummy_mask,velec);
1041 velecsum = _mm256_add_pd(velecsum,velec);
1045 fscal = _mm256_and_pd(fscal,cutoff_mask);
1047 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1049 /* Calculate temporary vectorial force */
1050 tx = _mm256_mul_pd(fscal,dx21);
1051 ty = _mm256_mul_pd(fscal,dy21);
1052 tz = _mm256_mul_pd(fscal,dz21);
1054 /* Update vectorial force */
1055 fix2 = _mm256_add_pd(fix2,tx);
1056 fiy2 = _mm256_add_pd(fiy2,ty);
1057 fiz2 = _mm256_add_pd(fiz2,tz);
1059 fjx1 = _mm256_add_pd(fjx1,tx);
1060 fjy1 = _mm256_add_pd(fjy1,ty);
1061 fjz1 = _mm256_add_pd(fjz1,tz);
1065 /**************************
1066 * CALCULATE INTERACTIONS *
1067 **************************/
1069 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1072 /* REACTION-FIELD ELECTROSTATICS */
1073 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
1074 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1076 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1078 /* Update potential sum for this i atom from the interaction with this j atom. */
1079 velec = _mm256_and_pd(velec,cutoff_mask);
1080 velec = _mm256_andnot_pd(dummy_mask,velec);
1081 velecsum = _mm256_add_pd(velecsum,velec);
1085 fscal = _mm256_and_pd(fscal,cutoff_mask);
1087 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1089 /* Calculate temporary vectorial force */
1090 tx = _mm256_mul_pd(fscal,dx22);
1091 ty = _mm256_mul_pd(fscal,dy22);
1092 tz = _mm256_mul_pd(fscal,dz22);
1094 /* Update vectorial force */
1095 fix2 = _mm256_add_pd(fix2,tx);
1096 fiy2 = _mm256_add_pd(fiy2,ty);
1097 fiz2 = _mm256_add_pd(fiz2,tz);
1099 fjx2 = _mm256_add_pd(fjx2,tx);
1100 fjy2 = _mm256_add_pd(fjy2,ty);
1101 fjz2 = _mm256_add_pd(fjz2,tz);
1105 /**************************
1106 * CALCULATE INTERACTIONS *
1107 **************************/
1109 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1112 /* REACTION-FIELD ELECTROSTATICS */
1113 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_add_pd(rinv23,_mm256_mul_pd(krf,rsq23)),crf));
1114 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
1116 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1118 /* Update potential sum for this i atom from the interaction with this j atom. */
1119 velec = _mm256_and_pd(velec,cutoff_mask);
1120 velec = _mm256_andnot_pd(dummy_mask,velec);
1121 velecsum = _mm256_add_pd(velecsum,velec);
1125 fscal = _mm256_and_pd(fscal,cutoff_mask);
1127 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1129 /* Calculate temporary vectorial force */
1130 tx = _mm256_mul_pd(fscal,dx23);
1131 ty = _mm256_mul_pd(fscal,dy23);
1132 tz = _mm256_mul_pd(fscal,dz23);
1134 /* Update vectorial force */
1135 fix2 = _mm256_add_pd(fix2,tx);
1136 fiy2 = _mm256_add_pd(fiy2,ty);
1137 fiz2 = _mm256_add_pd(fiz2,tz);
1139 fjx3 = _mm256_add_pd(fjx3,tx);
1140 fjy3 = _mm256_add_pd(fjy3,ty);
1141 fjz3 = _mm256_add_pd(fjz3,tz);
1145 /**************************
1146 * CALCULATE INTERACTIONS *
1147 **************************/
1149 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1152 /* REACTION-FIELD ELECTROSTATICS */
1153 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_add_pd(rinv31,_mm256_mul_pd(krf,rsq31)),crf));
1154 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1156 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1158 /* Update potential sum for this i atom from the interaction with this j atom. */
1159 velec = _mm256_and_pd(velec,cutoff_mask);
1160 velec = _mm256_andnot_pd(dummy_mask,velec);
1161 velecsum = _mm256_add_pd(velecsum,velec);
1165 fscal = _mm256_and_pd(fscal,cutoff_mask);
1167 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1169 /* Calculate temporary vectorial force */
1170 tx = _mm256_mul_pd(fscal,dx31);
1171 ty = _mm256_mul_pd(fscal,dy31);
1172 tz = _mm256_mul_pd(fscal,dz31);
1174 /* Update vectorial force */
1175 fix3 = _mm256_add_pd(fix3,tx);
1176 fiy3 = _mm256_add_pd(fiy3,ty);
1177 fiz3 = _mm256_add_pd(fiz3,tz);
1179 fjx1 = _mm256_add_pd(fjx1,tx);
1180 fjy1 = _mm256_add_pd(fjy1,ty);
1181 fjz1 = _mm256_add_pd(fjz1,tz);
1185 /**************************
1186 * CALCULATE INTERACTIONS *
1187 **************************/
1189 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1192 /* REACTION-FIELD ELECTROSTATICS */
1193 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_add_pd(rinv32,_mm256_mul_pd(krf,rsq32)),crf));
1194 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
1196 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1198 /* Update potential sum for this i atom from the interaction with this j atom. */
1199 velec = _mm256_and_pd(velec,cutoff_mask);
1200 velec = _mm256_andnot_pd(dummy_mask,velec);
1201 velecsum = _mm256_add_pd(velecsum,velec);
1205 fscal = _mm256_and_pd(fscal,cutoff_mask);
1207 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1209 /* Calculate temporary vectorial force */
1210 tx = _mm256_mul_pd(fscal,dx32);
1211 ty = _mm256_mul_pd(fscal,dy32);
1212 tz = _mm256_mul_pd(fscal,dz32);
1214 /* Update vectorial force */
1215 fix3 = _mm256_add_pd(fix3,tx);
1216 fiy3 = _mm256_add_pd(fiy3,ty);
1217 fiz3 = _mm256_add_pd(fiz3,tz);
1219 fjx2 = _mm256_add_pd(fjx2,tx);
1220 fjy2 = _mm256_add_pd(fjy2,ty);
1221 fjz2 = _mm256_add_pd(fjz2,tz);
1225 /**************************
1226 * CALCULATE INTERACTIONS *
1227 **************************/
1229 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1232 /* REACTION-FIELD ELECTROSTATICS */
1233 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_add_pd(rinv33,_mm256_mul_pd(krf,rsq33)),crf));
1234 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
1236 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1238 /* Update potential sum for this i atom from the interaction with this j atom. */
1239 velec = _mm256_and_pd(velec,cutoff_mask);
1240 velec = _mm256_andnot_pd(dummy_mask,velec);
1241 velecsum = _mm256_add_pd(velecsum,velec);
1245 fscal = _mm256_and_pd(fscal,cutoff_mask);
1247 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1249 /* Calculate temporary vectorial force */
1250 tx = _mm256_mul_pd(fscal,dx33);
1251 ty = _mm256_mul_pd(fscal,dy33);
1252 tz = _mm256_mul_pd(fscal,dz33);
1254 /* Update vectorial force */
1255 fix3 = _mm256_add_pd(fix3,tx);
1256 fiy3 = _mm256_add_pd(fiy3,ty);
1257 fiz3 = _mm256_add_pd(fiz3,tz);
1259 fjx3 = _mm256_add_pd(fjx3,tx);
1260 fjy3 = _mm256_add_pd(fjy3,ty);
1261 fjz3 = _mm256_add_pd(fjz3,tz);
1265 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1266 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1267 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1268 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1270 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1271 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1272 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1274 /* Inner loop uses 387 flops */
1277 /* End of innermost loop */
1279 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1280 f+i_coord_offset,fshift+i_shift_offset);
1283 /* Update potential energies */
1284 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1285 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1287 /* Increment number of inner iterations */
1288 inneriter += j_index_end - j_index_start;
1290 /* Outer loop uses 26 flops */
1293 /* Increment number of outer iterations */
1296 /* Update outer/inner flops */
1298 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*387);
1301 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_256_double
1302 * Electrostatics interaction: ReactionField
1303 * VdW interaction: LennardJones
1304 * Geometry: Water4-Water4
1305 * Calculate force/pot: Force
1308 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_avx_256_double
1309 (t_nblist * gmx_restrict nlist,
1310 rvec * gmx_restrict xx,
1311 rvec * gmx_restrict ff,
1312 t_forcerec * gmx_restrict fr,
1313 t_mdatoms * gmx_restrict mdatoms,
1314 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1315 t_nrnb * gmx_restrict nrnb)
1317 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1318 * just 0 for non-waters.
1319 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1320 * jnr indices corresponding to data put in the four positions in the SIMD register.
1322 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1323 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1324 int jnrA,jnrB,jnrC,jnrD;
1325 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1326 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1327 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1328 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1329 real rcutoff_scalar;
1330 real *shiftvec,*fshift,*x,*f;
1331 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1332 real scratch[4*DIM];
1333 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1334 real * vdwioffsetptr0;
1335 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1336 real * vdwioffsetptr1;
1337 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1338 real * vdwioffsetptr2;
1339 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1340 real * vdwioffsetptr3;
1341 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1342 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1343 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1344 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1345 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1346 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1347 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1348 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1349 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1350 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1351 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1352 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1353 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1354 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1355 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1356 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1357 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1358 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1359 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1360 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1363 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1366 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1367 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1368 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1369 real rswitch_scalar,d_scalar;
1370 __m256d dummy_mask,cutoff_mask;
1371 __m128 tmpmask0,tmpmask1;
1372 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1373 __m256d one = _mm256_set1_pd(1.0);
1374 __m256d two = _mm256_set1_pd(2.0);
1380 jindex = nlist->jindex;
1382 shiftidx = nlist->shift;
1384 shiftvec = fr->shift_vec[0];
1385 fshift = fr->fshift[0];
1386 facel = _mm256_set1_pd(fr->epsfac);
1387 charge = mdatoms->chargeA;
1388 krf = _mm256_set1_pd(fr->ic->k_rf);
1389 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
1390 crf = _mm256_set1_pd(fr->ic->c_rf);
1391 nvdwtype = fr->ntype;
1392 vdwparam = fr->nbfp;
1393 vdwtype = mdatoms->typeA;
1395 /* Setup water-specific parameters */
1396 inr = nlist->iinr[0];
1397 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1398 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1399 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1400 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1402 jq1 = _mm256_set1_pd(charge[inr+1]);
1403 jq2 = _mm256_set1_pd(charge[inr+2]);
1404 jq3 = _mm256_set1_pd(charge[inr+3]);
1405 vdwjidx0A = 2*vdwtype[inr+0];
1406 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1407 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1408 qq11 = _mm256_mul_pd(iq1,jq1);
1409 qq12 = _mm256_mul_pd(iq1,jq2);
1410 qq13 = _mm256_mul_pd(iq1,jq3);
1411 qq21 = _mm256_mul_pd(iq2,jq1);
1412 qq22 = _mm256_mul_pd(iq2,jq2);
1413 qq23 = _mm256_mul_pd(iq2,jq3);
1414 qq31 = _mm256_mul_pd(iq3,jq1);
1415 qq32 = _mm256_mul_pd(iq3,jq2);
1416 qq33 = _mm256_mul_pd(iq3,jq3);
1418 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1419 rcutoff_scalar = fr->rcoulomb;
1420 rcutoff = _mm256_set1_pd(rcutoff_scalar);
1421 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
1423 rswitch_scalar = fr->rvdw_switch;
1424 rswitch = _mm256_set1_pd(rswitch_scalar);
1425 /* Setup switch parameters */
1426 d_scalar = rcutoff_scalar-rswitch_scalar;
1427 d = _mm256_set1_pd(d_scalar);
1428 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1429 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1430 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1431 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1432 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1433 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1435 /* Avoid stupid compiler warnings */
1436 jnrA = jnrB = jnrC = jnrD = 0;
1437 j_coord_offsetA = 0;
1438 j_coord_offsetB = 0;
1439 j_coord_offsetC = 0;
1440 j_coord_offsetD = 0;
1445 for(iidx=0;iidx<4*DIM;iidx++)
1447 scratch[iidx] = 0.0;
1450 /* Start outer loop over neighborlists */
1451 for(iidx=0; iidx<nri; iidx++)
1453 /* Load shift vector for this list */
1454 i_shift_offset = DIM*shiftidx[iidx];
1456 /* Load limits for loop over neighbors */
1457 j_index_start = jindex[iidx];
1458 j_index_end = jindex[iidx+1];
1460 /* Get outer coordinate index */
1462 i_coord_offset = DIM*inr;
1464 /* Load i particle coords and add shift vector */
1465 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1466 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1468 fix0 = _mm256_setzero_pd();
1469 fiy0 = _mm256_setzero_pd();
1470 fiz0 = _mm256_setzero_pd();
1471 fix1 = _mm256_setzero_pd();
1472 fiy1 = _mm256_setzero_pd();
1473 fiz1 = _mm256_setzero_pd();
1474 fix2 = _mm256_setzero_pd();
1475 fiy2 = _mm256_setzero_pd();
1476 fiz2 = _mm256_setzero_pd();
1477 fix3 = _mm256_setzero_pd();
1478 fiy3 = _mm256_setzero_pd();
1479 fiz3 = _mm256_setzero_pd();
1481 /* Start inner kernel loop */
1482 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1485 /* Get j neighbor index, and coordinate index */
1487 jnrB = jjnr[jidx+1];
1488 jnrC = jjnr[jidx+2];
1489 jnrD = jjnr[jidx+3];
1490 j_coord_offsetA = DIM*jnrA;
1491 j_coord_offsetB = DIM*jnrB;
1492 j_coord_offsetC = DIM*jnrC;
1493 j_coord_offsetD = DIM*jnrD;
1495 /* load j atom coordinates */
1496 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1497 x+j_coord_offsetC,x+j_coord_offsetD,
1498 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1499 &jy2,&jz2,&jx3,&jy3,&jz3);
1501 /* Calculate displacement vector */
1502 dx00 = _mm256_sub_pd(ix0,jx0);
1503 dy00 = _mm256_sub_pd(iy0,jy0);
1504 dz00 = _mm256_sub_pd(iz0,jz0);
1505 dx11 = _mm256_sub_pd(ix1,jx1);
1506 dy11 = _mm256_sub_pd(iy1,jy1);
1507 dz11 = _mm256_sub_pd(iz1,jz1);
1508 dx12 = _mm256_sub_pd(ix1,jx2);
1509 dy12 = _mm256_sub_pd(iy1,jy2);
1510 dz12 = _mm256_sub_pd(iz1,jz2);
1511 dx13 = _mm256_sub_pd(ix1,jx3);
1512 dy13 = _mm256_sub_pd(iy1,jy3);
1513 dz13 = _mm256_sub_pd(iz1,jz3);
1514 dx21 = _mm256_sub_pd(ix2,jx1);
1515 dy21 = _mm256_sub_pd(iy2,jy1);
1516 dz21 = _mm256_sub_pd(iz2,jz1);
1517 dx22 = _mm256_sub_pd(ix2,jx2);
1518 dy22 = _mm256_sub_pd(iy2,jy2);
1519 dz22 = _mm256_sub_pd(iz2,jz2);
1520 dx23 = _mm256_sub_pd(ix2,jx3);
1521 dy23 = _mm256_sub_pd(iy2,jy3);
1522 dz23 = _mm256_sub_pd(iz2,jz3);
1523 dx31 = _mm256_sub_pd(ix3,jx1);
1524 dy31 = _mm256_sub_pd(iy3,jy1);
1525 dz31 = _mm256_sub_pd(iz3,jz1);
1526 dx32 = _mm256_sub_pd(ix3,jx2);
1527 dy32 = _mm256_sub_pd(iy3,jy2);
1528 dz32 = _mm256_sub_pd(iz3,jz2);
1529 dx33 = _mm256_sub_pd(ix3,jx3);
1530 dy33 = _mm256_sub_pd(iy3,jy3);
1531 dz33 = _mm256_sub_pd(iz3,jz3);
1533 /* Calculate squared distance and things based on it */
1534 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1535 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1536 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1537 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1538 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1539 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1540 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1541 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1542 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1543 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1545 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1546 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1547 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1548 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1549 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1550 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1551 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1552 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1553 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1554 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
1556 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1557 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1558 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1559 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1560 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1561 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1562 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1563 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1564 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1565 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1567 fjx0 = _mm256_setzero_pd();
1568 fjy0 = _mm256_setzero_pd();
1569 fjz0 = _mm256_setzero_pd();
1570 fjx1 = _mm256_setzero_pd();
1571 fjy1 = _mm256_setzero_pd();
1572 fjz1 = _mm256_setzero_pd();
1573 fjx2 = _mm256_setzero_pd();
1574 fjy2 = _mm256_setzero_pd();
1575 fjz2 = _mm256_setzero_pd();
1576 fjx3 = _mm256_setzero_pd();
1577 fjy3 = _mm256_setzero_pd();
1578 fjz3 = _mm256_setzero_pd();
1580 /**************************
1581 * CALCULATE INTERACTIONS *
1582 **************************/
1584 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1587 r00 = _mm256_mul_pd(rsq00,rinv00);
1589 /* LENNARD-JONES DISPERSION/REPULSION */
1591 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1592 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
1593 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1594 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1595 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1597 d = _mm256_sub_pd(r00,rswitch);
1598 d = _mm256_max_pd(d,_mm256_setzero_pd());
1599 d2 = _mm256_mul_pd(d,d);
1600 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1602 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1604 /* Evaluate switch function */
1605 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1606 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1607 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1611 fscal = _mm256_and_pd(fscal,cutoff_mask);
1613 /* Calculate temporary vectorial force */
1614 tx = _mm256_mul_pd(fscal,dx00);
1615 ty = _mm256_mul_pd(fscal,dy00);
1616 tz = _mm256_mul_pd(fscal,dz00);
1618 /* Update vectorial force */
1619 fix0 = _mm256_add_pd(fix0,tx);
1620 fiy0 = _mm256_add_pd(fiy0,ty);
1621 fiz0 = _mm256_add_pd(fiz0,tz);
1623 fjx0 = _mm256_add_pd(fjx0,tx);
1624 fjy0 = _mm256_add_pd(fjy0,ty);
1625 fjz0 = _mm256_add_pd(fjz0,tz);
1629 /**************************
1630 * CALCULATE INTERACTIONS *
1631 **************************/
1633 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1636 /* REACTION-FIELD ELECTROSTATICS */
1637 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1639 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
1643 fscal = _mm256_and_pd(fscal,cutoff_mask);
1645 /* Calculate temporary vectorial force */
1646 tx = _mm256_mul_pd(fscal,dx11);
1647 ty = _mm256_mul_pd(fscal,dy11);
1648 tz = _mm256_mul_pd(fscal,dz11);
1650 /* Update vectorial force */
1651 fix1 = _mm256_add_pd(fix1,tx);
1652 fiy1 = _mm256_add_pd(fiy1,ty);
1653 fiz1 = _mm256_add_pd(fiz1,tz);
1655 fjx1 = _mm256_add_pd(fjx1,tx);
1656 fjy1 = _mm256_add_pd(fjy1,ty);
1657 fjz1 = _mm256_add_pd(fjz1,tz);
1661 /**************************
1662 * CALCULATE INTERACTIONS *
1663 **************************/
1665 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1668 /* REACTION-FIELD ELECTROSTATICS */
1669 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1671 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
1675 fscal = _mm256_and_pd(fscal,cutoff_mask);
1677 /* Calculate temporary vectorial force */
1678 tx = _mm256_mul_pd(fscal,dx12);
1679 ty = _mm256_mul_pd(fscal,dy12);
1680 tz = _mm256_mul_pd(fscal,dz12);
1682 /* Update vectorial force */
1683 fix1 = _mm256_add_pd(fix1,tx);
1684 fiy1 = _mm256_add_pd(fiy1,ty);
1685 fiz1 = _mm256_add_pd(fiz1,tz);
1687 fjx2 = _mm256_add_pd(fjx2,tx);
1688 fjy2 = _mm256_add_pd(fjy2,ty);
1689 fjz2 = _mm256_add_pd(fjz2,tz);
1693 /**************************
1694 * CALCULATE INTERACTIONS *
1695 **************************/
1697 if (gmx_mm256_any_lt(rsq13,rcutoff2))
1700 /* REACTION-FIELD ELECTROSTATICS */
1701 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
1703 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
1707 fscal = _mm256_and_pd(fscal,cutoff_mask);
1709 /* Calculate temporary vectorial force */
1710 tx = _mm256_mul_pd(fscal,dx13);
1711 ty = _mm256_mul_pd(fscal,dy13);
1712 tz = _mm256_mul_pd(fscal,dz13);
1714 /* Update vectorial force */
1715 fix1 = _mm256_add_pd(fix1,tx);
1716 fiy1 = _mm256_add_pd(fiy1,ty);
1717 fiz1 = _mm256_add_pd(fiz1,tz);
1719 fjx3 = _mm256_add_pd(fjx3,tx);
1720 fjy3 = _mm256_add_pd(fjy3,ty);
1721 fjz3 = _mm256_add_pd(fjz3,tz);
1725 /**************************
1726 * CALCULATE INTERACTIONS *
1727 **************************/
1729 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1732 /* REACTION-FIELD ELECTROSTATICS */
1733 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1735 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
1739 fscal = _mm256_and_pd(fscal,cutoff_mask);
1741 /* Calculate temporary vectorial force */
1742 tx = _mm256_mul_pd(fscal,dx21);
1743 ty = _mm256_mul_pd(fscal,dy21);
1744 tz = _mm256_mul_pd(fscal,dz21);
1746 /* Update vectorial force */
1747 fix2 = _mm256_add_pd(fix2,tx);
1748 fiy2 = _mm256_add_pd(fiy2,ty);
1749 fiz2 = _mm256_add_pd(fiz2,tz);
1751 fjx1 = _mm256_add_pd(fjx1,tx);
1752 fjy1 = _mm256_add_pd(fjy1,ty);
1753 fjz1 = _mm256_add_pd(fjz1,tz);
1757 /**************************
1758 * CALCULATE INTERACTIONS *
1759 **************************/
1761 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1764 /* REACTION-FIELD ELECTROSTATICS */
1765 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1767 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
1771 fscal = _mm256_and_pd(fscal,cutoff_mask);
1773 /* Calculate temporary vectorial force */
1774 tx = _mm256_mul_pd(fscal,dx22);
1775 ty = _mm256_mul_pd(fscal,dy22);
1776 tz = _mm256_mul_pd(fscal,dz22);
1778 /* Update vectorial force */
1779 fix2 = _mm256_add_pd(fix2,tx);
1780 fiy2 = _mm256_add_pd(fiy2,ty);
1781 fiz2 = _mm256_add_pd(fiz2,tz);
1783 fjx2 = _mm256_add_pd(fjx2,tx);
1784 fjy2 = _mm256_add_pd(fjy2,ty);
1785 fjz2 = _mm256_add_pd(fjz2,tz);
1789 /**************************
1790 * CALCULATE INTERACTIONS *
1791 **************************/
1793 if (gmx_mm256_any_lt(rsq23,rcutoff2))
1796 /* REACTION-FIELD ELECTROSTATICS */
1797 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
1799 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
1803 fscal = _mm256_and_pd(fscal,cutoff_mask);
1805 /* Calculate temporary vectorial force */
1806 tx = _mm256_mul_pd(fscal,dx23);
1807 ty = _mm256_mul_pd(fscal,dy23);
1808 tz = _mm256_mul_pd(fscal,dz23);
1810 /* Update vectorial force */
1811 fix2 = _mm256_add_pd(fix2,tx);
1812 fiy2 = _mm256_add_pd(fiy2,ty);
1813 fiz2 = _mm256_add_pd(fiz2,tz);
1815 fjx3 = _mm256_add_pd(fjx3,tx);
1816 fjy3 = _mm256_add_pd(fjy3,ty);
1817 fjz3 = _mm256_add_pd(fjz3,tz);
1821 /**************************
1822 * CALCULATE INTERACTIONS *
1823 **************************/
1825 if (gmx_mm256_any_lt(rsq31,rcutoff2))
1828 /* REACTION-FIELD ELECTROSTATICS */
1829 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1831 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
1835 fscal = _mm256_and_pd(fscal,cutoff_mask);
1837 /* Calculate temporary vectorial force */
1838 tx = _mm256_mul_pd(fscal,dx31);
1839 ty = _mm256_mul_pd(fscal,dy31);
1840 tz = _mm256_mul_pd(fscal,dz31);
1842 /* Update vectorial force */
1843 fix3 = _mm256_add_pd(fix3,tx);
1844 fiy3 = _mm256_add_pd(fiy3,ty);
1845 fiz3 = _mm256_add_pd(fiz3,tz);
1847 fjx1 = _mm256_add_pd(fjx1,tx);
1848 fjy1 = _mm256_add_pd(fjy1,ty);
1849 fjz1 = _mm256_add_pd(fjz1,tz);
1853 /**************************
1854 * CALCULATE INTERACTIONS *
1855 **************************/
1857 if (gmx_mm256_any_lt(rsq32,rcutoff2))
1860 /* REACTION-FIELD ELECTROSTATICS */
1861 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
1863 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
1867 fscal = _mm256_and_pd(fscal,cutoff_mask);
1869 /* Calculate temporary vectorial force */
1870 tx = _mm256_mul_pd(fscal,dx32);
1871 ty = _mm256_mul_pd(fscal,dy32);
1872 tz = _mm256_mul_pd(fscal,dz32);
1874 /* Update vectorial force */
1875 fix3 = _mm256_add_pd(fix3,tx);
1876 fiy3 = _mm256_add_pd(fiy3,ty);
1877 fiz3 = _mm256_add_pd(fiz3,tz);
1879 fjx2 = _mm256_add_pd(fjx2,tx);
1880 fjy2 = _mm256_add_pd(fjy2,ty);
1881 fjz2 = _mm256_add_pd(fjz2,tz);
1885 /**************************
1886 * CALCULATE INTERACTIONS *
1887 **************************/
1889 if (gmx_mm256_any_lt(rsq33,rcutoff2))
1892 /* REACTION-FIELD ELECTROSTATICS */
1893 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
1895 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
1899 fscal = _mm256_and_pd(fscal,cutoff_mask);
1901 /* Calculate temporary vectorial force */
1902 tx = _mm256_mul_pd(fscal,dx33);
1903 ty = _mm256_mul_pd(fscal,dy33);
1904 tz = _mm256_mul_pd(fscal,dz33);
1906 /* Update vectorial force */
1907 fix3 = _mm256_add_pd(fix3,tx);
1908 fiy3 = _mm256_add_pd(fiy3,ty);
1909 fiz3 = _mm256_add_pd(fiz3,tz);
1911 fjx3 = _mm256_add_pd(fjx3,tx);
1912 fjy3 = _mm256_add_pd(fjy3,ty);
1913 fjz3 = _mm256_add_pd(fjz3,tz);
1917 fjptrA = f+j_coord_offsetA;
1918 fjptrB = f+j_coord_offsetB;
1919 fjptrC = f+j_coord_offsetC;
1920 fjptrD = f+j_coord_offsetD;
1922 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1923 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1924 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1926 /* Inner loop uses 329 flops */
1929 if(jidx<j_index_end)
1932 /* Get j neighbor index, and coordinate index */
1933 jnrlistA = jjnr[jidx];
1934 jnrlistB = jjnr[jidx+1];
1935 jnrlistC = jjnr[jidx+2];
1936 jnrlistD = jjnr[jidx+3];
1937 /* Sign of each element will be negative for non-real atoms.
1938 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1939 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1941 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1943 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1944 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1945 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1947 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1948 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1949 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1950 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1951 j_coord_offsetA = DIM*jnrA;
1952 j_coord_offsetB = DIM*jnrB;
1953 j_coord_offsetC = DIM*jnrC;
1954 j_coord_offsetD = DIM*jnrD;
1956 /* load j atom coordinates */
1957 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1958 x+j_coord_offsetC,x+j_coord_offsetD,
1959 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1960 &jy2,&jz2,&jx3,&jy3,&jz3);
1962 /* Calculate displacement vector */
1963 dx00 = _mm256_sub_pd(ix0,jx0);
1964 dy00 = _mm256_sub_pd(iy0,jy0);
1965 dz00 = _mm256_sub_pd(iz0,jz0);
1966 dx11 = _mm256_sub_pd(ix1,jx1);
1967 dy11 = _mm256_sub_pd(iy1,jy1);
1968 dz11 = _mm256_sub_pd(iz1,jz1);
1969 dx12 = _mm256_sub_pd(ix1,jx2);
1970 dy12 = _mm256_sub_pd(iy1,jy2);
1971 dz12 = _mm256_sub_pd(iz1,jz2);
1972 dx13 = _mm256_sub_pd(ix1,jx3);
1973 dy13 = _mm256_sub_pd(iy1,jy3);
1974 dz13 = _mm256_sub_pd(iz1,jz3);
1975 dx21 = _mm256_sub_pd(ix2,jx1);
1976 dy21 = _mm256_sub_pd(iy2,jy1);
1977 dz21 = _mm256_sub_pd(iz2,jz1);
1978 dx22 = _mm256_sub_pd(ix2,jx2);
1979 dy22 = _mm256_sub_pd(iy2,jy2);
1980 dz22 = _mm256_sub_pd(iz2,jz2);
1981 dx23 = _mm256_sub_pd(ix2,jx3);
1982 dy23 = _mm256_sub_pd(iy2,jy3);
1983 dz23 = _mm256_sub_pd(iz2,jz3);
1984 dx31 = _mm256_sub_pd(ix3,jx1);
1985 dy31 = _mm256_sub_pd(iy3,jy1);
1986 dz31 = _mm256_sub_pd(iz3,jz1);
1987 dx32 = _mm256_sub_pd(ix3,jx2);
1988 dy32 = _mm256_sub_pd(iy3,jy2);
1989 dz32 = _mm256_sub_pd(iz3,jz2);
1990 dx33 = _mm256_sub_pd(ix3,jx3);
1991 dy33 = _mm256_sub_pd(iy3,jy3);
1992 dz33 = _mm256_sub_pd(iz3,jz3);
1994 /* Calculate squared distance and things based on it */
1995 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1996 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1997 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1998 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1999 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
2000 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
2001 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
2002 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
2003 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
2004 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
2006 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
2007 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
2008 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
2009 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
2010 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
2011 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
2012 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
2013 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
2014 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
2015 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
2017 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
2018 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
2019 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
2020 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
2021 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
2022 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
2023 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
2024 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
2025 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
2026 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
2028 fjx0 = _mm256_setzero_pd();
2029 fjy0 = _mm256_setzero_pd();
2030 fjz0 = _mm256_setzero_pd();
2031 fjx1 = _mm256_setzero_pd();
2032 fjy1 = _mm256_setzero_pd();
2033 fjz1 = _mm256_setzero_pd();
2034 fjx2 = _mm256_setzero_pd();
2035 fjy2 = _mm256_setzero_pd();
2036 fjz2 = _mm256_setzero_pd();
2037 fjx3 = _mm256_setzero_pd();
2038 fjy3 = _mm256_setzero_pd();
2039 fjz3 = _mm256_setzero_pd();
2041 /**************************
2042 * CALCULATE INTERACTIONS *
2043 **************************/
2045 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2048 r00 = _mm256_mul_pd(rsq00,rinv00);
2049 r00 = _mm256_andnot_pd(dummy_mask,r00);
2051 /* LENNARD-JONES DISPERSION/REPULSION */
2053 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2054 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
2055 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
2056 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
2057 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
2059 d = _mm256_sub_pd(r00,rswitch);
2060 d = _mm256_max_pd(d,_mm256_setzero_pd());
2061 d2 = _mm256_mul_pd(d,d);
2062 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
2064 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
2066 /* Evaluate switch function */
2067 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2068 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
2069 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
2073 fscal = _mm256_and_pd(fscal,cutoff_mask);
2075 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2077 /* Calculate temporary vectorial force */
2078 tx = _mm256_mul_pd(fscal,dx00);
2079 ty = _mm256_mul_pd(fscal,dy00);
2080 tz = _mm256_mul_pd(fscal,dz00);
2082 /* Update vectorial force */
2083 fix0 = _mm256_add_pd(fix0,tx);
2084 fiy0 = _mm256_add_pd(fiy0,ty);
2085 fiz0 = _mm256_add_pd(fiz0,tz);
2087 fjx0 = _mm256_add_pd(fjx0,tx);
2088 fjy0 = _mm256_add_pd(fjy0,ty);
2089 fjz0 = _mm256_add_pd(fjz0,tz);
2093 /**************************
2094 * CALCULATE INTERACTIONS *
2095 **************************/
2097 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2100 /* REACTION-FIELD ELECTROSTATICS */
2101 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
2103 cutoff_mask = _mm256_cmp_pd(rsq11,rcutoff2,_CMP_LT_OQ);
2107 fscal = _mm256_and_pd(fscal,cutoff_mask);
2109 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2111 /* Calculate temporary vectorial force */
2112 tx = _mm256_mul_pd(fscal,dx11);
2113 ty = _mm256_mul_pd(fscal,dy11);
2114 tz = _mm256_mul_pd(fscal,dz11);
2116 /* Update vectorial force */
2117 fix1 = _mm256_add_pd(fix1,tx);
2118 fiy1 = _mm256_add_pd(fiy1,ty);
2119 fiz1 = _mm256_add_pd(fiz1,tz);
2121 fjx1 = _mm256_add_pd(fjx1,tx);
2122 fjy1 = _mm256_add_pd(fjy1,ty);
2123 fjz1 = _mm256_add_pd(fjz1,tz);
2127 /**************************
2128 * CALCULATE INTERACTIONS *
2129 **************************/
2131 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2134 /* REACTION-FIELD ELECTROSTATICS */
2135 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
2137 cutoff_mask = _mm256_cmp_pd(rsq12,rcutoff2,_CMP_LT_OQ);
2141 fscal = _mm256_and_pd(fscal,cutoff_mask);
2143 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2145 /* Calculate temporary vectorial force */
2146 tx = _mm256_mul_pd(fscal,dx12);
2147 ty = _mm256_mul_pd(fscal,dy12);
2148 tz = _mm256_mul_pd(fscal,dz12);
2150 /* Update vectorial force */
2151 fix1 = _mm256_add_pd(fix1,tx);
2152 fiy1 = _mm256_add_pd(fiy1,ty);
2153 fiz1 = _mm256_add_pd(fiz1,tz);
2155 fjx2 = _mm256_add_pd(fjx2,tx);
2156 fjy2 = _mm256_add_pd(fjy2,ty);
2157 fjz2 = _mm256_add_pd(fjz2,tz);
2161 /**************************
2162 * CALCULATE INTERACTIONS *
2163 **************************/
2165 if (gmx_mm256_any_lt(rsq13,rcutoff2))
2168 /* REACTION-FIELD ELECTROSTATICS */
2169 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
2171 cutoff_mask = _mm256_cmp_pd(rsq13,rcutoff2,_CMP_LT_OQ);
2175 fscal = _mm256_and_pd(fscal,cutoff_mask);
2177 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2179 /* Calculate temporary vectorial force */
2180 tx = _mm256_mul_pd(fscal,dx13);
2181 ty = _mm256_mul_pd(fscal,dy13);
2182 tz = _mm256_mul_pd(fscal,dz13);
2184 /* Update vectorial force */
2185 fix1 = _mm256_add_pd(fix1,tx);
2186 fiy1 = _mm256_add_pd(fiy1,ty);
2187 fiz1 = _mm256_add_pd(fiz1,tz);
2189 fjx3 = _mm256_add_pd(fjx3,tx);
2190 fjy3 = _mm256_add_pd(fjy3,ty);
2191 fjz3 = _mm256_add_pd(fjz3,tz);
2195 /**************************
2196 * CALCULATE INTERACTIONS *
2197 **************************/
2199 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2202 /* REACTION-FIELD ELECTROSTATICS */
2203 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
2205 cutoff_mask = _mm256_cmp_pd(rsq21,rcutoff2,_CMP_LT_OQ);
2209 fscal = _mm256_and_pd(fscal,cutoff_mask);
2211 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2213 /* Calculate temporary vectorial force */
2214 tx = _mm256_mul_pd(fscal,dx21);
2215 ty = _mm256_mul_pd(fscal,dy21);
2216 tz = _mm256_mul_pd(fscal,dz21);
2218 /* Update vectorial force */
2219 fix2 = _mm256_add_pd(fix2,tx);
2220 fiy2 = _mm256_add_pd(fiy2,ty);
2221 fiz2 = _mm256_add_pd(fiz2,tz);
2223 fjx1 = _mm256_add_pd(fjx1,tx);
2224 fjy1 = _mm256_add_pd(fjy1,ty);
2225 fjz1 = _mm256_add_pd(fjz1,tz);
2229 /**************************
2230 * CALCULATE INTERACTIONS *
2231 **************************/
2233 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2236 /* REACTION-FIELD ELECTROSTATICS */
2237 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
2239 cutoff_mask = _mm256_cmp_pd(rsq22,rcutoff2,_CMP_LT_OQ);
2243 fscal = _mm256_and_pd(fscal,cutoff_mask);
2245 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2247 /* Calculate temporary vectorial force */
2248 tx = _mm256_mul_pd(fscal,dx22);
2249 ty = _mm256_mul_pd(fscal,dy22);
2250 tz = _mm256_mul_pd(fscal,dz22);
2252 /* Update vectorial force */
2253 fix2 = _mm256_add_pd(fix2,tx);
2254 fiy2 = _mm256_add_pd(fiy2,ty);
2255 fiz2 = _mm256_add_pd(fiz2,tz);
2257 fjx2 = _mm256_add_pd(fjx2,tx);
2258 fjy2 = _mm256_add_pd(fjy2,ty);
2259 fjz2 = _mm256_add_pd(fjz2,tz);
2263 /**************************
2264 * CALCULATE INTERACTIONS *
2265 **************************/
2267 if (gmx_mm256_any_lt(rsq23,rcutoff2))
2270 /* REACTION-FIELD ELECTROSTATICS */
2271 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
2273 cutoff_mask = _mm256_cmp_pd(rsq23,rcutoff2,_CMP_LT_OQ);
2277 fscal = _mm256_and_pd(fscal,cutoff_mask);
2279 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2281 /* Calculate temporary vectorial force */
2282 tx = _mm256_mul_pd(fscal,dx23);
2283 ty = _mm256_mul_pd(fscal,dy23);
2284 tz = _mm256_mul_pd(fscal,dz23);
2286 /* Update vectorial force */
2287 fix2 = _mm256_add_pd(fix2,tx);
2288 fiy2 = _mm256_add_pd(fiy2,ty);
2289 fiz2 = _mm256_add_pd(fiz2,tz);
2291 fjx3 = _mm256_add_pd(fjx3,tx);
2292 fjy3 = _mm256_add_pd(fjy3,ty);
2293 fjz3 = _mm256_add_pd(fjz3,tz);
2297 /**************************
2298 * CALCULATE INTERACTIONS *
2299 **************************/
2301 if (gmx_mm256_any_lt(rsq31,rcutoff2))
2304 /* REACTION-FIELD ELECTROSTATICS */
2305 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
2307 cutoff_mask = _mm256_cmp_pd(rsq31,rcutoff2,_CMP_LT_OQ);
2311 fscal = _mm256_and_pd(fscal,cutoff_mask);
2313 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2315 /* Calculate temporary vectorial force */
2316 tx = _mm256_mul_pd(fscal,dx31);
2317 ty = _mm256_mul_pd(fscal,dy31);
2318 tz = _mm256_mul_pd(fscal,dz31);
2320 /* Update vectorial force */
2321 fix3 = _mm256_add_pd(fix3,tx);
2322 fiy3 = _mm256_add_pd(fiy3,ty);
2323 fiz3 = _mm256_add_pd(fiz3,tz);
2325 fjx1 = _mm256_add_pd(fjx1,tx);
2326 fjy1 = _mm256_add_pd(fjy1,ty);
2327 fjz1 = _mm256_add_pd(fjz1,tz);
2331 /**************************
2332 * CALCULATE INTERACTIONS *
2333 **************************/
2335 if (gmx_mm256_any_lt(rsq32,rcutoff2))
2338 /* REACTION-FIELD ELECTROSTATICS */
2339 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
2341 cutoff_mask = _mm256_cmp_pd(rsq32,rcutoff2,_CMP_LT_OQ);
2345 fscal = _mm256_and_pd(fscal,cutoff_mask);
2347 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2349 /* Calculate temporary vectorial force */
2350 tx = _mm256_mul_pd(fscal,dx32);
2351 ty = _mm256_mul_pd(fscal,dy32);
2352 tz = _mm256_mul_pd(fscal,dz32);
2354 /* Update vectorial force */
2355 fix3 = _mm256_add_pd(fix3,tx);
2356 fiy3 = _mm256_add_pd(fiy3,ty);
2357 fiz3 = _mm256_add_pd(fiz3,tz);
2359 fjx2 = _mm256_add_pd(fjx2,tx);
2360 fjy2 = _mm256_add_pd(fjy2,ty);
2361 fjz2 = _mm256_add_pd(fjz2,tz);
2365 /**************************
2366 * CALCULATE INTERACTIONS *
2367 **************************/
2369 if (gmx_mm256_any_lt(rsq33,rcutoff2))
2372 /* REACTION-FIELD ELECTROSTATICS */
2373 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
2375 cutoff_mask = _mm256_cmp_pd(rsq33,rcutoff2,_CMP_LT_OQ);
2379 fscal = _mm256_and_pd(fscal,cutoff_mask);
2381 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2383 /* Calculate temporary vectorial force */
2384 tx = _mm256_mul_pd(fscal,dx33);
2385 ty = _mm256_mul_pd(fscal,dy33);
2386 tz = _mm256_mul_pd(fscal,dz33);
2388 /* Update vectorial force */
2389 fix3 = _mm256_add_pd(fix3,tx);
2390 fiy3 = _mm256_add_pd(fiy3,ty);
2391 fiz3 = _mm256_add_pd(fiz3,tz);
2393 fjx3 = _mm256_add_pd(fjx3,tx);
2394 fjy3 = _mm256_add_pd(fjy3,ty);
2395 fjz3 = _mm256_add_pd(fjz3,tz);
2399 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2400 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2401 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2402 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2404 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2405 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2406 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2408 /* Inner loop uses 330 flops */
2411 /* End of innermost loop */
2413 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2414 f+i_coord_offset,fshift+i_shift_offset);
2416 /* Increment number of inner iterations */
2417 inneriter += j_index_end - j_index_start;
2419 /* Outer loop uses 24 flops */
2422 /* Increment number of outer iterations */
2425 /* Update outer/inner flops */
2427 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*330);