2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_avx_256_double
52 * Electrostatics interaction: ReactionField
53 * VdW interaction: CubicSplineTable
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_avx_256_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 real * vdwioffsetptr0;
85 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 real * vdwioffsetptr1;
87 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 real * vdwioffsetptr2;
89 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
98 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
99 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
100 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
103 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
108 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
111 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
112 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
114 __m128i ifour = _mm_set1_epi32(4);
115 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
117 __m256d dummy_mask,cutoff_mask;
118 __m128 tmpmask0,tmpmask1;
119 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
120 __m256d one = _mm256_set1_pd(1.0);
121 __m256d two = _mm256_set1_pd(2.0);
127 jindex = nlist->jindex;
129 shiftidx = nlist->shift;
131 shiftvec = fr->shift_vec[0];
132 fshift = fr->fshift[0];
133 facel = _mm256_set1_pd(fr->epsfac);
134 charge = mdatoms->chargeA;
135 krf = _mm256_set1_pd(fr->ic->k_rf);
136 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
137 crf = _mm256_set1_pd(fr->ic->c_rf);
138 nvdwtype = fr->ntype;
140 vdwtype = mdatoms->typeA;
142 vftab = kernel_data->table_vdw->data;
143 vftabscale = _mm256_set1_pd(kernel_data->table_vdw->scale);
145 /* Setup water-specific parameters */
146 inr = nlist->iinr[0];
147 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
148 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
149 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
150 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
152 jq0 = _mm256_set1_pd(charge[inr+0]);
153 jq1 = _mm256_set1_pd(charge[inr+1]);
154 jq2 = _mm256_set1_pd(charge[inr+2]);
155 vdwjidx0A = 2*vdwtype[inr+0];
156 qq00 = _mm256_mul_pd(iq0,jq0);
157 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
158 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
159 qq01 = _mm256_mul_pd(iq0,jq1);
160 qq02 = _mm256_mul_pd(iq0,jq2);
161 qq10 = _mm256_mul_pd(iq1,jq0);
162 qq11 = _mm256_mul_pd(iq1,jq1);
163 qq12 = _mm256_mul_pd(iq1,jq2);
164 qq20 = _mm256_mul_pd(iq2,jq0);
165 qq21 = _mm256_mul_pd(iq2,jq1);
166 qq22 = _mm256_mul_pd(iq2,jq2);
168 /* Avoid stupid compiler warnings */
169 jnrA = jnrB = jnrC = jnrD = 0;
178 for(iidx=0;iidx<4*DIM;iidx++)
183 /* Start outer loop over neighborlists */
184 for(iidx=0; iidx<nri; iidx++)
186 /* Load shift vector for this list */
187 i_shift_offset = DIM*shiftidx[iidx];
189 /* Load limits for loop over neighbors */
190 j_index_start = jindex[iidx];
191 j_index_end = jindex[iidx+1];
193 /* Get outer coordinate index */
195 i_coord_offset = DIM*inr;
197 /* Load i particle coords and add shift vector */
198 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
199 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
201 fix0 = _mm256_setzero_pd();
202 fiy0 = _mm256_setzero_pd();
203 fiz0 = _mm256_setzero_pd();
204 fix1 = _mm256_setzero_pd();
205 fiy1 = _mm256_setzero_pd();
206 fiz1 = _mm256_setzero_pd();
207 fix2 = _mm256_setzero_pd();
208 fiy2 = _mm256_setzero_pd();
209 fiz2 = _mm256_setzero_pd();
211 /* Reset potential sums */
212 velecsum = _mm256_setzero_pd();
213 vvdwsum = _mm256_setzero_pd();
215 /* Start inner kernel loop */
216 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
219 /* Get j neighbor index, and coordinate index */
224 j_coord_offsetA = DIM*jnrA;
225 j_coord_offsetB = DIM*jnrB;
226 j_coord_offsetC = DIM*jnrC;
227 j_coord_offsetD = DIM*jnrD;
229 /* load j atom coordinates */
230 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
231 x+j_coord_offsetC,x+j_coord_offsetD,
232 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
234 /* Calculate displacement vector */
235 dx00 = _mm256_sub_pd(ix0,jx0);
236 dy00 = _mm256_sub_pd(iy0,jy0);
237 dz00 = _mm256_sub_pd(iz0,jz0);
238 dx01 = _mm256_sub_pd(ix0,jx1);
239 dy01 = _mm256_sub_pd(iy0,jy1);
240 dz01 = _mm256_sub_pd(iz0,jz1);
241 dx02 = _mm256_sub_pd(ix0,jx2);
242 dy02 = _mm256_sub_pd(iy0,jy2);
243 dz02 = _mm256_sub_pd(iz0,jz2);
244 dx10 = _mm256_sub_pd(ix1,jx0);
245 dy10 = _mm256_sub_pd(iy1,jy0);
246 dz10 = _mm256_sub_pd(iz1,jz0);
247 dx11 = _mm256_sub_pd(ix1,jx1);
248 dy11 = _mm256_sub_pd(iy1,jy1);
249 dz11 = _mm256_sub_pd(iz1,jz1);
250 dx12 = _mm256_sub_pd(ix1,jx2);
251 dy12 = _mm256_sub_pd(iy1,jy2);
252 dz12 = _mm256_sub_pd(iz1,jz2);
253 dx20 = _mm256_sub_pd(ix2,jx0);
254 dy20 = _mm256_sub_pd(iy2,jy0);
255 dz20 = _mm256_sub_pd(iz2,jz0);
256 dx21 = _mm256_sub_pd(ix2,jx1);
257 dy21 = _mm256_sub_pd(iy2,jy1);
258 dz21 = _mm256_sub_pd(iz2,jz1);
259 dx22 = _mm256_sub_pd(ix2,jx2);
260 dy22 = _mm256_sub_pd(iy2,jy2);
261 dz22 = _mm256_sub_pd(iz2,jz2);
263 /* Calculate squared distance and things based on it */
264 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
265 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
266 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
267 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
268 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
269 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
270 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
271 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
272 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
274 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
275 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
276 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
277 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
278 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
279 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
280 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
281 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
282 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
284 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
285 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
286 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
287 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
288 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
289 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
290 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
291 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
292 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
294 fjx0 = _mm256_setzero_pd();
295 fjy0 = _mm256_setzero_pd();
296 fjz0 = _mm256_setzero_pd();
297 fjx1 = _mm256_setzero_pd();
298 fjy1 = _mm256_setzero_pd();
299 fjz1 = _mm256_setzero_pd();
300 fjx2 = _mm256_setzero_pd();
301 fjy2 = _mm256_setzero_pd();
302 fjz2 = _mm256_setzero_pd();
304 /**************************
305 * CALCULATE INTERACTIONS *
306 **************************/
308 r00 = _mm256_mul_pd(rsq00,rinv00);
310 /* Calculate table index by multiplying r with table scale and truncate to integer */
311 rt = _mm256_mul_pd(r00,vftabscale);
312 vfitab = _mm256_cvttpd_epi32(rt);
313 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
314 vfitab = _mm_slli_epi32(vfitab,3);
316 /* REACTION-FIELD ELECTROSTATICS */
317 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_add_pd(rinv00,_mm256_mul_pd(krf,rsq00)),crf));
318 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
320 /* CUBIC SPLINE TABLE DISPERSION */
321 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
322 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
323 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
324 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
325 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
326 Heps = _mm256_mul_pd(vfeps,H);
327 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
328 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
329 vvdw6 = _mm256_mul_pd(c6_00,VV);
330 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
331 fvdw6 = _mm256_mul_pd(c6_00,FF);
333 /* CUBIC SPLINE TABLE REPULSION */
334 vfitab = _mm_add_epi32(vfitab,ifour);
335 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
336 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
337 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
338 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
339 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
340 Heps = _mm256_mul_pd(vfeps,H);
341 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
342 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
343 vvdw12 = _mm256_mul_pd(c12_00,VV);
344 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
345 fvdw12 = _mm256_mul_pd(c12_00,FF);
346 vvdw = _mm256_add_pd(vvdw12,vvdw6);
347 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
349 /* Update potential sum for this i atom from the interaction with this j atom. */
350 velecsum = _mm256_add_pd(velecsum,velec);
351 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
353 fscal = _mm256_add_pd(felec,fvdw);
355 /* Calculate temporary vectorial force */
356 tx = _mm256_mul_pd(fscal,dx00);
357 ty = _mm256_mul_pd(fscal,dy00);
358 tz = _mm256_mul_pd(fscal,dz00);
360 /* Update vectorial force */
361 fix0 = _mm256_add_pd(fix0,tx);
362 fiy0 = _mm256_add_pd(fiy0,ty);
363 fiz0 = _mm256_add_pd(fiz0,tz);
365 fjx0 = _mm256_add_pd(fjx0,tx);
366 fjy0 = _mm256_add_pd(fjy0,ty);
367 fjz0 = _mm256_add_pd(fjz0,tz);
369 /**************************
370 * CALCULATE INTERACTIONS *
371 **************************/
373 /* REACTION-FIELD ELECTROSTATICS */
374 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_add_pd(rinv01,_mm256_mul_pd(krf,rsq01)),crf));
375 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
377 /* Update potential sum for this i atom from the interaction with this j atom. */
378 velecsum = _mm256_add_pd(velecsum,velec);
382 /* Calculate temporary vectorial force */
383 tx = _mm256_mul_pd(fscal,dx01);
384 ty = _mm256_mul_pd(fscal,dy01);
385 tz = _mm256_mul_pd(fscal,dz01);
387 /* Update vectorial force */
388 fix0 = _mm256_add_pd(fix0,tx);
389 fiy0 = _mm256_add_pd(fiy0,ty);
390 fiz0 = _mm256_add_pd(fiz0,tz);
392 fjx1 = _mm256_add_pd(fjx1,tx);
393 fjy1 = _mm256_add_pd(fjy1,ty);
394 fjz1 = _mm256_add_pd(fjz1,tz);
396 /**************************
397 * CALCULATE INTERACTIONS *
398 **************************/
400 /* REACTION-FIELD ELECTROSTATICS */
401 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_add_pd(rinv02,_mm256_mul_pd(krf,rsq02)),crf));
402 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
404 /* Update potential sum for this i atom from the interaction with this j atom. */
405 velecsum = _mm256_add_pd(velecsum,velec);
409 /* Calculate temporary vectorial force */
410 tx = _mm256_mul_pd(fscal,dx02);
411 ty = _mm256_mul_pd(fscal,dy02);
412 tz = _mm256_mul_pd(fscal,dz02);
414 /* Update vectorial force */
415 fix0 = _mm256_add_pd(fix0,tx);
416 fiy0 = _mm256_add_pd(fiy0,ty);
417 fiz0 = _mm256_add_pd(fiz0,tz);
419 fjx2 = _mm256_add_pd(fjx2,tx);
420 fjy2 = _mm256_add_pd(fjy2,ty);
421 fjz2 = _mm256_add_pd(fjz2,tz);
423 /**************************
424 * CALCULATE INTERACTIONS *
425 **************************/
427 /* REACTION-FIELD ELECTROSTATICS */
428 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_add_pd(rinv10,_mm256_mul_pd(krf,rsq10)),crf));
429 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
431 /* Update potential sum for this i atom from the interaction with this j atom. */
432 velecsum = _mm256_add_pd(velecsum,velec);
436 /* Calculate temporary vectorial force */
437 tx = _mm256_mul_pd(fscal,dx10);
438 ty = _mm256_mul_pd(fscal,dy10);
439 tz = _mm256_mul_pd(fscal,dz10);
441 /* Update vectorial force */
442 fix1 = _mm256_add_pd(fix1,tx);
443 fiy1 = _mm256_add_pd(fiy1,ty);
444 fiz1 = _mm256_add_pd(fiz1,tz);
446 fjx0 = _mm256_add_pd(fjx0,tx);
447 fjy0 = _mm256_add_pd(fjy0,ty);
448 fjz0 = _mm256_add_pd(fjz0,tz);
450 /**************************
451 * CALCULATE INTERACTIONS *
452 **************************/
454 /* REACTION-FIELD ELECTROSTATICS */
455 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
456 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
458 /* Update potential sum for this i atom from the interaction with this j atom. */
459 velecsum = _mm256_add_pd(velecsum,velec);
463 /* Calculate temporary vectorial force */
464 tx = _mm256_mul_pd(fscal,dx11);
465 ty = _mm256_mul_pd(fscal,dy11);
466 tz = _mm256_mul_pd(fscal,dz11);
468 /* Update vectorial force */
469 fix1 = _mm256_add_pd(fix1,tx);
470 fiy1 = _mm256_add_pd(fiy1,ty);
471 fiz1 = _mm256_add_pd(fiz1,tz);
473 fjx1 = _mm256_add_pd(fjx1,tx);
474 fjy1 = _mm256_add_pd(fjy1,ty);
475 fjz1 = _mm256_add_pd(fjz1,tz);
477 /**************************
478 * CALCULATE INTERACTIONS *
479 **************************/
481 /* REACTION-FIELD ELECTROSTATICS */
482 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
483 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
485 /* Update potential sum for this i atom from the interaction with this j atom. */
486 velecsum = _mm256_add_pd(velecsum,velec);
490 /* Calculate temporary vectorial force */
491 tx = _mm256_mul_pd(fscal,dx12);
492 ty = _mm256_mul_pd(fscal,dy12);
493 tz = _mm256_mul_pd(fscal,dz12);
495 /* Update vectorial force */
496 fix1 = _mm256_add_pd(fix1,tx);
497 fiy1 = _mm256_add_pd(fiy1,ty);
498 fiz1 = _mm256_add_pd(fiz1,tz);
500 fjx2 = _mm256_add_pd(fjx2,tx);
501 fjy2 = _mm256_add_pd(fjy2,ty);
502 fjz2 = _mm256_add_pd(fjz2,tz);
504 /**************************
505 * CALCULATE INTERACTIONS *
506 **************************/
508 /* REACTION-FIELD ELECTROSTATICS */
509 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_add_pd(rinv20,_mm256_mul_pd(krf,rsq20)),crf));
510 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
512 /* Update potential sum for this i atom from the interaction with this j atom. */
513 velecsum = _mm256_add_pd(velecsum,velec);
517 /* Calculate temporary vectorial force */
518 tx = _mm256_mul_pd(fscal,dx20);
519 ty = _mm256_mul_pd(fscal,dy20);
520 tz = _mm256_mul_pd(fscal,dz20);
522 /* Update vectorial force */
523 fix2 = _mm256_add_pd(fix2,tx);
524 fiy2 = _mm256_add_pd(fiy2,ty);
525 fiz2 = _mm256_add_pd(fiz2,tz);
527 fjx0 = _mm256_add_pd(fjx0,tx);
528 fjy0 = _mm256_add_pd(fjy0,ty);
529 fjz0 = _mm256_add_pd(fjz0,tz);
531 /**************************
532 * CALCULATE INTERACTIONS *
533 **************************/
535 /* REACTION-FIELD ELECTROSTATICS */
536 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
537 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
539 /* Update potential sum for this i atom from the interaction with this j atom. */
540 velecsum = _mm256_add_pd(velecsum,velec);
544 /* Calculate temporary vectorial force */
545 tx = _mm256_mul_pd(fscal,dx21);
546 ty = _mm256_mul_pd(fscal,dy21);
547 tz = _mm256_mul_pd(fscal,dz21);
549 /* Update vectorial force */
550 fix2 = _mm256_add_pd(fix2,tx);
551 fiy2 = _mm256_add_pd(fiy2,ty);
552 fiz2 = _mm256_add_pd(fiz2,tz);
554 fjx1 = _mm256_add_pd(fjx1,tx);
555 fjy1 = _mm256_add_pd(fjy1,ty);
556 fjz1 = _mm256_add_pd(fjz1,tz);
558 /**************************
559 * CALCULATE INTERACTIONS *
560 **************************/
562 /* REACTION-FIELD ELECTROSTATICS */
563 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
564 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
566 /* Update potential sum for this i atom from the interaction with this j atom. */
567 velecsum = _mm256_add_pd(velecsum,velec);
571 /* Calculate temporary vectorial force */
572 tx = _mm256_mul_pd(fscal,dx22);
573 ty = _mm256_mul_pd(fscal,dy22);
574 tz = _mm256_mul_pd(fscal,dz22);
576 /* Update vectorial force */
577 fix2 = _mm256_add_pd(fix2,tx);
578 fiy2 = _mm256_add_pd(fiy2,ty);
579 fiz2 = _mm256_add_pd(fiz2,tz);
581 fjx2 = _mm256_add_pd(fjx2,tx);
582 fjy2 = _mm256_add_pd(fjy2,ty);
583 fjz2 = _mm256_add_pd(fjz2,tz);
585 fjptrA = f+j_coord_offsetA;
586 fjptrB = f+j_coord_offsetB;
587 fjptrC = f+j_coord_offsetC;
588 fjptrD = f+j_coord_offsetD;
590 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
591 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
593 /* Inner loop uses 323 flops */
599 /* Get j neighbor index, and coordinate index */
600 jnrlistA = jjnr[jidx];
601 jnrlistB = jjnr[jidx+1];
602 jnrlistC = jjnr[jidx+2];
603 jnrlistD = jjnr[jidx+3];
604 /* Sign of each element will be negative for non-real atoms.
605 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
606 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
608 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
610 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
611 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
612 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
614 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
615 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
616 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
617 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
618 j_coord_offsetA = DIM*jnrA;
619 j_coord_offsetB = DIM*jnrB;
620 j_coord_offsetC = DIM*jnrC;
621 j_coord_offsetD = DIM*jnrD;
623 /* load j atom coordinates */
624 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
625 x+j_coord_offsetC,x+j_coord_offsetD,
626 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
628 /* Calculate displacement vector */
629 dx00 = _mm256_sub_pd(ix0,jx0);
630 dy00 = _mm256_sub_pd(iy0,jy0);
631 dz00 = _mm256_sub_pd(iz0,jz0);
632 dx01 = _mm256_sub_pd(ix0,jx1);
633 dy01 = _mm256_sub_pd(iy0,jy1);
634 dz01 = _mm256_sub_pd(iz0,jz1);
635 dx02 = _mm256_sub_pd(ix0,jx2);
636 dy02 = _mm256_sub_pd(iy0,jy2);
637 dz02 = _mm256_sub_pd(iz0,jz2);
638 dx10 = _mm256_sub_pd(ix1,jx0);
639 dy10 = _mm256_sub_pd(iy1,jy0);
640 dz10 = _mm256_sub_pd(iz1,jz0);
641 dx11 = _mm256_sub_pd(ix1,jx1);
642 dy11 = _mm256_sub_pd(iy1,jy1);
643 dz11 = _mm256_sub_pd(iz1,jz1);
644 dx12 = _mm256_sub_pd(ix1,jx2);
645 dy12 = _mm256_sub_pd(iy1,jy2);
646 dz12 = _mm256_sub_pd(iz1,jz2);
647 dx20 = _mm256_sub_pd(ix2,jx0);
648 dy20 = _mm256_sub_pd(iy2,jy0);
649 dz20 = _mm256_sub_pd(iz2,jz0);
650 dx21 = _mm256_sub_pd(ix2,jx1);
651 dy21 = _mm256_sub_pd(iy2,jy1);
652 dz21 = _mm256_sub_pd(iz2,jz1);
653 dx22 = _mm256_sub_pd(ix2,jx2);
654 dy22 = _mm256_sub_pd(iy2,jy2);
655 dz22 = _mm256_sub_pd(iz2,jz2);
657 /* Calculate squared distance and things based on it */
658 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
659 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
660 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
661 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
662 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
663 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
664 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
665 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
666 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
668 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
669 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
670 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
671 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
672 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
673 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
674 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
675 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
676 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
678 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
679 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
680 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
681 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
682 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
683 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
684 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
685 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
686 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
688 fjx0 = _mm256_setzero_pd();
689 fjy0 = _mm256_setzero_pd();
690 fjz0 = _mm256_setzero_pd();
691 fjx1 = _mm256_setzero_pd();
692 fjy1 = _mm256_setzero_pd();
693 fjz1 = _mm256_setzero_pd();
694 fjx2 = _mm256_setzero_pd();
695 fjy2 = _mm256_setzero_pd();
696 fjz2 = _mm256_setzero_pd();
698 /**************************
699 * CALCULATE INTERACTIONS *
700 **************************/
702 r00 = _mm256_mul_pd(rsq00,rinv00);
703 r00 = _mm256_andnot_pd(dummy_mask,r00);
705 /* Calculate table index by multiplying r with table scale and truncate to integer */
706 rt = _mm256_mul_pd(r00,vftabscale);
707 vfitab = _mm256_cvttpd_epi32(rt);
708 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
709 vfitab = _mm_slli_epi32(vfitab,3);
711 /* REACTION-FIELD ELECTROSTATICS */
712 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_add_pd(rinv00,_mm256_mul_pd(krf,rsq00)),crf));
713 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
715 /* CUBIC SPLINE TABLE DISPERSION */
716 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
717 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
718 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
719 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
720 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
721 Heps = _mm256_mul_pd(vfeps,H);
722 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
723 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
724 vvdw6 = _mm256_mul_pd(c6_00,VV);
725 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
726 fvdw6 = _mm256_mul_pd(c6_00,FF);
728 /* CUBIC SPLINE TABLE REPULSION */
729 vfitab = _mm_add_epi32(vfitab,ifour);
730 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
731 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
732 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
733 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
734 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
735 Heps = _mm256_mul_pd(vfeps,H);
736 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
737 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
738 vvdw12 = _mm256_mul_pd(c12_00,VV);
739 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
740 fvdw12 = _mm256_mul_pd(c12_00,FF);
741 vvdw = _mm256_add_pd(vvdw12,vvdw6);
742 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
744 /* Update potential sum for this i atom from the interaction with this j atom. */
745 velec = _mm256_andnot_pd(dummy_mask,velec);
746 velecsum = _mm256_add_pd(velecsum,velec);
747 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
748 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
750 fscal = _mm256_add_pd(felec,fvdw);
752 fscal = _mm256_andnot_pd(dummy_mask,fscal);
754 /* Calculate temporary vectorial force */
755 tx = _mm256_mul_pd(fscal,dx00);
756 ty = _mm256_mul_pd(fscal,dy00);
757 tz = _mm256_mul_pd(fscal,dz00);
759 /* Update vectorial force */
760 fix0 = _mm256_add_pd(fix0,tx);
761 fiy0 = _mm256_add_pd(fiy0,ty);
762 fiz0 = _mm256_add_pd(fiz0,tz);
764 fjx0 = _mm256_add_pd(fjx0,tx);
765 fjy0 = _mm256_add_pd(fjy0,ty);
766 fjz0 = _mm256_add_pd(fjz0,tz);
768 /**************************
769 * CALCULATE INTERACTIONS *
770 **************************/
772 /* REACTION-FIELD ELECTROSTATICS */
773 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_add_pd(rinv01,_mm256_mul_pd(krf,rsq01)),crf));
774 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
776 /* Update potential sum for this i atom from the interaction with this j atom. */
777 velec = _mm256_andnot_pd(dummy_mask,velec);
778 velecsum = _mm256_add_pd(velecsum,velec);
782 fscal = _mm256_andnot_pd(dummy_mask,fscal);
784 /* Calculate temporary vectorial force */
785 tx = _mm256_mul_pd(fscal,dx01);
786 ty = _mm256_mul_pd(fscal,dy01);
787 tz = _mm256_mul_pd(fscal,dz01);
789 /* Update vectorial force */
790 fix0 = _mm256_add_pd(fix0,tx);
791 fiy0 = _mm256_add_pd(fiy0,ty);
792 fiz0 = _mm256_add_pd(fiz0,tz);
794 fjx1 = _mm256_add_pd(fjx1,tx);
795 fjy1 = _mm256_add_pd(fjy1,ty);
796 fjz1 = _mm256_add_pd(fjz1,tz);
798 /**************************
799 * CALCULATE INTERACTIONS *
800 **************************/
802 /* REACTION-FIELD ELECTROSTATICS */
803 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_add_pd(rinv02,_mm256_mul_pd(krf,rsq02)),crf));
804 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
806 /* Update potential sum for this i atom from the interaction with this j atom. */
807 velec = _mm256_andnot_pd(dummy_mask,velec);
808 velecsum = _mm256_add_pd(velecsum,velec);
812 fscal = _mm256_andnot_pd(dummy_mask,fscal);
814 /* Calculate temporary vectorial force */
815 tx = _mm256_mul_pd(fscal,dx02);
816 ty = _mm256_mul_pd(fscal,dy02);
817 tz = _mm256_mul_pd(fscal,dz02);
819 /* Update vectorial force */
820 fix0 = _mm256_add_pd(fix0,tx);
821 fiy0 = _mm256_add_pd(fiy0,ty);
822 fiz0 = _mm256_add_pd(fiz0,tz);
824 fjx2 = _mm256_add_pd(fjx2,tx);
825 fjy2 = _mm256_add_pd(fjy2,ty);
826 fjz2 = _mm256_add_pd(fjz2,tz);
828 /**************************
829 * CALCULATE INTERACTIONS *
830 **************************/
832 /* REACTION-FIELD ELECTROSTATICS */
833 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_add_pd(rinv10,_mm256_mul_pd(krf,rsq10)),crf));
834 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
836 /* Update potential sum for this i atom from the interaction with this j atom. */
837 velec = _mm256_andnot_pd(dummy_mask,velec);
838 velecsum = _mm256_add_pd(velecsum,velec);
842 fscal = _mm256_andnot_pd(dummy_mask,fscal);
844 /* Calculate temporary vectorial force */
845 tx = _mm256_mul_pd(fscal,dx10);
846 ty = _mm256_mul_pd(fscal,dy10);
847 tz = _mm256_mul_pd(fscal,dz10);
849 /* Update vectorial force */
850 fix1 = _mm256_add_pd(fix1,tx);
851 fiy1 = _mm256_add_pd(fiy1,ty);
852 fiz1 = _mm256_add_pd(fiz1,tz);
854 fjx0 = _mm256_add_pd(fjx0,tx);
855 fjy0 = _mm256_add_pd(fjy0,ty);
856 fjz0 = _mm256_add_pd(fjz0,tz);
858 /**************************
859 * CALCULATE INTERACTIONS *
860 **************************/
862 /* REACTION-FIELD ELECTROSTATICS */
863 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
864 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
866 /* Update potential sum for this i atom from the interaction with this j atom. */
867 velec = _mm256_andnot_pd(dummy_mask,velec);
868 velecsum = _mm256_add_pd(velecsum,velec);
872 fscal = _mm256_andnot_pd(dummy_mask,fscal);
874 /* Calculate temporary vectorial force */
875 tx = _mm256_mul_pd(fscal,dx11);
876 ty = _mm256_mul_pd(fscal,dy11);
877 tz = _mm256_mul_pd(fscal,dz11);
879 /* Update vectorial force */
880 fix1 = _mm256_add_pd(fix1,tx);
881 fiy1 = _mm256_add_pd(fiy1,ty);
882 fiz1 = _mm256_add_pd(fiz1,tz);
884 fjx1 = _mm256_add_pd(fjx1,tx);
885 fjy1 = _mm256_add_pd(fjy1,ty);
886 fjz1 = _mm256_add_pd(fjz1,tz);
888 /**************************
889 * CALCULATE INTERACTIONS *
890 **************************/
892 /* REACTION-FIELD ELECTROSTATICS */
893 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
894 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
896 /* Update potential sum for this i atom from the interaction with this j atom. */
897 velec = _mm256_andnot_pd(dummy_mask,velec);
898 velecsum = _mm256_add_pd(velecsum,velec);
902 fscal = _mm256_andnot_pd(dummy_mask,fscal);
904 /* Calculate temporary vectorial force */
905 tx = _mm256_mul_pd(fscal,dx12);
906 ty = _mm256_mul_pd(fscal,dy12);
907 tz = _mm256_mul_pd(fscal,dz12);
909 /* Update vectorial force */
910 fix1 = _mm256_add_pd(fix1,tx);
911 fiy1 = _mm256_add_pd(fiy1,ty);
912 fiz1 = _mm256_add_pd(fiz1,tz);
914 fjx2 = _mm256_add_pd(fjx2,tx);
915 fjy2 = _mm256_add_pd(fjy2,ty);
916 fjz2 = _mm256_add_pd(fjz2,tz);
918 /**************************
919 * CALCULATE INTERACTIONS *
920 **************************/
922 /* REACTION-FIELD ELECTROSTATICS */
923 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_add_pd(rinv20,_mm256_mul_pd(krf,rsq20)),crf));
924 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
926 /* Update potential sum for this i atom from the interaction with this j atom. */
927 velec = _mm256_andnot_pd(dummy_mask,velec);
928 velecsum = _mm256_add_pd(velecsum,velec);
932 fscal = _mm256_andnot_pd(dummy_mask,fscal);
934 /* Calculate temporary vectorial force */
935 tx = _mm256_mul_pd(fscal,dx20);
936 ty = _mm256_mul_pd(fscal,dy20);
937 tz = _mm256_mul_pd(fscal,dz20);
939 /* Update vectorial force */
940 fix2 = _mm256_add_pd(fix2,tx);
941 fiy2 = _mm256_add_pd(fiy2,ty);
942 fiz2 = _mm256_add_pd(fiz2,tz);
944 fjx0 = _mm256_add_pd(fjx0,tx);
945 fjy0 = _mm256_add_pd(fjy0,ty);
946 fjz0 = _mm256_add_pd(fjz0,tz);
948 /**************************
949 * CALCULATE INTERACTIONS *
950 **************************/
952 /* REACTION-FIELD ELECTROSTATICS */
953 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
954 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
956 /* Update potential sum for this i atom from the interaction with this j atom. */
957 velec = _mm256_andnot_pd(dummy_mask,velec);
958 velecsum = _mm256_add_pd(velecsum,velec);
962 fscal = _mm256_andnot_pd(dummy_mask,fscal);
964 /* Calculate temporary vectorial force */
965 tx = _mm256_mul_pd(fscal,dx21);
966 ty = _mm256_mul_pd(fscal,dy21);
967 tz = _mm256_mul_pd(fscal,dz21);
969 /* Update vectorial force */
970 fix2 = _mm256_add_pd(fix2,tx);
971 fiy2 = _mm256_add_pd(fiy2,ty);
972 fiz2 = _mm256_add_pd(fiz2,tz);
974 fjx1 = _mm256_add_pd(fjx1,tx);
975 fjy1 = _mm256_add_pd(fjy1,ty);
976 fjz1 = _mm256_add_pd(fjz1,tz);
978 /**************************
979 * CALCULATE INTERACTIONS *
980 **************************/
982 /* REACTION-FIELD ELECTROSTATICS */
983 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
984 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
986 /* Update potential sum for this i atom from the interaction with this j atom. */
987 velec = _mm256_andnot_pd(dummy_mask,velec);
988 velecsum = _mm256_add_pd(velecsum,velec);
992 fscal = _mm256_andnot_pd(dummy_mask,fscal);
994 /* Calculate temporary vectorial force */
995 tx = _mm256_mul_pd(fscal,dx22);
996 ty = _mm256_mul_pd(fscal,dy22);
997 tz = _mm256_mul_pd(fscal,dz22);
999 /* Update vectorial force */
1000 fix2 = _mm256_add_pd(fix2,tx);
1001 fiy2 = _mm256_add_pd(fiy2,ty);
1002 fiz2 = _mm256_add_pd(fiz2,tz);
1004 fjx2 = _mm256_add_pd(fjx2,tx);
1005 fjy2 = _mm256_add_pd(fjy2,ty);
1006 fjz2 = _mm256_add_pd(fjz2,tz);
1008 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1009 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1010 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1011 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1013 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1014 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1016 /* Inner loop uses 324 flops */
1019 /* End of innermost loop */
1021 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1022 f+i_coord_offset,fshift+i_shift_offset);
1025 /* Update potential energies */
1026 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1027 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1029 /* Increment number of inner iterations */
1030 inneriter += j_index_end - j_index_start;
1032 /* Outer loop uses 20 flops */
1035 /* Increment number of outer iterations */
1038 /* Update outer/inner flops */
1040 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*324);
1043 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_avx_256_double
1044 * Electrostatics interaction: ReactionField
1045 * VdW interaction: CubicSplineTable
1046 * Geometry: Water3-Water3
1047 * Calculate force/pot: Force
1050 nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_avx_256_double
1051 (t_nblist * gmx_restrict nlist,
1052 rvec * gmx_restrict xx,
1053 rvec * gmx_restrict ff,
1054 t_forcerec * gmx_restrict fr,
1055 t_mdatoms * gmx_restrict mdatoms,
1056 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1057 t_nrnb * gmx_restrict nrnb)
1059 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1060 * just 0 for non-waters.
1061 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1062 * jnr indices corresponding to data put in the four positions in the SIMD register.
1064 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1065 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1066 int jnrA,jnrB,jnrC,jnrD;
1067 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1068 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1069 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1070 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1071 real rcutoff_scalar;
1072 real *shiftvec,*fshift,*x,*f;
1073 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1074 real scratch[4*DIM];
1075 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1076 real * vdwioffsetptr0;
1077 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1078 real * vdwioffsetptr1;
1079 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1080 real * vdwioffsetptr2;
1081 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1082 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1083 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1084 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1085 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1086 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1087 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1088 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1089 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1090 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1091 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1092 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1093 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1094 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1095 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1096 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1097 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1100 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1103 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1104 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1106 __m128i ifour = _mm_set1_epi32(4);
1107 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1109 __m256d dummy_mask,cutoff_mask;
1110 __m128 tmpmask0,tmpmask1;
1111 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1112 __m256d one = _mm256_set1_pd(1.0);
1113 __m256d two = _mm256_set1_pd(2.0);
1119 jindex = nlist->jindex;
1121 shiftidx = nlist->shift;
1123 shiftvec = fr->shift_vec[0];
1124 fshift = fr->fshift[0];
1125 facel = _mm256_set1_pd(fr->epsfac);
1126 charge = mdatoms->chargeA;
1127 krf = _mm256_set1_pd(fr->ic->k_rf);
1128 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
1129 crf = _mm256_set1_pd(fr->ic->c_rf);
1130 nvdwtype = fr->ntype;
1131 vdwparam = fr->nbfp;
1132 vdwtype = mdatoms->typeA;
1134 vftab = kernel_data->table_vdw->data;
1135 vftabscale = _mm256_set1_pd(kernel_data->table_vdw->scale);
1137 /* Setup water-specific parameters */
1138 inr = nlist->iinr[0];
1139 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1140 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1141 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1142 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1144 jq0 = _mm256_set1_pd(charge[inr+0]);
1145 jq1 = _mm256_set1_pd(charge[inr+1]);
1146 jq2 = _mm256_set1_pd(charge[inr+2]);
1147 vdwjidx0A = 2*vdwtype[inr+0];
1148 qq00 = _mm256_mul_pd(iq0,jq0);
1149 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1150 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1151 qq01 = _mm256_mul_pd(iq0,jq1);
1152 qq02 = _mm256_mul_pd(iq0,jq2);
1153 qq10 = _mm256_mul_pd(iq1,jq0);
1154 qq11 = _mm256_mul_pd(iq1,jq1);
1155 qq12 = _mm256_mul_pd(iq1,jq2);
1156 qq20 = _mm256_mul_pd(iq2,jq0);
1157 qq21 = _mm256_mul_pd(iq2,jq1);
1158 qq22 = _mm256_mul_pd(iq2,jq2);
1160 /* Avoid stupid compiler warnings */
1161 jnrA = jnrB = jnrC = jnrD = 0;
1162 j_coord_offsetA = 0;
1163 j_coord_offsetB = 0;
1164 j_coord_offsetC = 0;
1165 j_coord_offsetD = 0;
1170 for(iidx=0;iidx<4*DIM;iidx++)
1172 scratch[iidx] = 0.0;
1175 /* Start outer loop over neighborlists */
1176 for(iidx=0; iidx<nri; iidx++)
1178 /* Load shift vector for this list */
1179 i_shift_offset = DIM*shiftidx[iidx];
1181 /* Load limits for loop over neighbors */
1182 j_index_start = jindex[iidx];
1183 j_index_end = jindex[iidx+1];
1185 /* Get outer coordinate index */
1187 i_coord_offset = DIM*inr;
1189 /* Load i particle coords and add shift vector */
1190 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1191 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1193 fix0 = _mm256_setzero_pd();
1194 fiy0 = _mm256_setzero_pd();
1195 fiz0 = _mm256_setzero_pd();
1196 fix1 = _mm256_setzero_pd();
1197 fiy1 = _mm256_setzero_pd();
1198 fiz1 = _mm256_setzero_pd();
1199 fix2 = _mm256_setzero_pd();
1200 fiy2 = _mm256_setzero_pd();
1201 fiz2 = _mm256_setzero_pd();
1203 /* Start inner kernel loop */
1204 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1207 /* Get j neighbor index, and coordinate index */
1209 jnrB = jjnr[jidx+1];
1210 jnrC = jjnr[jidx+2];
1211 jnrD = jjnr[jidx+3];
1212 j_coord_offsetA = DIM*jnrA;
1213 j_coord_offsetB = DIM*jnrB;
1214 j_coord_offsetC = DIM*jnrC;
1215 j_coord_offsetD = DIM*jnrD;
1217 /* load j atom coordinates */
1218 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1219 x+j_coord_offsetC,x+j_coord_offsetD,
1220 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1222 /* Calculate displacement vector */
1223 dx00 = _mm256_sub_pd(ix0,jx0);
1224 dy00 = _mm256_sub_pd(iy0,jy0);
1225 dz00 = _mm256_sub_pd(iz0,jz0);
1226 dx01 = _mm256_sub_pd(ix0,jx1);
1227 dy01 = _mm256_sub_pd(iy0,jy1);
1228 dz01 = _mm256_sub_pd(iz0,jz1);
1229 dx02 = _mm256_sub_pd(ix0,jx2);
1230 dy02 = _mm256_sub_pd(iy0,jy2);
1231 dz02 = _mm256_sub_pd(iz0,jz2);
1232 dx10 = _mm256_sub_pd(ix1,jx0);
1233 dy10 = _mm256_sub_pd(iy1,jy0);
1234 dz10 = _mm256_sub_pd(iz1,jz0);
1235 dx11 = _mm256_sub_pd(ix1,jx1);
1236 dy11 = _mm256_sub_pd(iy1,jy1);
1237 dz11 = _mm256_sub_pd(iz1,jz1);
1238 dx12 = _mm256_sub_pd(ix1,jx2);
1239 dy12 = _mm256_sub_pd(iy1,jy2);
1240 dz12 = _mm256_sub_pd(iz1,jz2);
1241 dx20 = _mm256_sub_pd(ix2,jx0);
1242 dy20 = _mm256_sub_pd(iy2,jy0);
1243 dz20 = _mm256_sub_pd(iz2,jz0);
1244 dx21 = _mm256_sub_pd(ix2,jx1);
1245 dy21 = _mm256_sub_pd(iy2,jy1);
1246 dz21 = _mm256_sub_pd(iz2,jz1);
1247 dx22 = _mm256_sub_pd(ix2,jx2);
1248 dy22 = _mm256_sub_pd(iy2,jy2);
1249 dz22 = _mm256_sub_pd(iz2,jz2);
1251 /* Calculate squared distance and things based on it */
1252 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1253 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1254 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1255 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1256 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1257 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1258 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1259 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1260 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1262 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1263 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1264 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1265 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1266 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1267 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1268 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1269 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1270 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1272 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1273 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1274 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1275 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1276 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1277 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1278 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1279 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1280 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1282 fjx0 = _mm256_setzero_pd();
1283 fjy0 = _mm256_setzero_pd();
1284 fjz0 = _mm256_setzero_pd();
1285 fjx1 = _mm256_setzero_pd();
1286 fjy1 = _mm256_setzero_pd();
1287 fjz1 = _mm256_setzero_pd();
1288 fjx2 = _mm256_setzero_pd();
1289 fjy2 = _mm256_setzero_pd();
1290 fjz2 = _mm256_setzero_pd();
1292 /**************************
1293 * CALCULATE INTERACTIONS *
1294 **************************/
1296 r00 = _mm256_mul_pd(rsq00,rinv00);
1298 /* Calculate table index by multiplying r with table scale and truncate to integer */
1299 rt = _mm256_mul_pd(r00,vftabscale);
1300 vfitab = _mm256_cvttpd_epi32(rt);
1301 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1302 vfitab = _mm_slli_epi32(vfitab,3);
1304 /* REACTION-FIELD ELECTROSTATICS */
1305 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
1307 /* CUBIC SPLINE TABLE DISPERSION */
1308 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1309 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1310 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1311 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1312 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1313 Heps = _mm256_mul_pd(vfeps,H);
1314 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1315 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1316 fvdw6 = _mm256_mul_pd(c6_00,FF);
1318 /* CUBIC SPLINE TABLE REPULSION */
1319 vfitab = _mm_add_epi32(vfitab,ifour);
1320 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1321 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1322 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1323 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1324 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1325 Heps = _mm256_mul_pd(vfeps,H);
1326 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1327 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1328 fvdw12 = _mm256_mul_pd(c12_00,FF);
1329 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
1331 fscal = _mm256_add_pd(felec,fvdw);
1333 /* Calculate temporary vectorial force */
1334 tx = _mm256_mul_pd(fscal,dx00);
1335 ty = _mm256_mul_pd(fscal,dy00);
1336 tz = _mm256_mul_pd(fscal,dz00);
1338 /* Update vectorial force */
1339 fix0 = _mm256_add_pd(fix0,tx);
1340 fiy0 = _mm256_add_pd(fiy0,ty);
1341 fiz0 = _mm256_add_pd(fiz0,tz);
1343 fjx0 = _mm256_add_pd(fjx0,tx);
1344 fjy0 = _mm256_add_pd(fjy0,ty);
1345 fjz0 = _mm256_add_pd(fjz0,tz);
1347 /**************************
1348 * CALCULATE INTERACTIONS *
1349 **************************/
1351 /* REACTION-FIELD ELECTROSTATICS */
1352 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
1356 /* Calculate temporary vectorial force */
1357 tx = _mm256_mul_pd(fscal,dx01);
1358 ty = _mm256_mul_pd(fscal,dy01);
1359 tz = _mm256_mul_pd(fscal,dz01);
1361 /* Update vectorial force */
1362 fix0 = _mm256_add_pd(fix0,tx);
1363 fiy0 = _mm256_add_pd(fiy0,ty);
1364 fiz0 = _mm256_add_pd(fiz0,tz);
1366 fjx1 = _mm256_add_pd(fjx1,tx);
1367 fjy1 = _mm256_add_pd(fjy1,ty);
1368 fjz1 = _mm256_add_pd(fjz1,tz);
1370 /**************************
1371 * CALCULATE INTERACTIONS *
1372 **************************/
1374 /* REACTION-FIELD ELECTROSTATICS */
1375 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
1379 /* Calculate temporary vectorial force */
1380 tx = _mm256_mul_pd(fscal,dx02);
1381 ty = _mm256_mul_pd(fscal,dy02);
1382 tz = _mm256_mul_pd(fscal,dz02);
1384 /* Update vectorial force */
1385 fix0 = _mm256_add_pd(fix0,tx);
1386 fiy0 = _mm256_add_pd(fiy0,ty);
1387 fiz0 = _mm256_add_pd(fiz0,tz);
1389 fjx2 = _mm256_add_pd(fjx2,tx);
1390 fjy2 = _mm256_add_pd(fjy2,ty);
1391 fjz2 = _mm256_add_pd(fjz2,tz);
1393 /**************************
1394 * CALCULATE INTERACTIONS *
1395 **************************/
1397 /* REACTION-FIELD ELECTROSTATICS */
1398 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
1402 /* Calculate temporary vectorial force */
1403 tx = _mm256_mul_pd(fscal,dx10);
1404 ty = _mm256_mul_pd(fscal,dy10);
1405 tz = _mm256_mul_pd(fscal,dz10);
1407 /* Update vectorial force */
1408 fix1 = _mm256_add_pd(fix1,tx);
1409 fiy1 = _mm256_add_pd(fiy1,ty);
1410 fiz1 = _mm256_add_pd(fiz1,tz);
1412 fjx0 = _mm256_add_pd(fjx0,tx);
1413 fjy0 = _mm256_add_pd(fjy0,ty);
1414 fjz0 = _mm256_add_pd(fjz0,tz);
1416 /**************************
1417 * CALCULATE INTERACTIONS *
1418 **************************/
1420 /* REACTION-FIELD ELECTROSTATICS */
1421 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1425 /* Calculate temporary vectorial force */
1426 tx = _mm256_mul_pd(fscal,dx11);
1427 ty = _mm256_mul_pd(fscal,dy11);
1428 tz = _mm256_mul_pd(fscal,dz11);
1430 /* Update vectorial force */
1431 fix1 = _mm256_add_pd(fix1,tx);
1432 fiy1 = _mm256_add_pd(fiy1,ty);
1433 fiz1 = _mm256_add_pd(fiz1,tz);
1435 fjx1 = _mm256_add_pd(fjx1,tx);
1436 fjy1 = _mm256_add_pd(fjy1,ty);
1437 fjz1 = _mm256_add_pd(fjz1,tz);
1439 /**************************
1440 * CALCULATE INTERACTIONS *
1441 **************************/
1443 /* REACTION-FIELD ELECTROSTATICS */
1444 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1448 /* Calculate temporary vectorial force */
1449 tx = _mm256_mul_pd(fscal,dx12);
1450 ty = _mm256_mul_pd(fscal,dy12);
1451 tz = _mm256_mul_pd(fscal,dz12);
1453 /* Update vectorial force */
1454 fix1 = _mm256_add_pd(fix1,tx);
1455 fiy1 = _mm256_add_pd(fiy1,ty);
1456 fiz1 = _mm256_add_pd(fiz1,tz);
1458 fjx2 = _mm256_add_pd(fjx2,tx);
1459 fjy2 = _mm256_add_pd(fjy2,ty);
1460 fjz2 = _mm256_add_pd(fjz2,tz);
1462 /**************************
1463 * CALCULATE INTERACTIONS *
1464 **************************/
1466 /* REACTION-FIELD ELECTROSTATICS */
1467 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
1471 /* Calculate temporary vectorial force */
1472 tx = _mm256_mul_pd(fscal,dx20);
1473 ty = _mm256_mul_pd(fscal,dy20);
1474 tz = _mm256_mul_pd(fscal,dz20);
1476 /* Update vectorial force */
1477 fix2 = _mm256_add_pd(fix2,tx);
1478 fiy2 = _mm256_add_pd(fiy2,ty);
1479 fiz2 = _mm256_add_pd(fiz2,tz);
1481 fjx0 = _mm256_add_pd(fjx0,tx);
1482 fjy0 = _mm256_add_pd(fjy0,ty);
1483 fjz0 = _mm256_add_pd(fjz0,tz);
1485 /**************************
1486 * CALCULATE INTERACTIONS *
1487 **************************/
1489 /* REACTION-FIELD ELECTROSTATICS */
1490 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1494 /* Calculate temporary vectorial force */
1495 tx = _mm256_mul_pd(fscal,dx21);
1496 ty = _mm256_mul_pd(fscal,dy21);
1497 tz = _mm256_mul_pd(fscal,dz21);
1499 /* Update vectorial force */
1500 fix2 = _mm256_add_pd(fix2,tx);
1501 fiy2 = _mm256_add_pd(fiy2,ty);
1502 fiz2 = _mm256_add_pd(fiz2,tz);
1504 fjx1 = _mm256_add_pd(fjx1,tx);
1505 fjy1 = _mm256_add_pd(fjy1,ty);
1506 fjz1 = _mm256_add_pd(fjz1,tz);
1508 /**************************
1509 * CALCULATE INTERACTIONS *
1510 **************************/
1512 /* REACTION-FIELD ELECTROSTATICS */
1513 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1517 /* Calculate temporary vectorial force */
1518 tx = _mm256_mul_pd(fscal,dx22);
1519 ty = _mm256_mul_pd(fscal,dy22);
1520 tz = _mm256_mul_pd(fscal,dz22);
1522 /* Update vectorial force */
1523 fix2 = _mm256_add_pd(fix2,tx);
1524 fiy2 = _mm256_add_pd(fiy2,ty);
1525 fiz2 = _mm256_add_pd(fiz2,tz);
1527 fjx2 = _mm256_add_pd(fjx2,tx);
1528 fjy2 = _mm256_add_pd(fjy2,ty);
1529 fjz2 = _mm256_add_pd(fjz2,tz);
1531 fjptrA = f+j_coord_offsetA;
1532 fjptrB = f+j_coord_offsetB;
1533 fjptrC = f+j_coord_offsetC;
1534 fjptrD = f+j_coord_offsetD;
1536 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1537 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1539 /* Inner loop uses 270 flops */
1542 if(jidx<j_index_end)
1545 /* Get j neighbor index, and coordinate index */
1546 jnrlistA = jjnr[jidx];
1547 jnrlistB = jjnr[jidx+1];
1548 jnrlistC = jjnr[jidx+2];
1549 jnrlistD = jjnr[jidx+3];
1550 /* Sign of each element will be negative for non-real atoms.
1551 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1552 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1554 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1556 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1557 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1558 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1560 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1561 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1562 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1563 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1564 j_coord_offsetA = DIM*jnrA;
1565 j_coord_offsetB = DIM*jnrB;
1566 j_coord_offsetC = DIM*jnrC;
1567 j_coord_offsetD = DIM*jnrD;
1569 /* load j atom coordinates */
1570 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1571 x+j_coord_offsetC,x+j_coord_offsetD,
1572 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1574 /* Calculate displacement vector */
1575 dx00 = _mm256_sub_pd(ix0,jx0);
1576 dy00 = _mm256_sub_pd(iy0,jy0);
1577 dz00 = _mm256_sub_pd(iz0,jz0);
1578 dx01 = _mm256_sub_pd(ix0,jx1);
1579 dy01 = _mm256_sub_pd(iy0,jy1);
1580 dz01 = _mm256_sub_pd(iz0,jz1);
1581 dx02 = _mm256_sub_pd(ix0,jx2);
1582 dy02 = _mm256_sub_pd(iy0,jy2);
1583 dz02 = _mm256_sub_pd(iz0,jz2);
1584 dx10 = _mm256_sub_pd(ix1,jx0);
1585 dy10 = _mm256_sub_pd(iy1,jy0);
1586 dz10 = _mm256_sub_pd(iz1,jz0);
1587 dx11 = _mm256_sub_pd(ix1,jx1);
1588 dy11 = _mm256_sub_pd(iy1,jy1);
1589 dz11 = _mm256_sub_pd(iz1,jz1);
1590 dx12 = _mm256_sub_pd(ix1,jx2);
1591 dy12 = _mm256_sub_pd(iy1,jy2);
1592 dz12 = _mm256_sub_pd(iz1,jz2);
1593 dx20 = _mm256_sub_pd(ix2,jx0);
1594 dy20 = _mm256_sub_pd(iy2,jy0);
1595 dz20 = _mm256_sub_pd(iz2,jz0);
1596 dx21 = _mm256_sub_pd(ix2,jx1);
1597 dy21 = _mm256_sub_pd(iy2,jy1);
1598 dz21 = _mm256_sub_pd(iz2,jz1);
1599 dx22 = _mm256_sub_pd(ix2,jx2);
1600 dy22 = _mm256_sub_pd(iy2,jy2);
1601 dz22 = _mm256_sub_pd(iz2,jz2);
1603 /* Calculate squared distance and things based on it */
1604 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1605 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1606 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1607 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1608 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1609 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1610 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1611 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1612 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1614 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1615 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1616 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1617 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1618 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1619 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1620 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1621 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1622 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1624 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1625 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1626 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1627 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1628 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1629 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1630 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1631 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1632 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1634 fjx0 = _mm256_setzero_pd();
1635 fjy0 = _mm256_setzero_pd();
1636 fjz0 = _mm256_setzero_pd();
1637 fjx1 = _mm256_setzero_pd();
1638 fjy1 = _mm256_setzero_pd();
1639 fjz1 = _mm256_setzero_pd();
1640 fjx2 = _mm256_setzero_pd();
1641 fjy2 = _mm256_setzero_pd();
1642 fjz2 = _mm256_setzero_pd();
1644 /**************************
1645 * CALCULATE INTERACTIONS *
1646 **************************/
1648 r00 = _mm256_mul_pd(rsq00,rinv00);
1649 r00 = _mm256_andnot_pd(dummy_mask,r00);
1651 /* Calculate table index by multiplying r with table scale and truncate to integer */
1652 rt = _mm256_mul_pd(r00,vftabscale);
1653 vfitab = _mm256_cvttpd_epi32(rt);
1654 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1655 vfitab = _mm_slli_epi32(vfitab,3);
1657 /* REACTION-FIELD ELECTROSTATICS */
1658 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
1660 /* CUBIC SPLINE TABLE DISPERSION */
1661 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1662 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1663 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1664 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1665 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1666 Heps = _mm256_mul_pd(vfeps,H);
1667 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1668 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1669 fvdw6 = _mm256_mul_pd(c6_00,FF);
1671 /* CUBIC SPLINE TABLE REPULSION */
1672 vfitab = _mm_add_epi32(vfitab,ifour);
1673 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1674 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1675 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1676 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1677 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1678 Heps = _mm256_mul_pd(vfeps,H);
1679 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1680 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1681 fvdw12 = _mm256_mul_pd(c12_00,FF);
1682 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
1684 fscal = _mm256_add_pd(felec,fvdw);
1686 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1688 /* Calculate temporary vectorial force */
1689 tx = _mm256_mul_pd(fscal,dx00);
1690 ty = _mm256_mul_pd(fscal,dy00);
1691 tz = _mm256_mul_pd(fscal,dz00);
1693 /* Update vectorial force */
1694 fix0 = _mm256_add_pd(fix0,tx);
1695 fiy0 = _mm256_add_pd(fiy0,ty);
1696 fiz0 = _mm256_add_pd(fiz0,tz);
1698 fjx0 = _mm256_add_pd(fjx0,tx);
1699 fjy0 = _mm256_add_pd(fjy0,ty);
1700 fjz0 = _mm256_add_pd(fjz0,tz);
1702 /**************************
1703 * CALCULATE INTERACTIONS *
1704 **************************/
1706 /* REACTION-FIELD ELECTROSTATICS */
1707 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
1711 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1713 /* Calculate temporary vectorial force */
1714 tx = _mm256_mul_pd(fscal,dx01);
1715 ty = _mm256_mul_pd(fscal,dy01);
1716 tz = _mm256_mul_pd(fscal,dz01);
1718 /* Update vectorial force */
1719 fix0 = _mm256_add_pd(fix0,tx);
1720 fiy0 = _mm256_add_pd(fiy0,ty);
1721 fiz0 = _mm256_add_pd(fiz0,tz);
1723 fjx1 = _mm256_add_pd(fjx1,tx);
1724 fjy1 = _mm256_add_pd(fjy1,ty);
1725 fjz1 = _mm256_add_pd(fjz1,tz);
1727 /**************************
1728 * CALCULATE INTERACTIONS *
1729 **************************/
1731 /* REACTION-FIELD ELECTROSTATICS */
1732 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
1736 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1738 /* Calculate temporary vectorial force */
1739 tx = _mm256_mul_pd(fscal,dx02);
1740 ty = _mm256_mul_pd(fscal,dy02);
1741 tz = _mm256_mul_pd(fscal,dz02);
1743 /* Update vectorial force */
1744 fix0 = _mm256_add_pd(fix0,tx);
1745 fiy0 = _mm256_add_pd(fiy0,ty);
1746 fiz0 = _mm256_add_pd(fiz0,tz);
1748 fjx2 = _mm256_add_pd(fjx2,tx);
1749 fjy2 = _mm256_add_pd(fjy2,ty);
1750 fjz2 = _mm256_add_pd(fjz2,tz);
1752 /**************************
1753 * CALCULATE INTERACTIONS *
1754 **************************/
1756 /* REACTION-FIELD ELECTROSTATICS */
1757 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
1761 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1763 /* Calculate temporary vectorial force */
1764 tx = _mm256_mul_pd(fscal,dx10);
1765 ty = _mm256_mul_pd(fscal,dy10);
1766 tz = _mm256_mul_pd(fscal,dz10);
1768 /* Update vectorial force */
1769 fix1 = _mm256_add_pd(fix1,tx);
1770 fiy1 = _mm256_add_pd(fiy1,ty);
1771 fiz1 = _mm256_add_pd(fiz1,tz);
1773 fjx0 = _mm256_add_pd(fjx0,tx);
1774 fjy0 = _mm256_add_pd(fjy0,ty);
1775 fjz0 = _mm256_add_pd(fjz0,tz);
1777 /**************************
1778 * CALCULATE INTERACTIONS *
1779 **************************/
1781 /* REACTION-FIELD ELECTROSTATICS */
1782 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1786 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1788 /* Calculate temporary vectorial force */
1789 tx = _mm256_mul_pd(fscal,dx11);
1790 ty = _mm256_mul_pd(fscal,dy11);
1791 tz = _mm256_mul_pd(fscal,dz11);
1793 /* Update vectorial force */
1794 fix1 = _mm256_add_pd(fix1,tx);
1795 fiy1 = _mm256_add_pd(fiy1,ty);
1796 fiz1 = _mm256_add_pd(fiz1,tz);
1798 fjx1 = _mm256_add_pd(fjx1,tx);
1799 fjy1 = _mm256_add_pd(fjy1,ty);
1800 fjz1 = _mm256_add_pd(fjz1,tz);
1802 /**************************
1803 * CALCULATE INTERACTIONS *
1804 **************************/
1806 /* REACTION-FIELD ELECTROSTATICS */
1807 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1811 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1813 /* Calculate temporary vectorial force */
1814 tx = _mm256_mul_pd(fscal,dx12);
1815 ty = _mm256_mul_pd(fscal,dy12);
1816 tz = _mm256_mul_pd(fscal,dz12);
1818 /* Update vectorial force */
1819 fix1 = _mm256_add_pd(fix1,tx);
1820 fiy1 = _mm256_add_pd(fiy1,ty);
1821 fiz1 = _mm256_add_pd(fiz1,tz);
1823 fjx2 = _mm256_add_pd(fjx2,tx);
1824 fjy2 = _mm256_add_pd(fjy2,ty);
1825 fjz2 = _mm256_add_pd(fjz2,tz);
1827 /**************************
1828 * CALCULATE INTERACTIONS *
1829 **************************/
1831 /* REACTION-FIELD ELECTROSTATICS */
1832 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
1836 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1838 /* Calculate temporary vectorial force */
1839 tx = _mm256_mul_pd(fscal,dx20);
1840 ty = _mm256_mul_pd(fscal,dy20);
1841 tz = _mm256_mul_pd(fscal,dz20);
1843 /* Update vectorial force */
1844 fix2 = _mm256_add_pd(fix2,tx);
1845 fiy2 = _mm256_add_pd(fiy2,ty);
1846 fiz2 = _mm256_add_pd(fiz2,tz);
1848 fjx0 = _mm256_add_pd(fjx0,tx);
1849 fjy0 = _mm256_add_pd(fjy0,ty);
1850 fjz0 = _mm256_add_pd(fjz0,tz);
1852 /**************************
1853 * CALCULATE INTERACTIONS *
1854 **************************/
1856 /* REACTION-FIELD ELECTROSTATICS */
1857 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1861 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1863 /* Calculate temporary vectorial force */
1864 tx = _mm256_mul_pd(fscal,dx21);
1865 ty = _mm256_mul_pd(fscal,dy21);
1866 tz = _mm256_mul_pd(fscal,dz21);
1868 /* Update vectorial force */
1869 fix2 = _mm256_add_pd(fix2,tx);
1870 fiy2 = _mm256_add_pd(fiy2,ty);
1871 fiz2 = _mm256_add_pd(fiz2,tz);
1873 fjx1 = _mm256_add_pd(fjx1,tx);
1874 fjy1 = _mm256_add_pd(fjy1,ty);
1875 fjz1 = _mm256_add_pd(fjz1,tz);
1877 /**************************
1878 * CALCULATE INTERACTIONS *
1879 **************************/
1881 /* REACTION-FIELD ELECTROSTATICS */
1882 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1886 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1888 /* Calculate temporary vectorial force */
1889 tx = _mm256_mul_pd(fscal,dx22);
1890 ty = _mm256_mul_pd(fscal,dy22);
1891 tz = _mm256_mul_pd(fscal,dz22);
1893 /* Update vectorial force */
1894 fix2 = _mm256_add_pd(fix2,tx);
1895 fiy2 = _mm256_add_pd(fiy2,ty);
1896 fiz2 = _mm256_add_pd(fiz2,tz);
1898 fjx2 = _mm256_add_pd(fjx2,tx);
1899 fjy2 = _mm256_add_pd(fjy2,ty);
1900 fjz2 = _mm256_add_pd(fjz2,tz);
1902 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1903 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1904 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1905 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1907 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1908 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1910 /* Inner loop uses 271 flops */
1913 /* End of innermost loop */
1915 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1916 f+i_coord_offset,fshift+i_shift_offset);
1918 /* Increment number of inner iterations */
1919 inneriter += j_index_end - j_index_start;
1921 /* Outer loop uses 18 flops */
1924 /* Increment number of outer iterations */
1927 /* Update outer/inner flops */
1929 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*271);