2 * Note: this file was generated by the Gromacs avx_256_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_avx_256_single
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRF_VdwCSTab_GeomW3W3_VF_avx_256_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrE,jnrF,jnrG,jnrH;
62 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
68 real *shiftvec,*fshift,*x,*f;
69 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
71 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72 real * vdwioffsetptr0;
73 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74 real * vdwioffsetptr1;
75 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76 real * vdwioffsetptr2;
77 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
79 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
81 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
83 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
85 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
86 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
87 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
88 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
89 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
90 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
91 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
92 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
93 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
96 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
99 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
100 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
102 __m128i vfitab_lo,vfitab_hi;
103 __m128i ifour = _mm_set1_epi32(4);
104 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
106 __m256 dummy_mask,cutoff_mask;
107 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
108 __m256 one = _mm256_set1_ps(1.0);
109 __m256 two = _mm256_set1_ps(2.0);
115 jindex = nlist->jindex;
117 shiftidx = nlist->shift;
119 shiftvec = fr->shift_vec[0];
120 fshift = fr->fshift[0];
121 facel = _mm256_set1_ps(fr->epsfac);
122 charge = mdatoms->chargeA;
123 krf = _mm256_set1_ps(fr->ic->k_rf);
124 krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0);
125 crf = _mm256_set1_ps(fr->ic->c_rf);
126 nvdwtype = fr->ntype;
128 vdwtype = mdatoms->typeA;
130 vftab = kernel_data->table_vdw->data;
131 vftabscale = _mm256_set1_ps(kernel_data->table_vdw->scale);
133 /* Setup water-specific parameters */
134 inr = nlist->iinr[0];
135 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
136 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
137 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
138 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
140 jq0 = _mm256_set1_ps(charge[inr+0]);
141 jq1 = _mm256_set1_ps(charge[inr+1]);
142 jq2 = _mm256_set1_ps(charge[inr+2]);
143 vdwjidx0A = 2*vdwtype[inr+0];
144 qq00 = _mm256_mul_ps(iq0,jq0);
145 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
146 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
147 qq01 = _mm256_mul_ps(iq0,jq1);
148 qq02 = _mm256_mul_ps(iq0,jq2);
149 qq10 = _mm256_mul_ps(iq1,jq0);
150 qq11 = _mm256_mul_ps(iq1,jq1);
151 qq12 = _mm256_mul_ps(iq1,jq2);
152 qq20 = _mm256_mul_ps(iq2,jq0);
153 qq21 = _mm256_mul_ps(iq2,jq1);
154 qq22 = _mm256_mul_ps(iq2,jq2);
156 /* Avoid stupid compiler warnings */
157 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
170 for(iidx=0;iidx<4*DIM;iidx++)
175 /* Start outer loop over neighborlists */
176 for(iidx=0; iidx<nri; iidx++)
178 /* Load shift vector for this list */
179 i_shift_offset = DIM*shiftidx[iidx];
181 /* Load limits for loop over neighbors */
182 j_index_start = jindex[iidx];
183 j_index_end = jindex[iidx+1];
185 /* Get outer coordinate index */
187 i_coord_offset = DIM*inr;
189 /* Load i particle coords and add shift vector */
190 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
191 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
193 fix0 = _mm256_setzero_ps();
194 fiy0 = _mm256_setzero_ps();
195 fiz0 = _mm256_setzero_ps();
196 fix1 = _mm256_setzero_ps();
197 fiy1 = _mm256_setzero_ps();
198 fiz1 = _mm256_setzero_ps();
199 fix2 = _mm256_setzero_ps();
200 fiy2 = _mm256_setzero_ps();
201 fiz2 = _mm256_setzero_ps();
203 /* Reset potential sums */
204 velecsum = _mm256_setzero_ps();
205 vvdwsum = _mm256_setzero_ps();
207 /* Start inner kernel loop */
208 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
211 /* Get j neighbor index, and coordinate index */
220 j_coord_offsetA = DIM*jnrA;
221 j_coord_offsetB = DIM*jnrB;
222 j_coord_offsetC = DIM*jnrC;
223 j_coord_offsetD = DIM*jnrD;
224 j_coord_offsetE = DIM*jnrE;
225 j_coord_offsetF = DIM*jnrF;
226 j_coord_offsetG = DIM*jnrG;
227 j_coord_offsetH = DIM*jnrH;
229 /* load j atom coordinates */
230 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
231 x+j_coord_offsetC,x+j_coord_offsetD,
232 x+j_coord_offsetE,x+j_coord_offsetF,
233 x+j_coord_offsetG,x+j_coord_offsetH,
234 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
236 /* Calculate displacement vector */
237 dx00 = _mm256_sub_ps(ix0,jx0);
238 dy00 = _mm256_sub_ps(iy0,jy0);
239 dz00 = _mm256_sub_ps(iz0,jz0);
240 dx01 = _mm256_sub_ps(ix0,jx1);
241 dy01 = _mm256_sub_ps(iy0,jy1);
242 dz01 = _mm256_sub_ps(iz0,jz1);
243 dx02 = _mm256_sub_ps(ix0,jx2);
244 dy02 = _mm256_sub_ps(iy0,jy2);
245 dz02 = _mm256_sub_ps(iz0,jz2);
246 dx10 = _mm256_sub_ps(ix1,jx0);
247 dy10 = _mm256_sub_ps(iy1,jy0);
248 dz10 = _mm256_sub_ps(iz1,jz0);
249 dx11 = _mm256_sub_ps(ix1,jx1);
250 dy11 = _mm256_sub_ps(iy1,jy1);
251 dz11 = _mm256_sub_ps(iz1,jz1);
252 dx12 = _mm256_sub_ps(ix1,jx2);
253 dy12 = _mm256_sub_ps(iy1,jy2);
254 dz12 = _mm256_sub_ps(iz1,jz2);
255 dx20 = _mm256_sub_ps(ix2,jx0);
256 dy20 = _mm256_sub_ps(iy2,jy0);
257 dz20 = _mm256_sub_ps(iz2,jz0);
258 dx21 = _mm256_sub_ps(ix2,jx1);
259 dy21 = _mm256_sub_ps(iy2,jy1);
260 dz21 = _mm256_sub_ps(iz2,jz1);
261 dx22 = _mm256_sub_ps(ix2,jx2);
262 dy22 = _mm256_sub_ps(iy2,jy2);
263 dz22 = _mm256_sub_ps(iz2,jz2);
265 /* Calculate squared distance and things based on it */
266 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
267 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
268 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
269 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
270 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
271 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
272 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
273 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
274 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
276 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
277 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
278 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
279 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
280 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
281 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
282 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
283 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
284 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
286 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
287 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
288 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
289 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
290 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
291 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
292 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
293 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
294 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
296 fjx0 = _mm256_setzero_ps();
297 fjy0 = _mm256_setzero_ps();
298 fjz0 = _mm256_setzero_ps();
299 fjx1 = _mm256_setzero_ps();
300 fjy1 = _mm256_setzero_ps();
301 fjz1 = _mm256_setzero_ps();
302 fjx2 = _mm256_setzero_ps();
303 fjy2 = _mm256_setzero_ps();
304 fjz2 = _mm256_setzero_ps();
306 /**************************
307 * CALCULATE INTERACTIONS *
308 **************************/
310 r00 = _mm256_mul_ps(rsq00,rinv00);
312 /* Calculate table index by multiplying r with table scale and truncate to integer */
313 rt = _mm256_mul_ps(r00,vftabscale);
314 vfitab = _mm256_cvttps_epi32(rt);
315 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
316 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
317 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
318 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
319 vfitab_lo = _mm_slli_epi32(vfitab_lo,3);
320 vfitab_hi = _mm_slli_epi32(vfitab_hi,3);
322 /* REACTION-FIELD ELECTROSTATICS */
323 velec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
324 felec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
326 /* CUBIC SPLINE TABLE DISPERSION */
327 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
328 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
329 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
330 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
331 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
332 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
333 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
334 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
335 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
336 Heps = _mm256_mul_ps(vfeps,H);
337 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
338 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
339 vvdw6 = _mm256_mul_ps(c6_00,VV);
340 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
341 fvdw6 = _mm256_mul_ps(c6_00,FF);
343 /* CUBIC SPLINE TABLE REPULSION */
344 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
345 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
346 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
347 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
348 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
349 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
350 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
351 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
352 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
353 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
354 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
355 Heps = _mm256_mul_ps(vfeps,H);
356 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
357 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
358 vvdw12 = _mm256_mul_ps(c12_00,VV);
359 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
360 fvdw12 = _mm256_mul_ps(c12_00,FF);
361 vvdw = _mm256_add_ps(vvdw12,vvdw6);
362 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
364 /* Update potential sum for this i atom from the interaction with this j atom. */
365 velecsum = _mm256_add_ps(velecsum,velec);
366 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
368 fscal = _mm256_add_ps(felec,fvdw);
370 /* Calculate temporary vectorial force */
371 tx = _mm256_mul_ps(fscal,dx00);
372 ty = _mm256_mul_ps(fscal,dy00);
373 tz = _mm256_mul_ps(fscal,dz00);
375 /* Update vectorial force */
376 fix0 = _mm256_add_ps(fix0,tx);
377 fiy0 = _mm256_add_ps(fiy0,ty);
378 fiz0 = _mm256_add_ps(fiz0,tz);
380 fjx0 = _mm256_add_ps(fjx0,tx);
381 fjy0 = _mm256_add_ps(fjy0,ty);
382 fjz0 = _mm256_add_ps(fjz0,tz);
384 /**************************
385 * CALCULATE INTERACTIONS *
386 **************************/
388 /* REACTION-FIELD ELECTROSTATICS */
389 velec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_add_ps(rinv01,_mm256_mul_ps(krf,rsq01)),crf));
390 felec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
392 /* Update potential sum for this i atom from the interaction with this j atom. */
393 velecsum = _mm256_add_ps(velecsum,velec);
397 /* Calculate temporary vectorial force */
398 tx = _mm256_mul_ps(fscal,dx01);
399 ty = _mm256_mul_ps(fscal,dy01);
400 tz = _mm256_mul_ps(fscal,dz01);
402 /* Update vectorial force */
403 fix0 = _mm256_add_ps(fix0,tx);
404 fiy0 = _mm256_add_ps(fiy0,ty);
405 fiz0 = _mm256_add_ps(fiz0,tz);
407 fjx1 = _mm256_add_ps(fjx1,tx);
408 fjy1 = _mm256_add_ps(fjy1,ty);
409 fjz1 = _mm256_add_ps(fjz1,tz);
411 /**************************
412 * CALCULATE INTERACTIONS *
413 **************************/
415 /* REACTION-FIELD ELECTROSTATICS */
416 velec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_add_ps(rinv02,_mm256_mul_ps(krf,rsq02)),crf));
417 felec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
419 /* Update potential sum for this i atom from the interaction with this j atom. */
420 velecsum = _mm256_add_ps(velecsum,velec);
424 /* Calculate temporary vectorial force */
425 tx = _mm256_mul_ps(fscal,dx02);
426 ty = _mm256_mul_ps(fscal,dy02);
427 tz = _mm256_mul_ps(fscal,dz02);
429 /* Update vectorial force */
430 fix0 = _mm256_add_ps(fix0,tx);
431 fiy0 = _mm256_add_ps(fiy0,ty);
432 fiz0 = _mm256_add_ps(fiz0,tz);
434 fjx2 = _mm256_add_ps(fjx2,tx);
435 fjy2 = _mm256_add_ps(fjy2,ty);
436 fjz2 = _mm256_add_ps(fjz2,tz);
438 /**************************
439 * CALCULATE INTERACTIONS *
440 **************************/
442 /* REACTION-FIELD ELECTROSTATICS */
443 velec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
444 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
446 /* Update potential sum for this i atom from the interaction with this j atom. */
447 velecsum = _mm256_add_ps(velecsum,velec);
451 /* Calculate temporary vectorial force */
452 tx = _mm256_mul_ps(fscal,dx10);
453 ty = _mm256_mul_ps(fscal,dy10);
454 tz = _mm256_mul_ps(fscal,dz10);
456 /* Update vectorial force */
457 fix1 = _mm256_add_ps(fix1,tx);
458 fiy1 = _mm256_add_ps(fiy1,ty);
459 fiz1 = _mm256_add_ps(fiz1,tz);
461 fjx0 = _mm256_add_ps(fjx0,tx);
462 fjy0 = _mm256_add_ps(fjy0,ty);
463 fjz0 = _mm256_add_ps(fjz0,tz);
465 /**************************
466 * CALCULATE INTERACTIONS *
467 **************************/
469 /* REACTION-FIELD ELECTROSTATICS */
470 velec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
471 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
473 /* Update potential sum for this i atom from the interaction with this j atom. */
474 velecsum = _mm256_add_ps(velecsum,velec);
478 /* Calculate temporary vectorial force */
479 tx = _mm256_mul_ps(fscal,dx11);
480 ty = _mm256_mul_ps(fscal,dy11);
481 tz = _mm256_mul_ps(fscal,dz11);
483 /* Update vectorial force */
484 fix1 = _mm256_add_ps(fix1,tx);
485 fiy1 = _mm256_add_ps(fiy1,ty);
486 fiz1 = _mm256_add_ps(fiz1,tz);
488 fjx1 = _mm256_add_ps(fjx1,tx);
489 fjy1 = _mm256_add_ps(fjy1,ty);
490 fjz1 = _mm256_add_ps(fjz1,tz);
492 /**************************
493 * CALCULATE INTERACTIONS *
494 **************************/
496 /* REACTION-FIELD ELECTROSTATICS */
497 velec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
498 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
500 /* Update potential sum for this i atom from the interaction with this j atom. */
501 velecsum = _mm256_add_ps(velecsum,velec);
505 /* Calculate temporary vectorial force */
506 tx = _mm256_mul_ps(fscal,dx12);
507 ty = _mm256_mul_ps(fscal,dy12);
508 tz = _mm256_mul_ps(fscal,dz12);
510 /* Update vectorial force */
511 fix1 = _mm256_add_ps(fix1,tx);
512 fiy1 = _mm256_add_ps(fiy1,ty);
513 fiz1 = _mm256_add_ps(fiz1,tz);
515 fjx2 = _mm256_add_ps(fjx2,tx);
516 fjy2 = _mm256_add_ps(fjy2,ty);
517 fjz2 = _mm256_add_ps(fjz2,tz);
519 /**************************
520 * CALCULATE INTERACTIONS *
521 **************************/
523 /* REACTION-FIELD ELECTROSTATICS */
524 velec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
525 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
527 /* Update potential sum for this i atom from the interaction with this j atom. */
528 velecsum = _mm256_add_ps(velecsum,velec);
532 /* Calculate temporary vectorial force */
533 tx = _mm256_mul_ps(fscal,dx20);
534 ty = _mm256_mul_ps(fscal,dy20);
535 tz = _mm256_mul_ps(fscal,dz20);
537 /* Update vectorial force */
538 fix2 = _mm256_add_ps(fix2,tx);
539 fiy2 = _mm256_add_ps(fiy2,ty);
540 fiz2 = _mm256_add_ps(fiz2,tz);
542 fjx0 = _mm256_add_ps(fjx0,tx);
543 fjy0 = _mm256_add_ps(fjy0,ty);
544 fjz0 = _mm256_add_ps(fjz0,tz);
546 /**************************
547 * CALCULATE INTERACTIONS *
548 **************************/
550 /* REACTION-FIELD ELECTROSTATICS */
551 velec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
552 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
554 /* Update potential sum for this i atom from the interaction with this j atom. */
555 velecsum = _mm256_add_ps(velecsum,velec);
559 /* Calculate temporary vectorial force */
560 tx = _mm256_mul_ps(fscal,dx21);
561 ty = _mm256_mul_ps(fscal,dy21);
562 tz = _mm256_mul_ps(fscal,dz21);
564 /* Update vectorial force */
565 fix2 = _mm256_add_ps(fix2,tx);
566 fiy2 = _mm256_add_ps(fiy2,ty);
567 fiz2 = _mm256_add_ps(fiz2,tz);
569 fjx1 = _mm256_add_ps(fjx1,tx);
570 fjy1 = _mm256_add_ps(fjy1,ty);
571 fjz1 = _mm256_add_ps(fjz1,tz);
573 /**************************
574 * CALCULATE INTERACTIONS *
575 **************************/
577 /* REACTION-FIELD ELECTROSTATICS */
578 velec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
579 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
581 /* Update potential sum for this i atom from the interaction with this j atom. */
582 velecsum = _mm256_add_ps(velecsum,velec);
586 /* Calculate temporary vectorial force */
587 tx = _mm256_mul_ps(fscal,dx22);
588 ty = _mm256_mul_ps(fscal,dy22);
589 tz = _mm256_mul_ps(fscal,dz22);
591 /* Update vectorial force */
592 fix2 = _mm256_add_ps(fix2,tx);
593 fiy2 = _mm256_add_ps(fiy2,ty);
594 fiz2 = _mm256_add_ps(fiz2,tz);
596 fjx2 = _mm256_add_ps(fjx2,tx);
597 fjy2 = _mm256_add_ps(fjy2,ty);
598 fjz2 = _mm256_add_ps(fjz2,tz);
600 fjptrA = f+j_coord_offsetA;
601 fjptrB = f+j_coord_offsetB;
602 fjptrC = f+j_coord_offsetC;
603 fjptrD = f+j_coord_offsetD;
604 fjptrE = f+j_coord_offsetE;
605 fjptrF = f+j_coord_offsetF;
606 fjptrG = f+j_coord_offsetG;
607 fjptrH = f+j_coord_offsetH;
609 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
610 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
612 /* Inner loop uses 323 flops */
618 /* Get j neighbor index, and coordinate index */
619 jnrlistA = jjnr[jidx];
620 jnrlistB = jjnr[jidx+1];
621 jnrlistC = jjnr[jidx+2];
622 jnrlistD = jjnr[jidx+3];
623 jnrlistE = jjnr[jidx+4];
624 jnrlistF = jjnr[jidx+5];
625 jnrlistG = jjnr[jidx+6];
626 jnrlistH = jjnr[jidx+7];
627 /* Sign of each element will be negative for non-real atoms.
628 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
629 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
631 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
632 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
634 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
635 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
636 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
637 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
638 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
639 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
640 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
641 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
642 j_coord_offsetA = DIM*jnrA;
643 j_coord_offsetB = DIM*jnrB;
644 j_coord_offsetC = DIM*jnrC;
645 j_coord_offsetD = DIM*jnrD;
646 j_coord_offsetE = DIM*jnrE;
647 j_coord_offsetF = DIM*jnrF;
648 j_coord_offsetG = DIM*jnrG;
649 j_coord_offsetH = DIM*jnrH;
651 /* load j atom coordinates */
652 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
653 x+j_coord_offsetC,x+j_coord_offsetD,
654 x+j_coord_offsetE,x+j_coord_offsetF,
655 x+j_coord_offsetG,x+j_coord_offsetH,
656 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
658 /* Calculate displacement vector */
659 dx00 = _mm256_sub_ps(ix0,jx0);
660 dy00 = _mm256_sub_ps(iy0,jy0);
661 dz00 = _mm256_sub_ps(iz0,jz0);
662 dx01 = _mm256_sub_ps(ix0,jx1);
663 dy01 = _mm256_sub_ps(iy0,jy1);
664 dz01 = _mm256_sub_ps(iz0,jz1);
665 dx02 = _mm256_sub_ps(ix0,jx2);
666 dy02 = _mm256_sub_ps(iy0,jy2);
667 dz02 = _mm256_sub_ps(iz0,jz2);
668 dx10 = _mm256_sub_ps(ix1,jx0);
669 dy10 = _mm256_sub_ps(iy1,jy0);
670 dz10 = _mm256_sub_ps(iz1,jz0);
671 dx11 = _mm256_sub_ps(ix1,jx1);
672 dy11 = _mm256_sub_ps(iy1,jy1);
673 dz11 = _mm256_sub_ps(iz1,jz1);
674 dx12 = _mm256_sub_ps(ix1,jx2);
675 dy12 = _mm256_sub_ps(iy1,jy2);
676 dz12 = _mm256_sub_ps(iz1,jz2);
677 dx20 = _mm256_sub_ps(ix2,jx0);
678 dy20 = _mm256_sub_ps(iy2,jy0);
679 dz20 = _mm256_sub_ps(iz2,jz0);
680 dx21 = _mm256_sub_ps(ix2,jx1);
681 dy21 = _mm256_sub_ps(iy2,jy1);
682 dz21 = _mm256_sub_ps(iz2,jz1);
683 dx22 = _mm256_sub_ps(ix2,jx2);
684 dy22 = _mm256_sub_ps(iy2,jy2);
685 dz22 = _mm256_sub_ps(iz2,jz2);
687 /* Calculate squared distance and things based on it */
688 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
689 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
690 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
691 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
692 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
693 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
694 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
695 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
696 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
698 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
699 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
700 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
701 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
702 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
703 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
704 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
705 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
706 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
708 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
709 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
710 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
711 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
712 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
713 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
714 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
715 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
716 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
718 fjx0 = _mm256_setzero_ps();
719 fjy0 = _mm256_setzero_ps();
720 fjz0 = _mm256_setzero_ps();
721 fjx1 = _mm256_setzero_ps();
722 fjy1 = _mm256_setzero_ps();
723 fjz1 = _mm256_setzero_ps();
724 fjx2 = _mm256_setzero_ps();
725 fjy2 = _mm256_setzero_ps();
726 fjz2 = _mm256_setzero_ps();
728 /**************************
729 * CALCULATE INTERACTIONS *
730 **************************/
732 r00 = _mm256_mul_ps(rsq00,rinv00);
733 r00 = _mm256_andnot_ps(dummy_mask,r00);
735 /* Calculate table index by multiplying r with table scale and truncate to integer */
736 rt = _mm256_mul_ps(r00,vftabscale);
737 vfitab = _mm256_cvttps_epi32(rt);
738 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
739 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
740 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
741 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
742 vfitab_lo = _mm_slli_epi32(vfitab_lo,3);
743 vfitab_hi = _mm_slli_epi32(vfitab_hi,3);
745 /* REACTION-FIELD ELECTROSTATICS */
746 velec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
747 felec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
749 /* CUBIC SPLINE TABLE DISPERSION */
750 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
751 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
752 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
753 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
754 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
755 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
756 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
757 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
758 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
759 Heps = _mm256_mul_ps(vfeps,H);
760 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
761 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
762 vvdw6 = _mm256_mul_ps(c6_00,VV);
763 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
764 fvdw6 = _mm256_mul_ps(c6_00,FF);
766 /* CUBIC SPLINE TABLE REPULSION */
767 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
768 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
769 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
770 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
771 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
772 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
773 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
774 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
775 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
776 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
777 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
778 Heps = _mm256_mul_ps(vfeps,H);
779 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
780 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
781 vvdw12 = _mm256_mul_ps(c12_00,VV);
782 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
783 fvdw12 = _mm256_mul_ps(c12_00,FF);
784 vvdw = _mm256_add_ps(vvdw12,vvdw6);
785 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
787 /* Update potential sum for this i atom from the interaction with this j atom. */
788 velec = _mm256_andnot_ps(dummy_mask,velec);
789 velecsum = _mm256_add_ps(velecsum,velec);
790 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
791 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
793 fscal = _mm256_add_ps(felec,fvdw);
795 fscal = _mm256_andnot_ps(dummy_mask,fscal);
797 /* Calculate temporary vectorial force */
798 tx = _mm256_mul_ps(fscal,dx00);
799 ty = _mm256_mul_ps(fscal,dy00);
800 tz = _mm256_mul_ps(fscal,dz00);
802 /* Update vectorial force */
803 fix0 = _mm256_add_ps(fix0,tx);
804 fiy0 = _mm256_add_ps(fiy0,ty);
805 fiz0 = _mm256_add_ps(fiz0,tz);
807 fjx0 = _mm256_add_ps(fjx0,tx);
808 fjy0 = _mm256_add_ps(fjy0,ty);
809 fjz0 = _mm256_add_ps(fjz0,tz);
811 /**************************
812 * CALCULATE INTERACTIONS *
813 **************************/
815 /* REACTION-FIELD ELECTROSTATICS */
816 velec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_add_ps(rinv01,_mm256_mul_ps(krf,rsq01)),crf));
817 felec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
819 /* Update potential sum for this i atom from the interaction with this j atom. */
820 velec = _mm256_andnot_ps(dummy_mask,velec);
821 velecsum = _mm256_add_ps(velecsum,velec);
825 fscal = _mm256_andnot_ps(dummy_mask,fscal);
827 /* Calculate temporary vectorial force */
828 tx = _mm256_mul_ps(fscal,dx01);
829 ty = _mm256_mul_ps(fscal,dy01);
830 tz = _mm256_mul_ps(fscal,dz01);
832 /* Update vectorial force */
833 fix0 = _mm256_add_ps(fix0,tx);
834 fiy0 = _mm256_add_ps(fiy0,ty);
835 fiz0 = _mm256_add_ps(fiz0,tz);
837 fjx1 = _mm256_add_ps(fjx1,tx);
838 fjy1 = _mm256_add_ps(fjy1,ty);
839 fjz1 = _mm256_add_ps(fjz1,tz);
841 /**************************
842 * CALCULATE INTERACTIONS *
843 **************************/
845 /* REACTION-FIELD ELECTROSTATICS */
846 velec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_add_ps(rinv02,_mm256_mul_ps(krf,rsq02)),crf));
847 felec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
849 /* Update potential sum for this i atom from the interaction with this j atom. */
850 velec = _mm256_andnot_ps(dummy_mask,velec);
851 velecsum = _mm256_add_ps(velecsum,velec);
855 fscal = _mm256_andnot_ps(dummy_mask,fscal);
857 /* Calculate temporary vectorial force */
858 tx = _mm256_mul_ps(fscal,dx02);
859 ty = _mm256_mul_ps(fscal,dy02);
860 tz = _mm256_mul_ps(fscal,dz02);
862 /* Update vectorial force */
863 fix0 = _mm256_add_ps(fix0,tx);
864 fiy0 = _mm256_add_ps(fiy0,ty);
865 fiz0 = _mm256_add_ps(fiz0,tz);
867 fjx2 = _mm256_add_ps(fjx2,tx);
868 fjy2 = _mm256_add_ps(fjy2,ty);
869 fjz2 = _mm256_add_ps(fjz2,tz);
871 /**************************
872 * CALCULATE INTERACTIONS *
873 **************************/
875 /* REACTION-FIELD ELECTROSTATICS */
876 velec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
877 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
879 /* Update potential sum for this i atom from the interaction with this j atom. */
880 velec = _mm256_andnot_ps(dummy_mask,velec);
881 velecsum = _mm256_add_ps(velecsum,velec);
885 fscal = _mm256_andnot_ps(dummy_mask,fscal);
887 /* Calculate temporary vectorial force */
888 tx = _mm256_mul_ps(fscal,dx10);
889 ty = _mm256_mul_ps(fscal,dy10);
890 tz = _mm256_mul_ps(fscal,dz10);
892 /* Update vectorial force */
893 fix1 = _mm256_add_ps(fix1,tx);
894 fiy1 = _mm256_add_ps(fiy1,ty);
895 fiz1 = _mm256_add_ps(fiz1,tz);
897 fjx0 = _mm256_add_ps(fjx0,tx);
898 fjy0 = _mm256_add_ps(fjy0,ty);
899 fjz0 = _mm256_add_ps(fjz0,tz);
901 /**************************
902 * CALCULATE INTERACTIONS *
903 **************************/
905 /* REACTION-FIELD ELECTROSTATICS */
906 velec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
907 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
909 /* Update potential sum for this i atom from the interaction with this j atom. */
910 velec = _mm256_andnot_ps(dummy_mask,velec);
911 velecsum = _mm256_add_ps(velecsum,velec);
915 fscal = _mm256_andnot_ps(dummy_mask,fscal);
917 /* Calculate temporary vectorial force */
918 tx = _mm256_mul_ps(fscal,dx11);
919 ty = _mm256_mul_ps(fscal,dy11);
920 tz = _mm256_mul_ps(fscal,dz11);
922 /* Update vectorial force */
923 fix1 = _mm256_add_ps(fix1,tx);
924 fiy1 = _mm256_add_ps(fiy1,ty);
925 fiz1 = _mm256_add_ps(fiz1,tz);
927 fjx1 = _mm256_add_ps(fjx1,tx);
928 fjy1 = _mm256_add_ps(fjy1,ty);
929 fjz1 = _mm256_add_ps(fjz1,tz);
931 /**************************
932 * CALCULATE INTERACTIONS *
933 **************************/
935 /* REACTION-FIELD ELECTROSTATICS */
936 velec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
937 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
939 /* Update potential sum for this i atom from the interaction with this j atom. */
940 velec = _mm256_andnot_ps(dummy_mask,velec);
941 velecsum = _mm256_add_ps(velecsum,velec);
945 fscal = _mm256_andnot_ps(dummy_mask,fscal);
947 /* Calculate temporary vectorial force */
948 tx = _mm256_mul_ps(fscal,dx12);
949 ty = _mm256_mul_ps(fscal,dy12);
950 tz = _mm256_mul_ps(fscal,dz12);
952 /* Update vectorial force */
953 fix1 = _mm256_add_ps(fix1,tx);
954 fiy1 = _mm256_add_ps(fiy1,ty);
955 fiz1 = _mm256_add_ps(fiz1,tz);
957 fjx2 = _mm256_add_ps(fjx2,tx);
958 fjy2 = _mm256_add_ps(fjy2,ty);
959 fjz2 = _mm256_add_ps(fjz2,tz);
961 /**************************
962 * CALCULATE INTERACTIONS *
963 **************************/
965 /* REACTION-FIELD ELECTROSTATICS */
966 velec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
967 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
969 /* Update potential sum for this i atom from the interaction with this j atom. */
970 velec = _mm256_andnot_ps(dummy_mask,velec);
971 velecsum = _mm256_add_ps(velecsum,velec);
975 fscal = _mm256_andnot_ps(dummy_mask,fscal);
977 /* Calculate temporary vectorial force */
978 tx = _mm256_mul_ps(fscal,dx20);
979 ty = _mm256_mul_ps(fscal,dy20);
980 tz = _mm256_mul_ps(fscal,dz20);
982 /* Update vectorial force */
983 fix2 = _mm256_add_ps(fix2,tx);
984 fiy2 = _mm256_add_ps(fiy2,ty);
985 fiz2 = _mm256_add_ps(fiz2,tz);
987 fjx0 = _mm256_add_ps(fjx0,tx);
988 fjy0 = _mm256_add_ps(fjy0,ty);
989 fjz0 = _mm256_add_ps(fjz0,tz);
991 /**************************
992 * CALCULATE INTERACTIONS *
993 **************************/
995 /* REACTION-FIELD ELECTROSTATICS */
996 velec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
997 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
999 /* Update potential sum for this i atom from the interaction with this j atom. */
1000 velec = _mm256_andnot_ps(dummy_mask,velec);
1001 velecsum = _mm256_add_ps(velecsum,velec);
1005 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1007 /* Calculate temporary vectorial force */
1008 tx = _mm256_mul_ps(fscal,dx21);
1009 ty = _mm256_mul_ps(fscal,dy21);
1010 tz = _mm256_mul_ps(fscal,dz21);
1012 /* Update vectorial force */
1013 fix2 = _mm256_add_ps(fix2,tx);
1014 fiy2 = _mm256_add_ps(fiy2,ty);
1015 fiz2 = _mm256_add_ps(fiz2,tz);
1017 fjx1 = _mm256_add_ps(fjx1,tx);
1018 fjy1 = _mm256_add_ps(fjy1,ty);
1019 fjz1 = _mm256_add_ps(fjz1,tz);
1021 /**************************
1022 * CALCULATE INTERACTIONS *
1023 **************************/
1025 /* REACTION-FIELD ELECTROSTATICS */
1026 velec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
1027 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1029 /* Update potential sum for this i atom from the interaction with this j atom. */
1030 velec = _mm256_andnot_ps(dummy_mask,velec);
1031 velecsum = _mm256_add_ps(velecsum,velec);
1035 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1037 /* Calculate temporary vectorial force */
1038 tx = _mm256_mul_ps(fscal,dx22);
1039 ty = _mm256_mul_ps(fscal,dy22);
1040 tz = _mm256_mul_ps(fscal,dz22);
1042 /* Update vectorial force */
1043 fix2 = _mm256_add_ps(fix2,tx);
1044 fiy2 = _mm256_add_ps(fiy2,ty);
1045 fiz2 = _mm256_add_ps(fiz2,tz);
1047 fjx2 = _mm256_add_ps(fjx2,tx);
1048 fjy2 = _mm256_add_ps(fjy2,ty);
1049 fjz2 = _mm256_add_ps(fjz2,tz);
1051 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1052 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1053 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1054 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1055 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1056 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1057 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1058 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1060 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1061 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1063 /* Inner loop uses 324 flops */
1066 /* End of innermost loop */
1068 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1069 f+i_coord_offset,fshift+i_shift_offset);
1072 /* Update potential energies */
1073 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1074 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1076 /* Increment number of inner iterations */
1077 inneriter += j_index_end - j_index_start;
1079 /* Outer loop uses 20 flops */
1082 /* Increment number of outer iterations */
1085 /* Update outer/inner flops */
1087 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*324);
1090 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_avx_256_single
1091 * Electrostatics interaction: ReactionField
1092 * VdW interaction: CubicSplineTable
1093 * Geometry: Water3-Water3
1094 * Calculate force/pot: Force
1097 nb_kernel_ElecRF_VdwCSTab_GeomW3W3_F_avx_256_single
1098 (t_nblist * gmx_restrict nlist,
1099 rvec * gmx_restrict xx,
1100 rvec * gmx_restrict ff,
1101 t_forcerec * gmx_restrict fr,
1102 t_mdatoms * gmx_restrict mdatoms,
1103 nb_kernel_data_t * gmx_restrict kernel_data,
1104 t_nrnb * gmx_restrict nrnb)
1106 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1107 * just 0 for non-waters.
1108 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1109 * jnr indices corresponding to data put in the four positions in the SIMD register.
1111 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1112 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1113 int jnrA,jnrB,jnrC,jnrD;
1114 int jnrE,jnrF,jnrG,jnrH;
1115 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1116 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1117 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1118 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1119 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1120 real rcutoff_scalar;
1121 real *shiftvec,*fshift,*x,*f;
1122 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1123 real scratch[4*DIM];
1124 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1125 real * vdwioffsetptr0;
1126 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1127 real * vdwioffsetptr1;
1128 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1129 real * vdwioffsetptr2;
1130 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1131 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1132 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1133 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1134 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1135 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1136 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1137 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1138 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1139 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1140 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1141 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1142 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1143 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1144 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1145 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1146 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1149 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1152 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1153 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1155 __m128i vfitab_lo,vfitab_hi;
1156 __m128i ifour = _mm_set1_epi32(4);
1157 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1159 __m256 dummy_mask,cutoff_mask;
1160 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1161 __m256 one = _mm256_set1_ps(1.0);
1162 __m256 two = _mm256_set1_ps(2.0);
1168 jindex = nlist->jindex;
1170 shiftidx = nlist->shift;
1172 shiftvec = fr->shift_vec[0];
1173 fshift = fr->fshift[0];
1174 facel = _mm256_set1_ps(fr->epsfac);
1175 charge = mdatoms->chargeA;
1176 krf = _mm256_set1_ps(fr->ic->k_rf);
1177 krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0);
1178 crf = _mm256_set1_ps(fr->ic->c_rf);
1179 nvdwtype = fr->ntype;
1180 vdwparam = fr->nbfp;
1181 vdwtype = mdatoms->typeA;
1183 vftab = kernel_data->table_vdw->data;
1184 vftabscale = _mm256_set1_ps(kernel_data->table_vdw->scale);
1186 /* Setup water-specific parameters */
1187 inr = nlist->iinr[0];
1188 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1189 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1190 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1191 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1193 jq0 = _mm256_set1_ps(charge[inr+0]);
1194 jq1 = _mm256_set1_ps(charge[inr+1]);
1195 jq2 = _mm256_set1_ps(charge[inr+2]);
1196 vdwjidx0A = 2*vdwtype[inr+0];
1197 qq00 = _mm256_mul_ps(iq0,jq0);
1198 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1199 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1200 qq01 = _mm256_mul_ps(iq0,jq1);
1201 qq02 = _mm256_mul_ps(iq0,jq2);
1202 qq10 = _mm256_mul_ps(iq1,jq0);
1203 qq11 = _mm256_mul_ps(iq1,jq1);
1204 qq12 = _mm256_mul_ps(iq1,jq2);
1205 qq20 = _mm256_mul_ps(iq2,jq0);
1206 qq21 = _mm256_mul_ps(iq2,jq1);
1207 qq22 = _mm256_mul_ps(iq2,jq2);
1209 /* Avoid stupid compiler warnings */
1210 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1211 j_coord_offsetA = 0;
1212 j_coord_offsetB = 0;
1213 j_coord_offsetC = 0;
1214 j_coord_offsetD = 0;
1215 j_coord_offsetE = 0;
1216 j_coord_offsetF = 0;
1217 j_coord_offsetG = 0;
1218 j_coord_offsetH = 0;
1223 for(iidx=0;iidx<4*DIM;iidx++)
1225 scratch[iidx] = 0.0;
1228 /* Start outer loop over neighborlists */
1229 for(iidx=0; iidx<nri; iidx++)
1231 /* Load shift vector for this list */
1232 i_shift_offset = DIM*shiftidx[iidx];
1234 /* Load limits for loop over neighbors */
1235 j_index_start = jindex[iidx];
1236 j_index_end = jindex[iidx+1];
1238 /* Get outer coordinate index */
1240 i_coord_offset = DIM*inr;
1242 /* Load i particle coords and add shift vector */
1243 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1244 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1246 fix0 = _mm256_setzero_ps();
1247 fiy0 = _mm256_setzero_ps();
1248 fiz0 = _mm256_setzero_ps();
1249 fix1 = _mm256_setzero_ps();
1250 fiy1 = _mm256_setzero_ps();
1251 fiz1 = _mm256_setzero_ps();
1252 fix2 = _mm256_setzero_ps();
1253 fiy2 = _mm256_setzero_ps();
1254 fiz2 = _mm256_setzero_ps();
1256 /* Start inner kernel loop */
1257 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1260 /* Get j neighbor index, and coordinate index */
1262 jnrB = jjnr[jidx+1];
1263 jnrC = jjnr[jidx+2];
1264 jnrD = jjnr[jidx+3];
1265 jnrE = jjnr[jidx+4];
1266 jnrF = jjnr[jidx+5];
1267 jnrG = jjnr[jidx+6];
1268 jnrH = jjnr[jidx+7];
1269 j_coord_offsetA = DIM*jnrA;
1270 j_coord_offsetB = DIM*jnrB;
1271 j_coord_offsetC = DIM*jnrC;
1272 j_coord_offsetD = DIM*jnrD;
1273 j_coord_offsetE = DIM*jnrE;
1274 j_coord_offsetF = DIM*jnrF;
1275 j_coord_offsetG = DIM*jnrG;
1276 j_coord_offsetH = DIM*jnrH;
1278 /* load j atom coordinates */
1279 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1280 x+j_coord_offsetC,x+j_coord_offsetD,
1281 x+j_coord_offsetE,x+j_coord_offsetF,
1282 x+j_coord_offsetG,x+j_coord_offsetH,
1283 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1285 /* Calculate displacement vector */
1286 dx00 = _mm256_sub_ps(ix0,jx0);
1287 dy00 = _mm256_sub_ps(iy0,jy0);
1288 dz00 = _mm256_sub_ps(iz0,jz0);
1289 dx01 = _mm256_sub_ps(ix0,jx1);
1290 dy01 = _mm256_sub_ps(iy0,jy1);
1291 dz01 = _mm256_sub_ps(iz0,jz1);
1292 dx02 = _mm256_sub_ps(ix0,jx2);
1293 dy02 = _mm256_sub_ps(iy0,jy2);
1294 dz02 = _mm256_sub_ps(iz0,jz2);
1295 dx10 = _mm256_sub_ps(ix1,jx0);
1296 dy10 = _mm256_sub_ps(iy1,jy0);
1297 dz10 = _mm256_sub_ps(iz1,jz0);
1298 dx11 = _mm256_sub_ps(ix1,jx1);
1299 dy11 = _mm256_sub_ps(iy1,jy1);
1300 dz11 = _mm256_sub_ps(iz1,jz1);
1301 dx12 = _mm256_sub_ps(ix1,jx2);
1302 dy12 = _mm256_sub_ps(iy1,jy2);
1303 dz12 = _mm256_sub_ps(iz1,jz2);
1304 dx20 = _mm256_sub_ps(ix2,jx0);
1305 dy20 = _mm256_sub_ps(iy2,jy0);
1306 dz20 = _mm256_sub_ps(iz2,jz0);
1307 dx21 = _mm256_sub_ps(ix2,jx1);
1308 dy21 = _mm256_sub_ps(iy2,jy1);
1309 dz21 = _mm256_sub_ps(iz2,jz1);
1310 dx22 = _mm256_sub_ps(ix2,jx2);
1311 dy22 = _mm256_sub_ps(iy2,jy2);
1312 dz22 = _mm256_sub_ps(iz2,jz2);
1314 /* Calculate squared distance and things based on it */
1315 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1316 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1317 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1318 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1319 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1320 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1321 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1322 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1323 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1325 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1326 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1327 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1328 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1329 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1330 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1331 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1332 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1333 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1335 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1336 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
1337 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
1338 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
1339 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1340 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1341 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1342 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1343 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1345 fjx0 = _mm256_setzero_ps();
1346 fjy0 = _mm256_setzero_ps();
1347 fjz0 = _mm256_setzero_ps();
1348 fjx1 = _mm256_setzero_ps();
1349 fjy1 = _mm256_setzero_ps();
1350 fjz1 = _mm256_setzero_ps();
1351 fjx2 = _mm256_setzero_ps();
1352 fjy2 = _mm256_setzero_ps();
1353 fjz2 = _mm256_setzero_ps();
1355 /**************************
1356 * CALCULATE INTERACTIONS *
1357 **************************/
1359 r00 = _mm256_mul_ps(rsq00,rinv00);
1361 /* Calculate table index by multiplying r with table scale and truncate to integer */
1362 rt = _mm256_mul_ps(r00,vftabscale);
1363 vfitab = _mm256_cvttps_epi32(rt);
1364 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1365 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1366 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1367 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1368 vfitab_lo = _mm_slli_epi32(vfitab_lo,3);
1369 vfitab_hi = _mm_slli_epi32(vfitab_hi,3);
1371 /* REACTION-FIELD ELECTROSTATICS */
1372 felec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
1374 /* CUBIC SPLINE TABLE DISPERSION */
1375 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1376 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1377 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1378 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1379 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1380 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1381 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1382 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1383 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1384 Heps = _mm256_mul_ps(vfeps,H);
1385 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1386 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1387 fvdw6 = _mm256_mul_ps(c6_00,FF);
1389 /* CUBIC SPLINE TABLE REPULSION */
1390 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
1391 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
1392 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1393 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1394 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1395 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1396 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1397 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1398 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1399 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1400 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1401 Heps = _mm256_mul_ps(vfeps,H);
1402 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1403 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1404 fvdw12 = _mm256_mul_ps(c12_00,FF);
1405 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1407 fscal = _mm256_add_ps(felec,fvdw);
1409 /* Calculate temporary vectorial force */
1410 tx = _mm256_mul_ps(fscal,dx00);
1411 ty = _mm256_mul_ps(fscal,dy00);
1412 tz = _mm256_mul_ps(fscal,dz00);
1414 /* Update vectorial force */
1415 fix0 = _mm256_add_ps(fix0,tx);
1416 fiy0 = _mm256_add_ps(fiy0,ty);
1417 fiz0 = _mm256_add_ps(fiz0,tz);
1419 fjx0 = _mm256_add_ps(fjx0,tx);
1420 fjy0 = _mm256_add_ps(fjy0,ty);
1421 fjz0 = _mm256_add_ps(fjz0,tz);
1423 /**************************
1424 * CALCULATE INTERACTIONS *
1425 **************************/
1427 /* REACTION-FIELD ELECTROSTATICS */
1428 felec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
1432 /* Calculate temporary vectorial force */
1433 tx = _mm256_mul_ps(fscal,dx01);
1434 ty = _mm256_mul_ps(fscal,dy01);
1435 tz = _mm256_mul_ps(fscal,dz01);
1437 /* Update vectorial force */
1438 fix0 = _mm256_add_ps(fix0,tx);
1439 fiy0 = _mm256_add_ps(fiy0,ty);
1440 fiz0 = _mm256_add_ps(fiz0,tz);
1442 fjx1 = _mm256_add_ps(fjx1,tx);
1443 fjy1 = _mm256_add_ps(fjy1,ty);
1444 fjz1 = _mm256_add_ps(fjz1,tz);
1446 /**************************
1447 * CALCULATE INTERACTIONS *
1448 **************************/
1450 /* REACTION-FIELD ELECTROSTATICS */
1451 felec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
1455 /* Calculate temporary vectorial force */
1456 tx = _mm256_mul_ps(fscal,dx02);
1457 ty = _mm256_mul_ps(fscal,dy02);
1458 tz = _mm256_mul_ps(fscal,dz02);
1460 /* Update vectorial force */
1461 fix0 = _mm256_add_ps(fix0,tx);
1462 fiy0 = _mm256_add_ps(fiy0,ty);
1463 fiz0 = _mm256_add_ps(fiz0,tz);
1465 fjx2 = _mm256_add_ps(fjx2,tx);
1466 fjy2 = _mm256_add_ps(fjy2,ty);
1467 fjz2 = _mm256_add_ps(fjz2,tz);
1469 /**************************
1470 * CALCULATE INTERACTIONS *
1471 **************************/
1473 /* REACTION-FIELD ELECTROSTATICS */
1474 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
1478 /* Calculate temporary vectorial force */
1479 tx = _mm256_mul_ps(fscal,dx10);
1480 ty = _mm256_mul_ps(fscal,dy10);
1481 tz = _mm256_mul_ps(fscal,dz10);
1483 /* Update vectorial force */
1484 fix1 = _mm256_add_ps(fix1,tx);
1485 fiy1 = _mm256_add_ps(fiy1,ty);
1486 fiz1 = _mm256_add_ps(fiz1,tz);
1488 fjx0 = _mm256_add_ps(fjx0,tx);
1489 fjy0 = _mm256_add_ps(fjy0,ty);
1490 fjz0 = _mm256_add_ps(fjz0,tz);
1492 /**************************
1493 * CALCULATE INTERACTIONS *
1494 **************************/
1496 /* REACTION-FIELD ELECTROSTATICS */
1497 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1501 /* Calculate temporary vectorial force */
1502 tx = _mm256_mul_ps(fscal,dx11);
1503 ty = _mm256_mul_ps(fscal,dy11);
1504 tz = _mm256_mul_ps(fscal,dz11);
1506 /* Update vectorial force */
1507 fix1 = _mm256_add_ps(fix1,tx);
1508 fiy1 = _mm256_add_ps(fiy1,ty);
1509 fiz1 = _mm256_add_ps(fiz1,tz);
1511 fjx1 = _mm256_add_ps(fjx1,tx);
1512 fjy1 = _mm256_add_ps(fjy1,ty);
1513 fjz1 = _mm256_add_ps(fjz1,tz);
1515 /**************************
1516 * CALCULATE INTERACTIONS *
1517 **************************/
1519 /* REACTION-FIELD ELECTROSTATICS */
1520 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1524 /* Calculate temporary vectorial force */
1525 tx = _mm256_mul_ps(fscal,dx12);
1526 ty = _mm256_mul_ps(fscal,dy12);
1527 tz = _mm256_mul_ps(fscal,dz12);
1529 /* Update vectorial force */
1530 fix1 = _mm256_add_ps(fix1,tx);
1531 fiy1 = _mm256_add_ps(fiy1,ty);
1532 fiz1 = _mm256_add_ps(fiz1,tz);
1534 fjx2 = _mm256_add_ps(fjx2,tx);
1535 fjy2 = _mm256_add_ps(fjy2,ty);
1536 fjz2 = _mm256_add_ps(fjz2,tz);
1538 /**************************
1539 * CALCULATE INTERACTIONS *
1540 **************************/
1542 /* REACTION-FIELD ELECTROSTATICS */
1543 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1547 /* Calculate temporary vectorial force */
1548 tx = _mm256_mul_ps(fscal,dx20);
1549 ty = _mm256_mul_ps(fscal,dy20);
1550 tz = _mm256_mul_ps(fscal,dz20);
1552 /* Update vectorial force */
1553 fix2 = _mm256_add_ps(fix2,tx);
1554 fiy2 = _mm256_add_ps(fiy2,ty);
1555 fiz2 = _mm256_add_ps(fiz2,tz);
1557 fjx0 = _mm256_add_ps(fjx0,tx);
1558 fjy0 = _mm256_add_ps(fjy0,ty);
1559 fjz0 = _mm256_add_ps(fjz0,tz);
1561 /**************************
1562 * CALCULATE INTERACTIONS *
1563 **************************/
1565 /* REACTION-FIELD ELECTROSTATICS */
1566 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1570 /* Calculate temporary vectorial force */
1571 tx = _mm256_mul_ps(fscal,dx21);
1572 ty = _mm256_mul_ps(fscal,dy21);
1573 tz = _mm256_mul_ps(fscal,dz21);
1575 /* Update vectorial force */
1576 fix2 = _mm256_add_ps(fix2,tx);
1577 fiy2 = _mm256_add_ps(fiy2,ty);
1578 fiz2 = _mm256_add_ps(fiz2,tz);
1580 fjx1 = _mm256_add_ps(fjx1,tx);
1581 fjy1 = _mm256_add_ps(fjy1,ty);
1582 fjz1 = _mm256_add_ps(fjz1,tz);
1584 /**************************
1585 * CALCULATE INTERACTIONS *
1586 **************************/
1588 /* REACTION-FIELD ELECTROSTATICS */
1589 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1593 /* Calculate temporary vectorial force */
1594 tx = _mm256_mul_ps(fscal,dx22);
1595 ty = _mm256_mul_ps(fscal,dy22);
1596 tz = _mm256_mul_ps(fscal,dz22);
1598 /* Update vectorial force */
1599 fix2 = _mm256_add_ps(fix2,tx);
1600 fiy2 = _mm256_add_ps(fiy2,ty);
1601 fiz2 = _mm256_add_ps(fiz2,tz);
1603 fjx2 = _mm256_add_ps(fjx2,tx);
1604 fjy2 = _mm256_add_ps(fjy2,ty);
1605 fjz2 = _mm256_add_ps(fjz2,tz);
1607 fjptrA = f+j_coord_offsetA;
1608 fjptrB = f+j_coord_offsetB;
1609 fjptrC = f+j_coord_offsetC;
1610 fjptrD = f+j_coord_offsetD;
1611 fjptrE = f+j_coord_offsetE;
1612 fjptrF = f+j_coord_offsetF;
1613 fjptrG = f+j_coord_offsetG;
1614 fjptrH = f+j_coord_offsetH;
1616 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1617 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1619 /* Inner loop uses 270 flops */
1622 if(jidx<j_index_end)
1625 /* Get j neighbor index, and coordinate index */
1626 jnrlistA = jjnr[jidx];
1627 jnrlistB = jjnr[jidx+1];
1628 jnrlistC = jjnr[jidx+2];
1629 jnrlistD = jjnr[jidx+3];
1630 jnrlistE = jjnr[jidx+4];
1631 jnrlistF = jjnr[jidx+5];
1632 jnrlistG = jjnr[jidx+6];
1633 jnrlistH = jjnr[jidx+7];
1634 /* Sign of each element will be negative for non-real atoms.
1635 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1636 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1638 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1639 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1641 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1642 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1643 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1644 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1645 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
1646 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
1647 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
1648 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
1649 j_coord_offsetA = DIM*jnrA;
1650 j_coord_offsetB = DIM*jnrB;
1651 j_coord_offsetC = DIM*jnrC;
1652 j_coord_offsetD = DIM*jnrD;
1653 j_coord_offsetE = DIM*jnrE;
1654 j_coord_offsetF = DIM*jnrF;
1655 j_coord_offsetG = DIM*jnrG;
1656 j_coord_offsetH = DIM*jnrH;
1658 /* load j atom coordinates */
1659 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1660 x+j_coord_offsetC,x+j_coord_offsetD,
1661 x+j_coord_offsetE,x+j_coord_offsetF,
1662 x+j_coord_offsetG,x+j_coord_offsetH,
1663 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1665 /* Calculate displacement vector */
1666 dx00 = _mm256_sub_ps(ix0,jx0);
1667 dy00 = _mm256_sub_ps(iy0,jy0);
1668 dz00 = _mm256_sub_ps(iz0,jz0);
1669 dx01 = _mm256_sub_ps(ix0,jx1);
1670 dy01 = _mm256_sub_ps(iy0,jy1);
1671 dz01 = _mm256_sub_ps(iz0,jz1);
1672 dx02 = _mm256_sub_ps(ix0,jx2);
1673 dy02 = _mm256_sub_ps(iy0,jy2);
1674 dz02 = _mm256_sub_ps(iz0,jz2);
1675 dx10 = _mm256_sub_ps(ix1,jx0);
1676 dy10 = _mm256_sub_ps(iy1,jy0);
1677 dz10 = _mm256_sub_ps(iz1,jz0);
1678 dx11 = _mm256_sub_ps(ix1,jx1);
1679 dy11 = _mm256_sub_ps(iy1,jy1);
1680 dz11 = _mm256_sub_ps(iz1,jz1);
1681 dx12 = _mm256_sub_ps(ix1,jx2);
1682 dy12 = _mm256_sub_ps(iy1,jy2);
1683 dz12 = _mm256_sub_ps(iz1,jz2);
1684 dx20 = _mm256_sub_ps(ix2,jx0);
1685 dy20 = _mm256_sub_ps(iy2,jy0);
1686 dz20 = _mm256_sub_ps(iz2,jz0);
1687 dx21 = _mm256_sub_ps(ix2,jx1);
1688 dy21 = _mm256_sub_ps(iy2,jy1);
1689 dz21 = _mm256_sub_ps(iz2,jz1);
1690 dx22 = _mm256_sub_ps(ix2,jx2);
1691 dy22 = _mm256_sub_ps(iy2,jy2);
1692 dz22 = _mm256_sub_ps(iz2,jz2);
1694 /* Calculate squared distance and things based on it */
1695 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1696 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1697 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1698 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1699 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1700 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1701 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1702 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1703 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1705 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1706 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1707 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1708 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1709 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1710 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1711 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1712 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1713 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1715 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1716 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
1717 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
1718 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
1719 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1720 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1721 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1722 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1723 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1725 fjx0 = _mm256_setzero_ps();
1726 fjy0 = _mm256_setzero_ps();
1727 fjz0 = _mm256_setzero_ps();
1728 fjx1 = _mm256_setzero_ps();
1729 fjy1 = _mm256_setzero_ps();
1730 fjz1 = _mm256_setzero_ps();
1731 fjx2 = _mm256_setzero_ps();
1732 fjy2 = _mm256_setzero_ps();
1733 fjz2 = _mm256_setzero_ps();
1735 /**************************
1736 * CALCULATE INTERACTIONS *
1737 **************************/
1739 r00 = _mm256_mul_ps(rsq00,rinv00);
1740 r00 = _mm256_andnot_ps(dummy_mask,r00);
1742 /* Calculate table index by multiplying r with table scale and truncate to integer */
1743 rt = _mm256_mul_ps(r00,vftabscale);
1744 vfitab = _mm256_cvttps_epi32(rt);
1745 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1746 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1747 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1748 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1749 vfitab_lo = _mm_slli_epi32(vfitab_lo,3);
1750 vfitab_hi = _mm_slli_epi32(vfitab_hi,3);
1752 /* REACTION-FIELD ELECTROSTATICS */
1753 felec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
1755 /* CUBIC SPLINE TABLE DISPERSION */
1756 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1757 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1758 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1759 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1760 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1761 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1762 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1763 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1764 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1765 Heps = _mm256_mul_ps(vfeps,H);
1766 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1767 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1768 fvdw6 = _mm256_mul_ps(c6_00,FF);
1770 /* CUBIC SPLINE TABLE REPULSION */
1771 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
1772 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
1773 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1774 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1775 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1776 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1777 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1778 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1779 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1780 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1781 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1782 Heps = _mm256_mul_ps(vfeps,H);
1783 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1784 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1785 fvdw12 = _mm256_mul_ps(c12_00,FF);
1786 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1788 fscal = _mm256_add_ps(felec,fvdw);
1790 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1792 /* Calculate temporary vectorial force */
1793 tx = _mm256_mul_ps(fscal,dx00);
1794 ty = _mm256_mul_ps(fscal,dy00);
1795 tz = _mm256_mul_ps(fscal,dz00);
1797 /* Update vectorial force */
1798 fix0 = _mm256_add_ps(fix0,tx);
1799 fiy0 = _mm256_add_ps(fiy0,ty);
1800 fiz0 = _mm256_add_ps(fiz0,tz);
1802 fjx0 = _mm256_add_ps(fjx0,tx);
1803 fjy0 = _mm256_add_ps(fjy0,ty);
1804 fjz0 = _mm256_add_ps(fjz0,tz);
1806 /**************************
1807 * CALCULATE INTERACTIONS *
1808 **************************/
1810 /* REACTION-FIELD ELECTROSTATICS */
1811 felec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
1815 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1817 /* Calculate temporary vectorial force */
1818 tx = _mm256_mul_ps(fscal,dx01);
1819 ty = _mm256_mul_ps(fscal,dy01);
1820 tz = _mm256_mul_ps(fscal,dz01);
1822 /* Update vectorial force */
1823 fix0 = _mm256_add_ps(fix0,tx);
1824 fiy0 = _mm256_add_ps(fiy0,ty);
1825 fiz0 = _mm256_add_ps(fiz0,tz);
1827 fjx1 = _mm256_add_ps(fjx1,tx);
1828 fjy1 = _mm256_add_ps(fjy1,ty);
1829 fjz1 = _mm256_add_ps(fjz1,tz);
1831 /**************************
1832 * CALCULATE INTERACTIONS *
1833 **************************/
1835 /* REACTION-FIELD ELECTROSTATICS */
1836 felec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
1840 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1842 /* Calculate temporary vectorial force */
1843 tx = _mm256_mul_ps(fscal,dx02);
1844 ty = _mm256_mul_ps(fscal,dy02);
1845 tz = _mm256_mul_ps(fscal,dz02);
1847 /* Update vectorial force */
1848 fix0 = _mm256_add_ps(fix0,tx);
1849 fiy0 = _mm256_add_ps(fiy0,ty);
1850 fiz0 = _mm256_add_ps(fiz0,tz);
1852 fjx2 = _mm256_add_ps(fjx2,tx);
1853 fjy2 = _mm256_add_ps(fjy2,ty);
1854 fjz2 = _mm256_add_ps(fjz2,tz);
1856 /**************************
1857 * CALCULATE INTERACTIONS *
1858 **************************/
1860 /* REACTION-FIELD ELECTROSTATICS */
1861 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
1865 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1867 /* Calculate temporary vectorial force */
1868 tx = _mm256_mul_ps(fscal,dx10);
1869 ty = _mm256_mul_ps(fscal,dy10);
1870 tz = _mm256_mul_ps(fscal,dz10);
1872 /* Update vectorial force */
1873 fix1 = _mm256_add_ps(fix1,tx);
1874 fiy1 = _mm256_add_ps(fiy1,ty);
1875 fiz1 = _mm256_add_ps(fiz1,tz);
1877 fjx0 = _mm256_add_ps(fjx0,tx);
1878 fjy0 = _mm256_add_ps(fjy0,ty);
1879 fjz0 = _mm256_add_ps(fjz0,tz);
1881 /**************************
1882 * CALCULATE INTERACTIONS *
1883 **************************/
1885 /* REACTION-FIELD ELECTROSTATICS */
1886 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1890 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1892 /* Calculate temporary vectorial force */
1893 tx = _mm256_mul_ps(fscal,dx11);
1894 ty = _mm256_mul_ps(fscal,dy11);
1895 tz = _mm256_mul_ps(fscal,dz11);
1897 /* Update vectorial force */
1898 fix1 = _mm256_add_ps(fix1,tx);
1899 fiy1 = _mm256_add_ps(fiy1,ty);
1900 fiz1 = _mm256_add_ps(fiz1,tz);
1902 fjx1 = _mm256_add_ps(fjx1,tx);
1903 fjy1 = _mm256_add_ps(fjy1,ty);
1904 fjz1 = _mm256_add_ps(fjz1,tz);
1906 /**************************
1907 * CALCULATE INTERACTIONS *
1908 **************************/
1910 /* REACTION-FIELD ELECTROSTATICS */
1911 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1915 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1917 /* Calculate temporary vectorial force */
1918 tx = _mm256_mul_ps(fscal,dx12);
1919 ty = _mm256_mul_ps(fscal,dy12);
1920 tz = _mm256_mul_ps(fscal,dz12);
1922 /* Update vectorial force */
1923 fix1 = _mm256_add_ps(fix1,tx);
1924 fiy1 = _mm256_add_ps(fiy1,ty);
1925 fiz1 = _mm256_add_ps(fiz1,tz);
1927 fjx2 = _mm256_add_ps(fjx2,tx);
1928 fjy2 = _mm256_add_ps(fjy2,ty);
1929 fjz2 = _mm256_add_ps(fjz2,tz);
1931 /**************************
1932 * CALCULATE INTERACTIONS *
1933 **************************/
1935 /* REACTION-FIELD ELECTROSTATICS */
1936 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1940 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1942 /* Calculate temporary vectorial force */
1943 tx = _mm256_mul_ps(fscal,dx20);
1944 ty = _mm256_mul_ps(fscal,dy20);
1945 tz = _mm256_mul_ps(fscal,dz20);
1947 /* Update vectorial force */
1948 fix2 = _mm256_add_ps(fix2,tx);
1949 fiy2 = _mm256_add_ps(fiy2,ty);
1950 fiz2 = _mm256_add_ps(fiz2,tz);
1952 fjx0 = _mm256_add_ps(fjx0,tx);
1953 fjy0 = _mm256_add_ps(fjy0,ty);
1954 fjz0 = _mm256_add_ps(fjz0,tz);
1956 /**************************
1957 * CALCULATE INTERACTIONS *
1958 **************************/
1960 /* REACTION-FIELD ELECTROSTATICS */
1961 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1965 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1967 /* Calculate temporary vectorial force */
1968 tx = _mm256_mul_ps(fscal,dx21);
1969 ty = _mm256_mul_ps(fscal,dy21);
1970 tz = _mm256_mul_ps(fscal,dz21);
1972 /* Update vectorial force */
1973 fix2 = _mm256_add_ps(fix2,tx);
1974 fiy2 = _mm256_add_ps(fiy2,ty);
1975 fiz2 = _mm256_add_ps(fiz2,tz);
1977 fjx1 = _mm256_add_ps(fjx1,tx);
1978 fjy1 = _mm256_add_ps(fjy1,ty);
1979 fjz1 = _mm256_add_ps(fjz1,tz);
1981 /**************************
1982 * CALCULATE INTERACTIONS *
1983 **************************/
1985 /* REACTION-FIELD ELECTROSTATICS */
1986 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1990 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1992 /* Calculate temporary vectorial force */
1993 tx = _mm256_mul_ps(fscal,dx22);
1994 ty = _mm256_mul_ps(fscal,dy22);
1995 tz = _mm256_mul_ps(fscal,dz22);
1997 /* Update vectorial force */
1998 fix2 = _mm256_add_ps(fix2,tx);
1999 fiy2 = _mm256_add_ps(fiy2,ty);
2000 fiz2 = _mm256_add_ps(fiz2,tz);
2002 fjx2 = _mm256_add_ps(fjx2,tx);
2003 fjy2 = _mm256_add_ps(fjy2,ty);
2004 fjz2 = _mm256_add_ps(fjz2,tz);
2006 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2007 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2008 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2009 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2010 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2011 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2012 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2013 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2015 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2016 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2018 /* Inner loop uses 271 flops */
2021 /* End of innermost loop */
2023 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2024 f+i_coord_offset,fshift+i_shift_offset);
2026 /* Increment number of inner iterations */
2027 inneriter += j_index_end - j_index_start;
2029 /* Outer loop uses 18 flops */
2032 /* Increment number of outer iterations */
2035 /* Update outer/inner flops */
2037 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*271);