2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_avx_256_double
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 real * vdwioffsetptr3;
77 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
79 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
81 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
83 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
85 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
86 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
87 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
88 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
89 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
90 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
91 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
92 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
93 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
94 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
95 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
96 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
99 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
102 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
103 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
104 __m256d dummy_mask,cutoff_mask;
105 __m128 tmpmask0,tmpmask1;
106 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
107 __m256d one = _mm256_set1_pd(1.0);
108 __m256d two = _mm256_set1_pd(2.0);
114 jindex = nlist->jindex;
116 shiftidx = nlist->shift;
118 shiftvec = fr->shift_vec[0];
119 fshift = fr->fshift[0];
120 facel = _mm256_set1_pd(fr->epsfac);
121 charge = mdatoms->chargeA;
122 krf = _mm256_set1_pd(fr->ic->k_rf);
123 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
124 crf = _mm256_set1_pd(fr->ic->c_rf);
125 nvdwtype = fr->ntype;
127 vdwtype = mdatoms->typeA;
129 /* Setup water-specific parameters */
130 inr = nlist->iinr[0];
131 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
132 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
133 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
134 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
136 jq1 = _mm256_set1_pd(charge[inr+1]);
137 jq2 = _mm256_set1_pd(charge[inr+2]);
138 jq3 = _mm256_set1_pd(charge[inr+3]);
139 vdwjidx0A = 2*vdwtype[inr+0];
140 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
141 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
142 qq11 = _mm256_mul_pd(iq1,jq1);
143 qq12 = _mm256_mul_pd(iq1,jq2);
144 qq13 = _mm256_mul_pd(iq1,jq3);
145 qq21 = _mm256_mul_pd(iq2,jq1);
146 qq22 = _mm256_mul_pd(iq2,jq2);
147 qq23 = _mm256_mul_pd(iq2,jq3);
148 qq31 = _mm256_mul_pd(iq3,jq1);
149 qq32 = _mm256_mul_pd(iq3,jq2);
150 qq33 = _mm256_mul_pd(iq3,jq3);
152 /* Avoid stupid compiler warnings */
153 jnrA = jnrB = jnrC = jnrD = 0;
162 for(iidx=0;iidx<4*DIM;iidx++)
167 /* Start outer loop over neighborlists */
168 for(iidx=0; iidx<nri; iidx++)
170 /* Load shift vector for this list */
171 i_shift_offset = DIM*shiftidx[iidx];
173 /* Load limits for loop over neighbors */
174 j_index_start = jindex[iidx];
175 j_index_end = jindex[iidx+1];
177 /* Get outer coordinate index */
179 i_coord_offset = DIM*inr;
181 /* Load i particle coords and add shift vector */
182 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
183 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
185 fix0 = _mm256_setzero_pd();
186 fiy0 = _mm256_setzero_pd();
187 fiz0 = _mm256_setzero_pd();
188 fix1 = _mm256_setzero_pd();
189 fiy1 = _mm256_setzero_pd();
190 fiz1 = _mm256_setzero_pd();
191 fix2 = _mm256_setzero_pd();
192 fiy2 = _mm256_setzero_pd();
193 fiz2 = _mm256_setzero_pd();
194 fix3 = _mm256_setzero_pd();
195 fiy3 = _mm256_setzero_pd();
196 fiz3 = _mm256_setzero_pd();
198 /* Reset potential sums */
199 velecsum = _mm256_setzero_pd();
200 vvdwsum = _mm256_setzero_pd();
202 /* Start inner kernel loop */
203 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
206 /* Get j neighbor index, and coordinate index */
211 j_coord_offsetA = DIM*jnrA;
212 j_coord_offsetB = DIM*jnrB;
213 j_coord_offsetC = DIM*jnrC;
214 j_coord_offsetD = DIM*jnrD;
216 /* load j atom coordinates */
217 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
218 x+j_coord_offsetC,x+j_coord_offsetD,
219 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
220 &jy2,&jz2,&jx3,&jy3,&jz3);
222 /* Calculate displacement vector */
223 dx00 = _mm256_sub_pd(ix0,jx0);
224 dy00 = _mm256_sub_pd(iy0,jy0);
225 dz00 = _mm256_sub_pd(iz0,jz0);
226 dx11 = _mm256_sub_pd(ix1,jx1);
227 dy11 = _mm256_sub_pd(iy1,jy1);
228 dz11 = _mm256_sub_pd(iz1,jz1);
229 dx12 = _mm256_sub_pd(ix1,jx2);
230 dy12 = _mm256_sub_pd(iy1,jy2);
231 dz12 = _mm256_sub_pd(iz1,jz2);
232 dx13 = _mm256_sub_pd(ix1,jx3);
233 dy13 = _mm256_sub_pd(iy1,jy3);
234 dz13 = _mm256_sub_pd(iz1,jz3);
235 dx21 = _mm256_sub_pd(ix2,jx1);
236 dy21 = _mm256_sub_pd(iy2,jy1);
237 dz21 = _mm256_sub_pd(iz2,jz1);
238 dx22 = _mm256_sub_pd(ix2,jx2);
239 dy22 = _mm256_sub_pd(iy2,jy2);
240 dz22 = _mm256_sub_pd(iz2,jz2);
241 dx23 = _mm256_sub_pd(ix2,jx3);
242 dy23 = _mm256_sub_pd(iy2,jy3);
243 dz23 = _mm256_sub_pd(iz2,jz3);
244 dx31 = _mm256_sub_pd(ix3,jx1);
245 dy31 = _mm256_sub_pd(iy3,jy1);
246 dz31 = _mm256_sub_pd(iz3,jz1);
247 dx32 = _mm256_sub_pd(ix3,jx2);
248 dy32 = _mm256_sub_pd(iy3,jy2);
249 dz32 = _mm256_sub_pd(iz3,jz2);
250 dx33 = _mm256_sub_pd(ix3,jx3);
251 dy33 = _mm256_sub_pd(iy3,jy3);
252 dz33 = _mm256_sub_pd(iz3,jz3);
254 /* Calculate squared distance and things based on it */
255 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
256 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
257 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
258 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
259 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
260 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
261 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
262 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
263 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
264 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
266 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
267 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
268 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
269 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
270 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
271 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
272 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
273 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
274 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
276 rinvsq00 = gmx_mm256_inv_pd(rsq00);
277 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
278 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
279 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
280 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
281 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
282 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
283 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
284 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
285 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
287 fjx0 = _mm256_setzero_pd();
288 fjy0 = _mm256_setzero_pd();
289 fjz0 = _mm256_setzero_pd();
290 fjx1 = _mm256_setzero_pd();
291 fjy1 = _mm256_setzero_pd();
292 fjz1 = _mm256_setzero_pd();
293 fjx2 = _mm256_setzero_pd();
294 fjy2 = _mm256_setzero_pd();
295 fjz2 = _mm256_setzero_pd();
296 fjx3 = _mm256_setzero_pd();
297 fjy3 = _mm256_setzero_pd();
298 fjz3 = _mm256_setzero_pd();
300 /**************************
301 * CALCULATE INTERACTIONS *
302 **************************/
304 /* LENNARD-JONES DISPERSION/REPULSION */
306 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
307 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
308 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
309 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
310 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
312 /* Update potential sum for this i atom from the interaction with this j atom. */
313 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
317 /* Calculate temporary vectorial force */
318 tx = _mm256_mul_pd(fscal,dx00);
319 ty = _mm256_mul_pd(fscal,dy00);
320 tz = _mm256_mul_pd(fscal,dz00);
322 /* Update vectorial force */
323 fix0 = _mm256_add_pd(fix0,tx);
324 fiy0 = _mm256_add_pd(fiy0,ty);
325 fiz0 = _mm256_add_pd(fiz0,tz);
327 fjx0 = _mm256_add_pd(fjx0,tx);
328 fjy0 = _mm256_add_pd(fjy0,ty);
329 fjz0 = _mm256_add_pd(fjz0,tz);
331 /**************************
332 * CALCULATE INTERACTIONS *
333 **************************/
335 /* REACTION-FIELD ELECTROSTATICS */
336 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
337 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
339 /* Update potential sum for this i atom from the interaction with this j atom. */
340 velecsum = _mm256_add_pd(velecsum,velec);
344 /* Calculate temporary vectorial force */
345 tx = _mm256_mul_pd(fscal,dx11);
346 ty = _mm256_mul_pd(fscal,dy11);
347 tz = _mm256_mul_pd(fscal,dz11);
349 /* Update vectorial force */
350 fix1 = _mm256_add_pd(fix1,tx);
351 fiy1 = _mm256_add_pd(fiy1,ty);
352 fiz1 = _mm256_add_pd(fiz1,tz);
354 fjx1 = _mm256_add_pd(fjx1,tx);
355 fjy1 = _mm256_add_pd(fjy1,ty);
356 fjz1 = _mm256_add_pd(fjz1,tz);
358 /**************************
359 * CALCULATE INTERACTIONS *
360 **************************/
362 /* REACTION-FIELD ELECTROSTATICS */
363 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
364 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
366 /* Update potential sum for this i atom from the interaction with this j atom. */
367 velecsum = _mm256_add_pd(velecsum,velec);
371 /* Calculate temporary vectorial force */
372 tx = _mm256_mul_pd(fscal,dx12);
373 ty = _mm256_mul_pd(fscal,dy12);
374 tz = _mm256_mul_pd(fscal,dz12);
376 /* Update vectorial force */
377 fix1 = _mm256_add_pd(fix1,tx);
378 fiy1 = _mm256_add_pd(fiy1,ty);
379 fiz1 = _mm256_add_pd(fiz1,tz);
381 fjx2 = _mm256_add_pd(fjx2,tx);
382 fjy2 = _mm256_add_pd(fjy2,ty);
383 fjz2 = _mm256_add_pd(fjz2,tz);
385 /**************************
386 * CALCULATE INTERACTIONS *
387 **************************/
389 /* REACTION-FIELD ELECTROSTATICS */
390 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_add_pd(rinv13,_mm256_mul_pd(krf,rsq13)),crf));
391 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
393 /* Update potential sum for this i atom from the interaction with this j atom. */
394 velecsum = _mm256_add_pd(velecsum,velec);
398 /* Calculate temporary vectorial force */
399 tx = _mm256_mul_pd(fscal,dx13);
400 ty = _mm256_mul_pd(fscal,dy13);
401 tz = _mm256_mul_pd(fscal,dz13);
403 /* Update vectorial force */
404 fix1 = _mm256_add_pd(fix1,tx);
405 fiy1 = _mm256_add_pd(fiy1,ty);
406 fiz1 = _mm256_add_pd(fiz1,tz);
408 fjx3 = _mm256_add_pd(fjx3,tx);
409 fjy3 = _mm256_add_pd(fjy3,ty);
410 fjz3 = _mm256_add_pd(fjz3,tz);
412 /**************************
413 * CALCULATE INTERACTIONS *
414 **************************/
416 /* REACTION-FIELD ELECTROSTATICS */
417 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
418 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
420 /* Update potential sum for this i atom from the interaction with this j atom. */
421 velecsum = _mm256_add_pd(velecsum,velec);
425 /* Calculate temporary vectorial force */
426 tx = _mm256_mul_pd(fscal,dx21);
427 ty = _mm256_mul_pd(fscal,dy21);
428 tz = _mm256_mul_pd(fscal,dz21);
430 /* Update vectorial force */
431 fix2 = _mm256_add_pd(fix2,tx);
432 fiy2 = _mm256_add_pd(fiy2,ty);
433 fiz2 = _mm256_add_pd(fiz2,tz);
435 fjx1 = _mm256_add_pd(fjx1,tx);
436 fjy1 = _mm256_add_pd(fjy1,ty);
437 fjz1 = _mm256_add_pd(fjz1,tz);
439 /**************************
440 * CALCULATE INTERACTIONS *
441 **************************/
443 /* REACTION-FIELD ELECTROSTATICS */
444 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
445 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
447 /* Update potential sum for this i atom from the interaction with this j atom. */
448 velecsum = _mm256_add_pd(velecsum,velec);
452 /* Calculate temporary vectorial force */
453 tx = _mm256_mul_pd(fscal,dx22);
454 ty = _mm256_mul_pd(fscal,dy22);
455 tz = _mm256_mul_pd(fscal,dz22);
457 /* Update vectorial force */
458 fix2 = _mm256_add_pd(fix2,tx);
459 fiy2 = _mm256_add_pd(fiy2,ty);
460 fiz2 = _mm256_add_pd(fiz2,tz);
462 fjx2 = _mm256_add_pd(fjx2,tx);
463 fjy2 = _mm256_add_pd(fjy2,ty);
464 fjz2 = _mm256_add_pd(fjz2,tz);
466 /**************************
467 * CALCULATE INTERACTIONS *
468 **************************/
470 /* REACTION-FIELD ELECTROSTATICS */
471 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_add_pd(rinv23,_mm256_mul_pd(krf,rsq23)),crf));
472 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
474 /* Update potential sum for this i atom from the interaction with this j atom. */
475 velecsum = _mm256_add_pd(velecsum,velec);
479 /* Calculate temporary vectorial force */
480 tx = _mm256_mul_pd(fscal,dx23);
481 ty = _mm256_mul_pd(fscal,dy23);
482 tz = _mm256_mul_pd(fscal,dz23);
484 /* Update vectorial force */
485 fix2 = _mm256_add_pd(fix2,tx);
486 fiy2 = _mm256_add_pd(fiy2,ty);
487 fiz2 = _mm256_add_pd(fiz2,tz);
489 fjx3 = _mm256_add_pd(fjx3,tx);
490 fjy3 = _mm256_add_pd(fjy3,ty);
491 fjz3 = _mm256_add_pd(fjz3,tz);
493 /**************************
494 * CALCULATE INTERACTIONS *
495 **************************/
497 /* REACTION-FIELD ELECTROSTATICS */
498 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_add_pd(rinv31,_mm256_mul_pd(krf,rsq31)),crf));
499 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
501 /* Update potential sum for this i atom from the interaction with this j atom. */
502 velecsum = _mm256_add_pd(velecsum,velec);
506 /* Calculate temporary vectorial force */
507 tx = _mm256_mul_pd(fscal,dx31);
508 ty = _mm256_mul_pd(fscal,dy31);
509 tz = _mm256_mul_pd(fscal,dz31);
511 /* Update vectorial force */
512 fix3 = _mm256_add_pd(fix3,tx);
513 fiy3 = _mm256_add_pd(fiy3,ty);
514 fiz3 = _mm256_add_pd(fiz3,tz);
516 fjx1 = _mm256_add_pd(fjx1,tx);
517 fjy1 = _mm256_add_pd(fjy1,ty);
518 fjz1 = _mm256_add_pd(fjz1,tz);
520 /**************************
521 * CALCULATE INTERACTIONS *
522 **************************/
524 /* REACTION-FIELD ELECTROSTATICS */
525 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_add_pd(rinv32,_mm256_mul_pd(krf,rsq32)),crf));
526 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
528 /* Update potential sum for this i atom from the interaction with this j atom. */
529 velecsum = _mm256_add_pd(velecsum,velec);
533 /* Calculate temporary vectorial force */
534 tx = _mm256_mul_pd(fscal,dx32);
535 ty = _mm256_mul_pd(fscal,dy32);
536 tz = _mm256_mul_pd(fscal,dz32);
538 /* Update vectorial force */
539 fix3 = _mm256_add_pd(fix3,tx);
540 fiy3 = _mm256_add_pd(fiy3,ty);
541 fiz3 = _mm256_add_pd(fiz3,tz);
543 fjx2 = _mm256_add_pd(fjx2,tx);
544 fjy2 = _mm256_add_pd(fjy2,ty);
545 fjz2 = _mm256_add_pd(fjz2,tz);
547 /**************************
548 * CALCULATE INTERACTIONS *
549 **************************/
551 /* REACTION-FIELD ELECTROSTATICS */
552 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_add_pd(rinv33,_mm256_mul_pd(krf,rsq33)),crf));
553 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
555 /* Update potential sum for this i atom from the interaction with this j atom. */
556 velecsum = _mm256_add_pd(velecsum,velec);
560 /* Calculate temporary vectorial force */
561 tx = _mm256_mul_pd(fscal,dx33);
562 ty = _mm256_mul_pd(fscal,dy33);
563 tz = _mm256_mul_pd(fscal,dz33);
565 /* Update vectorial force */
566 fix3 = _mm256_add_pd(fix3,tx);
567 fiy3 = _mm256_add_pd(fiy3,ty);
568 fiz3 = _mm256_add_pd(fiz3,tz);
570 fjx3 = _mm256_add_pd(fjx3,tx);
571 fjy3 = _mm256_add_pd(fjy3,ty);
572 fjz3 = _mm256_add_pd(fjz3,tz);
574 fjptrA = f+j_coord_offsetA;
575 fjptrB = f+j_coord_offsetB;
576 fjptrC = f+j_coord_offsetC;
577 fjptrD = f+j_coord_offsetD;
579 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
580 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
581 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
583 /* Inner loop uses 323 flops */
589 /* Get j neighbor index, and coordinate index */
590 jnrlistA = jjnr[jidx];
591 jnrlistB = jjnr[jidx+1];
592 jnrlistC = jjnr[jidx+2];
593 jnrlistD = jjnr[jidx+3];
594 /* Sign of each element will be negative for non-real atoms.
595 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
596 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
598 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
600 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
601 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
602 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
604 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
605 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
606 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
607 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
608 j_coord_offsetA = DIM*jnrA;
609 j_coord_offsetB = DIM*jnrB;
610 j_coord_offsetC = DIM*jnrC;
611 j_coord_offsetD = DIM*jnrD;
613 /* load j atom coordinates */
614 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
615 x+j_coord_offsetC,x+j_coord_offsetD,
616 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
617 &jy2,&jz2,&jx3,&jy3,&jz3);
619 /* Calculate displacement vector */
620 dx00 = _mm256_sub_pd(ix0,jx0);
621 dy00 = _mm256_sub_pd(iy0,jy0);
622 dz00 = _mm256_sub_pd(iz0,jz0);
623 dx11 = _mm256_sub_pd(ix1,jx1);
624 dy11 = _mm256_sub_pd(iy1,jy1);
625 dz11 = _mm256_sub_pd(iz1,jz1);
626 dx12 = _mm256_sub_pd(ix1,jx2);
627 dy12 = _mm256_sub_pd(iy1,jy2);
628 dz12 = _mm256_sub_pd(iz1,jz2);
629 dx13 = _mm256_sub_pd(ix1,jx3);
630 dy13 = _mm256_sub_pd(iy1,jy3);
631 dz13 = _mm256_sub_pd(iz1,jz3);
632 dx21 = _mm256_sub_pd(ix2,jx1);
633 dy21 = _mm256_sub_pd(iy2,jy1);
634 dz21 = _mm256_sub_pd(iz2,jz1);
635 dx22 = _mm256_sub_pd(ix2,jx2);
636 dy22 = _mm256_sub_pd(iy2,jy2);
637 dz22 = _mm256_sub_pd(iz2,jz2);
638 dx23 = _mm256_sub_pd(ix2,jx3);
639 dy23 = _mm256_sub_pd(iy2,jy3);
640 dz23 = _mm256_sub_pd(iz2,jz3);
641 dx31 = _mm256_sub_pd(ix3,jx1);
642 dy31 = _mm256_sub_pd(iy3,jy1);
643 dz31 = _mm256_sub_pd(iz3,jz1);
644 dx32 = _mm256_sub_pd(ix3,jx2);
645 dy32 = _mm256_sub_pd(iy3,jy2);
646 dz32 = _mm256_sub_pd(iz3,jz2);
647 dx33 = _mm256_sub_pd(ix3,jx3);
648 dy33 = _mm256_sub_pd(iy3,jy3);
649 dz33 = _mm256_sub_pd(iz3,jz3);
651 /* Calculate squared distance and things based on it */
652 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
653 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
654 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
655 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
656 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
657 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
658 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
659 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
660 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
661 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
663 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
664 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
665 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
666 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
667 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
668 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
669 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
670 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
671 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
673 rinvsq00 = gmx_mm256_inv_pd(rsq00);
674 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
675 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
676 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
677 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
678 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
679 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
680 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
681 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
682 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
684 fjx0 = _mm256_setzero_pd();
685 fjy0 = _mm256_setzero_pd();
686 fjz0 = _mm256_setzero_pd();
687 fjx1 = _mm256_setzero_pd();
688 fjy1 = _mm256_setzero_pd();
689 fjz1 = _mm256_setzero_pd();
690 fjx2 = _mm256_setzero_pd();
691 fjy2 = _mm256_setzero_pd();
692 fjz2 = _mm256_setzero_pd();
693 fjx3 = _mm256_setzero_pd();
694 fjy3 = _mm256_setzero_pd();
695 fjz3 = _mm256_setzero_pd();
697 /**************************
698 * CALCULATE INTERACTIONS *
699 **************************/
701 /* LENNARD-JONES DISPERSION/REPULSION */
703 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
704 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
705 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
706 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
707 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
709 /* Update potential sum for this i atom from the interaction with this j atom. */
710 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
711 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
715 fscal = _mm256_andnot_pd(dummy_mask,fscal);
717 /* Calculate temporary vectorial force */
718 tx = _mm256_mul_pd(fscal,dx00);
719 ty = _mm256_mul_pd(fscal,dy00);
720 tz = _mm256_mul_pd(fscal,dz00);
722 /* Update vectorial force */
723 fix0 = _mm256_add_pd(fix0,tx);
724 fiy0 = _mm256_add_pd(fiy0,ty);
725 fiz0 = _mm256_add_pd(fiz0,tz);
727 fjx0 = _mm256_add_pd(fjx0,tx);
728 fjy0 = _mm256_add_pd(fjy0,ty);
729 fjz0 = _mm256_add_pd(fjz0,tz);
731 /**************************
732 * CALCULATE INTERACTIONS *
733 **************************/
735 /* REACTION-FIELD ELECTROSTATICS */
736 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
737 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
739 /* Update potential sum for this i atom from the interaction with this j atom. */
740 velec = _mm256_andnot_pd(dummy_mask,velec);
741 velecsum = _mm256_add_pd(velecsum,velec);
745 fscal = _mm256_andnot_pd(dummy_mask,fscal);
747 /* Calculate temporary vectorial force */
748 tx = _mm256_mul_pd(fscal,dx11);
749 ty = _mm256_mul_pd(fscal,dy11);
750 tz = _mm256_mul_pd(fscal,dz11);
752 /* Update vectorial force */
753 fix1 = _mm256_add_pd(fix1,tx);
754 fiy1 = _mm256_add_pd(fiy1,ty);
755 fiz1 = _mm256_add_pd(fiz1,tz);
757 fjx1 = _mm256_add_pd(fjx1,tx);
758 fjy1 = _mm256_add_pd(fjy1,ty);
759 fjz1 = _mm256_add_pd(fjz1,tz);
761 /**************************
762 * CALCULATE INTERACTIONS *
763 **************************/
765 /* REACTION-FIELD ELECTROSTATICS */
766 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
767 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
769 /* Update potential sum for this i atom from the interaction with this j atom. */
770 velec = _mm256_andnot_pd(dummy_mask,velec);
771 velecsum = _mm256_add_pd(velecsum,velec);
775 fscal = _mm256_andnot_pd(dummy_mask,fscal);
777 /* Calculate temporary vectorial force */
778 tx = _mm256_mul_pd(fscal,dx12);
779 ty = _mm256_mul_pd(fscal,dy12);
780 tz = _mm256_mul_pd(fscal,dz12);
782 /* Update vectorial force */
783 fix1 = _mm256_add_pd(fix1,tx);
784 fiy1 = _mm256_add_pd(fiy1,ty);
785 fiz1 = _mm256_add_pd(fiz1,tz);
787 fjx2 = _mm256_add_pd(fjx2,tx);
788 fjy2 = _mm256_add_pd(fjy2,ty);
789 fjz2 = _mm256_add_pd(fjz2,tz);
791 /**************************
792 * CALCULATE INTERACTIONS *
793 **************************/
795 /* REACTION-FIELD ELECTROSTATICS */
796 velec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_add_pd(rinv13,_mm256_mul_pd(krf,rsq13)),crf));
797 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
799 /* Update potential sum for this i atom from the interaction with this j atom. */
800 velec = _mm256_andnot_pd(dummy_mask,velec);
801 velecsum = _mm256_add_pd(velecsum,velec);
805 fscal = _mm256_andnot_pd(dummy_mask,fscal);
807 /* Calculate temporary vectorial force */
808 tx = _mm256_mul_pd(fscal,dx13);
809 ty = _mm256_mul_pd(fscal,dy13);
810 tz = _mm256_mul_pd(fscal,dz13);
812 /* Update vectorial force */
813 fix1 = _mm256_add_pd(fix1,tx);
814 fiy1 = _mm256_add_pd(fiy1,ty);
815 fiz1 = _mm256_add_pd(fiz1,tz);
817 fjx3 = _mm256_add_pd(fjx3,tx);
818 fjy3 = _mm256_add_pd(fjy3,ty);
819 fjz3 = _mm256_add_pd(fjz3,tz);
821 /**************************
822 * CALCULATE INTERACTIONS *
823 **************************/
825 /* REACTION-FIELD ELECTROSTATICS */
826 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
827 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
829 /* Update potential sum for this i atom from the interaction with this j atom. */
830 velec = _mm256_andnot_pd(dummy_mask,velec);
831 velecsum = _mm256_add_pd(velecsum,velec);
835 fscal = _mm256_andnot_pd(dummy_mask,fscal);
837 /* Calculate temporary vectorial force */
838 tx = _mm256_mul_pd(fscal,dx21);
839 ty = _mm256_mul_pd(fscal,dy21);
840 tz = _mm256_mul_pd(fscal,dz21);
842 /* Update vectorial force */
843 fix2 = _mm256_add_pd(fix2,tx);
844 fiy2 = _mm256_add_pd(fiy2,ty);
845 fiz2 = _mm256_add_pd(fiz2,tz);
847 fjx1 = _mm256_add_pd(fjx1,tx);
848 fjy1 = _mm256_add_pd(fjy1,ty);
849 fjz1 = _mm256_add_pd(fjz1,tz);
851 /**************************
852 * CALCULATE INTERACTIONS *
853 **************************/
855 /* REACTION-FIELD ELECTROSTATICS */
856 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
857 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
859 /* Update potential sum for this i atom from the interaction with this j atom. */
860 velec = _mm256_andnot_pd(dummy_mask,velec);
861 velecsum = _mm256_add_pd(velecsum,velec);
865 fscal = _mm256_andnot_pd(dummy_mask,fscal);
867 /* Calculate temporary vectorial force */
868 tx = _mm256_mul_pd(fscal,dx22);
869 ty = _mm256_mul_pd(fscal,dy22);
870 tz = _mm256_mul_pd(fscal,dz22);
872 /* Update vectorial force */
873 fix2 = _mm256_add_pd(fix2,tx);
874 fiy2 = _mm256_add_pd(fiy2,ty);
875 fiz2 = _mm256_add_pd(fiz2,tz);
877 fjx2 = _mm256_add_pd(fjx2,tx);
878 fjy2 = _mm256_add_pd(fjy2,ty);
879 fjz2 = _mm256_add_pd(fjz2,tz);
881 /**************************
882 * CALCULATE INTERACTIONS *
883 **************************/
885 /* REACTION-FIELD ELECTROSTATICS */
886 velec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_add_pd(rinv23,_mm256_mul_pd(krf,rsq23)),crf));
887 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
889 /* Update potential sum for this i atom from the interaction with this j atom. */
890 velec = _mm256_andnot_pd(dummy_mask,velec);
891 velecsum = _mm256_add_pd(velecsum,velec);
895 fscal = _mm256_andnot_pd(dummy_mask,fscal);
897 /* Calculate temporary vectorial force */
898 tx = _mm256_mul_pd(fscal,dx23);
899 ty = _mm256_mul_pd(fscal,dy23);
900 tz = _mm256_mul_pd(fscal,dz23);
902 /* Update vectorial force */
903 fix2 = _mm256_add_pd(fix2,tx);
904 fiy2 = _mm256_add_pd(fiy2,ty);
905 fiz2 = _mm256_add_pd(fiz2,tz);
907 fjx3 = _mm256_add_pd(fjx3,tx);
908 fjy3 = _mm256_add_pd(fjy3,ty);
909 fjz3 = _mm256_add_pd(fjz3,tz);
911 /**************************
912 * CALCULATE INTERACTIONS *
913 **************************/
915 /* REACTION-FIELD ELECTROSTATICS */
916 velec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_add_pd(rinv31,_mm256_mul_pd(krf,rsq31)),crf));
917 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
919 /* Update potential sum for this i atom from the interaction with this j atom. */
920 velec = _mm256_andnot_pd(dummy_mask,velec);
921 velecsum = _mm256_add_pd(velecsum,velec);
925 fscal = _mm256_andnot_pd(dummy_mask,fscal);
927 /* Calculate temporary vectorial force */
928 tx = _mm256_mul_pd(fscal,dx31);
929 ty = _mm256_mul_pd(fscal,dy31);
930 tz = _mm256_mul_pd(fscal,dz31);
932 /* Update vectorial force */
933 fix3 = _mm256_add_pd(fix3,tx);
934 fiy3 = _mm256_add_pd(fiy3,ty);
935 fiz3 = _mm256_add_pd(fiz3,tz);
937 fjx1 = _mm256_add_pd(fjx1,tx);
938 fjy1 = _mm256_add_pd(fjy1,ty);
939 fjz1 = _mm256_add_pd(fjz1,tz);
941 /**************************
942 * CALCULATE INTERACTIONS *
943 **************************/
945 /* REACTION-FIELD ELECTROSTATICS */
946 velec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_add_pd(rinv32,_mm256_mul_pd(krf,rsq32)),crf));
947 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
949 /* Update potential sum for this i atom from the interaction with this j atom. */
950 velec = _mm256_andnot_pd(dummy_mask,velec);
951 velecsum = _mm256_add_pd(velecsum,velec);
955 fscal = _mm256_andnot_pd(dummy_mask,fscal);
957 /* Calculate temporary vectorial force */
958 tx = _mm256_mul_pd(fscal,dx32);
959 ty = _mm256_mul_pd(fscal,dy32);
960 tz = _mm256_mul_pd(fscal,dz32);
962 /* Update vectorial force */
963 fix3 = _mm256_add_pd(fix3,tx);
964 fiy3 = _mm256_add_pd(fiy3,ty);
965 fiz3 = _mm256_add_pd(fiz3,tz);
967 fjx2 = _mm256_add_pd(fjx2,tx);
968 fjy2 = _mm256_add_pd(fjy2,ty);
969 fjz2 = _mm256_add_pd(fjz2,tz);
971 /**************************
972 * CALCULATE INTERACTIONS *
973 **************************/
975 /* REACTION-FIELD ELECTROSTATICS */
976 velec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_add_pd(rinv33,_mm256_mul_pd(krf,rsq33)),crf));
977 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
979 /* Update potential sum for this i atom from the interaction with this j atom. */
980 velec = _mm256_andnot_pd(dummy_mask,velec);
981 velecsum = _mm256_add_pd(velecsum,velec);
985 fscal = _mm256_andnot_pd(dummy_mask,fscal);
987 /* Calculate temporary vectorial force */
988 tx = _mm256_mul_pd(fscal,dx33);
989 ty = _mm256_mul_pd(fscal,dy33);
990 tz = _mm256_mul_pd(fscal,dz33);
992 /* Update vectorial force */
993 fix3 = _mm256_add_pd(fix3,tx);
994 fiy3 = _mm256_add_pd(fiy3,ty);
995 fiz3 = _mm256_add_pd(fiz3,tz);
997 fjx3 = _mm256_add_pd(fjx3,tx);
998 fjy3 = _mm256_add_pd(fjy3,ty);
999 fjz3 = _mm256_add_pd(fjz3,tz);
1001 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1002 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1003 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1004 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1006 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1007 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1008 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1010 /* Inner loop uses 323 flops */
1013 /* End of innermost loop */
1015 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1016 f+i_coord_offset,fshift+i_shift_offset);
1019 /* Update potential energies */
1020 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1021 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1023 /* Increment number of inner iterations */
1024 inneriter += j_index_end - j_index_start;
1026 /* Outer loop uses 26 flops */
1029 /* Increment number of outer iterations */
1032 /* Update outer/inner flops */
1034 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*323);
1037 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_avx_256_double
1038 * Electrostatics interaction: ReactionField
1039 * VdW interaction: LennardJones
1040 * Geometry: Water4-Water4
1041 * Calculate force/pot: Force
1044 nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_avx_256_double
1045 (t_nblist * gmx_restrict nlist,
1046 rvec * gmx_restrict xx,
1047 rvec * gmx_restrict ff,
1048 t_forcerec * gmx_restrict fr,
1049 t_mdatoms * gmx_restrict mdatoms,
1050 nb_kernel_data_t * gmx_restrict kernel_data,
1051 t_nrnb * gmx_restrict nrnb)
1053 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1054 * just 0 for non-waters.
1055 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1056 * jnr indices corresponding to data put in the four positions in the SIMD register.
1058 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1059 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1060 int jnrA,jnrB,jnrC,jnrD;
1061 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1062 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1063 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1064 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1065 real rcutoff_scalar;
1066 real *shiftvec,*fshift,*x,*f;
1067 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1068 real scratch[4*DIM];
1069 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1070 real * vdwioffsetptr0;
1071 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1072 real * vdwioffsetptr1;
1073 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1074 real * vdwioffsetptr2;
1075 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1076 real * vdwioffsetptr3;
1077 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1078 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1079 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1080 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1081 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1082 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1083 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1084 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1085 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1086 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1087 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1088 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1089 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1090 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1091 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1092 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1093 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1094 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1095 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1096 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1099 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1102 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1103 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1104 __m256d dummy_mask,cutoff_mask;
1105 __m128 tmpmask0,tmpmask1;
1106 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1107 __m256d one = _mm256_set1_pd(1.0);
1108 __m256d two = _mm256_set1_pd(2.0);
1114 jindex = nlist->jindex;
1116 shiftidx = nlist->shift;
1118 shiftvec = fr->shift_vec[0];
1119 fshift = fr->fshift[0];
1120 facel = _mm256_set1_pd(fr->epsfac);
1121 charge = mdatoms->chargeA;
1122 krf = _mm256_set1_pd(fr->ic->k_rf);
1123 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
1124 crf = _mm256_set1_pd(fr->ic->c_rf);
1125 nvdwtype = fr->ntype;
1126 vdwparam = fr->nbfp;
1127 vdwtype = mdatoms->typeA;
1129 /* Setup water-specific parameters */
1130 inr = nlist->iinr[0];
1131 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1132 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1133 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1134 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1136 jq1 = _mm256_set1_pd(charge[inr+1]);
1137 jq2 = _mm256_set1_pd(charge[inr+2]);
1138 jq3 = _mm256_set1_pd(charge[inr+3]);
1139 vdwjidx0A = 2*vdwtype[inr+0];
1140 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1141 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1142 qq11 = _mm256_mul_pd(iq1,jq1);
1143 qq12 = _mm256_mul_pd(iq1,jq2);
1144 qq13 = _mm256_mul_pd(iq1,jq3);
1145 qq21 = _mm256_mul_pd(iq2,jq1);
1146 qq22 = _mm256_mul_pd(iq2,jq2);
1147 qq23 = _mm256_mul_pd(iq2,jq3);
1148 qq31 = _mm256_mul_pd(iq3,jq1);
1149 qq32 = _mm256_mul_pd(iq3,jq2);
1150 qq33 = _mm256_mul_pd(iq3,jq3);
1152 /* Avoid stupid compiler warnings */
1153 jnrA = jnrB = jnrC = jnrD = 0;
1154 j_coord_offsetA = 0;
1155 j_coord_offsetB = 0;
1156 j_coord_offsetC = 0;
1157 j_coord_offsetD = 0;
1162 for(iidx=0;iidx<4*DIM;iidx++)
1164 scratch[iidx] = 0.0;
1167 /* Start outer loop over neighborlists */
1168 for(iidx=0; iidx<nri; iidx++)
1170 /* Load shift vector for this list */
1171 i_shift_offset = DIM*shiftidx[iidx];
1173 /* Load limits for loop over neighbors */
1174 j_index_start = jindex[iidx];
1175 j_index_end = jindex[iidx+1];
1177 /* Get outer coordinate index */
1179 i_coord_offset = DIM*inr;
1181 /* Load i particle coords and add shift vector */
1182 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1183 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1185 fix0 = _mm256_setzero_pd();
1186 fiy0 = _mm256_setzero_pd();
1187 fiz0 = _mm256_setzero_pd();
1188 fix1 = _mm256_setzero_pd();
1189 fiy1 = _mm256_setzero_pd();
1190 fiz1 = _mm256_setzero_pd();
1191 fix2 = _mm256_setzero_pd();
1192 fiy2 = _mm256_setzero_pd();
1193 fiz2 = _mm256_setzero_pd();
1194 fix3 = _mm256_setzero_pd();
1195 fiy3 = _mm256_setzero_pd();
1196 fiz3 = _mm256_setzero_pd();
1198 /* Start inner kernel loop */
1199 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1202 /* Get j neighbor index, and coordinate index */
1204 jnrB = jjnr[jidx+1];
1205 jnrC = jjnr[jidx+2];
1206 jnrD = jjnr[jidx+3];
1207 j_coord_offsetA = DIM*jnrA;
1208 j_coord_offsetB = DIM*jnrB;
1209 j_coord_offsetC = DIM*jnrC;
1210 j_coord_offsetD = DIM*jnrD;
1212 /* load j atom coordinates */
1213 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1214 x+j_coord_offsetC,x+j_coord_offsetD,
1215 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1216 &jy2,&jz2,&jx3,&jy3,&jz3);
1218 /* Calculate displacement vector */
1219 dx00 = _mm256_sub_pd(ix0,jx0);
1220 dy00 = _mm256_sub_pd(iy0,jy0);
1221 dz00 = _mm256_sub_pd(iz0,jz0);
1222 dx11 = _mm256_sub_pd(ix1,jx1);
1223 dy11 = _mm256_sub_pd(iy1,jy1);
1224 dz11 = _mm256_sub_pd(iz1,jz1);
1225 dx12 = _mm256_sub_pd(ix1,jx2);
1226 dy12 = _mm256_sub_pd(iy1,jy2);
1227 dz12 = _mm256_sub_pd(iz1,jz2);
1228 dx13 = _mm256_sub_pd(ix1,jx3);
1229 dy13 = _mm256_sub_pd(iy1,jy3);
1230 dz13 = _mm256_sub_pd(iz1,jz3);
1231 dx21 = _mm256_sub_pd(ix2,jx1);
1232 dy21 = _mm256_sub_pd(iy2,jy1);
1233 dz21 = _mm256_sub_pd(iz2,jz1);
1234 dx22 = _mm256_sub_pd(ix2,jx2);
1235 dy22 = _mm256_sub_pd(iy2,jy2);
1236 dz22 = _mm256_sub_pd(iz2,jz2);
1237 dx23 = _mm256_sub_pd(ix2,jx3);
1238 dy23 = _mm256_sub_pd(iy2,jy3);
1239 dz23 = _mm256_sub_pd(iz2,jz3);
1240 dx31 = _mm256_sub_pd(ix3,jx1);
1241 dy31 = _mm256_sub_pd(iy3,jy1);
1242 dz31 = _mm256_sub_pd(iz3,jz1);
1243 dx32 = _mm256_sub_pd(ix3,jx2);
1244 dy32 = _mm256_sub_pd(iy3,jy2);
1245 dz32 = _mm256_sub_pd(iz3,jz2);
1246 dx33 = _mm256_sub_pd(ix3,jx3);
1247 dy33 = _mm256_sub_pd(iy3,jy3);
1248 dz33 = _mm256_sub_pd(iz3,jz3);
1250 /* Calculate squared distance and things based on it */
1251 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1252 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1253 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1254 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1255 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1256 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1257 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1258 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1259 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1260 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1262 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1263 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1264 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1265 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1266 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1267 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1268 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1269 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1270 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
1272 rinvsq00 = gmx_mm256_inv_pd(rsq00);
1273 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1274 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1275 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1276 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1277 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1278 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1279 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1280 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1281 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1283 fjx0 = _mm256_setzero_pd();
1284 fjy0 = _mm256_setzero_pd();
1285 fjz0 = _mm256_setzero_pd();
1286 fjx1 = _mm256_setzero_pd();
1287 fjy1 = _mm256_setzero_pd();
1288 fjz1 = _mm256_setzero_pd();
1289 fjx2 = _mm256_setzero_pd();
1290 fjy2 = _mm256_setzero_pd();
1291 fjz2 = _mm256_setzero_pd();
1292 fjx3 = _mm256_setzero_pd();
1293 fjy3 = _mm256_setzero_pd();
1294 fjz3 = _mm256_setzero_pd();
1296 /**************************
1297 * CALCULATE INTERACTIONS *
1298 **************************/
1300 /* LENNARD-JONES DISPERSION/REPULSION */
1302 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1303 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1307 /* Calculate temporary vectorial force */
1308 tx = _mm256_mul_pd(fscal,dx00);
1309 ty = _mm256_mul_pd(fscal,dy00);
1310 tz = _mm256_mul_pd(fscal,dz00);
1312 /* Update vectorial force */
1313 fix0 = _mm256_add_pd(fix0,tx);
1314 fiy0 = _mm256_add_pd(fiy0,ty);
1315 fiz0 = _mm256_add_pd(fiz0,tz);
1317 fjx0 = _mm256_add_pd(fjx0,tx);
1318 fjy0 = _mm256_add_pd(fjy0,ty);
1319 fjz0 = _mm256_add_pd(fjz0,tz);
1321 /**************************
1322 * CALCULATE INTERACTIONS *
1323 **************************/
1325 /* REACTION-FIELD ELECTROSTATICS */
1326 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1330 /* Calculate temporary vectorial force */
1331 tx = _mm256_mul_pd(fscal,dx11);
1332 ty = _mm256_mul_pd(fscal,dy11);
1333 tz = _mm256_mul_pd(fscal,dz11);
1335 /* Update vectorial force */
1336 fix1 = _mm256_add_pd(fix1,tx);
1337 fiy1 = _mm256_add_pd(fiy1,ty);
1338 fiz1 = _mm256_add_pd(fiz1,tz);
1340 fjx1 = _mm256_add_pd(fjx1,tx);
1341 fjy1 = _mm256_add_pd(fjy1,ty);
1342 fjz1 = _mm256_add_pd(fjz1,tz);
1344 /**************************
1345 * CALCULATE INTERACTIONS *
1346 **************************/
1348 /* REACTION-FIELD ELECTROSTATICS */
1349 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1353 /* Calculate temporary vectorial force */
1354 tx = _mm256_mul_pd(fscal,dx12);
1355 ty = _mm256_mul_pd(fscal,dy12);
1356 tz = _mm256_mul_pd(fscal,dz12);
1358 /* Update vectorial force */
1359 fix1 = _mm256_add_pd(fix1,tx);
1360 fiy1 = _mm256_add_pd(fiy1,ty);
1361 fiz1 = _mm256_add_pd(fiz1,tz);
1363 fjx2 = _mm256_add_pd(fjx2,tx);
1364 fjy2 = _mm256_add_pd(fjy2,ty);
1365 fjz2 = _mm256_add_pd(fjz2,tz);
1367 /**************************
1368 * CALCULATE INTERACTIONS *
1369 **************************/
1371 /* REACTION-FIELD ELECTROSTATICS */
1372 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
1376 /* Calculate temporary vectorial force */
1377 tx = _mm256_mul_pd(fscal,dx13);
1378 ty = _mm256_mul_pd(fscal,dy13);
1379 tz = _mm256_mul_pd(fscal,dz13);
1381 /* Update vectorial force */
1382 fix1 = _mm256_add_pd(fix1,tx);
1383 fiy1 = _mm256_add_pd(fiy1,ty);
1384 fiz1 = _mm256_add_pd(fiz1,tz);
1386 fjx3 = _mm256_add_pd(fjx3,tx);
1387 fjy3 = _mm256_add_pd(fjy3,ty);
1388 fjz3 = _mm256_add_pd(fjz3,tz);
1390 /**************************
1391 * CALCULATE INTERACTIONS *
1392 **************************/
1394 /* REACTION-FIELD ELECTROSTATICS */
1395 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1399 /* Calculate temporary vectorial force */
1400 tx = _mm256_mul_pd(fscal,dx21);
1401 ty = _mm256_mul_pd(fscal,dy21);
1402 tz = _mm256_mul_pd(fscal,dz21);
1404 /* Update vectorial force */
1405 fix2 = _mm256_add_pd(fix2,tx);
1406 fiy2 = _mm256_add_pd(fiy2,ty);
1407 fiz2 = _mm256_add_pd(fiz2,tz);
1409 fjx1 = _mm256_add_pd(fjx1,tx);
1410 fjy1 = _mm256_add_pd(fjy1,ty);
1411 fjz1 = _mm256_add_pd(fjz1,tz);
1413 /**************************
1414 * CALCULATE INTERACTIONS *
1415 **************************/
1417 /* REACTION-FIELD ELECTROSTATICS */
1418 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1422 /* Calculate temporary vectorial force */
1423 tx = _mm256_mul_pd(fscal,dx22);
1424 ty = _mm256_mul_pd(fscal,dy22);
1425 tz = _mm256_mul_pd(fscal,dz22);
1427 /* Update vectorial force */
1428 fix2 = _mm256_add_pd(fix2,tx);
1429 fiy2 = _mm256_add_pd(fiy2,ty);
1430 fiz2 = _mm256_add_pd(fiz2,tz);
1432 fjx2 = _mm256_add_pd(fjx2,tx);
1433 fjy2 = _mm256_add_pd(fjy2,ty);
1434 fjz2 = _mm256_add_pd(fjz2,tz);
1436 /**************************
1437 * CALCULATE INTERACTIONS *
1438 **************************/
1440 /* REACTION-FIELD ELECTROSTATICS */
1441 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
1445 /* Calculate temporary vectorial force */
1446 tx = _mm256_mul_pd(fscal,dx23);
1447 ty = _mm256_mul_pd(fscal,dy23);
1448 tz = _mm256_mul_pd(fscal,dz23);
1450 /* Update vectorial force */
1451 fix2 = _mm256_add_pd(fix2,tx);
1452 fiy2 = _mm256_add_pd(fiy2,ty);
1453 fiz2 = _mm256_add_pd(fiz2,tz);
1455 fjx3 = _mm256_add_pd(fjx3,tx);
1456 fjy3 = _mm256_add_pd(fjy3,ty);
1457 fjz3 = _mm256_add_pd(fjz3,tz);
1459 /**************************
1460 * CALCULATE INTERACTIONS *
1461 **************************/
1463 /* REACTION-FIELD ELECTROSTATICS */
1464 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1468 /* Calculate temporary vectorial force */
1469 tx = _mm256_mul_pd(fscal,dx31);
1470 ty = _mm256_mul_pd(fscal,dy31);
1471 tz = _mm256_mul_pd(fscal,dz31);
1473 /* Update vectorial force */
1474 fix3 = _mm256_add_pd(fix3,tx);
1475 fiy3 = _mm256_add_pd(fiy3,ty);
1476 fiz3 = _mm256_add_pd(fiz3,tz);
1478 fjx1 = _mm256_add_pd(fjx1,tx);
1479 fjy1 = _mm256_add_pd(fjy1,ty);
1480 fjz1 = _mm256_add_pd(fjz1,tz);
1482 /**************************
1483 * CALCULATE INTERACTIONS *
1484 **************************/
1486 /* REACTION-FIELD ELECTROSTATICS */
1487 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
1491 /* Calculate temporary vectorial force */
1492 tx = _mm256_mul_pd(fscal,dx32);
1493 ty = _mm256_mul_pd(fscal,dy32);
1494 tz = _mm256_mul_pd(fscal,dz32);
1496 /* Update vectorial force */
1497 fix3 = _mm256_add_pd(fix3,tx);
1498 fiy3 = _mm256_add_pd(fiy3,ty);
1499 fiz3 = _mm256_add_pd(fiz3,tz);
1501 fjx2 = _mm256_add_pd(fjx2,tx);
1502 fjy2 = _mm256_add_pd(fjy2,ty);
1503 fjz2 = _mm256_add_pd(fjz2,tz);
1505 /**************************
1506 * CALCULATE INTERACTIONS *
1507 **************************/
1509 /* REACTION-FIELD ELECTROSTATICS */
1510 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
1514 /* Calculate temporary vectorial force */
1515 tx = _mm256_mul_pd(fscal,dx33);
1516 ty = _mm256_mul_pd(fscal,dy33);
1517 tz = _mm256_mul_pd(fscal,dz33);
1519 /* Update vectorial force */
1520 fix3 = _mm256_add_pd(fix3,tx);
1521 fiy3 = _mm256_add_pd(fiy3,ty);
1522 fiz3 = _mm256_add_pd(fiz3,tz);
1524 fjx3 = _mm256_add_pd(fjx3,tx);
1525 fjy3 = _mm256_add_pd(fjy3,ty);
1526 fjz3 = _mm256_add_pd(fjz3,tz);
1528 fjptrA = f+j_coord_offsetA;
1529 fjptrB = f+j_coord_offsetB;
1530 fjptrC = f+j_coord_offsetC;
1531 fjptrD = f+j_coord_offsetD;
1533 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1534 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1535 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1537 /* Inner loop uses 273 flops */
1540 if(jidx<j_index_end)
1543 /* Get j neighbor index, and coordinate index */
1544 jnrlistA = jjnr[jidx];
1545 jnrlistB = jjnr[jidx+1];
1546 jnrlistC = jjnr[jidx+2];
1547 jnrlistD = jjnr[jidx+3];
1548 /* Sign of each element will be negative for non-real atoms.
1549 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1550 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1552 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1554 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1555 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1556 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1558 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1559 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1560 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1561 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1562 j_coord_offsetA = DIM*jnrA;
1563 j_coord_offsetB = DIM*jnrB;
1564 j_coord_offsetC = DIM*jnrC;
1565 j_coord_offsetD = DIM*jnrD;
1567 /* load j atom coordinates */
1568 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1569 x+j_coord_offsetC,x+j_coord_offsetD,
1570 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1571 &jy2,&jz2,&jx3,&jy3,&jz3);
1573 /* Calculate displacement vector */
1574 dx00 = _mm256_sub_pd(ix0,jx0);
1575 dy00 = _mm256_sub_pd(iy0,jy0);
1576 dz00 = _mm256_sub_pd(iz0,jz0);
1577 dx11 = _mm256_sub_pd(ix1,jx1);
1578 dy11 = _mm256_sub_pd(iy1,jy1);
1579 dz11 = _mm256_sub_pd(iz1,jz1);
1580 dx12 = _mm256_sub_pd(ix1,jx2);
1581 dy12 = _mm256_sub_pd(iy1,jy2);
1582 dz12 = _mm256_sub_pd(iz1,jz2);
1583 dx13 = _mm256_sub_pd(ix1,jx3);
1584 dy13 = _mm256_sub_pd(iy1,jy3);
1585 dz13 = _mm256_sub_pd(iz1,jz3);
1586 dx21 = _mm256_sub_pd(ix2,jx1);
1587 dy21 = _mm256_sub_pd(iy2,jy1);
1588 dz21 = _mm256_sub_pd(iz2,jz1);
1589 dx22 = _mm256_sub_pd(ix2,jx2);
1590 dy22 = _mm256_sub_pd(iy2,jy2);
1591 dz22 = _mm256_sub_pd(iz2,jz2);
1592 dx23 = _mm256_sub_pd(ix2,jx3);
1593 dy23 = _mm256_sub_pd(iy2,jy3);
1594 dz23 = _mm256_sub_pd(iz2,jz3);
1595 dx31 = _mm256_sub_pd(ix3,jx1);
1596 dy31 = _mm256_sub_pd(iy3,jy1);
1597 dz31 = _mm256_sub_pd(iz3,jz1);
1598 dx32 = _mm256_sub_pd(ix3,jx2);
1599 dy32 = _mm256_sub_pd(iy3,jy2);
1600 dz32 = _mm256_sub_pd(iz3,jz2);
1601 dx33 = _mm256_sub_pd(ix3,jx3);
1602 dy33 = _mm256_sub_pd(iy3,jy3);
1603 dz33 = _mm256_sub_pd(iz3,jz3);
1605 /* Calculate squared distance and things based on it */
1606 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1607 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1608 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1609 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1610 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1611 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1612 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1613 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1614 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1615 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1617 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1618 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1619 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1620 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1621 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1622 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1623 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1624 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1625 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
1627 rinvsq00 = gmx_mm256_inv_pd(rsq00);
1628 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1629 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1630 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1631 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1632 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1633 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1634 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1635 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1636 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1638 fjx0 = _mm256_setzero_pd();
1639 fjy0 = _mm256_setzero_pd();
1640 fjz0 = _mm256_setzero_pd();
1641 fjx1 = _mm256_setzero_pd();
1642 fjy1 = _mm256_setzero_pd();
1643 fjz1 = _mm256_setzero_pd();
1644 fjx2 = _mm256_setzero_pd();
1645 fjy2 = _mm256_setzero_pd();
1646 fjz2 = _mm256_setzero_pd();
1647 fjx3 = _mm256_setzero_pd();
1648 fjy3 = _mm256_setzero_pd();
1649 fjz3 = _mm256_setzero_pd();
1651 /**************************
1652 * CALCULATE INTERACTIONS *
1653 **************************/
1655 /* LENNARD-JONES DISPERSION/REPULSION */
1657 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1658 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1662 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1664 /* Calculate temporary vectorial force */
1665 tx = _mm256_mul_pd(fscal,dx00);
1666 ty = _mm256_mul_pd(fscal,dy00);
1667 tz = _mm256_mul_pd(fscal,dz00);
1669 /* Update vectorial force */
1670 fix0 = _mm256_add_pd(fix0,tx);
1671 fiy0 = _mm256_add_pd(fiy0,ty);
1672 fiz0 = _mm256_add_pd(fiz0,tz);
1674 fjx0 = _mm256_add_pd(fjx0,tx);
1675 fjy0 = _mm256_add_pd(fjy0,ty);
1676 fjz0 = _mm256_add_pd(fjz0,tz);
1678 /**************************
1679 * CALCULATE INTERACTIONS *
1680 **************************/
1682 /* REACTION-FIELD ELECTROSTATICS */
1683 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1687 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1689 /* Calculate temporary vectorial force */
1690 tx = _mm256_mul_pd(fscal,dx11);
1691 ty = _mm256_mul_pd(fscal,dy11);
1692 tz = _mm256_mul_pd(fscal,dz11);
1694 /* Update vectorial force */
1695 fix1 = _mm256_add_pd(fix1,tx);
1696 fiy1 = _mm256_add_pd(fiy1,ty);
1697 fiz1 = _mm256_add_pd(fiz1,tz);
1699 fjx1 = _mm256_add_pd(fjx1,tx);
1700 fjy1 = _mm256_add_pd(fjy1,ty);
1701 fjz1 = _mm256_add_pd(fjz1,tz);
1703 /**************************
1704 * CALCULATE INTERACTIONS *
1705 **************************/
1707 /* REACTION-FIELD ELECTROSTATICS */
1708 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1712 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1714 /* Calculate temporary vectorial force */
1715 tx = _mm256_mul_pd(fscal,dx12);
1716 ty = _mm256_mul_pd(fscal,dy12);
1717 tz = _mm256_mul_pd(fscal,dz12);
1719 /* Update vectorial force */
1720 fix1 = _mm256_add_pd(fix1,tx);
1721 fiy1 = _mm256_add_pd(fiy1,ty);
1722 fiz1 = _mm256_add_pd(fiz1,tz);
1724 fjx2 = _mm256_add_pd(fjx2,tx);
1725 fjy2 = _mm256_add_pd(fjy2,ty);
1726 fjz2 = _mm256_add_pd(fjz2,tz);
1728 /**************************
1729 * CALCULATE INTERACTIONS *
1730 **************************/
1732 /* REACTION-FIELD ELECTROSTATICS */
1733 felec = _mm256_mul_pd(qq13,_mm256_sub_pd(_mm256_mul_pd(rinv13,rinvsq13),krf2));
1737 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1739 /* Calculate temporary vectorial force */
1740 tx = _mm256_mul_pd(fscal,dx13);
1741 ty = _mm256_mul_pd(fscal,dy13);
1742 tz = _mm256_mul_pd(fscal,dz13);
1744 /* Update vectorial force */
1745 fix1 = _mm256_add_pd(fix1,tx);
1746 fiy1 = _mm256_add_pd(fiy1,ty);
1747 fiz1 = _mm256_add_pd(fiz1,tz);
1749 fjx3 = _mm256_add_pd(fjx3,tx);
1750 fjy3 = _mm256_add_pd(fjy3,ty);
1751 fjz3 = _mm256_add_pd(fjz3,tz);
1753 /**************************
1754 * CALCULATE INTERACTIONS *
1755 **************************/
1757 /* REACTION-FIELD ELECTROSTATICS */
1758 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1762 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1764 /* Calculate temporary vectorial force */
1765 tx = _mm256_mul_pd(fscal,dx21);
1766 ty = _mm256_mul_pd(fscal,dy21);
1767 tz = _mm256_mul_pd(fscal,dz21);
1769 /* Update vectorial force */
1770 fix2 = _mm256_add_pd(fix2,tx);
1771 fiy2 = _mm256_add_pd(fiy2,ty);
1772 fiz2 = _mm256_add_pd(fiz2,tz);
1774 fjx1 = _mm256_add_pd(fjx1,tx);
1775 fjy1 = _mm256_add_pd(fjy1,ty);
1776 fjz1 = _mm256_add_pd(fjz1,tz);
1778 /**************************
1779 * CALCULATE INTERACTIONS *
1780 **************************/
1782 /* REACTION-FIELD ELECTROSTATICS */
1783 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1787 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1789 /* Calculate temporary vectorial force */
1790 tx = _mm256_mul_pd(fscal,dx22);
1791 ty = _mm256_mul_pd(fscal,dy22);
1792 tz = _mm256_mul_pd(fscal,dz22);
1794 /* Update vectorial force */
1795 fix2 = _mm256_add_pd(fix2,tx);
1796 fiy2 = _mm256_add_pd(fiy2,ty);
1797 fiz2 = _mm256_add_pd(fiz2,tz);
1799 fjx2 = _mm256_add_pd(fjx2,tx);
1800 fjy2 = _mm256_add_pd(fjy2,ty);
1801 fjz2 = _mm256_add_pd(fjz2,tz);
1803 /**************************
1804 * CALCULATE INTERACTIONS *
1805 **************************/
1807 /* REACTION-FIELD ELECTROSTATICS */
1808 felec = _mm256_mul_pd(qq23,_mm256_sub_pd(_mm256_mul_pd(rinv23,rinvsq23),krf2));
1812 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1814 /* Calculate temporary vectorial force */
1815 tx = _mm256_mul_pd(fscal,dx23);
1816 ty = _mm256_mul_pd(fscal,dy23);
1817 tz = _mm256_mul_pd(fscal,dz23);
1819 /* Update vectorial force */
1820 fix2 = _mm256_add_pd(fix2,tx);
1821 fiy2 = _mm256_add_pd(fiy2,ty);
1822 fiz2 = _mm256_add_pd(fiz2,tz);
1824 fjx3 = _mm256_add_pd(fjx3,tx);
1825 fjy3 = _mm256_add_pd(fjy3,ty);
1826 fjz3 = _mm256_add_pd(fjz3,tz);
1828 /**************************
1829 * CALCULATE INTERACTIONS *
1830 **************************/
1832 /* REACTION-FIELD ELECTROSTATICS */
1833 felec = _mm256_mul_pd(qq31,_mm256_sub_pd(_mm256_mul_pd(rinv31,rinvsq31),krf2));
1837 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1839 /* Calculate temporary vectorial force */
1840 tx = _mm256_mul_pd(fscal,dx31);
1841 ty = _mm256_mul_pd(fscal,dy31);
1842 tz = _mm256_mul_pd(fscal,dz31);
1844 /* Update vectorial force */
1845 fix3 = _mm256_add_pd(fix3,tx);
1846 fiy3 = _mm256_add_pd(fiy3,ty);
1847 fiz3 = _mm256_add_pd(fiz3,tz);
1849 fjx1 = _mm256_add_pd(fjx1,tx);
1850 fjy1 = _mm256_add_pd(fjy1,ty);
1851 fjz1 = _mm256_add_pd(fjz1,tz);
1853 /**************************
1854 * CALCULATE INTERACTIONS *
1855 **************************/
1857 /* REACTION-FIELD ELECTROSTATICS */
1858 felec = _mm256_mul_pd(qq32,_mm256_sub_pd(_mm256_mul_pd(rinv32,rinvsq32),krf2));
1862 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1864 /* Calculate temporary vectorial force */
1865 tx = _mm256_mul_pd(fscal,dx32);
1866 ty = _mm256_mul_pd(fscal,dy32);
1867 tz = _mm256_mul_pd(fscal,dz32);
1869 /* Update vectorial force */
1870 fix3 = _mm256_add_pd(fix3,tx);
1871 fiy3 = _mm256_add_pd(fiy3,ty);
1872 fiz3 = _mm256_add_pd(fiz3,tz);
1874 fjx2 = _mm256_add_pd(fjx2,tx);
1875 fjy2 = _mm256_add_pd(fjy2,ty);
1876 fjz2 = _mm256_add_pd(fjz2,tz);
1878 /**************************
1879 * CALCULATE INTERACTIONS *
1880 **************************/
1882 /* REACTION-FIELD ELECTROSTATICS */
1883 felec = _mm256_mul_pd(qq33,_mm256_sub_pd(_mm256_mul_pd(rinv33,rinvsq33),krf2));
1887 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1889 /* Calculate temporary vectorial force */
1890 tx = _mm256_mul_pd(fscal,dx33);
1891 ty = _mm256_mul_pd(fscal,dy33);
1892 tz = _mm256_mul_pd(fscal,dz33);
1894 /* Update vectorial force */
1895 fix3 = _mm256_add_pd(fix3,tx);
1896 fiy3 = _mm256_add_pd(fiy3,ty);
1897 fiz3 = _mm256_add_pd(fiz3,tz);
1899 fjx3 = _mm256_add_pd(fjx3,tx);
1900 fjy3 = _mm256_add_pd(fjy3,ty);
1901 fjz3 = _mm256_add_pd(fjz3,tz);
1903 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1904 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1905 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1906 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1908 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1909 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1910 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1912 /* Inner loop uses 273 flops */
1915 /* End of innermost loop */
1917 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1918 f+i_coord_offset,fshift+i_shift_offset);
1920 /* Increment number of inner iterations */
1921 inneriter += j_index_end - j_index_start;
1923 /* Outer loop uses 24 flops */
1926 /* Increment number of outer iterations */
1929 /* Update outer/inner flops */
1931 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*273);