2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
49 #include "gromacs/simd/math_x86_avx_256_double.h"
50 #include "kernelutil_x86_avx_256_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_avx_256_double
54 * Electrostatics interaction: ReactionField
55 * VdW interaction: None
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_avx_256_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
79 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
80 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
82 real *shiftvec,*fshift,*x,*f;
83 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
85 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86 real * vdwioffsetptr0;
87 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88 real * vdwioffsetptr1;
89 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 real * vdwioffsetptr2;
91 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
93 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
95 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
97 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
100 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
101 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
102 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
105 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
109 __m256d dummy_mask,cutoff_mask;
110 __m128 tmpmask0,tmpmask1;
111 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
112 __m256d one = _mm256_set1_pd(1.0);
113 __m256d two = _mm256_set1_pd(2.0);
119 jindex = nlist->jindex;
121 shiftidx = nlist->shift;
123 shiftvec = fr->shift_vec[0];
124 fshift = fr->fshift[0];
125 facel = _mm256_set1_pd(fr->epsfac);
126 charge = mdatoms->chargeA;
127 krf = _mm256_set1_pd(fr->ic->k_rf);
128 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
129 crf = _mm256_set1_pd(fr->ic->c_rf);
131 /* Setup water-specific parameters */
132 inr = nlist->iinr[0];
133 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
134 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
135 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
137 jq0 = _mm256_set1_pd(charge[inr+0]);
138 jq1 = _mm256_set1_pd(charge[inr+1]);
139 jq2 = _mm256_set1_pd(charge[inr+2]);
140 qq00 = _mm256_mul_pd(iq0,jq0);
141 qq01 = _mm256_mul_pd(iq0,jq1);
142 qq02 = _mm256_mul_pd(iq0,jq2);
143 qq10 = _mm256_mul_pd(iq1,jq0);
144 qq11 = _mm256_mul_pd(iq1,jq1);
145 qq12 = _mm256_mul_pd(iq1,jq2);
146 qq20 = _mm256_mul_pd(iq2,jq0);
147 qq21 = _mm256_mul_pd(iq2,jq1);
148 qq22 = _mm256_mul_pd(iq2,jq2);
150 /* Avoid stupid compiler warnings */
151 jnrA = jnrB = jnrC = jnrD = 0;
160 for(iidx=0;iidx<4*DIM;iidx++)
165 /* Start outer loop over neighborlists */
166 for(iidx=0; iidx<nri; iidx++)
168 /* Load shift vector for this list */
169 i_shift_offset = DIM*shiftidx[iidx];
171 /* Load limits for loop over neighbors */
172 j_index_start = jindex[iidx];
173 j_index_end = jindex[iidx+1];
175 /* Get outer coordinate index */
177 i_coord_offset = DIM*inr;
179 /* Load i particle coords and add shift vector */
180 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
181 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
183 fix0 = _mm256_setzero_pd();
184 fiy0 = _mm256_setzero_pd();
185 fiz0 = _mm256_setzero_pd();
186 fix1 = _mm256_setzero_pd();
187 fiy1 = _mm256_setzero_pd();
188 fiz1 = _mm256_setzero_pd();
189 fix2 = _mm256_setzero_pd();
190 fiy2 = _mm256_setzero_pd();
191 fiz2 = _mm256_setzero_pd();
193 /* Reset potential sums */
194 velecsum = _mm256_setzero_pd();
196 /* Start inner kernel loop */
197 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
200 /* Get j neighbor index, and coordinate index */
205 j_coord_offsetA = DIM*jnrA;
206 j_coord_offsetB = DIM*jnrB;
207 j_coord_offsetC = DIM*jnrC;
208 j_coord_offsetD = DIM*jnrD;
210 /* load j atom coordinates */
211 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
212 x+j_coord_offsetC,x+j_coord_offsetD,
213 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
215 /* Calculate displacement vector */
216 dx00 = _mm256_sub_pd(ix0,jx0);
217 dy00 = _mm256_sub_pd(iy0,jy0);
218 dz00 = _mm256_sub_pd(iz0,jz0);
219 dx01 = _mm256_sub_pd(ix0,jx1);
220 dy01 = _mm256_sub_pd(iy0,jy1);
221 dz01 = _mm256_sub_pd(iz0,jz1);
222 dx02 = _mm256_sub_pd(ix0,jx2);
223 dy02 = _mm256_sub_pd(iy0,jy2);
224 dz02 = _mm256_sub_pd(iz0,jz2);
225 dx10 = _mm256_sub_pd(ix1,jx0);
226 dy10 = _mm256_sub_pd(iy1,jy0);
227 dz10 = _mm256_sub_pd(iz1,jz0);
228 dx11 = _mm256_sub_pd(ix1,jx1);
229 dy11 = _mm256_sub_pd(iy1,jy1);
230 dz11 = _mm256_sub_pd(iz1,jz1);
231 dx12 = _mm256_sub_pd(ix1,jx2);
232 dy12 = _mm256_sub_pd(iy1,jy2);
233 dz12 = _mm256_sub_pd(iz1,jz2);
234 dx20 = _mm256_sub_pd(ix2,jx0);
235 dy20 = _mm256_sub_pd(iy2,jy0);
236 dz20 = _mm256_sub_pd(iz2,jz0);
237 dx21 = _mm256_sub_pd(ix2,jx1);
238 dy21 = _mm256_sub_pd(iy2,jy1);
239 dz21 = _mm256_sub_pd(iz2,jz1);
240 dx22 = _mm256_sub_pd(ix2,jx2);
241 dy22 = _mm256_sub_pd(iy2,jy2);
242 dz22 = _mm256_sub_pd(iz2,jz2);
244 /* Calculate squared distance and things based on it */
245 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
246 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
247 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
248 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
249 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
250 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
251 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
252 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
253 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
255 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
256 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
257 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
258 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
259 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
260 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
261 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
262 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
263 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
265 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
266 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
267 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
268 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
269 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
270 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
271 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
272 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
273 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
275 fjx0 = _mm256_setzero_pd();
276 fjy0 = _mm256_setzero_pd();
277 fjz0 = _mm256_setzero_pd();
278 fjx1 = _mm256_setzero_pd();
279 fjy1 = _mm256_setzero_pd();
280 fjz1 = _mm256_setzero_pd();
281 fjx2 = _mm256_setzero_pd();
282 fjy2 = _mm256_setzero_pd();
283 fjz2 = _mm256_setzero_pd();
285 /**************************
286 * CALCULATE INTERACTIONS *
287 **************************/
289 /* REACTION-FIELD ELECTROSTATICS */
290 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_add_pd(rinv00,_mm256_mul_pd(krf,rsq00)),crf));
291 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
293 /* Update potential sum for this i atom from the interaction with this j atom. */
294 velecsum = _mm256_add_pd(velecsum,velec);
298 /* Calculate temporary vectorial force */
299 tx = _mm256_mul_pd(fscal,dx00);
300 ty = _mm256_mul_pd(fscal,dy00);
301 tz = _mm256_mul_pd(fscal,dz00);
303 /* Update vectorial force */
304 fix0 = _mm256_add_pd(fix0,tx);
305 fiy0 = _mm256_add_pd(fiy0,ty);
306 fiz0 = _mm256_add_pd(fiz0,tz);
308 fjx0 = _mm256_add_pd(fjx0,tx);
309 fjy0 = _mm256_add_pd(fjy0,ty);
310 fjz0 = _mm256_add_pd(fjz0,tz);
312 /**************************
313 * CALCULATE INTERACTIONS *
314 **************************/
316 /* REACTION-FIELD ELECTROSTATICS */
317 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_add_pd(rinv01,_mm256_mul_pd(krf,rsq01)),crf));
318 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
320 /* Update potential sum for this i atom from the interaction with this j atom. */
321 velecsum = _mm256_add_pd(velecsum,velec);
325 /* Calculate temporary vectorial force */
326 tx = _mm256_mul_pd(fscal,dx01);
327 ty = _mm256_mul_pd(fscal,dy01);
328 tz = _mm256_mul_pd(fscal,dz01);
330 /* Update vectorial force */
331 fix0 = _mm256_add_pd(fix0,tx);
332 fiy0 = _mm256_add_pd(fiy0,ty);
333 fiz0 = _mm256_add_pd(fiz0,tz);
335 fjx1 = _mm256_add_pd(fjx1,tx);
336 fjy1 = _mm256_add_pd(fjy1,ty);
337 fjz1 = _mm256_add_pd(fjz1,tz);
339 /**************************
340 * CALCULATE INTERACTIONS *
341 **************************/
343 /* REACTION-FIELD ELECTROSTATICS */
344 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_add_pd(rinv02,_mm256_mul_pd(krf,rsq02)),crf));
345 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
347 /* Update potential sum for this i atom from the interaction with this j atom. */
348 velecsum = _mm256_add_pd(velecsum,velec);
352 /* Calculate temporary vectorial force */
353 tx = _mm256_mul_pd(fscal,dx02);
354 ty = _mm256_mul_pd(fscal,dy02);
355 tz = _mm256_mul_pd(fscal,dz02);
357 /* Update vectorial force */
358 fix0 = _mm256_add_pd(fix0,tx);
359 fiy0 = _mm256_add_pd(fiy0,ty);
360 fiz0 = _mm256_add_pd(fiz0,tz);
362 fjx2 = _mm256_add_pd(fjx2,tx);
363 fjy2 = _mm256_add_pd(fjy2,ty);
364 fjz2 = _mm256_add_pd(fjz2,tz);
366 /**************************
367 * CALCULATE INTERACTIONS *
368 **************************/
370 /* REACTION-FIELD ELECTROSTATICS */
371 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_add_pd(rinv10,_mm256_mul_pd(krf,rsq10)),crf));
372 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
374 /* Update potential sum for this i atom from the interaction with this j atom. */
375 velecsum = _mm256_add_pd(velecsum,velec);
379 /* Calculate temporary vectorial force */
380 tx = _mm256_mul_pd(fscal,dx10);
381 ty = _mm256_mul_pd(fscal,dy10);
382 tz = _mm256_mul_pd(fscal,dz10);
384 /* Update vectorial force */
385 fix1 = _mm256_add_pd(fix1,tx);
386 fiy1 = _mm256_add_pd(fiy1,ty);
387 fiz1 = _mm256_add_pd(fiz1,tz);
389 fjx0 = _mm256_add_pd(fjx0,tx);
390 fjy0 = _mm256_add_pd(fjy0,ty);
391 fjz0 = _mm256_add_pd(fjz0,tz);
393 /**************************
394 * CALCULATE INTERACTIONS *
395 **************************/
397 /* REACTION-FIELD ELECTROSTATICS */
398 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
399 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
401 /* Update potential sum for this i atom from the interaction with this j atom. */
402 velecsum = _mm256_add_pd(velecsum,velec);
406 /* Calculate temporary vectorial force */
407 tx = _mm256_mul_pd(fscal,dx11);
408 ty = _mm256_mul_pd(fscal,dy11);
409 tz = _mm256_mul_pd(fscal,dz11);
411 /* Update vectorial force */
412 fix1 = _mm256_add_pd(fix1,tx);
413 fiy1 = _mm256_add_pd(fiy1,ty);
414 fiz1 = _mm256_add_pd(fiz1,tz);
416 fjx1 = _mm256_add_pd(fjx1,tx);
417 fjy1 = _mm256_add_pd(fjy1,ty);
418 fjz1 = _mm256_add_pd(fjz1,tz);
420 /**************************
421 * CALCULATE INTERACTIONS *
422 **************************/
424 /* REACTION-FIELD ELECTROSTATICS */
425 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
426 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
428 /* Update potential sum for this i atom from the interaction with this j atom. */
429 velecsum = _mm256_add_pd(velecsum,velec);
433 /* Calculate temporary vectorial force */
434 tx = _mm256_mul_pd(fscal,dx12);
435 ty = _mm256_mul_pd(fscal,dy12);
436 tz = _mm256_mul_pd(fscal,dz12);
438 /* Update vectorial force */
439 fix1 = _mm256_add_pd(fix1,tx);
440 fiy1 = _mm256_add_pd(fiy1,ty);
441 fiz1 = _mm256_add_pd(fiz1,tz);
443 fjx2 = _mm256_add_pd(fjx2,tx);
444 fjy2 = _mm256_add_pd(fjy2,ty);
445 fjz2 = _mm256_add_pd(fjz2,tz);
447 /**************************
448 * CALCULATE INTERACTIONS *
449 **************************/
451 /* REACTION-FIELD ELECTROSTATICS */
452 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_add_pd(rinv20,_mm256_mul_pd(krf,rsq20)),crf));
453 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
455 /* Update potential sum for this i atom from the interaction with this j atom. */
456 velecsum = _mm256_add_pd(velecsum,velec);
460 /* Calculate temporary vectorial force */
461 tx = _mm256_mul_pd(fscal,dx20);
462 ty = _mm256_mul_pd(fscal,dy20);
463 tz = _mm256_mul_pd(fscal,dz20);
465 /* Update vectorial force */
466 fix2 = _mm256_add_pd(fix2,tx);
467 fiy2 = _mm256_add_pd(fiy2,ty);
468 fiz2 = _mm256_add_pd(fiz2,tz);
470 fjx0 = _mm256_add_pd(fjx0,tx);
471 fjy0 = _mm256_add_pd(fjy0,ty);
472 fjz0 = _mm256_add_pd(fjz0,tz);
474 /**************************
475 * CALCULATE INTERACTIONS *
476 **************************/
478 /* REACTION-FIELD ELECTROSTATICS */
479 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
480 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
482 /* Update potential sum for this i atom from the interaction with this j atom. */
483 velecsum = _mm256_add_pd(velecsum,velec);
487 /* Calculate temporary vectorial force */
488 tx = _mm256_mul_pd(fscal,dx21);
489 ty = _mm256_mul_pd(fscal,dy21);
490 tz = _mm256_mul_pd(fscal,dz21);
492 /* Update vectorial force */
493 fix2 = _mm256_add_pd(fix2,tx);
494 fiy2 = _mm256_add_pd(fiy2,ty);
495 fiz2 = _mm256_add_pd(fiz2,tz);
497 fjx1 = _mm256_add_pd(fjx1,tx);
498 fjy1 = _mm256_add_pd(fjy1,ty);
499 fjz1 = _mm256_add_pd(fjz1,tz);
501 /**************************
502 * CALCULATE INTERACTIONS *
503 **************************/
505 /* REACTION-FIELD ELECTROSTATICS */
506 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
507 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
509 /* Update potential sum for this i atom from the interaction with this j atom. */
510 velecsum = _mm256_add_pd(velecsum,velec);
514 /* Calculate temporary vectorial force */
515 tx = _mm256_mul_pd(fscal,dx22);
516 ty = _mm256_mul_pd(fscal,dy22);
517 tz = _mm256_mul_pd(fscal,dz22);
519 /* Update vectorial force */
520 fix2 = _mm256_add_pd(fix2,tx);
521 fiy2 = _mm256_add_pd(fiy2,ty);
522 fiz2 = _mm256_add_pd(fiz2,tz);
524 fjx2 = _mm256_add_pd(fjx2,tx);
525 fjy2 = _mm256_add_pd(fjy2,ty);
526 fjz2 = _mm256_add_pd(fjz2,tz);
528 fjptrA = f+j_coord_offsetA;
529 fjptrB = f+j_coord_offsetB;
530 fjptrC = f+j_coord_offsetC;
531 fjptrD = f+j_coord_offsetD;
533 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
534 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
536 /* Inner loop uses 288 flops */
542 /* Get j neighbor index, and coordinate index */
543 jnrlistA = jjnr[jidx];
544 jnrlistB = jjnr[jidx+1];
545 jnrlistC = jjnr[jidx+2];
546 jnrlistD = jjnr[jidx+3];
547 /* Sign of each element will be negative for non-real atoms.
548 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
549 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
551 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
553 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
554 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
555 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
557 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
558 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
559 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
560 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
561 j_coord_offsetA = DIM*jnrA;
562 j_coord_offsetB = DIM*jnrB;
563 j_coord_offsetC = DIM*jnrC;
564 j_coord_offsetD = DIM*jnrD;
566 /* load j atom coordinates */
567 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
568 x+j_coord_offsetC,x+j_coord_offsetD,
569 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
571 /* Calculate displacement vector */
572 dx00 = _mm256_sub_pd(ix0,jx0);
573 dy00 = _mm256_sub_pd(iy0,jy0);
574 dz00 = _mm256_sub_pd(iz0,jz0);
575 dx01 = _mm256_sub_pd(ix0,jx1);
576 dy01 = _mm256_sub_pd(iy0,jy1);
577 dz01 = _mm256_sub_pd(iz0,jz1);
578 dx02 = _mm256_sub_pd(ix0,jx2);
579 dy02 = _mm256_sub_pd(iy0,jy2);
580 dz02 = _mm256_sub_pd(iz0,jz2);
581 dx10 = _mm256_sub_pd(ix1,jx0);
582 dy10 = _mm256_sub_pd(iy1,jy0);
583 dz10 = _mm256_sub_pd(iz1,jz0);
584 dx11 = _mm256_sub_pd(ix1,jx1);
585 dy11 = _mm256_sub_pd(iy1,jy1);
586 dz11 = _mm256_sub_pd(iz1,jz1);
587 dx12 = _mm256_sub_pd(ix1,jx2);
588 dy12 = _mm256_sub_pd(iy1,jy2);
589 dz12 = _mm256_sub_pd(iz1,jz2);
590 dx20 = _mm256_sub_pd(ix2,jx0);
591 dy20 = _mm256_sub_pd(iy2,jy0);
592 dz20 = _mm256_sub_pd(iz2,jz0);
593 dx21 = _mm256_sub_pd(ix2,jx1);
594 dy21 = _mm256_sub_pd(iy2,jy1);
595 dz21 = _mm256_sub_pd(iz2,jz1);
596 dx22 = _mm256_sub_pd(ix2,jx2);
597 dy22 = _mm256_sub_pd(iy2,jy2);
598 dz22 = _mm256_sub_pd(iz2,jz2);
600 /* Calculate squared distance and things based on it */
601 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
602 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
603 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
604 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
605 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
606 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
607 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
608 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
609 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
611 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
612 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
613 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
614 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
615 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
616 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
617 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
618 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
619 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
621 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
622 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
623 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
624 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
625 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
626 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
627 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
628 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
629 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
631 fjx0 = _mm256_setzero_pd();
632 fjy0 = _mm256_setzero_pd();
633 fjz0 = _mm256_setzero_pd();
634 fjx1 = _mm256_setzero_pd();
635 fjy1 = _mm256_setzero_pd();
636 fjz1 = _mm256_setzero_pd();
637 fjx2 = _mm256_setzero_pd();
638 fjy2 = _mm256_setzero_pd();
639 fjz2 = _mm256_setzero_pd();
641 /**************************
642 * CALCULATE INTERACTIONS *
643 **************************/
645 /* REACTION-FIELD ELECTROSTATICS */
646 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_add_pd(rinv00,_mm256_mul_pd(krf,rsq00)),crf));
647 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
649 /* Update potential sum for this i atom from the interaction with this j atom. */
650 velec = _mm256_andnot_pd(dummy_mask,velec);
651 velecsum = _mm256_add_pd(velecsum,velec);
655 fscal = _mm256_andnot_pd(dummy_mask,fscal);
657 /* Calculate temporary vectorial force */
658 tx = _mm256_mul_pd(fscal,dx00);
659 ty = _mm256_mul_pd(fscal,dy00);
660 tz = _mm256_mul_pd(fscal,dz00);
662 /* Update vectorial force */
663 fix0 = _mm256_add_pd(fix0,tx);
664 fiy0 = _mm256_add_pd(fiy0,ty);
665 fiz0 = _mm256_add_pd(fiz0,tz);
667 fjx0 = _mm256_add_pd(fjx0,tx);
668 fjy0 = _mm256_add_pd(fjy0,ty);
669 fjz0 = _mm256_add_pd(fjz0,tz);
671 /**************************
672 * CALCULATE INTERACTIONS *
673 **************************/
675 /* REACTION-FIELD ELECTROSTATICS */
676 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_add_pd(rinv01,_mm256_mul_pd(krf,rsq01)),crf));
677 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
679 /* Update potential sum for this i atom from the interaction with this j atom. */
680 velec = _mm256_andnot_pd(dummy_mask,velec);
681 velecsum = _mm256_add_pd(velecsum,velec);
685 fscal = _mm256_andnot_pd(dummy_mask,fscal);
687 /* Calculate temporary vectorial force */
688 tx = _mm256_mul_pd(fscal,dx01);
689 ty = _mm256_mul_pd(fscal,dy01);
690 tz = _mm256_mul_pd(fscal,dz01);
692 /* Update vectorial force */
693 fix0 = _mm256_add_pd(fix0,tx);
694 fiy0 = _mm256_add_pd(fiy0,ty);
695 fiz0 = _mm256_add_pd(fiz0,tz);
697 fjx1 = _mm256_add_pd(fjx1,tx);
698 fjy1 = _mm256_add_pd(fjy1,ty);
699 fjz1 = _mm256_add_pd(fjz1,tz);
701 /**************************
702 * CALCULATE INTERACTIONS *
703 **************************/
705 /* REACTION-FIELD ELECTROSTATICS */
706 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_add_pd(rinv02,_mm256_mul_pd(krf,rsq02)),crf));
707 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
709 /* Update potential sum for this i atom from the interaction with this j atom. */
710 velec = _mm256_andnot_pd(dummy_mask,velec);
711 velecsum = _mm256_add_pd(velecsum,velec);
715 fscal = _mm256_andnot_pd(dummy_mask,fscal);
717 /* Calculate temporary vectorial force */
718 tx = _mm256_mul_pd(fscal,dx02);
719 ty = _mm256_mul_pd(fscal,dy02);
720 tz = _mm256_mul_pd(fscal,dz02);
722 /* Update vectorial force */
723 fix0 = _mm256_add_pd(fix0,tx);
724 fiy0 = _mm256_add_pd(fiy0,ty);
725 fiz0 = _mm256_add_pd(fiz0,tz);
727 fjx2 = _mm256_add_pd(fjx2,tx);
728 fjy2 = _mm256_add_pd(fjy2,ty);
729 fjz2 = _mm256_add_pd(fjz2,tz);
731 /**************************
732 * CALCULATE INTERACTIONS *
733 **************************/
735 /* REACTION-FIELD ELECTROSTATICS */
736 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_add_pd(rinv10,_mm256_mul_pd(krf,rsq10)),crf));
737 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
739 /* Update potential sum for this i atom from the interaction with this j atom. */
740 velec = _mm256_andnot_pd(dummy_mask,velec);
741 velecsum = _mm256_add_pd(velecsum,velec);
745 fscal = _mm256_andnot_pd(dummy_mask,fscal);
747 /* Calculate temporary vectorial force */
748 tx = _mm256_mul_pd(fscal,dx10);
749 ty = _mm256_mul_pd(fscal,dy10);
750 tz = _mm256_mul_pd(fscal,dz10);
752 /* Update vectorial force */
753 fix1 = _mm256_add_pd(fix1,tx);
754 fiy1 = _mm256_add_pd(fiy1,ty);
755 fiz1 = _mm256_add_pd(fiz1,tz);
757 fjx0 = _mm256_add_pd(fjx0,tx);
758 fjy0 = _mm256_add_pd(fjy0,ty);
759 fjz0 = _mm256_add_pd(fjz0,tz);
761 /**************************
762 * CALCULATE INTERACTIONS *
763 **************************/
765 /* REACTION-FIELD ELECTROSTATICS */
766 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_add_pd(rinv11,_mm256_mul_pd(krf,rsq11)),crf));
767 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
769 /* Update potential sum for this i atom from the interaction with this j atom. */
770 velec = _mm256_andnot_pd(dummy_mask,velec);
771 velecsum = _mm256_add_pd(velecsum,velec);
775 fscal = _mm256_andnot_pd(dummy_mask,fscal);
777 /* Calculate temporary vectorial force */
778 tx = _mm256_mul_pd(fscal,dx11);
779 ty = _mm256_mul_pd(fscal,dy11);
780 tz = _mm256_mul_pd(fscal,dz11);
782 /* Update vectorial force */
783 fix1 = _mm256_add_pd(fix1,tx);
784 fiy1 = _mm256_add_pd(fiy1,ty);
785 fiz1 = _mm256_add_pd(fiz1,tz);
787 fjx1 = _mm256_add_pd(fjx1,tx);
788 fjy1 = _mm256_add_pd(fjy1,ty);
789 fjz1 = _mm256_add_pd(fjz1,tz);
791 /**************************
792 * CALCULATE INTERACTIONS *
793 **************************/
795 /* REACTION-FIELD ELECTROSTATICS */
796 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_add_pd(rinv12,_mm256_mul_pd(krf,rsq12)),crf));
797 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
799 /* Update potential sum for this i atom from the interaction with this j atom. */
800 velec = _mm256_andnot_pd(dummy_mask,velec);
801 velecsum = _mm256_add_pd(velecsum,velec);
805 fscal = _mm256_andnot_pd(dummy_mask,fscal);
807 /* Calculate temporary vectorial force */
808 tx = _mm256_mul_pd(fscal,dx12);
809 ty = _mm256_mul_pd(fscal,dy12);
810 tz = _mm256_mul_pd(fscal,dz12);
812 /* Update vectorial force */
813 fix1 = _mm256_add_pd(fix1,tx);
814 fiy1 = _mm256_add_pd(fiy1,ty);
815 fiz1 = _mm256_add_pd(fiz1,tz);
817 fjx2 = _mm256_add_pd(fjx2,tx);
818 fjy2 = _mm256_add_pd(fjy2,ty);
819 fjz2 = _mm256_add_pd(fjz2,tz);
821 /**************************
822 * CALCULATE INTERACTIONS *
823 **************************/
825 /* REACTION-FIELD ELECTROSTATICS */
826 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_add_pd(rinv20,_mm256_mul_pd(krf,rsq20)),crf));
827 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
829 /* Update potential sum for this i atom from the interaction with this j atom. */
830 velec = _mm256_andnot_pd(dummy_mask,velec);
831 velecsum = _mm256_add_pd(velecsum,velec);
835 fscal = _mm256_andnot_pd(dummy_mask,fscal);
837 /* Calculate temporary vectorial force */
838 tx = _mm256_mul_pd(fscal,dx20);
839 ty = _mm256_mul_pd(fscal,dy20);
840 tz = _mm256_mul_pd(fscal,dz20);
842 /* Update vectorial force */
843 fix2 = _mm256_add_pd(fix2,tx);
844 fiy2 = _mm256_add_pd(fiy2,ty);
845 fiz2 = _mm256_add_pd(fiz2,tz);
847 fjx0 = _mm256_add_pd(fjx0,tx);
848 fjy0 = _mm256_add_pd(fjy0,ty);
849 fjz0 = _mm256_add_pd(fjz0,tz);
851 /**************************
852 * CALCULATE INTERACTIONS *
853 **************************/
855 /* REACTION-FIELD ELECTROSTATICS */
856 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_add_pd(rinv21,_mm256_mul_pd(krf,rsq21)),crf));
857 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
859 /* Update potential sum for this i atom from the interaction with this j atom. */
860 velec = _mm256_andnot_pd(dummy_mask,velec);
861 velecsum = _mm256_add_pd(velecsum,velec);
865 fscal = _mm256_andnot_pd(dummy_mask,fscal);
867 /* Calculate temporary vectorial force */
868 tx = _mm256_mul_pd(fscal,dx21);
869 ty = _mm256_mul_pd(fscal,dy21);
870 tz = _mm256_mul_pd(fscal,dz21);
872 /* Update vectorial force */
873 fix2 = _mm256_add_pd(fix2,tx);
874 fiy2 = _mm256_add_pd(fiy2,ty);
875 fiz2 = _mm256_add_pd(fiz2,tz);
877 fjx1 = _mm256_add_pd(fjx1,tx);
878 fjy1 = _mm256_add_pd(fjy1,ty);
879 fjz1 = _mm256_add_pd(fjz1,tz);
881 /**************************
882 * CALCULATE INTERACTIONS *
883 **************************/
885 /* REACTION-FIELD ELECTROSTATICS */
886 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_add_pd(rinv22,_mm256_mul_pd(krf,rsq22)),crf));
887 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
889 /* Update potential sum for this i atom from the interaction with this j atom. */
890 velec = _mm256_andnot_pd(dummy_mask,velec);
891 velecsum = _mm256_add_pd(velecsum,velec);
895 fscal = _mm256_andnot_pd(dummy_mask,fscal);
897 /* Calculate temporary vectorial force */
898 tx = _mm256_mul_pd(fscal,dx22);
899 ty = _mm256_mul_pd(fscal,dy22);
900 tz = _mm256_mul_pd(fscal,dz22);
902 /* Update vectorial force */
903 fix2 = _mm256_add_pd(fix2,tx);
904 fiy2 = _mm256_add_pd(fiy2,ty);
905 fiz2 = _mm256_add_pd(fiz2,tz);
907 fjx2 = _mm256_add_pd(fjx2,tx);
908 fjy2 = _mm256_add_pd(fjy2,ty);
909 fjz2 = _mm256_add_pd(fjz2,tz);
911 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
912 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
913 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
914 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
916 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
917 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
919 /* Inner loop uses 288 flops */
922 /* End of innermost loop */
924 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
925 f+i_coord_offset,fshift+i_shift_offset);
928 /* Update potential energies */
929 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
931 /* Increment number of inner iterations */
932 inneriter += j_index_end - j_index_start;
934 /* Outer loop uses 19 flops */
937 /* Increment number of outer iterations */
940 /* Update outer/inner flops */
942 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*288);
945 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW3W3_F_avx_256_double
946 * Electrostatics interaction: ReactionField
947 * VdW interaction: None
948 * Geometry: Water3-Water3
949 * Calculate force/pot: Force
952 nb_kernel_ElecRF_VdwNone_GeomW3W3_F_avx_256_double
953 (t_nblist * gmx_restrict nlist,
954 rvec * gmx_restrict xx,
955 rvec * gmx_restrict ff,
956 t_forcerec * gmx_restrict fr,
957 t_mdatoms * gmx_restrict mdatoms,
958 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
959 t_nrnb * gmx_restrict nrnb)
961 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
962 * just 0 for non-waters.
963 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
964 * jnr indices corresponding to data put in the four positions in the SIMD register.
966 int i_shift_offset,i_coord_offset,outeriter,inneriter;
967 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
968 int jnrA,jnrB,jnrC,jnrD;
969 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
970 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
971 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
972 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
974 real *shiftvec,*fshift,*x,*f;
975 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
977 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
978 real * vdwioffsetptr0;
979 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
980 real * vdwioffsetptr1;
981 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
982 real * vdwioffsetptr2;
983 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
984 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
985 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
986 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
987 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
988 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
989 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
990 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
991 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
992 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
993 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
994 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
995 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
996 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
997 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
998 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
999 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1001 __m256d dummy_mask,cutoff_mask;
1002 __m128 tmpmask0,tmpmask1;
1003 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1004 __m256d one = _mm256_set1_pd(1.0);
1005 __m256d two = _mm256_set1_pd(2.0);
1011 jindex = nlist->jindex;
1013 shiftidx = nlist->shift;
1015 shiftvec = fr->shift_vec[0];
1016 fshift = fr->fshift[0];
1017 facel = _mm256_set1_pd(fr->epsfac);
1018 charge = mdatoms->chargeA;
1019 krf = _mm256_set1_pd(fr->ic->k_rf);
1020 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
1021 crf = _mm256_set1_pd(fr->ic->c_rf);
1023 /* Setup water-specific parameters */
1024 inr = nlist->iinr[0];
1025 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1026 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1027 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1029 jq0 = _mm256_set1_pd(charge[inr+0]);
1030 jq1 = _mm256_set1_pd(charge[inr+1]);
1031 jq2 = _mm256_set1_pd(charge[inr+2]);
1032 qq00 = _mm256_mul_pd(iq0,jq0);
1033 qq01 = _mm256_mul_pd(iq0,jq1);
1034 qq02 = _mm256_mul_pd(iq0,jq2);
1035 qq10 = _mm256_mul_pd(iq1,jq0);
1036 qq11 = _mm256_mul_pd(iq1,jq1);
1037 qq12 = _mm256_mul_pd(iq1,jq2);
1038 qq20 = _mm256_mul_pd(iq2,jq0);
1039 qq21 = _mm256_mul_pd(iq2,jq1);
1040 qq22 = _mm256_mul_pd(iq2,jq2);
1042 /* Avoid stupid compiler warnings */
1043 jnrA = jnrB = jnrC = jnrD = 0;
1044 j_coord_offsetA = 0;
1045 j_coord_offsetB = 0;
1046 j_coord_offsetC = 0;
1047 j_coord_offsetD = 0;
1052 for(iidx=0;iidx<4*DIM;iidx++)
1054 scratch[iidx] = 0.0;
1057 /* Start outer loop over neighborlists */
1058 for(iidx=0; iidx<nri; iidx++)
1060 /* Load shift vector for this list */
1061 i_shift_offset = DIM*shiftidx[iidx];
1063 /* Load limits for loop over neighbors */
1064 j_index_start = jindex[iidx];
1065 j_index_end = jindex[iidx+1];
1067 /* Get outer coordinate index */
1069 i_coord_offset = DIM*inr;
1071 /* Load i particle coords and add shift vector */
1072 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1073 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1075 fix0 = _mm256_setzero_pd();
1076 fiy0 = _mm256_setzero_pd();
1077 fiz0 = _mm256_setzero_pd();
1078 fix1 = _mm256_setzero_pd();
1079 fiy1 = _mm256_setzero_pd();
1080 fiz1 = _mm256_setzero_pd();
1081 fix2 = _mm256_setzero_pd();
1082 fiy2 = _mm256_setzero_pd();
1083 fiz2 = _mm256_setzero_pd();
1085 /* Start inner kernel loop */
1086 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1089 /* Get j neighbor index, and coordinate index */
1091 jnrB = jjnr[jidx+1];
1092 jnrC = jjnr[jidx+2];
1093 jnrD = jjnr[jidx+3];
1094 j_coord_offsetA = DIM*jnrA;
1095 j_coord_offsetB = DIM*jnrB;
1096 j_coord_offsetC = DIM*jnrC;
1097 j_coord_offsetD = DIM*jnrD;
1099 /* load j atom coordinates */
1100 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1101 x+j_coord_offsetC,x+j_coord_offsetD,
1102 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1104 /* Calculate displacement vector */
1105 dx00 = _mm256_sub_pd(ix0,jx0);
1106 dy00 = _mm256_sub_pd(iy0,jy0);
1107 dz00 = _mm256_sub_pd(iz0,jz0);
1108 dx01 = _mm256_sub_pd(ix0,jx1);
1109 dy01 = _mm256_sub_pd(iy0,jy1);
1110 dz01 = _mm256_sub_pd(iz0,jz1);
1111 dx02 = _mm256_sub_pd(ix0,jx2);
1112 dy02 = _mm256_sub_pd(iy0,jy2);
1113 dz02 = _mm256_sub_pd(iz0,jz2);
1114 dx10 = _mm256_sub_pd(ix1,jx0);
1115 dy10 = _mm256_sub_pd(iy1,jy0);
1116 dz10 = _mm256_sub_pd(iz1,jz0);
1117 dx11 = _mm256_sub_pd(ix1,jx1);
1118 dy11 = _mm256_sub_pd(iy1,jy1);
1119 dz11 = _mm256_sub_pd(iz1,jz1);
1120 dx12 = _mm256_sub_pd(ix1,jx2);
1121 dy12 = _mm256_sub_pd(iy1,jy2);
1122 dz12 = _mm256_sub_pd(iz1,jz2);
1123 dx20 = _mm256_sub_pd(ix2,jx0);
1124 dy20 = _mm256_sub_pd(iy2,jy0);
1125 dz20 = _mm256_sub_pd(iz2,jz0);
1126 dx21 = _mm256_sub_pd(ix2,jx1);
1127 dy21 = _mm256_sub_pd(iy2,jy1);
1128 dz21 = _mm256_sub_pd(iz2,jz1);
1129 dx22 = _mm256_sub_pd(ix2,jx2);
1130 dy22 = _mm256_sub_pd(iy2,jy2);
1131 dz22 = _mm256_sub_pd(iz2,jz2);
1133 /* Calculate squared distance and things based on it */
1134 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1135 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1136 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1137 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1138 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1139 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1140 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1141 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1142 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1144 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1145 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1146 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1147 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1148 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1149 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1150 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1151 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1152 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1154 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1155 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1156 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1157 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1158 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1159 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1160 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1161 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1162 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1164 fjx0 = _mm256_setzero_pd();
1165 fjy0 = _mm256_setzero_pd();
1166 fjz0 = _mm256_setzero_pd();
1167 fjx1 = _mm256_setzero_pd();
1168 fjy1 = _mm256_setzero_pd();
1169 fjz1 = _mm256_setzero_pd();
1170 fjx2 = _mm256_setzero_pd();
1171 fjy2 = _mm256_setzero_pd();
1172 fjz2 = _mm256_setzero_pd();
1174 /**************************
1175 * CALCULATE INTERACTIONS *
1176 **************************/
1178 /* REACTION-FIELD ELECTROSTATICS */
1179 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
1183 /* Calculate temporary vectorial force */
1184 tx = _mm256_mul_pd(fscal,dx00);
1185 ty = _mm256_mul_pd(fscal,dy00);
1186 tz = _mm256_mul_pd(fscal,dz00);
1188 /* Update vectorial force */
1189 fix0 = _mm256_add_pd(fix0,tx);
1190 fiy0 = _mm256_add_pd(fiy0,ty);
1191 fiz0 = _mm256_add_pd(fiz0,tz);
1193 fjx0 = _mm256_add_pd(fjx0,tx);
1194 fjy0 = _mm256_add_pd(fjy0,ty);
1195 fjz0 = _mm256_add_pd(fjz0,tz);
1197 /**************************
1198 * CALCULATE INTERACTIONS *
1199 **************************/
1201 /* REACTION-FIELD ELECTROSTATICS */
1202 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
1206 /* Calculate temporary vectorial force */
1207 tx = _mm256_mul_pd(fscal,dx01);
1208 ty = _mm256_mul_pd(fscal,dy01);
1209 tz = _mm256_mul_pd(fscal,dz01);
1211 /* Update vectorial force */
1212 fix0 = _mm256_add_pd(fix0,tx);
1213 fiy0 = _mm256_add_pd(fiy0,ty);
1214 fiz0 = _mm256_add_pd(fiz0,tz);
1216 fjx1 = _mm256_add_pd(fjx1,tx);
1217 fjy1 = _mm256_add_pd(fjy1,ty);
1218 fjz1 = _mm256_add_pd(fjz1,tz);
1220 /**************************
1221 * CALCULATE INTERACTIONS *
1222 **************************/
1224 /* REACTION-FIELD ELECTROSTATICS */
1225 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
1229 /* Calculate temporary vectorial force */
1230 tx = _mm256_mul_pd(fscal,dx02);
1231 ty = _mm256_mul_pd(fscal,dy02);
1232 tz = _mm256_mul_pd(fscal,dz02);
1234 /* Update vectorial force */
1235 fix0 = _mm256_add_pd(fix0,tx);
1236 fiy0 = _mm256_add_pd(fiy0,ty);
1237 fiz0 = _mm256_add_pd(fiz0,tz);
1239 fjx2 = _mm256_add_pd(fjx2,tx);
1240 fjy2 = _mm256_add_pd(fjy2,ty);
1241 fjz2 = _mm256_add_pd(fjz2,tz);
1243 /**************************
1244 * CALCULATE INTERACTIONS *
1245 **************************/
1247 /* REACTION-FIELD ELECTROSTATICS */
1248 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
1252 /* Calculate temporary vectorial force */
1253 tx = _mm256_mul_pd(fscal,dx10);
1254 ty = _mm256_mul_pd(fscal,dy10);
1255 tz = _mm256_mul_pd(fscal,dz10);
1257 /* Update vectorial force */
1258 fix1 = _mm256_add_pd(fix1,tx);
1259 fiy1 = _mm256_add_pd(fiy1,ty);
1260 fiz1 = _mm256_add_pd(fiz1,tz);
1262 fjx0 = _mm256_add_pd(fjx0,tx);
1263 fjy0 = _mm256_add_pd(fjy0,ty);
1264 fjz0 = _mm256_add_pd(fjz0,tz);
1266 /**************************
1267 * CALCULATE INTERACTIONS *
1268 **************************/
1270 /* REACTION-FIELD ELECTROSTATICS */
1271 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1275 /* Calculate temporary vectorial force */
1276 tx = _mm256_mul_pd(fscal,dx11);
1277 ty = _mm256_mul_pd(fscal,dy11);
1278 tz = _mm256_mul_pd(fscal,dz11);
1280 /* Update vectorial force */
1281 fix1 = _mm256_add_pd(fix1,tx);
1282 fiy1 = _mm256_add_pd(fiy1,ty);
1283 fiz1 = _mm256_add_pd(fiz1,tz);
1285 fjx1 = _mm256_add_pd(fjx1,tx);
1286 fjy1 = _mm256_add_pd(fjy1,ty);
1287 fjz1 = _mm256_add_pd(fjz1,tz);
1289 /**************************
1290 * CALCULATE INTERACTIONS *
1291 **************************/
1293 /* REACTION-FIELD ELECTROSTATICS */
1294 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1298 /* Calculate temporary vectorial force */
1299 tx = _mm256_mul_pd(fscal,dx12);
1300 ty = _mm256_mul_pd(fscal,dy12);
1301 tz = _mm256_mul_pd(fscal,dz12);
1303 /* Update vectorial force */
1304 fix1 = _mm256_add_pd(fix1,tx);
1305 fiy1 = _mm256_add_pd(fiy1,ty);
1306 fiz1 = _mm256_add_pd(fiz1,tz);
1308 fjx2 = _mm256_add_pd(fjx2,tx);
1309 fjy2 = _mm256_add_pd(fjy2,ty);
1310 fjz2 = _mm256_add_pd(fjz2,tz);
1312 /**************************
1313 * CALCULATE INTERACTIONS *
1314 **************************/
1316 /* REACTION-FIELD ELECTROSTATICS */
1317 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
1321 /* Calculate temporary vectorial force */
1322 tx = _mm256_mul_pd(fscal,dx20);
1323 ty = _mm256_mul_pd(fscal,dy20);
1324 tz = _mm256_mul_pd(fscal,dz20);
1326 /* Update vectorial force */
1327 fix2 = _mm256_add_pd(fix2,tx);
1328 fiy2 = _mm256_add_pd(fiy2,ty);
1329 fiz2 = _mm256_add_pd(fiz2,tz);
1331 fjx0 = _mm256_add_pd(fjx0,tx);
1332 fjy0 = _mm256_add_pd(fjy0,ty);
1333 fjz0 = _mm256_add_pd(fjz0,tz);
1335 /**************************
1336 * CALCULATE INTERACTIONS *
1337 **************************/
1339 /* REACTION-FIELD ELECTROSTATICS */
1340 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1344 /* Calculate temporary vectorial force */
1345 tx = _mm256_mul_pd(fscal,dx21);
1346 ty = _mm256_mul_pd(fscal,dy21);
1347 tz = _mm256_mul_pd(fscal,dz21);
1349 /* Update vectorial force */
1350 fix2 = _mm256_add_pd(fix2,tx);
1351 fiy2 = _mm256_add_pd(fiy2,ty);
1352 fiz2 = _mm256_add_pd(fiz2,tz);
1354 fjx1 = _mm256_add_pd(fjx1,tx);
1355 fjy1 = _mm256_add_pd(fjy1,ty);
1356 fjz1 = _mm256_add_pd(fjz1,tz);
1358 /**************************
1359 * CALCULATE INTERACTIONS *
1360 **************************/
1362 /* REACTION-FIELD ELECTROSTATICS */
1363 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1367 /* Calculate temporary vectorial force */
1368 tx = _mm256_mul_pd(fscal,dx22);
1369 ty = _mm256_mul_pd(fscal,dy22);
1370 tz = _mm256_mul_pd(fscal,dz22);
1372 /* Update vectorial force */
1373 fix2 = _mm256_add_pd(fix2,tx);
1374 fiy2 = _mm256_add_pd(fiy2,ty);
1375 fiz2 = _mm256_add_pd(fiz2,tz);
1377 fjx2 = _mm256_add_pd(fjx2,tx);
1378 fjy2 = _mm256_add_pd(fjy2,ty);
1379 fjz2 = _mm256_add_pd(fjz2,tz);
1381 fjptrA = f+j_coord_offsetA;
1382 fjptrB = f+j_coord_offsetB;
1383 fjptrC = f+j_coord_offsetC;
1384 fjptrD = f+j_coord_offsetD;
1386 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1387 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1389 /* Inner loop uses 243 flops */
1392 if(jidx<j_index_end)
1395 /* Get j neighbor index, and coordinate index */
1396 jnrlistA = jjnr[jidx];
1397 jnrlistB = jjnr[jidx+1];
1398 jnrlistC = jjnr[jidx+2];
1399 jnrlistD = jjnr[jidx+3];
1400 /* Sign of each element will be negative for non-real atoms.
1401 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1402 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1404 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1406 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1407 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1408 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1410 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1411 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1412 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1413 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1414 j_coord_offsetA = DIM*jnrA;
1415 j_coord_offsetB = DIM*jnrB;
1416 j_coord_offsetC = DIM*jnrC;
1417 j_coord_offsetD = DIM*jnrD;
1419 /* load j atom coordinates */
1420 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1421 x+j_coord_offsetC,x+j_coord_offsetD,
1422 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1424 /* Calculate displacement vector */
1425 dx00 = _mm256_sub_pd(ix0,jx0);
1426 dy00 = _mm256_sub_pd(iy0,jy0);
1427 dz00 = _mm256_sub_pd(iz0,jz0);
1428 dx01 = _mm256_sub_pd(ix0,jx1);
1429 dy01 = _mm256_sub_pd(iy0,jy1);
1430 dz01 = _mm256_sub_pd(iz0,jz1);
1431 dx02 = _mm256_sub_pd(ix0,jx2);
1432 dy02 = _mm256_sub_pd(iy0,jy2);
1433 dz02 = _mm256_sub_pd(iz0,jz2);
1434 dx10 = _mm256_sub_pd(ix1,jx0);
1435 dy10 = _mm256_sub_pd(iy1,jy0);
1436 dz10 = _mm256_sub_pd(iz1,jz0);
1437 dx11 = _mm256_sub_pd(ix1,jx1);
1438 dy11 = _mm256_sub_pd(iy1,jy1);
1439 dz11 = _mm256_sub_pd(iz1,jz1);
1440 dx12 = _mm256_sub_pd(ix1,jx2);
1441 dy12 = _mm256_sub_pd(iy1,jy2);
1442 dz12 = _mm256_sub_pd(iz1,jz2);
1443 dx20 = _mm256_sub_pd(ix2,jx0);
1444 dy20 = _mm256_sub_pd(iy2,jy0);
1445 dz20 = _mm256_sub_pd(iz2,jz0);
1446 dx21 = _mm256_sub_pd(ix2,jx1);
1447 dy21 = _mm256_sub_pd(iy2,jy1);
1448 dz21 = _mm256_sub_pd(iz2,jz1);
1449 dx22 = _mm256_sub_pd(ix2,jx2);
1450 dy22 = _mm256_sub_pd(iy2,jy2);
1451 dz22 = _mm256_sub_pd(iz2,jz2);
1453 /* Calculate squared distance and things based on it */
1454 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1455 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1456 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1457 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1458 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1459 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1460 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1461 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1462 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1464 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1465 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1466 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1467 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1468 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1469 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1470 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1471 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1472 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1474 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1475 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1476 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1477 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1478 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1479 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1480 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1481 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1482 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1484 fjx0 = _mm256_setzero_pd();
1485 fjy0 = _mm256_setzero_pd();
1486 fjz0 = _mm256_setzero_pd();
1487 fjx1 = _mm256_setzero_pd();
1488 fjy1 = _mm256_setzero_pd();
1489 fjz1 = _mm256_setzero_pd();
1490 fjx2 = _mm256_setzero_pd();
1491 fjy2 = _mm256_setzero_pd();
1492 fjz2 = _mm256_setzero_pd();
1494 /**************************
1495 * CALCULATE INTERACTIONS *
1496 **************************/
1498 /* REACTION-FIELD ELECTROSTATICS */
1499 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
1503 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1505 /* Calculate temporary vectorial force */
1506 tx = _mm256_mul_pd(fscal,dx00);
1507 ty = _mm256_mul_pd(fscal,dy00);
1508 tz = _mm256_mul_pd(fscal,dz00);
1510 /* Update vectorial force */
1511 fix0 = _mm256_add_pd(fix0,tx);
1512 fiy0 = _mm256_add_pd(fiy0,ty);
1513 fiz0 = _mm256_add_pd(fiz0,tz);
1515 fjx0 = _mm256_add_pd(fjx0,tx);
1516 fjy0 = _mm256_add_pd(fjy0,ty);
1517 fjz0 = _mm256_add_pd(fjz0,tz);
1519 /**************************
1520 * CALCULATE INTERACTIONS *
1521 **************************/
1523 /* REACTION-FIELD ELECTROSTATICS */
1524 felec = _mm256_mul_pd(qq01,_mm256_sub_pd(_mm256_mul_pd(rinv01,rinvsq01),krf2));
1528 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1530 /* Calculate temporary vectorial force */
1531 tx = _mm256_mul_pd(fscal,dx01);
1532 ty = _mm256_mul_pd(fscal,dy01);
1533 tz = _mm256_mul_pd(fscal,dz01);
1535 /* Update vectorial force */
1536 fix0 = _mm256_add_pd(fix0,tx);
1537 fiy0 = _mm256_add_pd(fiy0,ty);
1538 fiz0 = _mm256_add_pd(fiz0,tz);
1540 fjx1 = _mm256_add_pd(fjx1,tx);
1541 fjy1 = _mm256_add_pd(fjy1,ty);
1542 fjz1 = _mm256_add_pd(fjz1,tz);
1544 /**************************
1545 * CALCULATE INTERACTIONS *
1546 **************************/
1548 /* REACTION-FIELD ELECTROSTATICS */
1549 felec = _mm256_mul_pd(qq02,_mm256_sub_pd(_mm256_mul_pd(rinv02,rinvsq02),krf2));
1553 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1555 /* Calculate temporary vectorial force */
1556 tx = _mm256_mul_pd(fscal,dx02);
1557 ty = _mm256_mul_pd(fscal,dy02);
1558 tz = _mm256_mul_pd(fscal,dz02);
1560 /* Update vectorial force */
1561 fix0 = _mm256_add_pd(fix0,tx);
1562 fiy0 = _mm256_add_pd(fiy0,ty);
1563 fiz0 = _mm256_add_pd(fiz0,tz);
1565 fjx2 = _mm256_add_pd(fjx2,tx);
1566 fjy2 = _mm256_add_pd(fjy2,ty);
1567 fjz2 = _mm256_add_pd(fjz2,tz);
1569 /**************************
1570 * CALCULATE INTERACTIONS *
1571 **************************/
1573 /* REACTION-FIELD ELECTROSTATICS */
1574 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
1578 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1580 /* Calculate temporary vectorial force */
1581 tx = _mm256_mul_pd(fscal,dx10);
1582 ty = _mm256_mul_pd(fscal,dy10);
1583 tz = _mm256_mul_pd(fscal,dz10);
1585 /* Update vectorial force */
1586 fix1 = _mm256_add_pd(fix1,tx);
1587 fiy1 = _mm256_add_pd(fiy1,ty);
1588 fiz1 = _mm256_add_pd(fiz1,tz);
1590 fjx0 = _mm256_add_pd(fjx0,tx);
1591 fjy0 = _mm256_add_pd(fjy0,ty);
1592 fjz0 = _mm256_add_pd(fjz0,tz);
1594 /**************************
1595 * CALCULATE INTERACTIONS *
1596 **************************/
1598 /* REACTION-FIELD ELECTROSTATICS */
1599 felec = _mm256_mul_pd(qq11,_mm256_sub_pd(_mm256_mul_pd(rinv11,rinvsq11),krf2));
1603 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1605 /* Calculate temporary vectorial force */
1606 tx = _mm256_mul_pd(fscal,dx11);
1607 ty = _mm256_mul_pd(fscal,dy11);
1608 tz = _mm256_mul_pd(fscal,dz11);
1610 /* Update vectorial force */
1611 fix1 = _mm256_add_pd(fix1,tx);
1612 fiy1 = _mm256_add_pd(fiy1,ty);
1613 fiz1 = _mm256_add_pd(fiz1,tz);
1615 fjx1 = _mm256_add_pd(fjx1,tx);
1616 fjy1 = _mm256_add_pd(fjy1,ty);
1617 fjz1 = _mm256_add_pd(fjz1,tz);
1619 /**************************
1620 * CALCULATE INTERACTIONS *
1621 **************************/
1623 /* REACTION-FIELD ELECTROSTATICS */
1624 felec = _mm256_mul_pd(qq12,_mm256_sub_pd(_mm256_mul_pd(rinv12,rinvsq12),krf2));
1628 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1630 /* Calculate temporary vectorial force */
1631 tx = _mm256_mul_pd(fscal,dx12);
1632 ty = _mm256_mul_pd(fscal,dy12);
1633 tz = _mm256_mul_pd(fscal,dz12);
1635 /* Update vectorial force */
1636 fix1 = _mm256_add_pd(fix1,tx);
1637 fiy1 = _mm256_add_pd(fiy1,ty);
1638 fiz1 = _mm256_add_pd(fiz1,tz);
1640 fjx2 = _mm256_add_pd(fjx2,tx);
1641 fjy2 = _mm256_add_pd(fjy2,ty);
1642 fjz2 = _mm256_add_pd(fjz2,tz);
1644 /**************************
1645 * CALCULATE INTERACTIONS *
1646 **************************/
1648 /* REACTION-FIELD ELECTROSTATICS */
1649 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
1653 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1655 /* Calculate temporary vectorial force */
1656 tx = _mm256_mul_pd(fscal,dx20);
1657 ty = _mm256_mul_pd(fscal,dy20);
1658 tz = _mm256_mul_pd(fscal,dz20);
1660 /* Update vectorial force */
1661 fix2 = _mm256_add_pd(fix2,tx);
1662 fiy2 = _mm256_add_pd(fiy2,ty);
1663 fiz2 = _mm256_add_pd(fiz2,tz);
1665 fjx0 = _mm256_add_pd(fjx0,tx);
1666 fjy0 = _mm256_add_pd(fjy0,ty);
1667 fjz0 = _mm256_add_pd(fjz0,tz);
1669 /**************************
1670 * CALCULATE INTERACTIONS *
1671 **************************/
1673 /* REACTION-FIELD ELECTROSTATICS */
1674 felec = _mm256_mul_pd(qq21,_mm256_sub_pd(_mm256_mul_pd(rinv21,rinvsq21),krf2));
1678 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1680 /* Calculate temporary vectorial force */
1681 tx = _mm256_mul_pd(fscal,dx21);
1682 ty = _mm256_mul_pd(fscal,dy21);
1683 tz = _mm256_mul_pd(fscal,dz21);
1685 /* Update vectorial force */
1686 fix2 = _mm256_add_pd(fix2,tx);
1687 fiy2 = _mm256_add_pd(fiy2,ty);
1688 fiz2 = _mm256_add_pd(fiz2,tz);
1690 fjx1 = _mm256_add_pd(fjx1,tx);
1691 fjy1 = _mm256_add_pd(fjy1,ty);
1692 fjz1 = _mm256_add_pd(fjz1,tz);
1694 /**************************
1695 * CALCULATE INTERACTIONS *
1696 **************************/
1698 /* REACTION-FIELD ELECTROSTATICS */
1699 felec = _mm256_mul_pd(qq22,_mm256_sub_pd(_mm256_mul_pd(rinv22,rinvsq22),krf2));
1703 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1705 /* Calculate temporary vectorial force */
1706 tx = _mm256_mul_pd(fscal,dx22);
1707 ty = _mm256_mul_pd(fscal,dy22);
1708 tz = _mm256_mul_pd(fscal,dz22);
1710 /* Update vectorial force */
1711 fix2 = _mm256_add_pd(fix2,tx);
1712 fiy2 = _mm256_add_pd(fiy2,ty);
1713 fiz2 = _mm256_add_pd(fiz2,tz);
1715 fjx2 = _mm256_add_pd(fjx2,tx);
1716 fjy2 = _mm256_add_pd(fjy2,ty);
1717 fjz2 = _mm256_add_pd(fjz2,tz);
1719 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1720 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1721 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1722 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1724 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1725 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1727 /* Inner loop uses 243 flops */
1730 /* End of innermost loop */
1732 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1733 f+i_coord_offset,fshift+i_shift_offset);
1735 /* Increment number of inner iterations */
1736 inneriter += j_index_end - j_index_start;
1738 /* Outer loop uses 18 flops */
1741 /* Increment number of outer iterations */
1744 /* Update outer/inner flops */
1746 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*243);