2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
49 #include "gromacs/simd/math_x86_avx_256_double.h"
50 #include "kernelutil_x86_avx_256_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_256_double
54 * Electrostatics interaction: ReactionField
55 * VdW interaction: None
56 * Geometry: Water3-Particle
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecRF_VdwNone_GeomW3P1_VF_avx_256_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
79 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
80 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
82 real *shiftvec,*fshift,*x,*f;
83 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
85 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86 real * vdwioffsetptr0;
87 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88 real * vdwioffsetptr1;
89 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 real * vdwioffsetptr2;
91 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
93 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
96 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
97 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
99 __m256d dummy_mask,cutoff_mask;
100 __m128 tmpmask0,tmpmask1;
101 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
102 __m256d one = _mm256_set1_pd(1.0);
103 __m256d two = _mm256_set1_pd(2.0);
109 jindex = nlist->jindex;
111 shiftidx = nlist->shift;
113 shiftvec = fr->shift_vec[0];
114 fshift = fr->fshift[0];
115 facel = _mm256_set1_pd(fr->epsfac);
116 charge = mdatoms->chargeA;
117 krf = _mm256_set1_pd(fr->ic->k_rf);
118 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
119 crf = _mm256_set1_pd(fr->ic->c_rf);
121 /* Setup water-specific parameters */
122 inr = nlist->iinr[0];
123 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
124 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
125 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
127 /* Avoid stupid compiler warnings */
128 jnrA = jnrB = jnrC = jnrD = 0;
137 for(iidx=0;iidx<4*DIM;iidx++)
142 /* Start outer loop over neighborlists */
143 for(iidx=0; iidx<nri; iidx++)
145 /* Load shift vector for this list */
146 i_shift_offset = DIM*shiftidx[iidx];
148 /* Load limits for loop over neighbors */
149 j_index_start = jindex[iidx];
150 j_index_end = jindex[iidx+1];
152 /* Get outer coordinate index */
154 i_coord_offset = DIM*inr;
156 /* Load i particle coords and add shift vector */
157 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
158 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
160 fix0 = _mm256_setzero_pd();
161 fiy0 = _mm256_setzero_pd();
162 fiz0 = _mm256_setzero_pd();
163 fix1 = _mm256_setzero_pd();
164 fiy1 = _mm256_setzero_pd();
165 fiz1 = _mm256_setzero_pd();
166 fix2 = _mm256_setzero_pd();
167 fiy2 = _mm256_setzero_pd();
168 fiz2 = _mm256_setzero_pd();
170 /* Reset potential sums */
171 velecsum = _mm256_setzero_pd();
173 /* Start inner kernel loop */
174 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
177 /* Get j neighbor index, and coordinate index */
182 j_coord_offsetA = DIM*jnrA;
183 j_coord_offsetB = DIM*jnrB;
184 j_coord_offsetC = DIM*jnrC;
185 j_coord_offsetD = DIM*jnrD;
187 /* load j atom coordinates */
188 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
189 x+j_coord_offsetC,x+j_coord_offsetD,
192 /* Calculate displacement vector */
193 dx00 = _mm256_sub_pd(ix0,jx0);
194 dy00 = _mm256_sub_pd(iy0,jy0);
195 dz00 = _mm256_sub_pd(iz0,jz0);
196 dx10 = _mm256_sub_pd(ix1,jx0);
197 dy10 = _mm256_sub_pd(iy1,jy0);
198 dz10 = _mm256_sub_pd(iz1,jz0);
199 dx20 = _mm256_sub_pd(ix2,jx0);
200 dy20 = _mm256_sub_pd(iy2,jy0);
201 dz20 = _mm256_sub_pd(iz2,jz0);
203 /* Calculate squared distance and things based on it */
204 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
205 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
206 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
208 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
209 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
210 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
212 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
213 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
214 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
216 /* Load parameters for j particles */
217 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
218 charge+jnrC+0,charge+jnrD+0);
220 fjx0 = _mm256_setzero_pd();
221 fjy0 = _mm256_setzero_pd();
222 fjz0 = _mm256_setzero_pd();
224 /**************************
225 * CALCULATE INTERACTIONS *
226 **************************/
228 /* Compute parameters for interactions between i and j atoms */
229 qq00 = _mm256_mul_pd(iq0,jq0);
231 /* REACTION-FIELD ELECTROSTATICS */
232 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_add_pd(rinv00,_mm256_mul_pd(krf,rsq00)),crf));
233 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
235 /* Update potential sum for this i atom from the interaction with this j atom. */
236 velecsum = _mm256_add_pd(velecsum,velec);
240 /* Calculate temporary vectorial force */
241 tx = _mm256_mul_pd(fscal,dx00);
242 ty = _mm256_mul_pd(fscal,dy00);
243 tz = _mm256_mul_pd(fscal,dz00);
245 /* Update vectorial force */
246 fix0 = _mm256_add_pd(fix0,tx);
247 fiy0 = _mm256_add_pd(fiy0,ty);
248 fiz0 = _mm256_add_pd(fiz0,tz);
250 fjx0 = _mm256_add_pd(fjx0,tx);
251 fjy0 = _mm256_add_pd(fjy0,ty);
252 fjz0 = _mm256_add_pd(fjz0,tz);
254 /**************************
255 * CALCULATE INTERACTIONS *
256 **************************/
258 /* Compute parameters for interactions between i and j atoms */
259 qq10 = _mm256_mul_pd(iq1,jq0);
261 /* REACTION-FIELD ELECTROSTATICS */
262 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_add_pd(rinv10,_mm256_mul_pd(krf,rsq10)),crf));
263 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
265 /* Update potential sum for this i atom from the interaction with this j atom. */
266 velecsum = _mm256_add_pd(velecsum,velec);
270 /* Calculate temporary vectorial force */
271 tx = _mm256_mul_pd(fscal,dx10);
272 ty = _mm256_mul_pd(fscal,dy10);
273 tz = _mm256_mul_pd(fscal,dz10);
275 /* Update vectorial force */
276 fix1 = _mm256_add_pd(fix1,tx);
277 fiy1 = _mm256_add_pd(fiy1,ty);
278 fiz1 = _mm256_add_pd(fiz1,tz);
280 fjx0 = _mm256_add_pd(fjx0,tx);
281 fjy0 = _mm256_add_pd(fjy0,ty);
282 fjz0 = _mm256_add_pd(fjz0,tz);
284 /**************************
285 * CALCULATE INTERACTIONS *
286 **************************/
288 /* Compute parameters for interactions between i and j atoms */
289 qq20 = _mm256_mul_pd(iq2,jq0);
291 /* REACTION-FIELD ELECTROSTATICS */
292 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_add_pd(rinv20,_mm256_mul_pd(krf,rsq20)),crf));
293 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
295 /* Update potential sum for this i atom from the interaction with this j atom. */
296 velecsum = _mm256_add_pd(velecsum,velec);
300 /* Calculate temporary vectorial force */
301 tx = _mm256_mul_pd(fscal,dx20);
302 ty = _mm256_mul_pd(fscal,dy20);
303 tz = _mm256_mul_pd(fscal,dz20);
305 /* Update vectorial force */
306 fix2 = _mm256_add_pd(fix2,tx);
307 fiy2 = _mm256_add_pd(fiy2,ty);
308 fiz2 = _mm256_add_pd(fiz2,tz);
310 fjx0 = _mm256_add_pd(fjx0,tx);
311 fjy0 = _mm256_add_pd(fjy0,ty);
312 fjz0 = _mm256_add_pd(fjz0,tz);
314 fjptrA = f+j_coord_offsetA;
315 fjptrB = f+j_coord_offsetB;
316 fjptrC = f+j_coord_offsetC;
317 fjptrD = f+j_coord_offsetD;
319 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
321 /* Inner loop uses 99 flops */
327 /* Get j neighbor index, and coordinate index */
328 jnrlistA = jjnr[jidx];
329 jnrlistB = jjnr[jidx+1];
330 jnrlistC = jjnr[jidx+2];
331 jnrlistD = jjnr[jidx+3];
332 /* Sign of each element will be negative for non-real atoms.
333 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
334 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
336 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
338 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
339 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
340 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
342 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
343 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
344 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
345 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
346 j_coord_offsetA = DIM*jnrA;
347 j_coord_offsetB = DIM*jnrB;
348 j_coord_offsetC = DIM*jnrC;
349 j_coord_offsetD = DIM*jnrD;
351 /* load j atom coordinates */
352 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
353 x+j_coord_offsetC,x+j_coord_offsetD,
356 /* Calculate displacement vector */
357 dx00 = _mm256_sub_pd(ix0,jx0);
358 dy00 = _mm256_sub_pd(iy0,jy0);
359 dz00 = _mm256_sub_pd(iz0,jz0);
360 dx10 = _mm256_sub_pd(ix1,jx0);
361 dy10 = _mm256_sub_pd(iy1,jy0);
362 dz10 = _mm256_sub_pd(iz1,jz0);
363 dx20 = _mm256_sub_pd(ix2,jx0);
364 dy20 = _mm256_sub_pd(iy2,jy0);
365 dz20 = _mm256_sub_pd(iz2,jz0);
367 /* Calculate squared distance and things based on it */
368 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
369 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
370 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
372 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
373 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
374 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
376 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
377 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
378 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
380 /* Load parameters for j particles */
381 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
382 charge+jnrC+0,charge+jnrD+0);
384 fjx0 = _mm256_setzero_pd();
385 fjy0 = _mm256_setzero_pd();
386 fjz0 = _mm256_setzero_pd();
388 /**************************
389 * CALCULATE INTERACTIONS *
390 **************************/
392 /* Compute parameters for interactions between i and j atoms */
393 qq00 = _mm256_mul_pd(iq0,jq0);
395 /* REACTION-FIELD ELECTROSTATICS */
396 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_add_pd(rinv00,_mm256_mul_pd(krf,rsq00)),crf));
397 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
399 /* Update potential sum for this i atom from the interaction with this j atom. */
400 velec = _mm256_andnot_pd(dummy_mask,velec);
401 velecsum = _mm256_add_pd(velecsum,velec);
405 fscal = _mm256_andnot_pd(dummy_mask,fscal);
407 /* Calculate temporary vectorial force */
408 tx = _mm256_mul_pd(fscal,dx00);
409 ty = _mm256_mul_pd(fscal,dy00);
410 tz = _mm256_mul_pd(fscal,dz00);
412 /* Update vectorial force */
413 fix0 = _mm256_add_pd(fix0,tx);
414 fiy0 = _mm256_add_pd(fiy0,ty);
415 fiz0 = _mm256_add_pd(fiz0,tz);
417 fjx0 = _mm256_add_pd(fjx0,tx);
418 fjy0 = _mm256_add_pd(fjy0,ty);
419 fjz0 = _mm256_add_pd(fjz0,tz);
421 /**************************
422 * CALCULATE INTERACTIONS *
423 **************************/
425 /* Compute parameters for interactions between i and j atoms */
426 qq10 = _mm256_mul_pd(iq1,jq0);
428 /* REACTION-FIELD ELECTROSTATICS */
429 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_add_pd(rinv10,_mm256_mul_pd(krf,rsq10)),crf));
430 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
432 /* Update potential sum for this i atom from the interaction with this j atom. */
433 velec = _mm256_andnot_pd(dummy_mask,velec);
434 velecsum = _mm256_add_pd(velecsum,velec);
438 fscal = _mm256_andnot_pd(dummy_mask,fscal);
440 /* Calculate temporary vectorial force */
441 tx = _mm256_mul_pd(fscal,dx10);
442 ty = _mm256_mul_pd(fscal,dy10);
443 tz = _mm256_mul_pd(fscal,dz10);
445 /* Update vectorial force */
446 fix1 = _mm256_add_pd(fix1,tx);
447 fiy1 = _mm256_add_pd(fiy1,ty);
448 fiz1 = _mm256_add_pd(fiz1,tz);
450 fjx0 = _mm256_add_pd(fjx0,tx);
451 fjy0 = _mm256_add_pd(fjy0,ty);
452 fjz0 = _mm256_add_pd(fjz0,tz);
454 /**************************
455 * CALCULATE INTERACTIONS *
456 **************************/
458 /* Compute parameters for interactions between i and j atoms */
459 qq20 = _mm256_mul_pd(iq2,jq0);
461 /* REACTION-FIELD ELECTROSTATICS */
462 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_add_pd(rinv20,_mm256_mul_pd(krf,rsq20)),crf));
463 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
465 /* Update potential sum for this i atom from the interaction with this j atom. */
466 velec = _mm256_andnot_pd(dummy_mask,velec);
467 velecsum = _mm256_add_pd(velecsum,velec);
471 fscal = _mm256_andnot_pd(dummy_mask,fscal);
473 /* Calculate temporary vectorial force */
474 tx = _mm256_mul_pd(fscal,dx20);
475 ty = _mm256_mul_pd(fscal,dy20);
476 tz = _mm256_mul_pd(fscal,dz20);
478 /* Update vectorial force */
479 fix2 = _mm256_add_pd(fix2,tx);
480 fiy2 = _mm256_add_pd(fiy2,ty);
481 fiz2 = _mm256_add_pd(fiz2,tz);
483 fjx0 = _mm256_add_pd(fjx0,tx);
484 fjy0 = _mm256_add_pd(fjy0,ty);
485 fjz0 = _mm256_add_pd(fjz0,tz);
487 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
488 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
489 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
490 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
492 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
494 /* Inner loop uses 99 flops */
497 /* End of innermost loop */
499 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
500 f+i_coord_offset,fshift+i_shift_offset);
503 /* Update potential energies */
504 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
506 /* Increment number of inner iterations */
507 inneriter += j_index_end - j_index_start;
509 /* Outer loop uses 19 flops */
512 /* Increment number of outer iterations */
515 /* Update outer/inner flops */
517 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*99);
520 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_256_double
521 * Electrostatics interaction: ReactionField
522 * VdW interaction: None
523 * Geometry: Water3-Particle
524 * Calculate force/pot: Force
527 nb_kernel_ElecRF_VdwNone_GeomW3P1_F_avx_256_double
528 (t_nblist * gmx_restrict nlist,
529 rvec * gmx_restrict xx,
530 rvec * gmx_restrict ff,
531 t_forcerec * gmx_restrict fr,
532 t_mdatoms * gmx_restrict mdatoms,
533 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
534 t_nrnb * gmx_restrict nrnb)
536 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
537 * just 0 for non-waters.
538 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
539 * jnr indices corresponding to data put in the four positions in the SIMD register.
541 int i_shift_offset,i_coord_offset,outeriter,inneriter;
542 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
543 int jnrA,jnrB,jnrC,jnrD;
544 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
545 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
546 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
547 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
549 real *shiftvec,*fshift,*x,*f;
550 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
552 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
553 real * vdwioffsetptr0;
554 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
555 real * vdwioffsetptr1;
556 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
557 real * vdwioffsetptr2;
558 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
559 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
560 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
561 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
562 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
563 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
564 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
566 __m256d dummy_mask,cutoff_mask;
567 __m128 tmpmask0,tmpmask1;
568 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
569 __m256d one = _mm256_set1_pd(1.0);
570 __m256d two = _mm256_set1_pd(2.0);
576 jindex = nlist->jindex;
578 shiftidx = nlist->shift;
580 shiftvec = fr->shift_vec[0];
581 fshift = fr->fshift[0];
582 facel = _mm256_set1_pd(fr->epsfac);
583 charge = mdatoms->chargeA;
584 krf = _mm256_set1_pd(fr->ic->k_rf);
585 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
586 crf = _mm256_set1_pd(fr->ic->c_rf);
588 /* Setup water-specific parameters */
589 inr = nlist->iinr[0];
590 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
591 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
592 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
594 /* Avoid stupid compiler warnings */
595 jnrA = jnrB = jnrC = jnrD = 0;
604 for(iidx=0;iidx<4*DIM;iidx++)
609 /* Start outer loop over neighborlists */
610 for(iidx=0; iidx<nri; iidx++)
612 /* Load shift vector for this list */
613 i_shift_offset = DIM*shiftidx[iidx];
615 /* Load limits for loop over neighbors */
616 j_index_start = jindex[iidx];
617 j_index_end = jindex[iidx+1];
619 /* Get outer coordinate index */
621 i_coord_offset = DIM*inr;
623 /* Load i particle coords and add shift vector */
624 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
625 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
627 fix0 = _mm256_setzero_pd();
628 fiy0 = _mm256_setzero_pd();
629 fiz0 = _mm256_setzero_pd();
630 fix1 = _mm256_setzero_pd();
631 fiy1 = _mm256_setzero_pd();
632 fiz1 = _mm256_setzero_pd();
633 fix2 = _mm256_setzero_pd();
634 fiy2 = _mm256_setzero_pd();
635 fiz2 = _mm256_setzero_pd();
637 /* Start inner kernel loop */
638 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
641 /* Get j neighbor index, and coordinate index */
646 j_coord_offsetA = DIM*jnrA;
647 j_coord_offsetB = DIM*jnrB;
648 j_coord_offsetC = DIM*jnrC;
649 j_coord_offsetD = DIM*jnrD;
651 /* load j atom coordinates */
652 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
653 x+j_coord_offsetC,x+j_coord_offsetD,
656 /* Calculate displacement vector */
657 dx00 = _mm256_sub_pd(ix0,jx0);
658 dy00 = _mm256_sub_pd(iy0,jy0);
659 dz00 = _mm256_sub_pd(iz0,jz0);
660 dx10 = _mm256_sub_pd(ix1,jx0);
661 dy10 = _mm256_sub_pd(iy1,jy0);
662 dz10 = _mm256_sub_pd(iz1,jz0);
663 dx20 = _mm256_sub_pd(ix2,jx0);
664 dy20 = _mm256_sub_pd(iy2,jy0);
665 dz20 = _mm256_sub_pd(iz2,jz0);
667 /* Calculate squared distance and things based on it */
668 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
669 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
670 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
672 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
673 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
674 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
676 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
677 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
678 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
680 /* Load parameters for j particles */
681 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
682 charge+jnrC+0,charge+jnrD+0);
684 fjx0 = _mm256_setzero_pd();
685 fjy0 = _mm256_setzero_pd();
686 fjz0 = _mm256_setzero_pd();
688 /**************************
689 * CALCULATE INTERACTIONS *
690 **************************/
692 /* Compute parameters for interactions between i and j atoms */
693 qq00 = _mm256_mul_pd(iq0,jq0);
695 /* REACTION-FIELD ELECTROSTATICS */
696 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
700 /* Calculate temporary vectorial force */
701 tx = _mm256_mul_pd(fscal,dx00);
702 ty = _mm256_mul_pd(fscal,dy00);
703 tz = _mm256_mul_pd(fscal,dz00);
705 /* Update vectorial force */
706 fix0 = _mm256_add_pd(fix0,tx);
707 fiy0 = _mm256_add_pd(fiy0,ty);
708 fiz0 = _mm256_add_pd(fiz0,tz);
710 fjx0 = _mm256_add_pd(fjx0,tx);
711 fjy0 = _mm256_add_pd(fjy0,ty);
712 fjz0 = _mm256_add_pd(fjz0,tz);
714 /**************************
715 * CALCULATE INTERACTIONS *
716 **************************/
718 /* Compute parameters for interactions between i and j atoms */
719 qq10 = _mm256_mul_pd(iq1,jq0);
721 /* REACTION-FIELD ELECTROSTATICS */
722 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
726 /* Calculate temporary vectorial force */
727 tx = _mm256_mul_pd(fscal,dx10);
728 ty = _mm256_mul_pd(fscal,dy10);
729 tz = _mm256_mul_pd(fscal,dz10);
731 /* Update vectorial force */
732 fix1 = _mm256_add_pd(fix1,tx);
733 fiy1 = _mm256_add_pd(fiy1,ty);
734 fiz1 = _mm256_add_pd(fiz1,tz);
736 fjx0 = _mm256_add_pd(fjx0,tx);
737 fjy0 = _mm256_add_pd(fjy0,ty);
738 fjz0 = _mm256_add_pd(fjz0,tz);
740 /**************************
741 * CALCULATE INTERACTIONS *
742 **************************/
744 /* Compute parameters for interactions between i and j atoms */
745 qq20 = _mm256_mul_pd(iq2,jq0);
747 /* REACTION-FIELD ELECTROSTATICS */
748 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
752 /* Calculate temporary vectorial force */
753 tx = _mm256_mul_pd(fscal,dx20);
754 ty = _mm256_mul_pd(fscal,dy20);
755 tz = _mm256_mul_pd(fscal,dz20);
757 /* Update vectorial force */
758 fix2 = _mm256_add_pd(fix2,tx);
759 fiy2 = _mm256_add_pd(fiy2,ty);
760 fiz2 = _mm256_add_pd(fiz2,tz);
762 fjx0 = _mm256_add_pd(fjx0,tx);
763 fjy0 = _mm256_add_pd(fjy0,ty);
764 fjz0 = _mm256_add_pd(fjz0,tz);
766 fjptrA = f+j_coord_offsetA;
767 fjptrB = f+j_coord_offsetB;
768 fjptrC = f+j_coord_offsetC;
769 fjptrD = f+j_coord_offsetD;
771 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
773 /* Inner loop uses 84 flops */
779 /* Get j neighbor index, and coordinate index */
780 jnrlistA = jjnr[jidx];
781 jnrlistB = jjnr[jidx+1];
782 jnrlistC = jjnr[jidx+2];
783 jnrlistD = jjnr[jidx+3];
784 /* Sign of each element will be negative for non-real atoms.
785 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
786 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
788 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
790 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
791 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
792 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
794 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
795 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
796 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
797 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
798 j_coord_offsetA = DIM*jnrA;
799 j_coord_offsetB = DIM*jnrB;
800 j_coord_offsetC = DIM*jnrC;
801 j_coord_offsetD = DIM*jnrD;
803 /* load j atom coordinates */
804 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
805 x+j_coord_offsetC,x+j_coord_offsetD,
808 /* Calculate displacement vector */
809 dx00 = _mm256_sub_pd(ix0,jx0);
810 dy00 = _mm256_sub_pd(iy0,jy0);
811 dz00 = _mm256_sub_pd(iz0,jz0);
812 dx10 = _mm256_sub_pd(ix1,jx0);
813 dy10 = _mm256_sub_pd(iy1,jy0);
814 dz10 = _mm256_sub_pd(iz1,jz0);
815 dx20 = _mm256_sub_pd(ix2,jx0);
816 dy20 = _mm256_sub_pd(iy2,jy0);
817 dz20 = _mm256_sub_pd(iz2,jz0);
819 /* Calculate squared distance and things based on it */
820 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
821 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
822 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
824 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
825 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
826 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
828 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
829 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
830 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
832 /* Load parameters for j particles */
833 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
834 charge+jnrC+0,charge+jnrD+0);
836 fjx0 = _mm256_setzero_pd();
837 fjy0 = _mm256_setzero_pd();
838 fjz0 = _mm256_setzero_pd();
840 /**************************
841 * CALCULATE INTERACTIONS *
842 **************************/
844 /* Compute parameters for interactions between i and j atoms */
845 qq00 = _mm256_mul_pd(iq0,jq0);
847 /* REACTION-FIELD ELECTROSTATICS */
848 felec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_mul_pd(rinv00,rinvsq00),krf2));
852 fscal = _mm256_andnot_pd(dummy_mask,fscal);
854 /* Calculate temporary vectorial force */
855 tx = _mm256_mul_pd(fscal,dx00);
856 ty = _mm256_mul_pd(fscal,dy00);
857 tz = _mm256_mul_pd(fscal,dz00);
859 /* Update vectorial force */
860 fix0 = _mm256_add_pd(fix0,tx);
861 fiy0 = _mm256_add_pd(fiy0,ty);
862 fiz0 = _mm256_add_pd(fiz0,tz);
864 fjx0 = _mm256_add_pd(fjx0,tx);
865 fjy0 = _mm256_add_pd(fjy0,ty);
866 fjz0 = _mm256_add_pd(fjz0,tz);
868 /**************************
869 * CALCULATE INTERACTIONS *
870 **************************/
872 /* Compute parameters for interactions between i and j atoms */
873 qq10 = _mm256_mul_pd(iq1,jq0);
875 /* REACTION-FIELD ELECTROSTATICS */
876 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
880 fscal = _mm256_andnot_pd(dummy_mask,fscal);
882 /* Calculate temporary vectorial force */
883 tx = _mm256_mul_pd(fscal,dx10);
884 ty = _mm256_mul_pd(fscal,dy10);
885 tz = _mm256_mul_pd(fscal,dz10);
887 /* Update vectorial force */
888 fix1 = _mm256_add_pd(fix1,tx);
889 fiy1 = _mm256_add_pd(fiy1,ty);
890 fiz1 = _mm256_add_pd(fiz1,tz);
892 fjx0 = _mm256_add_pd(fjx0,tx);
893 fjy0 = _mm256_add_pd(fjy0,ty);
894 fjz0 = _mm256_add_pd(fjz0,tz);
896 /**************************
897 * CALCULATE INTERACTIONS *
898 **************************/
900 /* Compute parameters for interactions between i and j atoms */
901 qq20 = _mm256_mul_pd(iq2,jq0);
903 /* REACTION-FIELD ELECTROSTATICS */
904 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
908 fscal = _mm256_andnot_pd(dummy_mask,fscal);
910 /* Calculate temporary vectorial force */
911 tx = _mm256_mul_pd(fscal,dx20);
912 ty = _mm256_mul_pd(fscal,dy20);
913 tz = _mm256_mul_pd(fscal,dz20);
915 /* Update vectorial force */
916 fix2 = _mm256_add_pd(fix2,tx);
917 fiy2 = _mm256_add_pd(fiy2,ty);
918 fiz2 = _mm256_add_pd(fiz2,tz);
920 fjx0 = _mm256_add_pd(fjx0,tx);
921 fjy0 = _mm256_add_pd(fjy0,ty);
922 fjz0 = _mm256_add_pd(fjz0,tz);
924 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
925 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
926 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
927 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
929 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
931 /* Inner loop uses 84 flops */
934 /* End of innermost loop */
936 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
937 f+i_coord_offset,fshift+i_shift_offset);
939 /* Increment number of inner iterations */
940 inneriter += j_index_end - j_index_start;
942 /* Outer loop uses 18 flops */
945 /* Increment number of outer iterations */
948 /* Update outer/inner flops */
950 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*84);