2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
49 #include "gromacs/simd/math_x86_avx_128_fma_double.h"
50 #include "kernelutil_x86_avx_128_fma_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_128_fma_double
54 * Electrostatics interaction: ReactionField
55 * VdW interaction: LennardJones
56 * Geometry: Water4-Particle
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_VF_avx_128_fma_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
77 int j_coord_offsetA,j_coord_offsetB;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
90 int vdwjidx0A,vdwjidx0B;
91 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
93 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
94 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
95 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
96 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
99 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
102 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
103 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
104 __m128d dummy_mask,cutoff_mask;
105 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
106 __m128d one = _mm_set1_pd(1.0);
107 __m128d two = _mm_set1_pd(2.0);
113 jindex = nlist->jindex;
115 shiftidx = nlist->shift;
117 shiftvec = fr->shift_vec[0];
118 fshift = fr->fshift[0];
119 facel = _mm_set1_pd(fr->epsfac);
120 charge = mdatoms->chargeA;
121 krf = _mm_set1_pd(fr->ic->k_rf);
122 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
123 crf = _mm_set1_pd(fr->ic->c_rf);
124 nvdwtype = fr->ntype;
126 vdwtype = mdatoms->typeA;
128 /* Setup water-specific parameters */
129 inr = nlist->iinr[0];
130 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
131 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
132 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
133 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
135 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
136 rcutoff_scalar = fr->rcoulomb;
137 rcutoff = _mm_set1_pd(rcutoff_scalar);
138 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
140 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
141 rvdw = _mm_set1_pd(fr->rvdw);
143 /* Avoid stupid compiler warnings */
151 /* Start outer loop over neighborlists */
152 for(iidx=0; iidx<nri; iidx++)
154 /* Load shift vector for this list */
155 i_shift_offset = DIM*shiftidx[iidx];
157 /* Load limits for loop over neighbors */
158 j_index_start = jindex[iidx];
159 j_index_end = jindex[iidx+1];
161 /* Get outer coordinate index */
163 i_coord_offset = DIM*inr;
165 /* Load i particle coords and add shift vector */
166 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
167 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
169 fix0 = _mm_setzero_pd();
170 fiy0 = _mm_setzero_pd();
171 fiz0 = _mm_setzero_pd();
172 fix1 = _mm_setzero_pd();
173 fiy1 = _mm_setzero_pd();
174 fiz1 = _mm_setzero_pd();
175 fix2 = _mm_setzero_pd();
176 fiy2 = _mm_setzero_pd();
177 fiz2 = _mm_setzero_pd();
178 fix3 = _mm_setzero_pd();
179 fiy3 = _mm_setzero_pd();
180 fiz3 = _mm_setzero_pd();
182 /* Reset potential sums */
183 velecsum = _mm_setzero_pd();
184 vvdwsum = _mm_setzero_pd();
186 /* Start inner kernel loop */
187 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
190 /* Get j neighbor index, and coordinate index */
193 j_coord_offsetA = DIM*jnrA;
194 j_coord_offsetB = DIM*jnrB;
196 /* load j atom coordinates */
197 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
200 /* Calculate displacement vector */
201 dx00 = _mm_sub_pd(ix0,jx0);
202 dy00 = _mm_sub_pd(iy0,jy0);
203 dz00 = _mm_sub_pd(iz0,jz0);
204 dx10 = _mm_sub_pd(ix1,jx0);
205 dy10 = _mm_sub_pd(iy1,jy0);
206 dz10 = _mm_sub_pd(iz1,jz0);
207 dx20 = _mm_sub_pd(ix2,jx0);
208 dy20 = _mm_sub_pd(iy2,jy0);
209 dz20 = _mm_sub_pd(iz2,jz0);
210 dx30 = _mm_sub_pd(ix3,jx0);
211 dy30 = _mm_sub_pd(iy3,jy0);
212 dz30 = _mm_sub_pd(iz3,jz0);
214 /* Calculate squared distance and things based on it */
215 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
216 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
217 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
218 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
220 rinv10 = gmx_mm_invsqrt_pd(rsq10);
221 rinv20 = gmx_mm_invsqrt_pd(rsq20);
222 rinv30 = gmx_mm_invsqrt_pd(rsq30);
224 rinvsq00 = gmx_mm_inv_pd(rsq00);
225 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
226 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
227 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
229 /* Load parameters for j particles */
230 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
231 vdwjidx0A = 2*vdwtype[jnrA+0];
232 vdwjidx0B = 2*vdwtype[jnrB+0];
234 fjx0 = _mm_setzero_pd();
235 fjy0 = _mm_setzero_pd();
236 fjz0 = _mm_setzero_pd();
238 /**************************
239 * CALCULATE INTERACTIONS *
240 **************************/
242 if (gmx_mm_any_lt(rsq00,rcutoff2))
245 /* Compute parameters for interactions between i and j atoms */
246 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
247 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
249 /* LENNARD-JONES DISPERSION/REPULSION */
251 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
252 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
253 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
254 vvdw = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
255 _mm_mul_pd(_mm_nmacc_pd( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
256 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
258 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
260 /* Update potential sum for this i atom from the interaction with this j atom. */
261 vvdw = _mm_and_pd(vvdw,cutoff_mask);
262 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
266 fscal = _mm_and_pd(fscal,cutoff_mask);
268 /* Update vectorial force */
269 fix0 = _mm_macc_pd(dx00,fscal,fix0);
270 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
271 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
273 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
274 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
275 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
279 /**************************
280 * CALCULATE INTERACTIONS *
281 **************************/
283 if (gmx_mm_any_lt(rsq10,rcutoff2))
286 /* Compute parameters for interactions between i and j atoms */
287 qq10 = _mm_mul_pd(iq1,jq0);
289 /* REACTION-FIELD ELECTROSTATICS */
290 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_macc_pd(krf,rsq10,rinv10),crf));
291 felec = _mm_mul_pd(qq10,_mm_msub_pd(rinv10,rinvsq10,krf2));
293 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
295 /* Update potential sum for this i atom from the interaction with this j atom. */
296 velec = _mm_and_pd(velec,cutoff_mask);
297 velecsum = _mm_add_pd(velecsum,velec);
301 fscal = _mm_and_pd(fscal,cutoff_mask);
303 /* Update vectorial force */
304 fix1 = _mm_macc_pd(dx10,fscal,fix1);
305 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
306 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
308 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
309 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
310 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
314 /**************************
315 * CALCULATE INTERACTIONS *
316 **************************/
318 if (gmx_mm_any_lt(rsq20,rcutoff2))
321 /* Compute parameters for interactions between i and j atoms */
322 qq20 = _mm_mul_pd(iq2,jq0);
324 /* REACTION-FIELD ELECTROSTATICS */
325 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_macc_pd(krf,rsq20,rinv20),crf));
326 felec = _mm_mul_pd(qq20,_mm_msub_pd(rinv20,rinvsq20,krf2));
328 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
330 /* Update potential sum for this i atom from the interaction with this j atom. */
331 velec = _mm_and_pd(velec,cutoff_mask);
332 velecsum = _mm_add_pd(velecsum,velec);
336 fscal = _mm_and_pd(fscal,cutoff_mask);
338 /* Update vectorial force */
339 fix2 = _mm_macc_pd(dx20,fscal,fix2);
340 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
341 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
343 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
344 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
345 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
349 /**************************
350 * CALCULATE INTERACTIONS *
351 **************************/
353 if (gmx_mm_any_lt(rsq30,rcutoff2))
356 /* Compute parameters for interactions between i and j atoms */
357 qq30 = _mm_mul_pd(iq3,jq0);
359 /* REACTION-FIELD ELECTROSTATICS */
360 velec = _mm_mul_pd(qq30,_mm_sub_pd(_mm_macc_pd(krf,rsq30,rinv30),crf));
361 felec = _mm_mul_pd(qq30,_mm_msub_pd(rinv30,rinvsq30,krf2));
363 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
365 /* Update potential sum for this i atom from the interaction with this j atom. */
366 velec = _mm_and_pd(velec,cutoff_mask);
367 velecsum = _mm_add_pd(velecsum,velec);
371 fscal = _mm_and_pd(fscal,cutoff_mask);
373 /* Update vectorial force */
374 fix3 = _mm_macc_pd(dx30,fscal,fix3);
375 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
376 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
378 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
379 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
380 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
384 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
386 /* Inner loop uses 164 flops */
393 j_coord_offsetA = DIM*jnrA;
395 /* load j atom coordinates */
396 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
399 /* Calculate displacement vector */
400 dx00 = _mm_sub_pd(ix0,jx0);
401 dy00 = _mm_sub_pd(iy0,jy0);
402 dz00 = _mm_sub_pd(iz0,jz0);
403 dx10 = _mm_sub_pd(ix1,jx0);
404 dy10 = _mm_sub_pd(iy1,jy0);
405 dz10 = _mm_sub_pd(iz1,jz0);
406 dx20 = _mm_sub_pd(ix2,jx0);
407 dy20 = _mm_sub_pd(iy2,jy0);
408 dz20 = _mm_sub_pd(iz2,jz0);
409 dx30 = _mm_sub_pd(ix3,jx0);
410 dy30 = _mm_sub_pd(iy3,jy0);
411 dz30 = _mm_sub_pd(iz3,jz0);
413 /* Calculate squared distance and things based on it */
414 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
415 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
416 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
417 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
419 rinv10 = gmx_mm_invsqrt_pd(rsq10);
420 rinv20 = gmx_mm_invsqrt_pd(rsq20);
421 rinv30 = gmx_mm_invsqrt_pd(rsq30);
423 rinvsq00 = gmx_mm_inv_pd(rsq00);
424 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
425 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
426 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
428 /* Load parameters for j particles */
429 jq0 = _mm_load_sd(charge+jnrA+0);
430 vdwjidx0A = 2*vdwtype[jnrA+0];
432 fjx0 = _mm_setzero_pd();
433 fjy0 = _mm_setzero_pd();
434 fjz0 = _mm_setzero_pd();
436 /**************************
437 * CALCULATE INTERACTIONS *
438 **************************/
440 if (gmx_mm_any_lt(rsq00,rcutoff2))
443 /* Compute parameters for interactions between i and j atoms */
444 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
446 /* LENNARD-JONES DISPERSION/REPULSION */
448 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
449 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
450 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
451 vvdw = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
452 _mm_mul_pd(_mm_nmacc_pd( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
453 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
455 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
457 /* Update potential sum for this i atom from the interaction with this j atom. */
458 vvdw = _mm_and_pd(vvdw,cutoff_mask);
459 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
460 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
464 fscal = _mm_and_pd(fscal,cutoff_mask);
466 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
468 /* Update vectorial force */
469 fix0 = _mm_macc_pd(dx00,fscal,fix0);
470 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
471 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
473 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
474 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
475 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
479 /**************************
480 * CALCULATE INTERACTIONS *
481 **************************/
483 if (gmx_mm_any_lt(rsq10,rcutoff2))
486 /* Compute parameters for interactions between i and j atoms */
487 qq10 = _mm_mul_pd(iq1,jq0);
489 /* REACTION-FIELD ELECTROSTATICS */
490 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_macc_pd(krf,rsq10,rinv10),crf));
491 felec = _mm_mul_pd(qq10,_mm_msub_pd(rinv10,rinvsq10,krf2));
493 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
495 /* Update potential sum for this i atom from the interaction with this j atom. */
496 velec = _mm_and_pd(velec,cutoff_mask);
497 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
498 velecsum = _mm_add_pd(velecsum,velec);
502 fscal = _mm_and_pd(fscal,cutoff_mask);
504 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
506 /* Update vectorial force */
507 fix1 = _mm_macc_pd(dx10,fscal,fix1);
508 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
509 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
511 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
512 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
513 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
517 /**************************
518 * CALCULATE INTERACTIONS *
519 **************************/
521 if (gmx_mm_any_lt(rsq20,rcutoff2))
524 /* Compute parameters for interactions between i and j atoms */
525 qq20 = _mm_mul_pd(iq2,jq0);
527 /* REACTION-FIELD ELECTROSTATICS */
528 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_macc_pd(krf,rsq20,rinv20),crf));
529 felec = _mm_mul_pd(qq20,_mm_msub_pd(rinv20,rinvsq20,krf2));
531 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
533 /* Update potential sum for this i atom from the interaction with this j atom. */
534 velec = _mm_and_pd(velec,cutoff_mask);
535 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
536 velecsum = _mm_add_pd(velecsum,velec);
540 fscal = _mm_and_pd(fscal,cutoff_mask);
542 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
544 /* Update vectorial force */
545 fix2 = _mm_macc_pd(dx20,fscal,fix2);
546 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
547 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
549 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
550 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
551 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
555 /**************************
556 * CALCULATE INTERACTIONS *
557 **************************/
559 if (gmx_mm_any_lt(rsq30,rcutoff2))
562 /* Compute parameters for interactions between i and j atoms */
563 qq30 = _mm_mul_pd(iq3,jq0);
565 /* REACTION-FIELD ELECTROSTATICS */
566 velec = _mm_mul_pd(qq30,_mm_sub_pd(_mm_macc_pd(krf,rsq30,rinv30),crf));
567 felec = _mm_mul_pd(qq30,_mm_msub_pd(rinv30,rinvsq30,krf2));
569 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
571 /* Update potential sum for this i atom from the interaction with this j atom. */
572 velec = _mm_and_pd(velec,cutoff_mask);
573 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
574 velecsum = _mm_add_pd(velecsum,velec);
578 fscal = _mm_and_pd(fscal,cutoff_mask);
580 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
582 /* Update vectorial force */
583 fix3 = _mm_macc_pd(dx30,fscal,fix3);
584 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
585 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
587 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
588 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
589 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
593 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
595 /* Inner loop uses 164 flops */
598 /* End of innermost loop */
600 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
601 f+i_coord_offset,fshift+i_shift_offset);
604 /* Update potential energies */
605 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
606 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
608 /* Increment number of inner iterations */
609 inneriter += j_index_end - j_index_start;
611 /* Outer loop uses 26 flops */
614 /* Increment number of outer iterations */
617 /* Update outer/inner flops */
619 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*164);
622 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_128_fma_double
623 * Electrostatics interaction: ReactionField
624 * VdW interaction: LennardJones
625 * Geometry: Water4-Particle
626 * Calculate force/pot: Force
629 nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_F_avx_128_fma_double
630 (t_nblist * gmx_restrict nlist,
631 rvec * gmx_restrict xx,
632 rvec * gmx_restrict ff,
633 t_forcerec * gmx_restrict fr,
634 t_mdatoms * gmx_restrict mdatoms,
635 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
636 t_nrnb * gmx_restrict nrnb)
638 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
639 * just 0 for non-waters.
640 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
641 * jnr indices corresponding to data put in the four positions in the SIMD register.
643 int i_shift_offset,i_coord_offset,outeriter,inneriter;
644 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
646 int j_coord_offsetA,j_coord_offsetB;
647 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
649 real *shiftvec,*fshift,*x,*f;
650 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
652 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
654 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
656 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
658 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
659 int vdwjidx0A,vdwjidx0B;
660 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
661 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
662 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
663 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
664 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
665 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
668 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
671 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
672 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
673 __m128d dummy_mask,cutoff_mask;
674 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
675 __m128d one = _mm_set1_pd(1.0);
676 __m128d two = _mm_set1_pd(2.0);
682 jindex = nlist->jindex;
684 shiftidx = nlist->shift;
686 shiftvec = fr->shift_vec[0];
687 fshift = fr->fshift[0];
688 facel = _mm_set1_pd(fr->epsfac);
689 charge = mdatoms->chargeA;
690 krf = _mm_set1_pd(fr->ic->k_rf);
691 krf2 = _mm_set1_pd(fr->ic->k_rf*2.0);
692 crf = _mm_set1_pd(fr->ic->c_rf);
693 nvdwtype = fr->ntype;
695 vdwtype = mdatoms->typeA;
697 /* Setup water-specific parameters */
698 inr = nlist->iinr[0];
699 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
700 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
701 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
702 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
704 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
705 rcutoff_scalar = fr->rcoulomb;
706 rcutoff = _mm_set1_pd(rcutoff_scalar);
707 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
709 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
710 rvdw = _mm_set1_pd(fr->rvdw);
712 /* Avoid stupid compiler warnings */
720 /* Start outer loop over neighborlists */
721 for(iidx=0; iidx<nri; iidx++)
723 /* Load shift vector for this list */
724 i_shift_offset = DIM*shiftidx[iidx];
726 /* Load limits for loop over neighbors */
727 j_index_start = jindex[iidx];
728 j_index_end = jindex[iidx+1];
730 /* Get outer coordinate index */
732 i_coord_offset = DIM*inr;
734 /* Load i particle coords and add shift vector */
735 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
736 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
738 fix0 = _mm_setzero_pd();
739 fiy0 = _mm_setzero_pd();
740 fiz0 = _mm_setzero_pd();
741 fix1 = _mm_setzero_pd();
742 fiy1 = _mm_setzero_pd();
743 fiz1 = _mm_setzero_pd();
744 fix2 = _mm_setzero_pd();
745 fiy2 = _mm_setzero_pd();
746 fiz2 = _mm_setzero_pd();
747 fix3 = _mm_setzero_pd();
748 fiy3 = _mm_setzero_pd();
749 fiz3 = _mm_setzero_pd();
751 /* Start inner kernel loop */
752 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
755 /* Get j neighbor index, and coordinate index */
758 j_coord_offsetA = DIM*jnrA;
759 j_coord_offsetB = DIM*jnrB;
761 /* load j atom coordinates */
762 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
765 /* Calculate displacement vector */
766 dx00 = _mm_sub_pd(ix0,jx0);
767 dy00 = _mm_sub_pd(iy0,jy0);
768 dz00 = _mm_sub_pd(iz0,jz0);
769 dx10 = _mm_sub_pd(ix1,jx0);
770 dy10 = _mm_sub_pd(iy1,jy0);
771 dz10 = _mm_sub_pd(iz1,jz0);
772 dx20 = _mm_sub_pd(ix2,jx0);
773 dy20 = _mm_sub_pd(iy2,jy0);
774 dz20 = _mm_sub_pd(iz2,jz0);
775 dx30 = _mm_sub_pd(ix3,jx0);
776 dy30 = _mm_sub_pd(iy3,jy0);
777 dz30 = _mm_sub_pd(iz3,jz0);
779 /* Calculate squared distance and things based on it */
780 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
781 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
782 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
783 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
785 rinv10 = gmx_mm_invsqrt_pd(rsq10);
786 rinv20 = gmx_mm_invsqrt_pd(rsq20);
787 rinv30 = gmx_mm_invsqrt_pd(rsq30);
789 rinvsq00 = gmx_mm_inv_pd(rsq00);
790 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
791 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
792 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
794 /* Load parameters for j particles */
795 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
796 vdwjidx0A = 2*vdwtype[jnrA+0];
797 vdwjidx0B = 2*vdwtype[jnrB+0];
799 fjx0 = _mm_setzero_pd();
800 fjy0 = _mm_setzero_pd();
801 fjz0 = _mm_setzero_pd();
803 /**************************
804 * CALCULATE INTERACTIONS *
805 **************************/
807 if (gmx_mm_any_lt(rsq00,rcutoff2))
810 /* Compute parameters for interactions between i and j atoms */
811 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
812 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
814 /* LENNARD-JONES DISPERSION/REPULSION */
816 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
817 fvdw = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
819 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
823 fscal = _mm_and_pd(fscal,cutoff_mask);
825 /* Update vectorial force */
826 fix0 = _mm_macc_pd(dx00,fscal,fix0);
827 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
828 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
830 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
831 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
832 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
836 /**************************
837 * CALCULATE INTERACTIONS *
838 **************************/
840 if (gmx_mm_any_lt(rsq10,rcutoff2))
843 /* Compute parameters for interactions between i and j atoms */
844 qq10 = _mm_mul_pd(iq1,jq0);
846 /* REACTION-FIELD ELECTROSTATICS */
847 felec = _mm_mul_pd(qq10,_mm_msub_pd(rinv10,rinvsq10,krf2));
849 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
853 fscal = _mm_and_pd(fscal,cutoff_mask);
855 /* Update vectorial force */
856 fix1 = _mm_macc_pd(dx10,fscal,fix1);
857 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
858 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
860 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
861 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
862 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
866 /**************************
867 * CALCULATE INTERACTIONS *
868 **************************/
870 if (gmx_mm_any_lt(rsq20,rcutoff2))
873 /* Compute parameters for interactions between i and j atoms */
874 qq20 = _mm_mul_pd(iq2,jq0);
876 /* REACTION-FIELD ELECTROSTATICS */
877 felec = _mm_mul_pd(qq20,_mm_msub_pd(rinv20,rinvsq20,krf2));
879 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
883 fscal = _mm_and_pd(fscal,cutoff_mask);
885 /* Update vectorial force */
886 fix2 = _mm_macc_pd(dx20,fscal,fix2);
887 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
888 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
890 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
891 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
892 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
896 /**************************
897 * CALCULATE INTERACTIONS *
898 **************************/
900 if (gmx_mm_any_lt(rsq30,rcutoff2))
903 /* Compute parameters for interactions between i and j atoms */
904 qq30 = _mm_mul_pd(iq3,jq0);
906 /* REACTION-FIELD ELECTROSTATICS */
907 felec = _mm_mul_pd(qq30,_mm_msub_pd(rinv30,rinvsq30,krf2));
909 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
913 fscal = _mm_and_pd(fscal,cutoff_mask);
915 /* Update vectorial force */
916 fix3 = _mm_macc_pd(dx30,fscal,fix3);
917 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
918 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
920 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
921 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
922 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
926 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
928 /* Inner loop uses 135 flops */
935 j_coord_offsetA = DIM*jnrA;
937 /* load j atom coordinates */
938 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
941 /* Calculate displacement vector */
942 dx00 = _mm_sub_pd(ix0,jx0);
943 dy00 = _mm_sub_pd(iy0,jy0);
944 dz00 = _mm_sub_pd(iz0,jz0);
945 dx10 = _mm_sub_pd(ix1,jx0);
946 dy10 = _mm_sub_pd(iy1,jy0);
947 dz10 = _mm_sub_pd(iz1,jz0);
948 dx20 = _mm_sub_pd(ix2,jx0);
949 dy20 = _mm_sub_pd(iy2,jy0);
950 dz20 = _mm_sub_pd(iz2,jz0);
951 dx30 = _mm_sub_pd(ix3,jx0);
952 dy30 = _mm_sub_pd(iy3,jy0);
953 dz30 = _mm_sub_pd(iz3,jz0);
955 /* Calculate squared distance and things based on it */
956 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
957 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
958 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
959 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
961 rinv10 = gmx_mm_invsqrt_pd(rsq10);
962 rinv20 = gmx_mm_invsqrt_pd(rsq20);
963 rinv30 = gmx_mm_invsqrt_pd(rsq30);
965 rinvsq00 = gmx_mm_inv_pd(rsq00);
966 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
967 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
968 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
970 /* Load parameters for j particles */
971 jq0 = _mm_load_sd(charge+jnrA+0);
972 vdwjidx0A = 2*vdwtype[jnrA+0];
974 fjx0 = _mm_setzero_pd();
975 fjy0 = _mm_setzero_pd();
976 fjz0 = _mm_setzero_pd();
978 /**************************
979 * CALCULATE INTERACTIONS *
980 **************************/
982 if (gmx_mm_any_lt(rsq00,rcutoff2))
985 /* Compute parameters for interactions between i and j atoms */
986 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
988 /* LENNARD-JONES DISPERSION/REPULSION */
990 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
991 fvdw = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
993 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
997 fscal = _mm_and_pd(fscal,cutoff_mask);
999 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1001 /* Update vectorial force */
1002 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1003 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1004 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1006 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1007 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1008 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1012 /**************************
1013 * CALCULATE INTERACTIONS *
1014 **************************/
1016 if (gmx_mm_any_lt(rsq10,rcutoff2))
1019 /* Compute parameters for interactions between i and j atoms */
1020 qq10 = _mm_mul_pd(iq1,jq0);
1022 /* REACTION-FIELD ELECTROSTATICS */
1023 felec = _mm_mul_pd(qq10,_mm_msub_pd(rinv10,rinvsq10,krf2));
1025 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1029 fscal = _mm_and_pd(fscal,cutoff_mask);
1031 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1033 /* Update vectorial force */
1034 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1035 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1036 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1038 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1039 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1040 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1044 /**************************
1045 * CALCULATE INTERACTIONS *
1046 **************************/
1048 if (gmx_mm_any_lt(rsq20,rcutoff2))
1051 /* Compute parameters for interactions between i and j atoms */
1052 qq20 = _mm_mul_pd(iq2,jq0);
1054 /* REACTION-FIELD ELECTROSTATICS */
1055 felec = _mm_mul_pd(qq20,_mm_msub_pd(rinv20,rinvsq20,krf2));
1057 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1061 fscal = _mm_and_pd(fscal,cutoff_mask);
1063 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1065 /* Update vectorial force */
1066 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1067 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1068 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1070 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1071 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1072 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1076 /**************************
1077 * CALCULATE INTERACTIONS *
1078 **************************/
1080 if (gmx_mm_any_lt(rsq30,rcutoff2))
1083 /* Compute parameters for interactions between i and j atoms */
1084 qq30 = _mm_mul_pd(iq3,jq0);
1086 /* REACTION-FIELD ELECTROSTATICS */
1087 felec = _mm_mul_pd(qq30,_mm_msub_pd(rinv30,rinvsq30,krf2));
1089 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
1093 fscal = _mm_and_pd(fscal,cutoff_mask);
1095 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1097 /* Update vectorial force */
1098 fix3 = _mm_macc_pd(dx30,fscal,fix3);
1099 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
1100 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
1102 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
1103 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
1104 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
1108 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
1110 /* Inner loop uses 135 flops */
1113 /* End of innermost loop */
1115 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1116 f+i_coord_offset,fshift+i_shift_offset);
1118 /* Increment number of inner iterations */
1119 inneriter += j_index_end - j_index_start;
1121 /* Outer loop uses 24 flops */
1124 /* Increment number of outer iterations */
1127 /* Update outer/inner flops */
1129 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*135);