2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_avx_128_fma_double.h"
48 #include "kernelutil_x86_avx_128_fma_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_avx_128_fma_double
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LJEwald
54 * Geometry: Water4-Particle
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_avx_128_fma_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int j_coord_offsetA,j_coord_offsetB;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
88 int vdwjidx0A,vdwjidx0B;
89 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
91 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
92 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
93 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
94 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
97 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
100 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
101 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
107 __m128d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
108 __m128d one_half = _mm_set1_pd(0.5);
109 __m128d minus_one = _mm_set1_pd(-1.0);
111 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
113 __m128d dummy_mask,cutoff_mask;
114 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
115 __m128d one = _mm_set1_pd(1.0);
116 __m128d two = _mm_set1_pd(2.0);
122 jindex = nlist->jindex;
124 shiftidx = nlist->shift;
126 shiftvec = fr->shift_vec[0];
127 fshift = fr->fshift[0];
128 facel = _mm_set1_pd(fr->epsfac);
129 charge = mdatoms->chargeA;
130 nvdwtype = fr->ntype;
132 vdwtype = mdatoms->typeA;
133 vdwgridparam = fr->ljpme_c6grid;
134 sh_lj_ewald = _mm_set1_pd(fr->ic->sh_lj_ewald);
135 ewclj = _mm_set1_pd(fr->ewaldcoeff_lj);
136 ewclj2 = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
138 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
139 ewtab = fr->ic->tabq_coul_FDV0;
140 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
141 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
143 /* Setup water-specific parameters */
144 inr = nlist->iinr[0];
145 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
146 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
147 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
148 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
150 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
151 rcutoff_scalar = fr->rcoulomb;
152 rcutoff = _mm_set1_pd(rcutoff_scalar);
153 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
155 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
156 rvdw = _mm_set1_pd(fr->rvdw);
158 /* Avoid stupid compiler warnings */
166 /* Start outer loop over neighborlists */
167 for(iidx=0; iidx<nri; iidx++)
169 /* Load shift vector for this list */
170 i_shift_offset = DIM*shiftidx[iidx];
172 /* Load limits for loop over neighbors */
173 j_index_start = jindex[iidx];
174 j_index_end = jindex[iidx+1];
176 /* Get outer coordinate index */
178 i_coord_offset = DIM*inr;
180 /* Load i particle coords and add shift vector */
181 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
182 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
184 fix0 = _mm_setzero_pd();
185 fiy0 = _mm_setzero_pd();
186 fiz0 = _mm_setzero_pd();
187 fix1 = _mm_setzero_pd();
188 fiy1 = _mm_setzero_pd();
189 fiz1 = _mm_setzero_pd();
190 fix2 = _mm_setzero_pd();
191 fiy2 = _mm_setzero_pd();
192 fiz2 = _mm_setzero_pd();
193 fix3 = _mm_setzero_pd();
194 fiy3 = _mm_setzero_pd();
195 fiz3 = _mm_setzero_pd();
197 /* Reset potential sums */
198 velecsum = _mm_setzero_pd();
199 vvdwsum = _mm_setzero_pd();
201 /* Start inner kernel loop */
202 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
205 /* Get j neighbor index, and coordinate index */
208 j_coord_offsetA = DIM*jnrA;
209 j_coord_offsetB = DIM*jnrB;
211 /* load j atom coordinates */
212 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
215 /* Calculate displacement vector */
216 dx00 = _mm_sub_pd(ix0,jx0);
217 dy00 = _mm_sub_pd(iy0,jy0);
218 dz00 = _mm_sub_pd(iz0,jz0);
219 dx10 = _mm_sub_pd(ix1,jx0);
220 dy10 = _mm_sub_pd(iy1,jy0);
221 dz10 = _mm_sub_pd(iz1,jz0);
222 dx20 = _mm_sub_pd(ix2,jx0);
223 dy20 = _mm_sub_pd(iy2,jy0);
224 dz20 = _mm_sub_pd(iz2,jz0);
225 dx30 = _mm_sub_pd(ix3,jx0);
226 dy30 = _mm_sub_pd(iy3,jy0);
227 dz30 = _mm_sub_pd(iz3,jz0);
229 /* Calculate squared distance and things based on it */
230 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
231 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
232 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
233 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
235 rinv00 = gmx_mm_invsqrt_pd(rsq00);
236 rinv10 = gmx_mm_invsqrt_pd(rsq10);
237 rinv20 = gmx_mm_invsqrt_pd(rsq20);
238 rinv30 = gmx_mm_invsqrt_pd(rsq30);
240 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
241 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
242 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
243 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
245 /* Load parameters for j particles */
246 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
247 vdwjidx0A = 2*vdwtype[jnrA+0];
248 vdwjidx0B = 2*vdwtype[jnrB+0];
250 fjx0 = _mm_setzero_pd();
251 fjy0 = _mm_setzero_pd();
252 fjz0 = _mm_setzero_pd();
254 /**************************
255 * CALCULATE INTERACTIONS *
256 **************************/
258 if (gmx_mm_any_lt(rsq00,rcutoff2))
261 r00 = _mm_mul_pd(rsq00,rinv00);
263 /* Compute parameters for interactions between i and j atoms */
264 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
265 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
266 c6grid_00 = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
267 vdwgridparam+vdwioffset0+vdwjidx0B);
269 /* Analytical LJ-PME */
270 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
271 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
272 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
273 exponent = gmx_simd_exp_d(ewcljrsq);
274 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
275 poly = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
276 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
277 vvdw6 = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
278 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
279 vvdw = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
280 _mm_mul_pd(_mm_sub_pd(vvdw6,_mm_macc_pd(c6grid_00,sh_lj_ewald,_mm_mul_pd(c6_00,sh_vdw_invrcut6))),one_sixth));
281 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
282 fvdw = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
284 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
286 /* Update potential sum for this i atom from the interaction with this j atom. */
287 vvdw = _mm_and_pd(vvdw,cutoff_mask);
288 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
292 fscal = _mm_and_pd(fscal,cutoff_mask);
294 /* Update vectorial force */
295 fix0 = _mm_macc_pd(dx00,fscal,fix0);
296 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
297 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
299 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
300 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
301 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
305 /**************************
306 * CALCULATE INTERACTIONS *
307 **************************/
309 if (gmx_mm_any_lt(rsq10,rcutoff2))
312 r10 = _mm_mul_pd(rsq10,rinv10);
314 /* Compute parameters for interactions between i and j atoms */
315 qq10 = _mm_mul_pd(iq1,jq0);
317 /* EWALD ELECTROSTATICS */
319 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
320 ewrt = _mm_mul_pd(r10,ewtabscale);
321 ewitab = _mm_cvttpd_epi32(ewrt);
323 eweps = _mm_frcz_pd(ewrt);
325 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
327 twoeweps = _mm_add_pd(eweps,eweps);
328 ewitab = _mm_slli_epi32(ewitab,2);
329 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
330 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
331 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
332 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
333 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
334 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
335 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
336 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
337 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
338 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
340 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
342 /* Update potential sum for this i atom from the interaction with this j atom. */
343 velec = _mm_and_pd(velec,cutoff_mask);
344 velecsum = _mm_add_pd(velecsum,velec);
348 fscal = _mm_and_pd(fscal,cutoff_mask);
350 /* Update vectorial force */
351 fix1 = _mm_macc_pd(dx10,fscal,fix1);
352 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
353 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
355 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
356 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
357 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
361 /**************************
362 * CALCULATE INTERACTIONS *
363 **************************/
365 if (gmx_mm_any_lt(rsq20,rcutoff2))
368 r20 = _mm_mul_pd(rsq20,rinv20);
370 /* Compute parameters for interactions between i and j atoms */
371 qq20 = _mm_mul_pd(iq2,jq0);
373 /* EWALD ELECTROSTATICS */
375 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
376 ewrt = _mm_mul_pd(r20,ewtabscale);
377 ewitab = _mm_cvttpd_epi32(ewrt);
379 eweps = _mm_frcz_pd(ewrt);
381 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
383 twoeweps = _mm_add_pd(eweps,eweps);
384 ewitab = _mm_slli_epi32(ewitab,2);
385 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
386 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
387 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
388 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
389 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
390 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
391 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
392 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
393 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
394 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
396 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
398 /* Update potential sum for this i atom from the interaction with this j atom. */
399 velec = _mm_and_pd(velec,cutoff_mask);
400 velecsum = _mm_add_pd(velecsum,velec);
404 fscal = _mm_and_pd(fscal,cutoff_mask);
406 /* Update vectorial force */
407 fix2 = _mm_macc_pd(dx20,fscal,fix2);
408 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
409 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
411 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
412 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
413 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
417 /**************************
418 * CALCULATE INTERACTIONS *
419 **************************/
421 if (gmx_mm_any_lt(rsq30,rcutoff2))
424 r30 = _mm_mul_pd(rsq30,rinv30);
426 /* Compute parameters for interactions between i and j atoms */
427 qq30 = _mm_mul_pd(iq3,jq0);
429 /* EWALD ELECTROSTATICS */
431 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
432 ewrt = _mm_mul_pd(r30,ewtabscale);
433 ewitab = _mm_cvttpd_epi32(ewrt);
435 eweps = _mm_frcz_pd(ewrt);
437 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
439 twoeweps = _mm_add_pd(eweps,eweps);
440 ewitab = _mm_slli_epi32(ewitab,2);
441 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
442 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
443 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
444 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
445 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
446 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
447 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
448 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
449 velec = _mm_mul_pd(qq30,_mm_sub_pd(_mm_sub_pd(rinv30,sh_ewald),velec));
450 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
452 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
454 /* Update potential sum for this i atom from the interaction with this j atom. */
455 velec = _mm_and_pd(velec,cutoff_mask);
456 velecsum = _mm_add_pd(velecsum,velec);
460 fscal = _mm_and_pd(fscal,cutoff_mask);
462 /* Update vectorial force */
463 fix3 = _mm_macc_pd(dx30,fscal,fix3);
464 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
465 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
467 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
468 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
469 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
473 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
475 /* Inner loop uses 208 flops */
482 j_coord_offsetA = DIM*jnrA;
484 /* load j atom coordinates */
485 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
488 /* Calculate displacement vector */
489 dx00 = _mm_sub_pd(ix0,jx0);
490 dy00 = _mm_sub_pd(iy0,jy0);
491 dz00 = _mm_sub_pd(iz0,jz0);
492 dx10 = _mm_sub_pd(ix1,jx0);
493 dy10 = _mm_sub_pd(iy1,jy0);
494 dz10 = _mm_sub_pd(iz1,jz0);
495 dx20 = _mm_sub_pd(ix2,jx0);
496 dy20 = _mm_sub_pd(iy2,jy0);
497 dz20 = _mm_sub_pd(iz2,jz0);
498 dx30 = _mm_sub_pd(ix3,jx0);
499 dy30 = _mm_sub_pd(iy3,jy0);
500 dz30 = _mm_sub_pd(iz3,jz0);
502 /* Calculate squared distance and things based on it */
503 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
504 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
505 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
506 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
508 rinv00 = gmx_mm_invsqrt_pd(rsq00);
509 rinv10 = gmx_mm_invsqrt_pd(rsq10);
510 rinv20 = gmx_mm_invsqrt_pd(rsq20);
511 rinv30 = gmx_mm_invsqrt_pd(rsq30);
513 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
514 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
515 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
516 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
518 /* Load parameters for j particles */
519 jq0 = _mm_load_sd(charge+jnrA+0);
520 vdwjidx0A = 2*vdwtype[jnrA+0];
522 fjx0 = _mm_setzero_pd();
523 fjy0 = _mm_setzero_pd();
524 fjz0 = _mm_setzero_pd();
526 /**************************
527 * CALCULATE INTERACTIONS *
528 **************************/
530 if (gmx_mm_any_lt(rsq00,rcutoff2))
533 r00 = _mm_mul_pd(rsq00,rinv00);
535 /* Compute parameters for interactions between i and j atoms */
536 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
537 c6grid_00 = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
539 /* Analytical LJ-PME */
540 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
541 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
542 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
543 exponent = gmx_simd_exp_d(ewcljrsq);
544 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
545 poly = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
546 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
547 vvdw6 = _mm_mul_pd(_mm_macc_pd(-c6grid_00,_mm_sub_pd(one,poly),c6_00),rinvsix);
548 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
549 vvdw = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
550 _mm_mul_pd(_mm_sub_pd(vvdw6,_mm_macc_pd(c6grid_00,sh_lj_ewald,_mm_mul_pd(c6_00,sh_vdw_invrcut6))),one_sixth));
551 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
552 fvdw = _mm_mul_pd(_mm_add_pd(vvdw12,_mm_msub_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6),vvdw6)),rinvsq00);
554 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
556 /* Update potential sum for this i atom from the interaction with this j atom. */
557 vvdw = _mm_and_pd(vvdw,cutoff_mask);
558 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
559 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
563 fscal = _mm_and_pd(fscal,cutoff_mask);
565 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
567 /* Update vectorial force */
568 fix0 = _mm_macc_pd(dx00,fscal,fix0);
569 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
570 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
572 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
573 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
574 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
578 /**************************
579 * CALCULATE INTERACTIONS *
580 **************************/
582 if (gmx_mm_any_lt(rsq10,rcutoff2))
585 r10 = _mm_mul_pd(rsq10,rinv10);
587 /* Compute parameters for interactions between i and j atoms */
588 qq10 = _mm_mul_pd(iq1,jq0);
590 /* EWALD ELECTROSTATICS */
592 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
593 ewrt = _mm_mul_pd(r10,ewtabscale);
594 ewitab = _mm_cvttpd_epi32(ewrt);
596 eweps = _mm_frcz_pd(ewrt);
598 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
600 twoeweps = _mm_add_pd(eweps,eweps);
601 ewitab = _mm_slli_epi32(ewitab,2);
602 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
603 ewtabD = _mm_setzero_pd();
604 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
605 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
606 ewtabFn = _mm_setzero_pd();
607 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
608 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
609 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
610 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
611 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
613 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
615 /* Update potential sum for this i atom from the interaction with this j atom. */
616 velec = _mm_and_pd(velec,cutoff_mask);
617 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
618 velecsum = _mm_add_pd(velecsum,velec);
622 fscal = _mm_and_pd(fscal,cutoff_mask);
624 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
626 /* Update vectorial force */
627 fix1 = _mm_macc_pd(dx10,fscal,fix1);
628 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
629 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
631 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
632 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
633 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
637 /**************************
638 * CALCULATE INTERACTIONS *
639 **************************/
641 if (gmx_mm_any_lt(rsq20,rcutoff2))
644 r20 = _mm_mul_pd(rsq20,rinv20);
646 /* Compute parameters for interactions between i and j atoms */
647 qq20 = _mm_mul_pd(iq2,jq0);
649 /* EWALD ELECTROSTATICS */
651 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
652 ewrt = _mm_mul_pd(r20,ewtabscale);
653 ewitab = _mm_cvttpd_epi32(ewrt);
655 eweps = _mm_frcz_pd(ewrt);
657 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
659 twoeweps = _mm_add_pd(eweps,eweps);
660 ewitab = _mm_slli_epi32(ewitab,2);
661 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
662 ewtabD = _mm_setzero_pd();
663 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
664 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
665 ewtabFn = _mm_setzero_pd();
666 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
667 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
668 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
669 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
670 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
672 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
674 /* Update potential sum for this i atom from the interaction with this j atom. */
675 velec = _mm_and_pd(velec,cutoff_mask);
676 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
677 velecsum = _mm_add_pd(velecsum,velec);
681 fscal = _mm_and_pd(fscal,cutoff_mask);
683 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
685 /* Update vectorial force */
686 fix2 = _mm_macc_pd(dx20,fscal,fix2);
687 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
688 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
690 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
691 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
692 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
696 /**************************
697 * CALCULATE INTERACTIONS *
698 **************************/
700 if (gmx_mm_any_lt(rsq30,rcutoff2))
703 r30 = _mm_mul_pd(rsq30,rinv30);
705 /* Compute parameters for interactions between i and j atoms */
706 qq30 = _mm_mul_pd(iq3,jq0);
708 /* EWALD ELECTROSTATICS */
710 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
711 ewrt = _mm_mul_pd(r30,ewtabscale);
712 ewitab = _mm_cvttpd_epi32(ewrt);
714 eweps = _mm_frcz_pd(ewrt);
716 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
718 twoeweps = _mm_add_pd(eweps,eweps);
719 ewitab = _mm_slli_epi32(ewitab,2);
720 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
721 ewtabD = _mm_setzero_pd();
722 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
723 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
724 ewtabFn = _mm_setzero_pd();
725 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
726 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
727 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
728 velec = _mm_mul_pd(qq30,_mm_sub_pd(_mm_sub_pd(rinv30,sh_ewald),velec));
729 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
731 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
733 /* Update potential sum for this i atom from the interaction with this j atom. */
734 velec = _mm_and_pd(velec,cutoff_mask);
735 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
736 velecsum = _mm_add_pd(velecsum,velec);
740 fscal = _mm_and_pd(fscal,cutoff_mask);
742 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
744 /* Update vectorial force */
745 fix3 = _mm_macc_pd(dx30,fscal,fix3);
746 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
747 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
749 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
750 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
751 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
755 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
757 /* Inner loop uses 208 flops */
760 /* End of innermost loop */
762 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
763 f+i_coord_offset,fshift+i_shift_offset);
766 /* Update potential energies */
767 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
768 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
770 /* Increment number of inner iterations */
771 inneriter += j_index_end - j_index_start;
773 /* Outer loop uses 26 flops */
776 /* Increment number of outer iterations */
779 /* Update outer/inner flops */
781 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*208);
784 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_avx_128_fma_double
785 * Electrostatics interaction: Ewald
786 * VdW interaction: LJEwald
787 * Geometry: Water4-Particle
788 * Calculate force/pot: Force
791 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_avx_128_fma_double
792 (t_nblist * gmx_restrict nlist,
793 rvec * gmx_restrict xx,
794 rvec * gmx_restrict ff,
795 t_forcerec * gmx_restrict fr,
796 t_mdatoms * gmx_restrict mdatoms,
797 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
798 t_nrnb * gmx_restrict nrnb)
800 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
801 * just 0 for non-waters.
802 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
803 * jnr indices corresponding to data put in the four positions in the SIMD register.
805 int i_shift_offset,i_coord_offset,outeriter,inneriter;
806 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
808 int j_coord_offsetA,j_coord_offsetB;
809 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
811 real *shiftvec,*fshift,*x,*f;
812 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
814 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
816 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
818 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
820 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
821 int vdwjidx0A,vdwjidx0B;
822 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
823 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
824 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
825 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
826 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
827 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
830 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
833 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
834 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
840 __m128d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
841 __m128d one_half = _mm_set1_pd(0.5);
842 __m128d minus_one = _mm_set1_pd(-1.0);
844 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
846 __m128d dummy_mask,cutoff_mask;
847 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
848 __m128d one = _mm_set1_pd(1.0);
849 __m128d two = _mm_set1_pd(2.0);
855 jindex = nlist->jindex;
857 shiftidx = nlist->shift;
859 shiftvec = fr->shift_vec[0];
860 fshift = fr->fshift[0];
861 facel = _mm_set1_pd(fr->epsfac);
862 charge = mdatoms->chargeA;
863 nvdwtype = fr->ntype;
865 vdwtype = mdatoms->typeA;
866 vdwgridparam = fr->ljpme_c6grid;
867 sh_lj_ewald = _mm_set1_pd(fr->ic->sh_lj_ewald);
868 ewclj = _mm_set1_pd(fr->ewaldcoeff_lj);
869 ewclj2 = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
871 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
872 ewtab = fr->ic->tabq_coul_F;
873 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
874 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
876 /* Setup water-specific parameters */
877 inr = nlist->iinr[0];
878 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
879 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
880 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
881 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
883 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
884 rcutoff_scalar = fr->rcoulomb;
885 rcutoff = _mm_set1_pd(rcutoff_scalar);
886 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
888 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
889 rvdw = _mm_set1_pd(fr->rvdw);
891 /* Avoid stupid compiler warnings */
899 /* Start outer loop over neighborlists */
900 for(iidx=0; iidx<nri; iidx++)
902 /* Load shift vector for this list */
903 i_shift_offset = DIM*shiftidx[iidx];
905 /* Load limits for loop over neighbors */
906 j_index_start = jindex[iidx];
907 j_index_end = jindex[iidx+1];
909 /* Get outer coordinate index */
911 i_coord_offset = DIM*inr;
913 /* Load i particle coords and add shift vector */
914 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
915 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
917 fix0 = _mm_setzero_pd();
918 fiy0 = _mm_setzero_pd();
919 fiz0 = _mm_setzero_pd();
920 fix1 = _mm_setzero_pd();
921 fiy1 = _mm_setzero_pd();
922 fiz1 = _mm_setzero_pd();
923 fix2 = _mm_setzero_pd();
924 fiy2 = _mm_setzero_pd();
925 fiz2 = _mm_setzero_pd();
926 fix3 = _mm_setzero_pd();
927 fiy3 = _mm_setzero_pd();
928 fiz3 = _mm_setzero_pd();
930 /* Start inner kernel loop */
931 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
934 /* Get j neighbor index, and coordinate index */
937 j_coord_offsetA = DIM*jnrA;
938 j_coord_offsetB = DIM*jnrB;
940 /* load j atom coordinates */
941 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
944 /* Calculate displacement vector */
945 dx00 = _mm_sub_pd(ix0,jx0);
946 dy00 = _mm_sub_pd(iy0,jy0);
947 dz00 = _mm_sub_pd(iz0,jz0);
948 dx10 = _mm_sub_pd(ix1,jx0);
949 dy10 = _mm_sub_pd(iy1,jy0);
950 dz10 = _mm_sub_pd(iz1,jz0);
951 dx20 = _mm_sub_pd(ix2,jx0);
952 dy20 = _mm_sub_pd(iy2,jy0);
953 dz20 = _mm_sub_pd(iz2,jz0);
954 dx30 = _mm_sub_pd(ix3,jx0);
955 dy30 = _mm_sub_pd(iy3,jy0);
956 dz30 = _mm_sub_pd(iz3,jz0);
958 /* Calculate squared distance and things based on it */
959 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
960 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
961 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
962 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
964 rinv00 = gmx_mm_invsqrt_pd(rsq00);
965 rinv10 = gmx_mm_invsqrt_pd(rsq10);
966 rinv20 = gmx_mm_invsqrt_pd(rsq20);
967 rinv30 = gmx_mm_invsqrt_pd(rsq30);
969 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
970 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
971 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
972 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
974 /* Load parameters for j particles */
975 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
976 vdwjidx0A = 2*vdwtype[jnrA+0];
977 vdwjidx0B = 2*vdwtype[jnrB+0];
979 fjx0 = _mm_setzero_pd();
980 fjy0 = _mm_setzero_pd();
981 fjz0 = _mm_setzero_pd();
983 /**************************
984 * CALCULATE INTERACTIONS *
985 **************************/
987 if (gmx_mm_any_lt(rsq00,rcutoff2))
990 r00 = _mm_mul_pd(rsq00,rinv00);
992 /* Compute parameters for interactions between i and j atoms */
993 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
994 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
995 c6grid_00 = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
996 vdwgridparam+vdwioffset0+vdwjidx0B);
998 /* Analytical LJ-PME */
999 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1000 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
1001 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
1002 exponent = gmx_simd_exp_d(ewcljrsq);
1003 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1004 poly = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
1005 /* f6A = 6 * C6grid * (1 - poly) */
1006 f6A = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
1007 /* f6B = C6grid * exponent * beta^6 */
1008 f6B = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
1009 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1010 fvdw = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
1012 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1016 fscal = _mm_and_pd(fscal,cutoff_mask);
1018 /* Update vectorial force */
1019 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1020 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1021 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1023 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1024 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1025 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1029 /**************************
1030 * CALCULATE INTERACTIONS *
1031 **************************/
1033 if (gmx_mm_any_lt(rsq10,rcutoff2))
1036 r10 = _mm_mul_pd(rsq10,rinv10);
1038 /* Compute parameters for interactions between i and j atoms */
1039 qq10 = _mm_mul_pd(iq1,jq0);
1041 /* EWALD ELECTROSTATICS */
1043 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1044 ewrt = _mm_mul_pd(r10,ewtabscale);
1045 ewitab = _mm_cvttpd_epi32(ewrt);
1047 eweps = _mm_frcz_pd(ewrt);
1049 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1051 twoeweps = _mm_add_pd(eweps,eweps);
1052 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1054 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1055 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1057 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1061 fscal = _mm_and_pd(fscal,cutoff_mask);
1063 /* Update vectorial force */
1064 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1065 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1066 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1068 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1069 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1070 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1074 /**************************
1075 * CALCULATE INTERACTIONS *
1076 **************************/
1078 if (gmx_mm_any_lt(rsq20,rcutoff2))
1081 r20 = _mm_mul_pd(rsq20,rinv20);
1083 /* Compute parameters for interactions between i and j atoms */
1084 qq20 = _mm_mul_pd(iq2,jq0);
1086 /* EWALD ELECTROSTATICS */
1088 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1089 ewrt = _mm_mul_pd(r20,ewtabscale);
1090 ewitab = _mm_cvttpd_epi32(ewrt);
1092 eweps = _mm_frcz_pd(ewrt);
1094 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1096 twoeweps = _mm_add_pd(eweps,eweps);
1097 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1099 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1100 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1102 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1106 fscal = _mm_and_pd(fscal,cutoff_mask);
1108 /* Update vectorial force */
1109 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1110 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1111 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1113 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1114 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1115 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1119 /**************************
1120 * CALCULATE INTERACTIONS *
1121 **************************/
1123 if (gmx_mm_any_lt(rsq30,rcutoff2))
1126 r30 = _mm_mul_pd(rsq30,rinv30);
1128 /* Compute parameters for interactions between i and j atoms */
1129 qq30 = _mm_mul_pd(iq3,jq0);
1131 /* EWALD ELECTROSTATICS */
1133 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1134 ewrt = _mm_mul_pd(r30,ewtabscale);
1135 ewitab = _mm_cvttpd_epi32(ewrt);
1137 eweps = _mm_frcz_pd(ewrt);
1139 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1141 twoeweps = _mm_add_pd(eweps,eweps);
1142 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1144 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1145 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
1147 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
1151 fscal = _mm_and_pd(fscal,cutoff_mask);
1153 /* Update vectorial force */
1154 fix3 = _mm_macc_pd(dx30,fscal,fix3);
1155 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
1156 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
1158 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
1159 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
1160 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
1164 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
1166 /* Inner loop uses 179 flops */
1169 if(jidx<j_index_end)
1173 j_coord_offsetA = DIM*jnrA;
1175 /* load j atom coordinates */
1176 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1179 /* Calculate displacement vector */
1180 dx00 = _mm_sub_pd(ix0,jx0);
1181 dy00 = _mm_sub_pd(iy0,jy0);
1182 dz00 = _mm_sub_pd(iz0,jz0);
1183 dx10 = _mm_sub_pd(ix1,jx0);
1184 dy10 = _mm_sub_pd(iy1,jy0);
1185 dz10 = _mm_sub_pd(iz1,jz0);
1186 dx20 = _mm_sub_pd(ix2,jx0);
1187 dy20 = _mm_sub_pd(iy2,jy0);
1188 dz20 = _mm_sub_pd(iz2,jz0);
1189 dx30 = _mm_sub_pd(ix3,jx0);
1190 dy30 = _mm_sub_pd(iy3,jy0);
1191 dz30 = _mm_sub_pd(iz3,jz0);
1193 /* Calculate squared distance and things based on it */
1194 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1195 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1196 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1197 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
1199 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1200 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1201 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1202 rinv30 = gmx_mm_invsqrt_pd(rsq30);
1204 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1205 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1206 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1207 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
1209 /* Load parameters for j particles */
1210 jq0 = _mm_load_sd(charge+jnrA+0);
1211 vdwjidx0A = 2*vdwtype[jnrA+0];
1213 fjx0 = _mm_setzero_pd();
1214 fjy0 = _mm_setzero_pd();
1215 fjz0 = _mm_setzero_pd();
1217 /**************************
1218 * CALCULATE INTERACTIONS *
1219 **************************/
1221 if (gmx_mm_any_lt(rsq00,rcutoff2))
1224 r00 = _mm_mul_pd(rsq00,rinv00);
1226 /* Compute parameters for interactions between i and j atoms */
1227 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
1228 c6grid_00 = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
1230 /* Analytical LJ-PME */
1231 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1232 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
1233 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
1234 exponent = gmx_simd_exp_d(ewcljrsq);
1235 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1236 poly = _mm_mul_pd(exponent,_mm_macc_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half,_mm_sub_pd(one,ewcljrsq)));
1237 /* f6A = 6 * C6grid * (1 - poly) */
1238 f6A = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
1239 /* f6B = C6grid * exponent * beta^6 */
1240 f6B = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
1241 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1242 fvdw = _mm_mul_pd(_mm_macc_pd(_mm_msub_pd(c12_00,rinvsix,_mm_sub_pd(c6_00,f6A)),rinvsix,f6B),rinvsq00);
1244 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1248 fscal = _mm_and_pd(fscal,cutoff_mask);
1250 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1252 /* Update vectorial force */
1253 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1254 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1255 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1257 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1258 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1259 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1263 /**************************
1264 * CALCULATE INTERACTIONS *
1265 **************************/
1267 if (gmx_mm_any_lt(rsq10,rcutoff2))
1270 r10 = _mm_mul_pd(rsq10,rinv10);
1272 /* Compute parameters for interactions between i and j atoms */
1273 qq10 = _mm_mul_pd(iq1,jq0);
1275 /* EWALD ELECTROSTATICS */
1277 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1278 ewrt = _mm_mul_pd(r10,ewtabscale);
1279 ewitab = _mm_cvttpd_epi32(ewrt);
1281 eweps = _mm_frcz_pd(ewrt);
1283 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1285 twoeweps = _mm_add_pd(eweps,eweps);
1286 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1287 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1288 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1290 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1294 fscal = _mm_and_pd(fscal,cutoff_mask);
1296 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1298 /* Update vectorial force */
1299 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1300 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1301 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1303 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1304 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1305 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1309 /**************************
1310 * CALCULATE INTERACTIONS *
1311 **************************/
1313 if (gmx_mm_any_lt(rsq20,rcutoff2))
1316 r20 = _mm_mul_pd(rsq20,rinv20);
1318 /* Compute parameters for interactions between i and j atoms */
1319 qq20 = _mm_mul_pd(iq2,jq0);
1321 /* EWALD ELECTROSTATICS */
1323 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1324 ewrt = _mm_mul_pd(r20,ewtabscale);
1325 ewitab = _mm_cvttpd_epi32(ewrt);
1327 eweps = _mm_frcz_pd(ewrt);
1329 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1331 twoeweps = _mm_add_pd(eweps,eweps);
1332 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1333 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1334 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1336 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1340 fscal = _mm_and_pd(fscal,cutoff_mask);
1342 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1344 /* Update vectorial force */
1345 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1346 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1347 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1349 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1350 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1351 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1355 /**************************
1356 * CALCULATE INTERACTIONS *
1357 **************************/
1359 if (gmx_mm_any_lt(rsq30,rcutoff2))
1362 r30 = _mm_mul_pd(rsq30,rinv30);
1364 /* Compute parameters for interactions between i and j atoms */
1365 qq30 = _mm_mul_pd(iq3,jq0);
1367 /* EWALD ELECTROSTATICS */
1369 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1370 ewrt = _mm_mul_pd(r30,ewtabscale);
1371 ewitab = _mm_cvttpd_epi32(ewrt);
1373 eweps = _mm_frcz_pd(ewrt);
1375 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1377 twoeweps = _mm_add_pd(eweps,eweps);
1378 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1379 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1380 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
1382 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
1386 fscal = _mm_and_pd(fscal,cutoff_mask);
1388 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1390 /* Update vectorial force */
1391 fix3 = _mm_macc_pd(dx30,fscal,fix3);
1392 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
1393 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
1395 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
1396 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
1397 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
1401 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
1403 /* Inner loop uses 179 flops */
1406 /* End of innermost loop */
1408 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1409 f+i_coord_offset,fshift+i_shift_offset);
1411 /* Increment number of inner iterations */
1412 inneriter += j_index_end - j_index_start;
1414 /* Outer loop uses 24 flops */
1417 /* Increment number of outer iterations */
1420 /* Update outer/inner flops */
1422 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*179);