2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse4_1_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse4_1_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LJEwald
53 * Geometry: Water4-Particle
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse4_1_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int j_coord_offsetA,j_coord_offsetB;
75 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
77 real *shiftvec,*fshift,*x,*f;
78 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
82 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
84 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
87 int vdwjidx0A,vdwjidx0B;
88 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
89 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
90 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
91 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
92 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
93 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
96 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
99 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
100 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
105 __m128d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
107 __m128d one_half = _mm_set1_pd(0.5);
108 __m128d minus_one = _mm_set1_pd(-1.0);
110 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
112 __m128d dummy_mask,cutoff_mask;
113 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
114 __m128d one = _mm_set1_pd(1.0);
115 __m128d two = _mm_set1_pd(2.0);
121 jindex = nlist->jindex;
123 shiftidx = nlist->shift;
125 shiftvec = fr->shift_vec[0];
126 fshift = fr->fshift[0];
127 facel = _mm_set1_pd(fr->ic->epsfac);
128 charge = mdatoms->chargeA;
129 nvdwtype = fr->ntype;
131 vdwtype = mdatoms->typeA;
132 vdwgridparam = fr->ljpme_c6grid;
133 sh_lj_ewald = _mm_set1_pd(fr->ic->sh_lj_ewald);
134 ewclj = _mm_set1_pd(fr->ic->ewaldcoeff_lj);
135 ewclj2 = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
137 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
138 ewtab = fr->ic->tabq_coul_FDV0;
139 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
140 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
142 /* Setup water-specific parameters */
143 inr = nlist->iinr[0];
144 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
145 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
146 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
147 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
149 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
150 rcutoff_scalar = fr->ic->rcoulomb;
151 rcutoff = _mm_set1_pd(rcutoff_scalar);
152 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
154 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
155 rvdw = _mm_set1_pd(fr->ic->rvdw);
157 /* Avoid stupid compiler warnings */
165 /* Start outer loop over neighborlists */
166 for(iidx=0; iidx<nri; iidx++)
168 /* Load shift vector for this list */
169 i_shift_offset = DIM*shiftidx[iidx];
171 /* Load limits for loop over neighbors */
172 j_index_start = jindex[iidx];
173 j_index_end = jindex[iidx+1];
175 /* Get outer coordinate index */
177 i_coord_offset = DIM*inr;
179 /* Load i particle coords and add shift vector */
180 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
181 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
183 fix0 = _mm_setzero_pd();
184 fiy0 = _mm_setzero_pd();
185 fiz0 = _mm_setzero_pd();
186 fix1 = _mm_setzero_pd();
187 fiy1 = _mm_setzero_pd();
188 fiz1 = _mm_setzero_pd();
189 fix2 = _mm_setzero_pd();
190 fiy2 = _mm_setzero_pd();
191 fiz2 = _mm_setzero_pd();
192 fix3 = _mm_setzero_pd();
193 fiy3 = _mm_setzero_pd();
194 fiz3 = _mm_setzero_pd();
196 /* Reset potential sums */
197 velecsum = _mm_setzero_pd();
198 vvdwsum = _mm_setzero_pd();
200 /* Start inner kernel loop */
201 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
204 /* Get j neighbor index, and coordinate index */
207 j_coord_offsetA = DIM*jnrA;
208 j_coord_offsetB = DIM*jnrB;
210 /* load j atom coordinates */
211 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
214 /* Calculate displacement vector */
215 dx00 = _mm_sub_pd(ix0,jx0);
216 dy00 = _mm_sub_pd(iy0,jy0);
217 dz00 = _mm_sub_pd(iz0,jz0);
218 dx10 = _mm_sub_pd(ix1,jx0);
219 dy10 = _mm_sub_pd(iy1,jy0);
220 dz10 = _mm_sub_pd(iz1,jz0);
221 dx20 = _mm_sub_pd(ix2,jx0);
222 dy20 = _mm_sub_pd(iy2,jy0);
223 dz20 = _mm_sub_pd(iz2,jz0);
224 dx30 = _mm_sub_pd(ix3,jx0);
225 dy30 = _mm_sub_pd(iy3,jy0);
226 dz30 = _mm_sub_pd(iz3,jz0);
228 /* Calculate squared distance and things based on it */
229 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
230 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
231 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
232 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
234 rinv00 = sse41_invsqrt_d(rsq00);
235 rinv10 = sse41_invsqrt_d(rsq10);
236 rinv20 = sse41_invsqrt_d(rsq20);
237 rinv30 = sse41_invsqrt_d(rsq30);
239 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
240 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
241 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
242 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
244 /* Load parameters for j particles */
245 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
246 vdwjidx0A = 2*vdwtype[jnrA+0];
247 vdwjidx0B = 2*vdwtype[jnrB+0];
249 fjx0 = _mm_setzero_pd();
250 fjy0 = _mm_setzero_pd();
251 fjz0 = _mm_setzero_pd();
253 /**************************
254 * CALCULATE INTERACTIONS *
255 **************************/
257 if (gmx_mm_any_lt(rsq00,rcutoff2))
260 r00 = _mm_mul_pd(rsq00,rinv00);
262 /* Compute parameters for interactions between i and j atoms */
263 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
264 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
265 c6grid_00 = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
266 vdwgridparam+vdwioffset0+vdwjidx0B);
268 /* Analytical LJ-PME */
269 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
270 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
271 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
272 exponent = sse41_exp_d(ewcljrsq);
273 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
274 poly = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
275 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
276 vvdw6 = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
277 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
278 vvdw = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
279 _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
280 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
281 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
283 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
285 /* Update potential sum for this i atom from the interaction with this j atom. */
286 vvdw = _mm_and_pd(vvdw,cutoff_mask);
287 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
291 fscal = _mm_and_pd(fscal,cutoff_mask);
293 /* Calculate temporary vectorial force */
294 tx = _mm_mul_pd(fscal,dx00);
295 ty = _mm_mul_pd(fscal,dy00);
296 tz = _mm_mul_pd(fscal,dz00);
298 /* Update vectorial force */
299 fix0 = _mm_add_pd(fix0,tx);
300 fiy0 = _mm_add_pd(fiy0,ty);
301 fiz0 = _mm_add_pd(fiz0,tz);
303 fjx0 = _mm_add_pd(fjx0,tx);
304 fjy0 = _mm_add_pd(fjy0,ty);
305 fjz0 = _mm_add_pd(fjz0,tz);
309 /**************************
310 * CALCULATE INTERACTIONS *
311 **************************/
313 if (gmx_mm_any_lt(rsq10,rcutoff2))
316 r10 = _mm_mul_pd(rsq10,rinv10);
318 /* Compute parameters for interactions between i and j atoms */
319 qq10 = _mm_mul_pd(iq1,jq0);
321 /* EWALD ELECTROSTATICS */
323 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
324 ewrt = _mm_mul_pd(r10,ewtabscale);
325 ewitab = _mm_cvttpd_epi32(ewrt);
326 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
327 ewitab = _mm_slli_epi32(ewitab,2);
328 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
329 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
330 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
331 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
332 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
333 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
334 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
335 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
336 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
337 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
339 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
341 /* Update potential sum for this i atom from the interaction with this j atom. */
342 velec = _mm_and_pd(velec,cutoff_mask);
343 velecsum = _mm_add_pd(velecsum,velec);
347 fscal = _mm_and_pd(fscal,cutoff_mask);
349 /* Calculate temporary vectorial force */
350 tx = _mm_mul_pd(fscal,dx10);
351 ty = _mm_mul_pd(fscal,dy10);
352 tz = _mm_mul_pd(fscal,dz10);
354 /* Update vectorial force */
355 fix1 = _mm_add_pd(fix1,tx);
356 fiy1 = _mm_add_pd(fiy1,ty);
357 fiz1 = _mm_add_pd(fiz1,tz);
359 fjx0 = _mm_add_pd(fjx0,tx);
360 fjy0 = _mm_add_pd(fjy0,ty);
361 fjz0 = _mm_add_pd(fjz0,tz);
365 /**************************
366 * CALCULATE INTERACTIONS *
367 **************************/
369 if (gmx_mm_any_lt(rsq20,rcutoff2))
372 r20 = _mm_mul_pd(rsq20,rinv20);
374 /* Compute parameters for interactions between i and j atoms */
375 qq20 = _mm_mul_pd(iq2,jq0);
377 /* EWALD ELECTROSTATICS */
379 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
380 ewrt = _mm_mul_pd(r20,ewtabscale);
381 ewitab = _mm_cvttpd_epi32(ewrt);
382 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
383 ewitab = _mm_slli_epi32(ewitab,2);
384 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
385 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
386 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
387 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
388 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
389 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
390 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
391 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
392 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
393 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
395 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
397 /* Update potential sum for this i atom from the interaction with this j atom. */
398 velec = _mm_and_pd(velec,cutoff_mask);
399 velecsum = _mm_add_pd(velecsum,velec);
403 fscal = _mm_and_pd(fscal,cutoff_mask);
405 /* Calculate temporary vectorial force */
406 tx = _mm_mul_pd(fscal,dx20);
407 ty = _mm_mul_pd(fscal,dy20);
408 tz = _mm_mul_pd(fscal,dz20);
410 /* Update vectorial force */
411 fix2 = _mm_add_pd(fix2,tx);
412 fiy2 = _mm_add_pd(fiy2,ty);
413 fiz2 = _mm_add_pd(fiz2,tz);
415 fjx0 = _mm_add_pd(fjx0,tx);
416 fjy0 = _mm_add_pd(fjy0,ty);
417 fjz0 = _mm_add_pd(fjz0,tz);
421 /**************************
422 * CALCULATE INTERACTIONS *
423 **************************/
425 if (gmx_mm_any_lt(rsq30,rcutoff2))
428 r30 = _mm_mul_pd(rsq30,rinv30);
430 /* Compute parameters for interactions between i and j atoms */
431 qq30 = _mm_mul_pd(iq3,jq0);
433 /* EWALD ELECTROSTATICS */
435 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
436 ewrt = _mm_mul_pd(r30,ewtabscale);
437 ewitab = _mm_cvttpd_epi32(ewrt);
438 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
439 ewitab = _mm_slli_epi32(ewitab,2);
440 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
441 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
442 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
443 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
444 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
445 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
446 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
447 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
448 velec = _mm_mul_pd(qq30,_mm_sub_pd(_mm_sub_pd(rinv30,sh_ewald),velec));
449 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
451 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
453 /* Update potential sum for this i atom from the interaction with this j atom. */
454 velec = _mm_and_pd(velec,cutoff_mask);
455 velecsum = _mm_add_pd(velecsum,velec);
459 fscal = _mm_and_pd(fscal,cutoff_mask);
461 /* Calculate temporary vectorial force */
462 tx = _mm_mul_pd(fscal,dx30);
463 ty = _mm_mul_pd(fscal,dy30);
464 tz = _mm_mul_pd(fscal,dz30);
466 /* Update vectorial force */
467 fix3 = _mm_add_pd(fix3,tx);
468 fiy3 = _mm_add_pd(fiy3,ty);
469 fiz3 = _mm_add_pd(fiz3,tz);
471 fjx0 = _mm_add_pd(fjx0,tx);
472 fjy0 = _mm_add_pd(fjy0,ty);
473 fjz0 = _mm_add_pd(fjz0,tz);
477 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
479 /* Inner loop uses 202 flops */
486 j_coord_offsetA = DIM*jnrA;
488 /* load j atom coordinates */
489 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
492 /* Calculate displacement vector */
493 dx00 = _mm_sub_pd(ix0,jx0);
494 dy00 = _mm_sub_pd(iy0,jy0);
495 dz00 = _mm_sub_pd(iz0,jz0);
496 dx10 = _mm_sub_pd(ix1,jx0);
497 dy10 = _mm_sub_pd(iy1,jy0);
498 dz10 = _mm_sub_pd(iz1,jz0);
499 dx20 = _mm_sub_pd(ix2,jx0);
500 dy20 = _mm_sub_pd(iy2,jy0);
501 dz20 = _mm_sub_pd(iz2,jz0);
502 dx30 = _mm_sub_pd(ix3,jx0);
503 dy30 = _mm_sub_pd(iy3,jy0);
504 dz30 = _mm_sub_pd(iz3,jz0);
506 /* Calculate squared distance and things based on it */
507 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
508 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
509 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
510 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
512 rinv00 = sse41_invsqrt_d(rsq00);
513 rinv10 = sse41_invsqrt_d(rsq10);
514 rinv20 = sse41_invsqrt_d(rsq20);
515 rinv30 = sse41_invsqrt_d(rsq30);
517 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
518 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
519 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
520 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
522 /* Load parameters for j particles */
523 jq0 = _mm_load_sd(charge+jnrA+0);
524 vdwjidx0A = 2*vdwtype[jnrA+0];
526 fjx0 = _mm_setzero_pd();
527 fjy0 = _mm_setzero_pd();
528 fjz0 = _mm_setzero_pd();
530 /**************************
531 * CALCULATE INTERACTIONS *
532 **************************/
534 if (gmx_mm_any_lt(rsq00,rcutoff2))
537 r00 = _mm_mul_pd(rsq00,rinv00);
539 /* Compute parameters for interactions between i and j atoms */
540 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
542 c6grid_00 = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
544 /* Analytical LJ-PME */
545 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
546 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
547 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
548 exponent = sse41_exp_d(ewcljrsq);
549 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
550 poly = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
551 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
552 vvdw6 = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
553 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
554 vvdw = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
555 _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
556 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
557 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
559 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
561 /* Update potential sum for this i atom from the interaction with this j atom. */
562 vvdw = _mm_and_pd(vvdw,cutoff_mask);
563 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
564 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
568 fscal = _mm_and_pd(fscal,cutoff_mask);
570 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
572 /* Calculate temporary vectorial force */
573 tx = _mm_mul_pd(fscal,dx00);
574 ty = _mm_mul_pd(fscal,dy00);
575 tz = _mm_mul_pd(fscal,dz00);
577 /* Update vectorial force */
578 fix0 = _mm_add_pd(fix0,tx);
579 fiy0 = _mm_add_pd(fiy0,ty);
580 fiz0 = _mm_add_pd(fiz0,tz);
582 fjx0 = _mm_add_pd(fjx0,tx);
583 fjy0 = _mm_add_pd(fjy0,ty);
584 fjz0 = _mm_add_pd(fjz0,tz);
588 /**************************
589 * CALCULATE INTERACTIONS *
590 **************************/
592 if (gmx_mm_any_lt(rsq10,rcutoff2))
595 r10 = _mm_mul_pd(rsq10,rinv10);
597 /* Compute parameters for interactions between i and j atoms */
598 qq10 = _mm_mul_pd(iq1,jq0);
600 /* EWALD ELECTROSTATICS */
602 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
603 ewrt = _mm_mul_pd(r10,ewtabscale);
604 ewitab = _mm_cvttpd_epi32(ewrt);
605 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
606 ewitab = _mm_slli_epi32(ewitab,2);
607 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
608 ewtabD = _mm_setzero_pd();
609 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
610 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
611 ewtabFn = _mm_setzero_pd();
612 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
613 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
614 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
615 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
616 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
618 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
620 /* Update potential sum for this i atom from the interaction with this j atom. */
621 velec = _mm_and_pd(velec,cutoff_mask);
622 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
623 velecsum = _mm_add_pd(velecsum,velec);
627 fscal = _mm_and_pd(fscal,cutoff_mask);
629 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
631 /* Calculate temporary vectorial force */
632 tx = _mm_mul_pd(fscal,dx10);
633 ty = _mm_mul_pd(fscal,dy10);
634 tz = _mm_mul_pd(fscal,dz10);
636 /* Update vectorial force */
637 fix1 = _mm_add_pd(fix1,tx);
638 fiy1 = _mm_add_pd(fiy1,ty);
639 fiz1 = _mm_add_pd(fiz1,tz);
641 fjx0 = _mm_add_pd(fjx0,tx);
642 fjy0 = _mm_add_pd(fjy0,ty);
643 fjz0 = _mm_add_pd(fjz0,tz);
647 /**************************
648 * CALCULATE INTERACTIONS *
649 **************************/
651 if (gmx_mm_any_lt(rsq20,rcutoff2))
654 r20 = _mm_mul_pd(rsq20,rinv20);
656 /* Compute parameters for interactions between i and j atoms */
657 qq20 = _mm_mul_pd(iq2,jq0);
659 /* EWALD ELECTROSTATICS */
661 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
662 ewrt = _mm_mul_pd(r20,ewtabscale);
663 ewitab = _mm_cvttpd_epi32(ewrt);
664 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
665 ewitab = _mm_slli_epi32(ewitab,2);
666 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
667 ewtabD = _mm_setzero_pd();
668 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
669 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
670 ewtabFn = _mm_setzero_pd();
671 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
672 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
673 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
674 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
675 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
677 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
679 /* Update potential sum for this i atom from the interaction with this j atom. */
680 velec = _mm_and_pd(velec,cutoff_mask);
681 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
682 velecsum = _mm_add_pd(velecsum,velec);
686 fscal = _mm_and_pd(fscal,cutoff_mask);
688 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
690 /* Calculate temporary vectorial force */
691 tx = _mm_mul_pd(fscal,dx20);
692 ty = _mm_mul_pd(fscal,dy20);
693 tz = _mm_mul_pd(fscal,dz20);
695 /* Update vectorial force */
696 fix2 = _mm_add_pd(fix2,tx);
697 fiy2 = _mm_add_pd(fiy2,ty);
698 fiz2 = _mm_add_pd(fiz2,tz);
700 fjx0 = _mm_add_pd(fjx0,tx);
701 fjy0 = _mm_add_pd(fjy0,ty);
702 fjz0 = _mm_add_pd(fjz0,tz);
706 /**************************
707 * CALCULATE INTERACTIONS *
708 **************************/
710 if (gmx_mm_any_lt(rsq30,rcutoff2))
713 r30 = _mm_mul_pd(rsq30,rinv30);
715 /* Compute parameters for interactions between i and j atoms */
716 qq30 = _mm_mul_pd(iq3,jq0);
718 /* EWALD ELECTROSTATICS */
720 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
721 ewrt = _mm_mul_pd(r30,ewtabscale);
722 ewitab = _mm_cvttpd_epi32(ewrt);
723 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
724 ewitab = _mm_slli_epi32(ewitab,2);
725 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
726 ewtabD = _mm_setzero_pd();
727 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
728 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
729 ewtabFn = _mm_setzero_pd();
730 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
731 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
732 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
733 velec = _mm_mul_pd(qq30,_mm_sub_pd(_mm_sub_pd(rinv30,sh_ewald),velec));
734 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
736 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
738 /* Update potential sum for this i atom from the interaction with this j atom. */
739 velec = _mm_and_pd(velec,cutoff_mask);
740 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
741 velecsum = _mm_add_pd(velecsum,velec);
745 fscal = _mm_and_pd(fscal,cutoff_mask);
747 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
749 /* Calculate temporary vectorial force */
750 tx = _mm_mul_pd(fscal,dx30);
751 ty = _mm_mul_pd(fscal,dy30);
752 tz = _mm_mul_pd(fscal,dz30);
754 /* Update vectorial force */
755 fix3 = _mm_add_pd(fix3,tx);
756 fiy3 = _mm_add_pd(fiy3,ty);
757 fiz3 = _mm_add_pd(fiz3,tz);
759 fjx0 = _mm_add_pd(fjx0,tx);
760 fjy0 = _mm_add_pd(fjy0,ty);
761 fjz0 = _mm_add_pd(fjz0,tz);
765 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
767 /* Inner loop uses 202 flops */
770 /* End of innermost loop */
772 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
773 f+i_coord_offset,fshift+i_shift_offset);
776 /* Update potential energies */
777 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
778 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
780 /* Increment number of inner iterations */
781 inneriter += j_index_end - j_index_start;
783 /* Outer loop uses 26 flops */
786 /* Increment number of outer iterations */
789 /* Update outer/inner flops */
791 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*202);
794 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse4_1_double
795 * Electrostatics interaction: Ewald
796 * VdW interaction: LJEwald
797 * Geometry: Water4-Particle
798 * Calculate force/pot: Force
801 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse4_1_double
802 (t_nblist * gmx_restrict nlist,
803 rvec * gmx_restrict xx,
804 rvec * gmx_restrict ff,
805 struct t_forcerec * gmx_restrict fr,
806 t_mdatoms * gmx_restrict mdatoms,
807 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
808 t_nrnb * gmx_restrict nrnb)
810 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
811 * just 0 for non-waters.
812 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
813 * jnr indices corresponding to data put in the four positions in the SIMD register.
815 int i_shift_offset,i_coord_offset,outeriter,inneriter;
816 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
818 int j_coord_offsetA,j_coord_offsetB;
819 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
821 real *shiftvec,*fshift,*x,*f;
822 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
824 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
826 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
828 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
830 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
831 int vdwjidx0A,vdwjidx0B;
832 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
833 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
834 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
835 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
836 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
837 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
840 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
843 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
844 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
849 __m128d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
851 __m128d one_half = _mm_set1_pd(0.5);
852 __m128d minus_one = _mm_set1_pd(-1.0);
854 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
856 __m128d dummy_mask,cutoff_mask;
857 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
858 __m128d one = _mm_set1_pd(1.0);
859 __m128d two = _mm_set1_pd(2.0);
865 jindex = nlist->jindex;
867 shiftidx = nlist->shift;
869 shiftvec = fr->shift_vec[0];
870 fshift = fr->fshift[0];
871 facel = _mm_set1_pd(fr->ic->epsfac);
872 charge = mdatoms->chargeA;
873 nvdwtype = fr->ntype;
875 vdwtype = mdatoms->typeA;
876 vdwgridparam = fr->ljpme_c6grid;
877 sh_lj_ewald = _mm_set1_pd(fr->ic->sh_lj_ewald);
878 ewclj = _mm_set1_pd(fr->ic->ewaldcoeff_lj);
879 ewclj2 = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
881 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
882 ewtab = fr->ic->tabq_coul_F;
883 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
884 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
886 /* Setup water-specific parameters */
887 inr = nlist->iinr[0];
888 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
889 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
890 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
891 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
893 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
894 rcutoff_scalar = fr->ic->rcoulomb;
895 rcutoff = _mm_set1_pd(rcutoff_scalar);
896 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
898 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
899 rvdw = _mm_set1_pd(fr->ic->rvdw);
901 /* Avoid stupid compiler warnings */
909 /* Start outer loop over neighborlists */
910 for(iidx=0; iidx<nri; iidx++)
912 /* Load shift vector for this list */
913 i_shift_offset = DIM*shiftidx[iidx];
915 /* Load limits for loop over neighbors */
916 j_index_start = jindex[iidx];
917 j_index_end = jindex[iidx+1];
919 /* Get outer coordinate index */
921 i_coord_offset = DIM*inr;
923 /* Load i particle coords and add shift vector */
924 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
925 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
927 fix0 = _mm_setzero_pd();
928 fiy0 = _mm_setzero_pd();
929 fiz0 = _mm_setzero_pd();
930 fix1 = _mm_setzero_pd();
931 fiy1 = _mm_setzero_pd();
932 fiz1 = _mm_setzero_pd();
933 fix2 = _mm_setzero_pd();
934 fiy2 = _mm_setzero_pd();
935 fiz2 = _mm_setzero_pd();
936 fix3 = _mm_setzero_pd();
937 fiy3 = _mm_setzero_pd();
938 fiz3 = _mm_setzero_pd();
940 /* Start inner kernel loop */
941 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
944 /* Get j neighbor index, and coordinate index */
947 j_coord_offsetA = DIM*jnrA;
948 j_coord_offsetB = DIM*jnrB;
950 /* load j atom coordinates */
951 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
954 /* Calculate displacement vector */
955 dx00 = _mm_sub_pd(ix0,jx0);
956 dy00 = _mm_sub_pd(iy0,jy0);
957 dz00 = _mm_sub_pd(iz0,jz0);
958 dx10 = _mm_sub_pd(ix1,jx0);
959 dy10 = _mm_sub_pd(iy1,jy0);
960 dz10 = _mm_sub_pd(iz1,jz0);
961 dx20 = _mm_sub_pd(ix2,jx0);
962 dy20 = _mm_sub_pd(iy2,jy0);
963 dz20 = _mm_sub_pd(iz2,jz0);
964 dx30 = _mm_sub_pd(ix3,jx0);
965 dy30 = _mm_sub_pd(iy3,jy0);
966 dz30 = _mm_sub_pd(iz3,jz0);
968 /* Calculate squared distance and things based on it */
969 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
970 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
971 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
972 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
974 rinv00 = sse41_invsqrt_d(rsq00);
975 rinv10 = sse41_invsqrt_d(rsq10);
976 rinv20 = sse41_invsqrt_d(rsq20);
977 rinv30 = sse41_invsqrt_d(rsq30);
979 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
980 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
981 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
982 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
984 /* Load parameters for j particles */
985 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
986 vdwjidx0A = 2*vdwtype[jnrA+0];
987 vdwjidx0B = 2*vdwtype[jnrB+0];
989 fjx0 = _mm_setzero_pd();
990 fjy0 = _mm_setzero_pd();
991 fjz0 = _mm_setzero_pd();
993 /**************************
994 * CALCULATE INTERACTIONS *
995 **************************/
997 if (gmx_mm_any_lt(rsq00,rcutoff2))
1000 r00 = _mm_mul_pd(rsq00,rinv00);
1002 /* Compute parameters for interactions between i and j atoms */
1003 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
1004 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
1005 c6grid_00 = gmx_mm_load_2real_swizzle_pd(vdwgridparam+vdwioffset0+vdwjidx0A,
1006 vdwgridparam+vdwioffset0+vdwjidx0B);
1008 /* Analytical LJ-PME */
1009 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1010 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
1011 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
1012 exponent = sse41_exp_d(ewcljrsq);
1013 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1014 poly = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
1015 /* f6A = 6 * C6grid * (1 - poly) */
1016 f6A = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
1017 /* f6B = C6grid * exponent * beta^6 */
1018 f6B = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
1019 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1020 fvdw = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1022 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1026 fscal = _mm_and_pd(fscal,cutoff_mask);
1028 /* Calculate temporary vectorial force */
1029 tx = _mm_mul_pd(fscal,dx00);
1030 ty = _mm_mul_pd(fscal,dy00);
1031 tz = _mm_mul_pd(fscal,dz00);
1033 /* Update vectorial force */
1034 fix0 = _mm_add_pd(fix0,tx);
1035 fiy0 = _mm_add_pd(fiy0,ty);
1036 fiz0 = _mm_add_pd(fiz0,tz);
1038 fjx0 = _mm_add_pd(fjx0,tx);
1039 fjy0 = _mm_add_pd(fjy0,ty);
1040 fjz0 = _mm_add_pd(fjz0,tz);
1044 /**************************
1045 * CALCULATE INTERACTIONS *
1046 **************************/
1048 if (gmx_mm_any_lt(rsq10,rcutoff2))
1051 r10 = _mm_mul_pd(rsq10,rinv10);
1053 /* Compute parameters for interactions between i and j atoms */
1054 qq10 = _mm_mul_pd(iq1,jq0);
1056 /* EWALD ELECTROSTATICS */
1058 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1059 ewrt = _mm_mul_pd(r10,ewtabscale);
1060 ewitab = _mm_cvttpd_epi32(ewrt);
1061 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1062 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1064 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1065 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1067 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1071 fscal = _mm_and_pd(fscal,cutoff_mask);
1073 /* Calculate temporary vectorial force */
1074 tx = _mm_mul_pd(fscal,dx10);
1075 ty = _mm_mul_pd(fscal,dy10);
1076 tz = _mm_mul_pd(fscal,dz10);
1078 /* Update vectorial force */
1079 fix1 = _mm_add_pd(fix1,tx);
1080 fiy1 = _mm_add_pd(fiy1,ty);
1081 fiz1 = _mm_add_pd(fiz1,tz);
1083 fjx0 = _mm_add_pd(fjx0,tx);
1084 fjy0 = _mm_add_pd(fjy0,ty);
1085 fjz0 = _mm_add_pd(fjz0,tz);
1089 /**************************
1090 * CALCULATE INTERACTIONS *
1091 **************************/
1093 if (gmx_mm_any_lt(rsq20,rcutoff2))
1096 r20 = _mm_mul_pd(rsq20,rinv20);
1098 /* Compute parameters for interactions between i and j atoms */
1099 qq20 = _mm_mul_pd(iq2,jq0);
1101 /* EWALD ELECTROSTATICS */
1103 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1104 ewrt = _mm_mul_pd(r20,ewtabscale);
1105 ewitab = _mm_cvttpd_epi32(ewrt);
1106 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1107 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1109 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1110 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1112 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1116 fscal = _mm_and_pd(fscal,cutoff_mask);
1118 /* Calculate temporary vectorial force */
1119 tx = _mm_mul_pd(fscal,dx20);
1120 ty = _mm_mul_pd(fscal,dy20);
1121 tz = _mm_mul_pd(fscal,dz20);
1123 /* Update vectorial force */
1124 fix2 = _mm_add_pd(fix2,tx);
1125 fiy2 = _mm_add_pd(fiy2,ty);
1126 fiz2 = _mm_add_pd(fiz2,tz);
1128 fjx0 = _mm_add_pd(fjx0,tx);
1129 fjy0 = _mm_add_pd(fjy0,ty);
1130 fjz0 = _mm_add_pd(fjz0,tz);
1134 /**************************
1135 * CALCULATE INTERACTIONS *
1136 **************************/
1138 if (gmx_mm_any_lt(rsq30,rcutoff2))
1141 r30 = _mm_mul_pd(rsq30,rinv30);
1143 /* Compute parameters for interactions between i and j atoms */
1144 qq30 = _mm_mul_pd(iq3,jq0);
1146 /* EWALD ELECTROSTATICS */
1148 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1149 ewrt = _mm_mul_pd(r30,ewtabscale);
1150 ewitab = _mm_cvttpd_epi32(ewrt);
1151 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1152 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1154 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1155 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
1157 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
1161 fscal = _mm_and_pd(fscal,cutoff_mask);
1163 /* Calculate temporary vectorial force */
1164 tx = _mm_mul_pd(fscal,dx30);
1165 ty = _mm_mul_pd(fscal,dy30);
1166 tz = _mm_mul_pd(fscal,dz30);
1168 /* Update vectorial force */
1169 fix3 = _mm_add_pd(fix3,tx);
1170 fiy3 = _mm_add_pd(fiy3,ty);
1171 fiz3 = _mm_add_pd(fiz3,tz);
1173 fjx0 = _mm_add_pd(fjx0,tx);
1174 fjy0 = _mm_add_pd(fjy0,ty);
1175 fjz0 = _mm_add_pd(fjz0,tz);
1179 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
1181 /* Inner loop uses 169 flops */
1184 if(jidx<j_index_end)
1188 j_coord_offsetA = DIM*jnrA;
1190 /* load j atom coordinates */
1191 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1194 /* Calculate displacement vector */
1195 dx00 = _mm_sub_pd(ix0,jx0);
1196 dy00 = _mm_sub_pd(iy0,jy0);
1197 dz00 = _mm_sub_pd(iz0,jz0);
1198 dx10 = _mm_sub_pd(ix1,jx0);
1199 dy10 = _mm_sub_pd(iy1,jy0);
1200 dz10 = _mm_sub_pd(iz1,jz0);
1201 dx20 = _mm_sub_pd(ix2,jx0);
1202 dy20 = _mm_sub_pd(iy2,jy0);
1203 dz20 = _mm_sub_pd(iz2,jz0);
1204 dx30 = _mm_sub_pd(ix3,jx0);
1205 dy30 = _mm_sub_pd(iy3,jy0);
1206 dz30 = _mm_sub_pd(iz3,jz0);
1208 /* Calculate squared distance and things based on it */
1209 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1210 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1211 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1212 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
1214 rinv00 = sse41_invsqrt_d(rsq00);
1215 rinv10 = sse41_invsqrt_d(rsq10);
1216 rinv20 = sse41_invsqrt_d(rsq20);
1217 rinv30 = sse41_invsqrt_d(rsq30);
1219 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1220 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1221 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1222 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
1224 /* Load parameters for j particles */
1225 jq0 = _mm_load_sd(charge+jnrA+0);
1226 vdwjidx0A = 2*vdwtype[jnrA+0];
1228 fjx0 = _mm_setzero_pd();
1229 fjy0 = _mm_setzero_pd();
1230 fjz0 = _mm_setzero_pd();
1232 /**************************
1233 * CALCULATE INTERACTIONS *
1234 **************************/
1236 if (gmx_mm_any_lt(rsq00,rcutoff2))
1239 r00 = _mm_mul_pd(rsq00,rinv00);
1241 /* Compute parameters for interactions between i and j atoms */
1242 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
1244 c6grid_00 = gmx_mm_load_1real_pd(vdwgridparam+vdwioffset0+vdwjidx0A);
1246 /* Analytical LJ-PME */
1247 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1248 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
1249 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
1250 exponent = sse41_exp_d(ewcljrsq);
1251 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1252 poly = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
1253 /* f6A = 6 * C6grid * (1 - poly) */
1254 f6A = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
1255 /* f6B = C6grid * exponent * beta^6 */
1256 f6B = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
1257 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1258 fvdw = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1260 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1264 fscal = _mm_and_pd(fscal,cutoff_mask);
1266 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1268 /* Calculate temporary vectorial force */
1269 tx = _mm_mul_pd(fscal,dx00);
1270 ty = _mm_mul_pd(fscal,dy00);
1271 tz = _mm_mul_pd(fscal,dz00);
1273 /* Update vectorial force */
1274 fix0 = _mm_add_pd(fix0,tx);
1275 fiy0 = _mm_add_pd(fiy0,ty);
1276 fiz0 = _mm_add_pd(fiz0,tz);
1278 fjx0 = _mm_add_pd(fjx0,tx);
1279 fjy0 = _mm_add_pd(fjy0,ty);
1280 fjz0 = _mm_add_pd(fjz0,tz);
1284 /**************************
1285 * CALCULATE INTERACTIONS *
1286 **************************/
1288 if (gmx_mm_any_lt(rsq10,rcutoff2))
1291 r10 = _mm_mul_pd(rsq10,rinv10);
1293 /* Compute parameters for interactions between i and j atoms */
1294 qq10 = _mm_mul_pd(iq1,jq0);
1296 /* EWALD ELECTROSTATICS */
1298 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1299 ewrt = _mm_mul_pd(r10,ewtabscale);
1300 ewitab = _mm_cvttpd_epi32(ewrt);
1301 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1302 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1303 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1304 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1306 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1310 fscal = _mm_and_pd(fscal,cutoff_mask);
1312 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1314 /* Calculate temporary vectorial force */
1315 tx = _mm_mul_pd(fscal,dx10);
1316 ty = _mm_mul_pd(fscal,dy10);
1317 tz = _mm_mul_pd(fscal,dz10);
1319 /* Update vectorial force */
1320 fix1 = _mm_add_pd(fix1,tx);
1321 fiy1 = _mm_add_pd(fiy1,ty);
1322 fiz1 = _mm_add_pd(fiz1,tz);
1324 fjx0 = _mm_add_pd(fjx0,tx);
1325 fjy0 = _mm_add_pd(fjy0,ty);
1326 fjz0 = _mm_add_pd(fjz0,tz);
1330 /**************************
1331 * CALCULATE INTERACTIONS *
1332 **************************/
1334 if (gmx_mm_any_lt(rsq20,rcutoff2))
1337 r20 = _mm_mul_pd(rsq20,rinv20);
1339 /* Compute parameters for interactions between i and j atoms */
1340 qq20 = _mm_mul_pd(iq2,jq0);
1342 /* EWALD ELECTROSTATICS */
1344 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1345 ewrt = _mm_mul_pd(r20,ewtabscale);
1346 ewitab = _mm_cvttpd_epi32(ewrt);
1347 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1348 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1349 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1350 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1352 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1356 fscal = _mm_and_pd(fscal,cutoff_mask);
1358 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1360 /* Calculate temporary vectorial force */
1361 tx = _mm_mul_pd(fscal,dx20);
1362 ty = _mm_mul_pd(fscal,dy20);
1363 tz = _mm_mul_pd(fscal,dz20);
1365 /* Update vectorial force */
1366 fix2 = _mm_add_pd(fix2,tx);
1367 fiy2 = _mm_add_pd(fiy2,ty);
1368 fiz2 = _mm_add_pd(fiz2,tz);
1370 fjx0 = _mm_add_pd(fjx0,tx);
1371 fjy0 = _mm_add_pd(fjy0,ty);
1372 fjz0 = _mm_add_pd(fjz0,tz);
1376 /**************************
1377 * CALCULATE INTERACTIONS *
1378 **************************/
1380 if (gmx_mm_any_lt(rsq30,rcutoff2))
1383 r30 = _mm_mul_pd(rsq30,rinv30);
1385 /* Compute parameters for interactions between i and j atoms */
1386 qq30 = _mm_mul_pd(iq3,jq0);
1388 /* EWALD ELECTROSTATICS */
1390 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1391 ewrt = _mm_mul_pd(r30,ewtabscale);
1392 ewitab = _mm_cvttpd_epi32(ewrt);
1393 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1394 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1395 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1396 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
1398 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
1402 fscal = _mm_and_pd(fscal,cutoff_mask);
1404 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1406 /* Calculate temporary vectorial force */
1407 tx = _mm_mul_pd(fscal,dx30);
1408 ty = _mm_mul_pd(fscal,dy30);
1409 tz = _mm_mul_pd(fscal,dz30);
1411 /* Update vectorial force */
1412 fix3 = _mm_add_pd(fix3,tx);
1413 fiy3 = _mm_add_pd(fiy3,ty);
1414 fiz3 = _mm_add_pd(fiz3,tz);
1416 fjx0 = _mm_add_pd(fjx0,tx);
1417 fjy0 = _mm_add_pd(fjy0,ty);
1418 fjz0 = _mm_add_pd(fjz0,tz);
1422 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
1424 /* Inner loop uses 169 flops */
1427 /* End of innermost loop */
1429 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1430 f+i_coord_offset,fshift+i_shift_offset);
1432 /* Increment number of inner iterations */
1433 inneriter += j_index_end - j_index_start;
1435 /* Outer loop uses 24 flops */
1438 /* Increment number of outer iterations */
1441 /* Update outer/inner flops */
1443 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*169);