2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_sse2_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B;
75 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
77 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
78 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
79 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
80 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
83 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
86 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
87 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
89 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
91 __m128d dummy_mask,cutoff_mask;
92 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
93 __m128d one = _mm_set1_pd(1.0);
94 __m128d two = _mm_set1_pd(2.0);
100 jindex = nlist->jindex;
102 shiftidx = nlist->shift;
104 shiftvec = fr->shift_vec[0];
105 fshift = fr->fshift[0];
106 facel = _mm_set1_pd(fr->epsfac);
107 charge = mdatoms->chargeA;
108 nvdwtype = fr->ntype;
110 vdwtype = mdatoms->typeA;
112 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
113 ewtab = fr->ic->tabq_coul_FDV0;
114 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
115 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
117 /* Setup water-specific parameters */
118 inr = nlist->iinr[0];
119 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
120 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
121 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
122 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
124 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
125 rcutoff_scalar = fr->rcoulomb;
126 rcutoff = _mm_set1_pd(rcutoff_scalar);
127 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
129 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
130 rvdw = _mm_set1_pd(fr->rvdw);
132 /* Avoid stupid compiler warnings */
140 /* Start outer loop over neighborlists */
141 for(iidx=0; iidx<nri; iidx++)
143 /* Load shift vector for this list */
144 i_shift_offset = DIM*shiftidx[iidx];
146 /* Load limits for loop over neighbors */
147 j_index_start = jindex[iidx];
148 j_index_end = jindex[iidx+1];
150 /* Get outer coordinate index */
152 i_coord_offset = DIM*inr;
154 /* Load i particle coords and add shift vector */
155 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
156 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
158 fix0 = _mm_setzero_pd();
159 fiy0 = _mm_setzero_pd();
160 fiz0 = _mm_setzero_pd();
161 fix1 = _mm_setzero_pd();
162 fiy1 = _mm_setzero_pd();
163 fiz1 = _mm_setzero_pd();
164 fix2 = _mm_setzero_pd();
165 fiy2 = _mm_setzero_pd();
166 fiz2 = _mm_setzero_pd();
167 fix3 = _mm_setzero_pd();
168 fiy3 = _mm_setzero_pd();
169 fiz3 = _mm_setzero_pd();
171 /* Reset potential sums */
172 velecsum = _mm_setzero_pd();
173 vvdwsum = _mm_setzero_pd();
175 /* Start inner kernel loop */
176 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
179 /* Get j neighbor index, and coordinate index */
182 j_coord_offsetA = DIM*jnrA;
183 j_coord_offsetB = DIM*jnrB;
185 /* load j atom coordinates */
186 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
189 /* Calculate displacement vector */
190 dx00 = _mm_sub_pd(ix0,jx0);
191 dy00 = _mm_sub_pd(iy0,jy0);
192 dz00 = _mm_sub_pd(iz0,jz0);
193 dx10 = _mm_sub_pd(ix1,jx0);
194 dy10 = _mm_sub_pd(iy1,jy0);
195 dz10 = _mm_sub_pd(iz1,jz0);
196 dx20 = _mm_sub_pd(ix2,jx0);
197 dy20 = _mm_sub_pd(iy2,jy0);
198 dz20 = _mm_sub_pd(iz2,jz0);
199 dx30 = _mm_sub_pd(ix3,jx0);
200 dy30 = _mm_sub_pd(iy3,jy0);
201 dz30 = _mm_sub_pd(iz3,jz0);
203 /* Calculate squared distance and things based on it */
204 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
205 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
206 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
207 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
209 rinv10 = gmx_mm_invsqrt_pd(rsq10);
210 rinv20 = gmx_mm_invsqrt_pd(rsq20);
211 rinv30 = gmx_mm_invsqrt_pd(rsq30);
213 rinvsq00 = gmx_mm_inv_pd(rsq00);
214 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
215 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
216 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
218 /* Load parameters for j particles */
219 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
220 vdwjidx0A = 2*vdwtype[jnrA+0];
221 vdwjidx0B = 2*vdwtype[jnrB+0];
223 fjx0 = _mm_setzero_pd();
224 fjy0 = _mm_setzero_pd();
225 fjz0 = _mm_setzero_pd();
227 /**************************
228 * CALCULATE INTERACTIONS *
229 **************************/
231 if (gmx_mm_any_lt(rsq00,rcutoff2))
234 /* Compute parameters for interactions between i and j atoms */
235 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
236 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
238 /* LENNARD-JONES DISPERSION/REPULSION */
240 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
241 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
242 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
243 vvdw = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
244 _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
245 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
247 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
249 /* Update potential sum for this i atom from the interaction with this j atom. */
250 vvdw = _mm_and_pd(vvdw,cutoff_mask);
251 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
255 fscal = _mm_and_pd(fscal,cutoff_mask);
257 /* Calculate temporary vectorial force */
258 tx = _mm_mul_pd(fscal,dx00);
259 ty = _mm_mul_pd(fscal,dy00);
260 tz = _mm_mul_pd(fscal,dz00);
262 /* Update vectorial force */
263 fix0 = _mm_add_pd(fix0,tx);
264 fiy0 = _mm_add_pd(fiy0,ty);
265 fiz0 = _mm_add_pd(fiz0,tz);
267 fjx0 = _mm_add_pd(fjx0,tx);
268 fjy0 = _mm_add_pd(fjy0,ty);
269 fjz0 = _mm_add_pd(fjz0,tz);
273 /**************************
274 * CALCULATE INTERACTIONS *
275 **************************/
277 if (gmx_mm_any_lt(rsq10,rcutoff2))
280 r10 = _mm_mul_pd(rsq10,rinv10);
282 /* Compute parameters for interactions between i and j atoms */
283 qq10 = _mm_mul_pd(iq1,jq0);
285 /* EWALD ELECTROSTATICS */
287 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
288 ewrt = _mm_mul_pd(r10,ewtabscale);
289 ewitab = _mm_cvttpd_epi32(ewrt);
290 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
291 ewitab = _mm_slli_epi32(ewitab,2);
292 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
293 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
294 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
295 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
296 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
297 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
298 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
299 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
300 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
301 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
303 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
305 /* Update potential sum for this i atom from the interaction with this j atom. */
306 velec = _mm_and_pd(velec,cutoff_mask);
307 velecsum = _mm_add_pd(velecsum,velec);
311 fscal = _mm_and_pd(fscal,cutoff_mask);
313 /* Calculate temporary vectorial force */
314 tx = _mm_mul_pd(fscal,dx10);
315 ty = _mm_mul_pd(fscal,dy10);
316 tz = _mm_mul_pd(fscal,dz10);
318 /* Update vectorial force */
319 fix1 = _mm_add_pd(fix1,tx);
320 fiy1 = _mm_add_pd(fiy1,ty);
321 fiz1 = _mm_add_pd(fiz1,tz);
323 fjx0 = _mm_add_pd(fjx0,tx);
324 fjy0 = _mm_add_pd(fjy0,ty);
325 fjz0 = _mm_add_pd(fjz0,tz);
329 /**************************
330 * CALCULATE INTERACTIONS *
331 **************************/
333 if (gmx_mm_any_lt(rsq20,rcutoff2))
336 r20 = _mm_mul_pd(rsq20,rinv20);
338 /* Compute parameters for interactions between i and j atoms */
339 qq20 = _mm_mul_pd(iq2,jq0);
341 /* EWALD ELECTROSTATICS */
343 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
344 ewrt = _mm_mul_pd(r20,ewtabscale);
345 ewitab = _mm_cvttpd_epi32(ewrt);
346 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
347 ewitab = _mm_slli_epi32(ewitab,2);
348 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
349 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
350 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
351 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
352 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
353 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
354 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
355 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
356 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
357 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
359 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
361 /* Update potential sum for this i atom from the interaction with this j atom. */
362 velec = _mm_and_pd(velec,cutoff_mask);
363 velecsum = _mm_add_pd(velecsum,velec);
367 fscal = _mm_and_pd(fscal,cutoff_mask);
369 /* Calculate temporary vectorial force */
370 tx = _mm_mul_pd(fscal,dx20);
371 ty = _mm_mul_pd(fscal,dy20);
372 tz = _mm_mul_pd(fscal,dz20);
374 /* Update vectorial force */
375 fix2 = _mm_add_pd(fix2,tx);
376 fiy2 = _mm_add_pd(fiy2,ty);
377 fiz2 = _mm_add_pd(fiz2,tz);
379 fjx0 = _mm_add_pd(fjx0,tx);
380 fjy0 = _mm_add_pd(fjy0,ty);
381 fjz0 = _mm_add_pd(fjz0,tz);
385 /**************************
386 * CALCULATE INTERACTIONS *
387 **************************/
389 if (gmx_mm_any_lt(rsq30,rcutoff2))
392 r30 = _mm_mul_pd(rsq30,rinv30);
394 /* Compute parameters for interactions between i and j atoms */
395 qq30 = _mm_mul_pd(iq3,jq0);
397 /* EWALD ELECTROSTATICS */
399 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
400 ewrt = _mm_mul_pd(r30,ewtabscale);
401 ewitab = _mm_cvttpd_epi32(ewrt);
402 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
403 ewitab = _mm_slli_epi32(ewitab,2);
404 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
405 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
406 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
407 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
408 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
409 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
410 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
411 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
412 velec = _mm_mul_pd(qq30,_mm_sub_pd(_mm_sub_pd(rinv30,sh_ewald),velec));
413 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
415 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
417 /* Update potential sum for this i atom from the interaction with this j atom. */
418 velec = _mm_and_pd(velec,cutoff_mask);
419 velecsum = _mm_add_pd(velecsum,velec);
423 fscal = _mm_and_pd(fscal,cutoff_mask);
425 /* Calculate temporary vectorial force */
426 tx = _mm_mul_pd(fscal,dx30);
427 ty = _mm_mul_pd(fscal,dy30);
428 tz = _mm_mul_pd(fscal,dz30);
430 /* Update vectorial force */
431 fix3 = _mm_add_pd(fix3,tx);
432 fiy3 = _mm_add_pd(fiy3,ty);
433 fiz3 = _mm_add_pd(fiz3,tz);
435 fjx0 = _mm_add_pd(fjx0,tx);
436 fjy0 = _mm_add_pd(fjy0,ty);
437 fjz0 = _mm_add_pd(fjz0,tz);
441 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
443 /* Inner loop uses 182 flops */
450 j_coord_offsetA = DIM*jnrA;
452 /* load j atom coordinates */
453 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
456 /* Calculate displacement vector */
457 dx00 = _mm_sub_pd(ix0,jx0);
458 dy00 = _mm_sub_pd(iy0,jy0);
459 dz00 = _mm_sub_pd(iz0,jz0);
460 dx10 = _mm_sub_pd(ix1,jx0);
461 dy10 = _mm_sub_pd(iy1,jy0);
462 dz10 = _mm_sub_pd(iz1,jz0);
463 dx20 = _mm_sub_pd(ix2,jx0);
464 dy20 = _mm_sub_pd(iy2,jy0);
465 dz20 = _mm_sub_pd(iz2,jz0);
466 dx30 = _mm_sub_pd(ix3,jx0);
467 dy30 = _mm_sub_pd(iy3,jy0);
468 dz30 = _mm_sub_pd(iz3,jz0);
470 /* Calculate squared distance and things based on it */
471 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
472 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
473 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
474 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
476 rinv10 = gmx_mm_invsqrt_pd(rsq10);
477 rinv20 = gmx_mm_invsqrt_pd(rsq20);
478 rinv30 = gmx_mm_invsqrt_pd(rsq30);
480 rinvsq00 = gmx_mm_inv_pd(rsq00);
481 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
482 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
483 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
485 /* Load parameters for j particles */
486 jq0 = _mm_load_sd(charge+jnrA+0);
487 vdwjidx0A = 2*vdwtype[jnrA+0];
489 fjx0 = _mm_setzero_pd();
490 fjy0 = _mm_setzero_pd();
491 fjz0 = _mm_setzero_pd();
493 /**************************
494 * CALCULATE INTERACTIONS *
495 **************************/
497 if (gmx_mm_any_lt(rsq00,rcutoff2))
500 /* Compute parameters for interactions between i and j atoms */
501 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
503 /* LENNARD-JONES DISPERSION/REPULSION */
505 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
506 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
507 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
508 vvdw = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
509 _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
510 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
512 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
514 /* Update potential sum for this i atom from the interaction with this j atom. */
515 vvdw = _mm_and_pd(vvdw,cutoff_mask);
516 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
517 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
521 fscal = _mm_and_pd(fscal,cutoff_mask);
523 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
525 /* Calculate temporary vectorial force */
526 tx = _mm_mul_pd(fscal,dx00);
527 ty = _mm_mul_pd(fscal,dy00);
528 tz = _mm_mul_pd(fscal,dz00);
530 /* Update vectorial force */
531 fix0 = _mm_add_pd(fix0,tx);
532 fiy0 = _mm_add_pd(fiy0,ty);
533 fiz0 = _mm_add_pd(fiz0,tz);
535 fjx0 = _mm_add_pd(fjx0,tx);
536 fjy0 = _mm_add_pd(fjy0,ty);
537 fjz0 = _mm_add_pd(fjz0,tz);
541 /**************************
542 * CALCULATE INTERACTIONS *
543 **************************/
545 if (gmx_mm_any_lt(rsq10,rcutoff2))
548 r10 = _mm_mul_pd(rsq10,rinv10);
550 /* Compute parameters for interactions between i and j atoms */
551 qq10 = _mm_mul_pd(iq1,jq0);
553 /* EWALD ELECTROSTATICS */
555 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
556 ewrt = _mm_mul_pd(r10,ewtabscale);
557 ewitab = _mm_cvttpd_epi32(ewrt);
558 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
559 ewitab = _mm_slli_epi32(ewitab,2);
560 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
561 ewtabD = _mm_setzero_pd();
562 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
563 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
564 ewtabFn = _mm_setzero_pd();
565 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
566 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
567 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
568 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
569 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
571 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
573 /* Update potential sum for this i atom from the interaction with this j atom. */
574 velec = _mm_and_pd(velec,cutoff_mask);
575 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
576 velecsum = _mm_add_pd(velecsum,velec);
580 fscal = _mm_and_pd(fscal,cutoff_mask);
582 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
584 /* Calculate temporary vectorial force */
585 tx = _mm_mul_pd(fscal,dx10);
586 ty = _mm_mul_pd(fscal,dy10);
587 tz = _mm_mul_pd(fscal,dz10);
589 /* Update vectorial force */
590 fix1 = _mm_add_pd(fix1,tx);
591 fiy1 = _mm_add_pd(fiy1,ty);
592 fiz1 = _mm_add_pd(fiz1,tz);
594 fjx0 = _mm_add_pd(fjx0,tx);
595 fjy0 = _mm_add_pd(fjy0,ty);
596 fjz0 = _mm_add_pd(fjz0,tz);
600 /**************************
601 * CALCULATE INTERACTIONS *
602 **************************/
604 if (gmx_mm_any_lt(rsq20,rcutoff2))
607 r20 = _mm_mul_pd(rsq20,rinv20);
609 /* Compute parameters for interactions between i and j atoms */
610 qq20 = _mm_mul_pd(iq2,jq0);
612 /* EWALD ELECTROSTATICS */
614 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
615 ewrt = _mm_mul_pd(r20,ewtabscale);
616 ewitab = _mm_cvttpd_epi32(ewrt);
617 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
618 ewitab = _mm_slli_epi32(ewitab,2);
619 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
620 ewtabD = _mm_setzero_pd();
621 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
622 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
623 ewtabFn = _mm_setzero_pd();
624 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
625 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
626 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
627 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
628 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
630 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
632 /* Update potential sum for this i atom from the interaction with this j atom. */
633 velec = _mm_and_pd(velec,cutoff_mask);
634 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
635 velecsum = _mm_add_pd(velecsum,velec);
639 fscal = _mm_and_pd(fscal,cutoff_mask);
641 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
643 /* Calculate temporary vectorial force */
644 tx = _mm_mul_pd(fscal,dx20);
645 ty = _mm_mul_pd(fscal,dy20);
646 tz = _mm_mul_pd(fscal,dz20);
648 /* Update vectorial force */
649 fix2 = _mm_add_pd(fix2,tx);
650 fiy2 = _mm_add_pd(fiy2,ty);
651 fiz2 = _mm_add_pd(fiz2,tz);
653 fjx0 = _mm_add_pd(fjx0,tx);
654 fjy0 = _mm_add_pd(fjy0,ty);
655 fjz0 = _mm_add_pd(fjz0,tz);
659 /**************************
660 * CALCULATE INTERACTIONS *
661 **************************/
663 if (gmx_mm_any_lt(rsq30,rcutoff2))
666 r30 = _mm_mul_pd(rsq30,rinv30);
668 /* Compute parameters for interactions between i and j atoms */
669 qq30 = _mm_mul_pd(iq3,jq0);
671 /* EWALD ELECTROSTATICS */
673 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
674 ewrt = _mm_mul_pd(r30,ewtabscale);
675 ewitab = _mm_cvttpd_epi32(ewrt);
676 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
677 ewitab = _mm_slli_epi32(ewitab,2);
678 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
679 ewtabD = _mm_setzero_pd();
680 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
681 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
682 ewtabFn = _mm_setzero_pd();
683 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
684 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
685 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
686 velec = _mm_mul_pd(qq30,_mm_sub_pd(_mm_sub_pd(rinv30,sh_ewald),velec));
687 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
689 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
691 /* Update potential sum for this i atom from the interaction with this j atom. */
692 velec = _mm_and_pd(velec,cutoff_mask);
693 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
694 velecsum = _mm_add_pd(velecsum,velec);
698 fscal = _mm_and_pd(fscal,cutoff_mask);
700 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
702 /* Calculate temporary vectorial force */
703 tx = _mm_mul_pd(fscal,dx30);
704 ty = _mm_mul_pd(fscal,dy30);
705 tz = _mm_mul_pd(fscal,dz30);
707 /* Update vectorial force */
708 fix3 = _mm_add_pd(fix3,tx);
709 fiy3 = _mm_add_pd(fiy3,ty);
710 fiz3 = _mm_add_pd(fiz3,tz);
712 fjx0 = _mm_add_pd(fjx0,tx);
713 fjy0 = _mm_add_pd(fjy0,ty);
714 fjz0 = _mm_add_pd(fjz0,tz);
718 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
720 /* Inner loop uses 182 flops */
723 /* End of innermost loop */
725 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
726 f+i_coord_offset,fshift+i_shift_offset);
729 /* Update potential energies */
730 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
731 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
733 /* Increment number of inner iterations */
734 inneriter += j_index_end - j_index_start;
736 /* Outer loop uses 26 flops */
739 /* Increment number of outer iterations */
742 /* Update outer/inner flops */
744 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*182);
747 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_double
748 * Electrostatics interaction: Ewald
749 * VdW interaction: LennardJones
750 * Geometry: Water4-Particle
751 * Calculate force/pot: Force
754 nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_sse2_double
755 (t_nblist * gmx_restrict nlist,
756 rvec * gmx_restrict xx,
757 rvec * gmx_restrict ff,
758 t_forcerec * gmx_restrict fr,
759 t_mdatoms * gmx_restrict mdatoms,
760 nb_kernel_data_t * gmx_restrict kernel_data,
761 t_nrnb * gmx_restrict nrnb)
763 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
764 * just 0 for non-waters.
765 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
766 * jnr indices corresponding to data put in the four positions in the SIMD register.
768 int i_shift_offset,i_coord_offset,outeriter,inneriter;
769 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
771 int j_coord_offsetA,j_coord_offsetB;
772 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
774 real *shiftvec,*fshift,*x,*f;
775 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
777 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
779 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
781 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
783 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
784 int vdwjidx0A,vdwjidx0B;
785 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
786 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
787 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
788 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
789 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
790 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
793 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
796 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
797 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
799 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
801 __m128d dummy_mask,cutoff_mask;
802 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
803 __m128d one = _mm_set1_pd(1.0);
804 __m128d two = _mm_set1_pd(2.0);
810 jindex = nlist->jindex;
812 shiftidx = nlist->shift;
814 shiftvec = fr->shift_vec[0];
815 fshift = fr->fshift[0];
816 facel = _mm_set1_pd(fr->epsfac);
817 charge = mdatoms->chargeA;
818 nvdwtype = fr->ntype;
820 vdwtype = mdatoms->typeA;
822 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
823 ewtab = fr->ic->tabq_coul_F;
824 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
825 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
827 /* Setup water-specific parameters */
828 inr = nlist->iinr[0];
829 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
830 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
831 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
832 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
834 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
835 rcutoff_scalar = fr->rcoulomb;
836 rcutoff = _mm_set1_pd(rcutoff_scalar);
837 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
839 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
840 rvdw = _mm_set1_pd(fr->rvdw);
842 /* Avoid stupid compiler warnings */
850 /* Start outer loop over neighborlists */
851 for(iidx=0; iidx<nri; iidx++)
853 /* Load shift vector for this list */
854 i_shift_offset = DIM*shiftidx[iidx];
856 /* Load limits for loop over neighbors */
857 j_index_start = jindex[iidx];
858 j_index_end = jindex[iidx+1];
860 /* Get outer coordinate index */
862 i_coord_offset = DIM*inr;
864 /* Load i particle coords and add shift vector */
865 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
866 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
868 fix0 = _mm_setzero_pd();
869 fiy0 = _mm_setzero_pd();
870 fiz0 = _mm_setzero_pd();
871 fix1 = _mm_setzero_pd();
872 fiy1 = _mm_setzero_pd();
873 fiz1 = _mm_setzero_pd();
874 fix2 = _mm_setzero_pd();
875 fiy2 = _mm_setzero_pd();
876 fiz2 = _mm_setzero_pd();
877 fix3 = _mm_setzero_pd();
878 fiy3 = _mm_setzero_pd();
879 fiz3 = _mm_setzero_pd();
881 /* Start inner kernel loop */
882 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
885 /* Get j neighbor index, and coordinate index */
888 j_coord_offsetA = DIM*jnrA;
889 j_coord_offsetB = DIM*jnrB;
891 /* load j atom coordinates */
892 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
895 /* Calculate displacement vector */
896 dx00 = _mm_sub_pd(ix0,jx0);
897 dy00 = _mm_sub_pd(iy0,jy0);
898 dz00 = _mm_sub_pd(iz0,jz0);
899 dx10 = _mm_sub_pd(ix1,jx0);
900 dy10 = _mm_sub_pd(iy1,jy0);
901 dz10 = _mm_sub_pd(iz1,jz0);
902 dx20 = _mm_sub_pd(ix2,jx0);
903 dy20 = _mm_sub_pd(iy2,jy0);
904 dz20 = _mm_sub_pd(iz2,jz0);
905 dx30 = _mm_sub_pd(ix3,jx0);
906 dy30 = _mm_sub_pd(iy3,jy0);
907 dz30 = _mm_sub_pd(iz3,jz0);
909 /* Calculate squared distance and things based on it */
910 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
911 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
912 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
913 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
915 rinv10 = gmx_mm_invsqrt_pd(rsq10);
916 rinv20 = gmx_mm_invsqrt_pd(rsq20);
917 rinv30 = gmx_mm_invsqrt_pd(rsq30);
919 rinvsq00 = gmx_mm_inv_pd(rsq00);
920 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
921 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
922 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
924 /* Load parameters for j particles */
925 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
926 vdwjidx0A = 2*vdwtype[jnrA+0];
927 vdwjidx0B = 2*vdwtype[jnrB+0];
929 fjx0 = _mm_setzero_pd();
930 fjy0 = _mm_setzero_pd();
931 fjz0 = _mm_setzero_pd();
933 /**************************
934 * CALCULATE INTERACTIONS *
935 **************************/
937 if (gmx_mm_any_lt(rsq00,rcutoff2))
940 /* Compute parameters for interactions between i and j atoms */
941 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
942 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
944 /* LENNARD-JONES DISPERSION/REPULSION */
946 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
947 fvdw = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
949 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
953 fscal = _mm_and_pd(fscal,cutoff_mask);
955 /* Calculate temporary vectorial force */
956 tx = _mm_mul_pd(fscal,dx00);
957 ty = _mm_mul_pd(fscal,dy00);
958 tz = _mm_mul_pd(fscal,dz00);
960 /* Update vectorial force */
961 fix0 = _mm_add_pd(fix0,tx);
962 fiy0 = _mm_add_pd(fiy0,ty);
963 fiz0 = _mm_add_pd(fiz0,tz);
965 fjx0 = _mm_add_pd(fjx0,tx);
966 fjy0 = _mm_add_pd(fjy0,ty);
967 fjz0 = _mm_add_pd(fjz0,tz);
971 /**************************
972 * CALCULATE INTERACTIONS *
973 **************************/
975 if (gmx_mm_any_lt(rsq10,rcutoff2))
978 r10 = _mm_mul_pd(rsq10,rinv10);
980 /* Compute parameters for interactions between i and j atoms */
981 qq10 = _mm_mul_pd(iq1,jq0);
983 /* EWALD ELECTROSTATICS */
985 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
986 ewrt = _mm_mul_pd(r10,ewtabscale);
987 ewitab = _mm_cvttpd_epi32(ewrt);
988 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
989 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
991 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
992 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
994 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
998 fscal = _mm_and_pd(fscal,cutoff_mask);
1000 /* Calculate temporary vectorial force */
1001 tx = _mm_mul_pd(fscal,dx10);
1002 ty = _mm_mul_pd(fscal,dy10);
1003 tz = _mm_mul_pd(fscal,dz10);
1005 /* Update vectorial force */
1006 fix1 = _mm_add_pd(fix1,tx);
1007 fiy1 = _mm_add_pd(fiy1,ty);
1008 fiz1 = _mm_add_pd(fiz1,tz);
1010 fjx0 = _mm_add_pd(fjx0,tx);
1011 fjy0 = _mm_add_pd(fjy0,ty);
1012 fjz0 = _mm_add_pd(fjz0,tz);
1016 /**************************
1017 * CALCULATE INTERACTIONS *
1018 **************************/
1020 if (gmx_mm_any_lt(rsq20,rcutoff2))
1023 r20 = _mm_mul_pd(rsq20,rinv20);
1025 /* Compute parameters for interactions between i and j atoms */
1026 qq20 = _mm_mul_pd(iq2,jq0);
1028 /* EWALD ELECTROSTATICS */
1030 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1031 ewrt = _mm_mul_pd(r20,ewtabscale);
1032 ewitab = _mm_cvttpd_epi32(ewrt);
1033 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1034 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1036 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1037 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1039 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1043 fscal = _mm_and_pd(fscal,cutoff_mask);
1045 /* Calculate temporary vectorial force */
1046 tx = _mm_mul_pd(fscal,dx20);
1047 ty = _mm_mul_pd(fscal,dy20);
1048 tz = _mm_mul_pd(fscal,dz20);
1050 /* Update vectorial force */
1051 fix2 = _mm_add_pd(fix2,tx);
1052 fiy2 = _mm_add_pd(fiy2,ty);
1053 fiz2 = _mm_add_pd(fiz2,tz);
1055 fjx0 = _mm_add_pd(fjx0,tx);
1056 fjy0 = _mm_add_pd(fjy0,ty);
1057 fjz0 = _mm_add_pd(fjz0,tz);
1061 /**************************
1062 * CALCULATE INTERACTIONS *
1063 **************************/
1065 if (gmx_mm_any_lt(rsq30,rcutoff2))
1068 r30 = _mm_mul_pd(rsq30,rinv30);
1070 /* Compute parameters for interactions between i and j atoms */
1071 qq30 = _mm_mul_pd(iq3,jq0);
1073 /* EWALD ELECTROSTATICS */
1075 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1076 ewrt = _mm_mul_pd(r30,ewtabscale);
1077 ewitab = _mm_cvttpd_epi32(ewrt);
1078 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1079 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1081 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1082 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
1084 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
1088 fscal = _mm_and_pd(fscal,cutoff_mask);
1090 /* Calculate temporary vectorial force */
1091 tx = _mm_mul_pd(fscal,dx30);
1092 ty = _mm_mul_pd(fscal,dy30);
1093 tz = _mm_mul_pd(fscal,dz30);
1095 /* Update vectorial force */
1096 fix3 = _mm_add_pd(fix3,tx);
1097 fiy3 = _mm_add_pd(fiy3,ty);
1098 fiz3 = _mm_add_pd(fiz3,tz);
1100 fjx0 = _mm_add_pd(fjx0,tx);
1101 fjy0 = _mm_add_pd(fjy0,ty);
1102 fjz0 = _mm_add_pd(fjz0,tz);
1106 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
1108 /* Inner loop uses 150 flops */
1111 if(jidx<j_index_end)
1115 j_coord_offsetA = DIM*jnrA;
1117 /* load j atom coordinates */
1118 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1121 /* Calculate displacement vector */
1122 dx00 = _mm_sub_pd(ix0,jx0);
1123 dy00 = _mm_sub_pd(iy0,jy0);
1124 dz00 = _mm_sub_pd(iz0,jz0);
1125 dx10 = _mm_sub_pd(ix1,jx0);
1126 dy10 = _mm_sub_pd(iy1,jy0);
1127 dz10 = _mm_sub_pd(iz1,jz0);
1128 dx20 = _mm_sub_pd(ix2,jx0);
1129 dy20 = _mm_sub_pd(iy2,jy0);
1130 dz20 = _mm_sub_pd(iz2,jz0);
1131 dx30 = _mm_sub_pd(ix3,jx0);
1132 dy30 = _mm_sub_pd(iy3,jy0);
1133 dz30 = _mm_sub_pd(iz3,jz0);
1135 /* Calculate squared distance and things based on it */
1136 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1137 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1138 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1139 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
1141 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1142 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1143 rinv30 = gmx_mm_invsqrt_pd(rsq30);
1145 rinvsq00 = gmx_mm_inv_pd(rsq00);
1146 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1147 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1148 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
1150 /* Load parameters for j particles */
1151 jq0 = _mm_load_sd(charge+jnrA+0);
1152 vdwjidx0A = 2*vdwtype[jnrA+0];
1154 fjx0 = _mm_setzero_pd();
1155 fjy0 = _mm_setzero_pd();
1156 fjz0 = _mm_setzero_pd();
1158 /**************************
1159 * CALCULATE INTERACTIONS *
1160 **************************/
1162 if (gmx_mm_any_lt(rsq00,rcutoff2))
1165 /* Compute parameters for interactions between i and j atoms */
1166 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
1168 /* LENNARD-JONES DISPERSION/REPULSION */
1170 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1171 fvdw = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1173 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1177 fscal = _mm_and_pd(fscal,cutoff_mask);
1179 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1181 /* Calculate temporary vectorial force */
1182 tx = _mm_mul_pd(fscal,dx00);
1183 ty = _mm_mul_pd(fscal,dy00);
1184 tz = _mm_mul_pd(fscal,dz00);
1186 /* Update vectorial force */
1187 fix0 = _mm_add_pd(fix0,tx);
1188 fiy0 = _mm_add_pd(fiy0,ty);
1189 fiz0 = _mm_add_pd(fiz0,tz);
1191 fjx0 = _mm_add_pd(fjx0,tx);
1192 fjy0 = _mm_add_pd(fjy0,ty);
1193 fjz0 = _mm_add_pd(fjz0,tz);
1197 /**************************
1198 * CALCULATE INTERACTIONS *
1199 **************************/
1201 if (gmx_mm_any_lt(rsq10,rcutoff2))
1204 r10 = _mm_mul_pd(rsq10,rinv10);
1206 /* Compute parameters for interactions between i and j atoms */
1207 qq10 = _mm_mul_pd(iq1,jq0);
1209 /* EWALD ELECTROSTATICS */
1211 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1212 ewrt = _mm_mul_pd(r10,ewtabscale);
1213 ewitab = _mm_cvttpd_epi32(ewrt);
1214 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1215 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1216 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1217 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1219 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1223 fscal = _mm_and_pd(fscal,cutoff_mask);
1225 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1227 /* Calculate temporary vectorial force */
1228 tx = _mm_mul_pd(fscal,dx10);
1229 ty = _mm_mul_pd(fscal,dy10);
1230 tz = _mm_mul_pd(fscal,dz10);
1232 /* Update vectorial force */
1233 fix1 = _mm_add_pd(fix1,tx);
1234 fiy1 = _mm_add_pd(fiy1,ty);
1235 fiz1 = _mm_add_pd(fiz1,tz);
1237 fjx0 = _mm_add_pd(fjx0,tx);
1238 fjy0 = _mm_add_pd(fjy0,ty);
1239 fjz0 = _mm_add_pd(fjz0,tz);
1243 /**************************
1244 * CALCULATE INTERACTIONS *
1245 **************************/
1247 if (gmx_mm_any_lt(rsq20,rcutoff2))
1250 r20 = _mm_mul_pd(rsq20,rinv20);
1252 /* Compute parameters for interactions between i and j atoms */
1253 qq20 = _mm_mul_pd(iq2,jq0);
1255 /* EWALD ELECTROSTATICS */
1257 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1258 ewrt = _mm_mul_pd(r20,ewtabscale);
1259 ewitab = _mm_cvttpd_epi32(ewrt);
1260 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1261 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1262 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1263 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1265 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1269 fscal = _mm_and_pd(fscal,cutoff_mask);
1271 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1273 /* Calculate temporary vectorial force */
1274 tx = _mm_mul_pd(fscal,dx20);
1275 ty = _mm_mul_pd(fscal,dy20);
1276 tz = _mm_mul_pd(fscal,dz20);
1278 /* Update vectorial force */
1279 fix2 = _mm_add_pd(fix2,tx);
1280 fiy2 = _mm_add_pd(fiy2,ty);
1281 fiz2 = _mm_add_pd(fiz2,tz);
1283 fjx0 = _mm_add_pd(fjx0,tx);
1284 fjy0 = _mm_add_pd(fjy0,ty);
1285 fjz0 = _mm_add_pd(fjz0,tz);
1289 /**************************
1290 * CALCULATE INTERACTIONS *
1291 **************************/
1293 if (gmx_mm_any_lt(rsq30,rcutoff2))
1296 r30 = _mm_mul_pd(rsq30,rinv30);
1298 /* Compute parameters for interactions between i and j atoms */
1299 qq30 = _mm_mul_pd(iq3,jq0);
1301 /* EWALD ELECTROSTATICS */
1303 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1304 ewrt = _mm_mul_pd(r30,ewtabscale);
1305 ewitab = _mm_cvttpd_epi32(ewrt);
1306 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1307 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1308 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1309 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
1311 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
1315 fscal = _mm_and_pd(fscal,cutoff_mask);
1317 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1319 /* Calculate temporary vectorial force */
1320 tx = _mm_mul_pd(fscal,dx30);
1321 ty = _mm_mul_pd(fscal,dy30);
1322 tz = _mm_mul_pd(fscal,dz30);
1324 /* Update vectorial force */
1325 fix3 = _mm_add_pd(fix3,tx);
1326 fiy3 = _mm_add_pd(fiy3,ty);
1327 fiz3 = _mm_add_pd(fiz3,tz);
1329 fjx0 = _mm_add_pd(fjx0,tx);
1330 fjy0 = _mm_add_pd(fjy0,ty);
1331 fjz0 = _mm_add_pd(fjz0,tz);
1335 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
1337 /* Inner loop uses 150 flops */
1340 /* End of innermost loop */
1342 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1343 f+i_coord_offset,fshift+i_shift_offset);
1345 /* Increment number of inner iterations */
1346 inneriter += j_index_end - j_index_start;
1348 /* Outer loop uses 24 flops */
1351 /* Increment number of outer iterations */
1354 /* Update outer/inner flops */
1356 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*150);