2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B;
75 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
77 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
78 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
79 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
80 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
83 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
86 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
87 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
89 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
91 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
92 real rswitch_scalar,d_scalar;
93 __m128d dummy_mask,cutoff_mask;
94 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
95 __m128d one = _mm_set1_pd(1.0);
96 __m128d two = _mm_set1_pd(2.0);
102 jindex = nlist->jindex;
104 shiftidx = nlist->shift;
106 shiftvec = fr->shift_vec[0];
107 fshift = fr->fshift[0];
108 facel = _mm_set1_pd(fr->epsfac);
109 charge = mdatoms->chargeA;
110 nvdwtype = fr->ntype;
112 vdwtype = mdatoms->typeA;
114 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
115 ewtab = fr->ic->tabq_coul_FDV0;
116 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
117 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
119 /* Setup water-specific parameters */
120 inr = nlist->iinr[0];
121 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
122 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
123 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
124 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
126 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
127 rcutoff_scalar = fr->rcoulomb;
128 rcutoff = _mm_set1_pd(rcutoff_scalar);
129 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
131 rswitch_scalar = fr->rcoulomb_switch;
132 rswitch = _mm_set1_pd(rswitch_scalar);
133 /* Setup switch parameters */
134 d_scalar = rcutoff_scalar-rswitch_scalar;
135 d = _mm_set1_pd(d_scalar);
136 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
137 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
138 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
139 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
140 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
141 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
143 /* Avoid stupid compiler warnings */
151 /* Start outer loop over neighborlists */
152 for(iidx=0; iidx<nri; iidx++)
154 /* Load shift vector for this list */
155 i_shift_offset = DIM*shiftidx[iidx];
157 /* Load limits for loop over neighbors */
158 j_index_start = jindex[iidx];
159 j_index_end = jindex[iidx+1];
161 /* Get outer coordinate index */
163 i_coord_offset = DIM*inr;
165 /* Load i particle coords and add shift vector */
166 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
167 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
169 fix0 = _mm_setzero_pd();
170 fiy0 = _mm_setzero_pd();
171 fiz0 = _mm_setzero_pd();
172 fix1 = _mm_setzero_pd();
173 fiy1 = _mm_setzero_pd();
174 fiz1 = _mm_setzero_pd();
175 fix2 = _mm_setzero_pd();
176 fiy2 = _mm_setzero_pd();
177 fiz2 = _mm_setzero_pd();
178 fix3 = _mm_setzero_pd();
179 fiy3 = _mm_setzero_pd();
180 fiz3 = _mm_setzero_pd();
182 /* Reset potential sums */
183 velecsum = _mm_setzero_pd();
184 vvdwsum = _mm_setzero_pd();
186 /* Start inner kernel loop */
187 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
190 /* Get j neighbor index, and coordinate index */
193 j_coord_offsetA = DIM*jnrA;
194 j_coord_offsetB = DIM*jnrB;
196 /* load j atom coordinates */
197 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
200 /* Calculate displacement vector */
201 dx00 = _mm_sub_pd(ix0,jx0);
202 dy00 = _mm_sub_pd(iy0,jy0);
203 dz00 = _mm_sub_pd(iz0,jz0);
204 dx10 = _mm_sub_pd(ix1,jx0);
205 dy10 = _mm_sub_pd(iy1,jy0);
206 dz10 = _mm_sub_pd(iz1,jz0);
207 dx20 = _mm_sub_pd(ix2,jx0);
208 dy20 = _mm_sub_pd(iy2,jy0);
209 dz20 = _mm_sub_pd(iz2,jz0);
210 dx30 = _mm_sub_pd(ix3,jx0);
211 dy30 = _mm_sub_pd(iy3,jy0);
212 dz30 = _mm_sub_pd(iz3,jz0);
214 /* Calculate squared distance and things based on it */
215 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
216 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
217 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
218 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
220 rinv00 = gmx_mm_invsqrt_pd(rsq00);
221 rinv10 = gmx_mm_invsqrt_pd(rsq10);
222 rinv20 = gmx_mm_invsqrt_pd(rsq20);
223 rinv30 = gmx_mm_invsqrt_pd(rsq30);
225 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
226 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
227 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
228 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
230 /* Load parameters for j particles */
231 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
232 vdwjidx0A = 2*vdwtype[jnrA+0];
233 vdwjidx0B = 2*vdwtype[jnrB+0];
235 fjx0 = _mm_setzero_pd();
236 fjy0 = _mm_setzero_pd();
237 fjz0 = _mm_setzero_pd();
239 /**************************
240 * CALCULATE INTERACTIONS *
241 **************************/
243 if (gmx_mm_any_lt(rsq00,rcutoff2))
246 r00 = _mm_mul_pd(rsq00,rinv00);
248 /* Compute parameters for interactions between i and j atoms */
249 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
250 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
252 /* LENNARD-JONES DISPERSION/REPULSION */
254 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
255 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
256 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
257 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
258 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
260 d = _mm_sub_pd(r00,rswitch);
261 d = _mm_max_pd(d,_mm_setzero_pd());
262 d2 = _mm_mul_pd(d,d);
263 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
265 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
267 /* Evaluate switch function */
268 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
269 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
270 vvdw = _mm_mul_pd(vvdw,sw);
271 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
273 /* Update potential sum for this i atom from the interaction with this j atom. */
274 vvdw = _mm_and_pd(vvdw,cutoff_mask);
275 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
279 fscal = _mm_and_pd(fscal,cutoff_mask);
281 /* Calculate temporary vectorial force */
282 tx = _mm_mul_pd(fscal,dx00);
283 ty = _mm_mul_pd(fscal,dy00);
284 tz = _mm_mul_pd(fscal,dz00);
286 /* Update vectorial force */
287 fix0 = _mm_add_pd(fix0,tx);
288 fiy0 = _mm_add_pd(fiy0,ty);
289 fiz0 = _mm_add_pd(fiz0,tz);
291 fjx0 = _mm_add_pd(fjx0,tx);
292 fjy0 = _mm_add_pd(fjy0,ty);
293 fjz0 = _mm_add_pd(fjz0,tz);
297 /**************************
298 * CALCULATE INTERACTIONS *
299 **************************/
301 if (gmx_mm_any_lt(rsq10,rcutoff2))
304 r10 = _mm_mul_pd(rsq10,rinv10);
306 /* Compute parameters for interactions between i and j atoms */
307 qq10 = _mm_mul_pd(iq1,jq0);
309 /* EWALD ELECTROSTATICS */
311 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
312 ewrt = _mm_mul_pd(r10,ewtabscale);
313 ewitab = _mm_cvttpd_epi32(ewrt);
314 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
315 ewitab = _mm_slli_epi32(ewitab,2);
316 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
317 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
318 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
319 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
320 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
321 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
322 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
323 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
324 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
325 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
327 d = _mm_sub_pd(r10,rswitch);
328 d = _mm_max_pd(d,_mm_setzero_pd());
329 d2 = _mm_mul_pd(d,d);
330 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
332 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
334 /* Evaluate switch function */
335 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
336 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
337 velec = _mm_mul_pd(velec,sw);
338 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
340 /* Update potential sum for this i atom from the interaction with this j atom. */
341 velec = _mm_and_pd(velec,cutoff_mask);
342 velecsum = _mm_add_pd(velecsum,velec);
346 fscal = _mm_and_pd(fscal,cutoff_mask);
348 /* Calculate temporary vectorial force */
349 tx = _mm_mul_pd(fscal,dx10);
350 ty = _mm_mul_pd(fscal,dy10);
351 tz = _mm_mul_pd(fscal,dz10);
353 /* Update vectorial force */
354 fix1 = _mm_add_pd(fix1,tx);
355 fiy1 = _mm_add_pd(fiy1,ty);
356 fiz1 = _mm_add_pd(fiz1,tz);
358 fjx0 = _mm_add_pd(fjx0,tx);
359 fjy0 = _mm_add_pd(fjy0,ty);
360 fjz0 = _mm_add_pd(fjz0,tz);
364 /**************************
365 * CALCULATE INTERACTIONS *
366 **************************/
368 if (gmx_mm_any_lt(rsq20,rcutoff2))
371 r20 = _mm_mul_pd(rsq20,rinv20);
373 /* Compute parameters for interactions between i and j atoms */
374 qq20 = _mm_mul_pd(iq2,jq0);
376 /* EWALD ELECTROSTATICS */
378 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
379 ewrt = _mm_mul_pd(r20,ewtabscale);
380 ewitab = _mm_cvttpd_epi32(ewrt);
381 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
382 ewitab = _mm_slli_epi32(ewitab,2);
383 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
384 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
385 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
386 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
387 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
388 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
389 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
390 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
391 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
392 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
394 d = _mm_sub_pd(r20,rswitch);
395 d = _mm_max_pd(d,_mm_setzero_pd());
396 d2 = _mm_mul_pd(d,d);
397 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
399 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
401 /* Evaluate switch function */
402 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
403 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
404 velec = _mm_mul_pd(velec,sw);
405 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
407 /* Update potential sum for this i atom from the interaction with this j atom. */
408 velec = _mm_and_pd(velec,cutoff_mask);
409 velecsum = _mm_add_pd(velecsum,velec);
413 fscal = _mm_and_pd(fscal,cutoff_mask);
415 /* Calculate temporary vectorial force */
416 tx = _mm_mul_pd(fscal,dx20);
417 ty = _mm_mul_pd(fscal,dy20);
418 tz = _mm_mul_pd(fscal,dz20);
420 /* Update vectorial force */
421 fix2 = _mm_add_pd(fix2,tx);
422 fiy2 = _mm_add_pd(fiy2,ty);
423 fiz2 = _mm_add_pd(fiz2,tz);
425 fjx0 = _mm_add_pd(fjx0,tx);
426 fjy0 = _mm_add_pd(fjy0,ty);
427 fjz0 = _mm_add_pd(fjz0,tz);
431 /**************************
432 * CALCULATE INTERACTIONS *
433 **************************/
435 if (gmx_mm_any_lt(rsq30,rcutoff2))
438 r30 = _mm_mul_pd(rsq30,rinv30);
440 /* Compute parameters for interactions between i and j atoms */
441 qq30 = _mm_mul_pd(iq3,jq0);
443 /* EWALD ELECTROSTATICS */
445 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
446 ewrt = _mm_mul_pd(r30,ewtabscale);
447 ewitab = _mm_cvttpd_epi32(ewrt);
448 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
449 ewitab = _mm_slli_epi32(ewitab,2);
450 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
451 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
452 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
453 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
454 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
455 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
456 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
457 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
458 velec = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
459 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
461 d = _mm_sub_pd(r30,rswitch);
462 d = _mm_max_pd(d,_mm_setzero_pd());
463 d2 = _mm_mul_pd(d,d);
464 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
466 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
468 /* Evaluate switch function */
469 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
470 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv30,_mm_mul_pd(velec,dsw)) );
471 velec = _mm_mul_pd(velec,sw);
472 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
474 /* Update potential sum for this i atom from the interaction with this j atom. */
475 velec = _mm_and_pd(velec,cutoff_mask);
476 velecsum = _mm_add_pd(velecsum,velec);
480 fscal = _mm_and_pd(fscal,cutoff_mask);
482 /* Calculate temporary vectorial force */
483 tx = _mm_mul_pd(fscal,dx30);
484 ty = _mm_mul_pd(fscal,dy30);
485 tz = _mm_mul_pd(fscal,dz30);
487 /* Update vectorial force */
488 fix3 = _mm_add_pd(fix3,tx);
489 fiy3 = _mm_add_pd(fiy3,ty);
490 fiz3 = _mm_add_pd(fiz3,tz);
492 fjx0 = _mm_add_pd(fjx0,tx);
493 fjy0 = _mm_add_pd(fjy0,ty);
494 fjz0 = _mm_add_pd(fjz0,tz);
498 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
500 /* Inner loop uses 257 flops */
507 j_coord_offsetA = DIM*jnrA;
509 /* load j atom coordinates */
510 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
513 /* Calculate displacement vector */
514 dx00 = _mm_sub_pd(ix0,jx0);
515 dy00 = _mm_sub_pd(iy0,jy0);
516 dz00 = _mm_sub_pd(iz0,jz0);
517 dx10 = _mm_sub_pd(ix1,jx0);
518 dy10 = _mm_sub_pd(iy1,jy0);
519 dz10 = _mm_sub_pd(iz1,jz0);
520 dx20 = _mm_sub_pd(ix2,jx0);
521 dy20 = _mm_sub_pd(iy2,jy0);
522 dz20 = _mm_sub_pd(iz2,jz0);
523 dx30 = _mm_sub_pd(ix3,jx0);
524 dy30 = _mm_sub_pd(iy3,jy0);
525 dz30 = _mm_sub_pd(iz3,jz0);
527 /* Calculate squared distance and things based on it */
528 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
529 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
530 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
531 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
533 rinv00 = gmx_mm_invsqrt_pd(rsq00);
534 rinv10 = gmx_mm_invsqrt_pd(rsq10);
535 rinv20 = gmx_mm_invsqrt_pd(rsq20);
536 rinv30 = gmx_mm_invsqrt_pd(rsq30);
538 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
539 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
540 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
541 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
543 /* Load parameters for j particles */
544 jq0 = _mm_load_sd(charge+jnrA+0);
545 vdwjidx0A = 2*vdwtype[jnrA+0];
547 fjx0 = _mm_setzero_pd();
548 fjy0 = _mm_setzero_pd();
549 fjz0 = _mm_setzero_pd();
551 /**************************
552 * CALCULATE INTERACTIONS *
553 **************************/
555 if (gmx_mm_any_lt(rsq00,rcutoff2))
558 r00 = _mm_mul_pd(rsq00,rinv00);
560 /* Compute parameters for interactions between i and j atoms */
561 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
563 /* LENNARD-JONES DISPERSION/REPULSION */
565 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
566 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
567 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
568 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
569 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
571 d = _mm_sub_pd(r00,rswitch);
572 d = _mm_max_pd(d,_mm_setzero_pd());
573 d2 = _mm_mul_pd(d,d);
574 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
576 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
578 /* Evaluate switch function */
579 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
580 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
581 vvdw = _mm_mul_pd(vvdw,sw);
582 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
584 /* Update potential sum for this i atom from the interaction with this j atom. */
585 vvdw = _mm_and_pd(vvdw,cutoff_mask);
586 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
587 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
591 fscal = _mm_and_pd(fscal,cutoff_mask);
593 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
595 /* Calculate temporary vectorial force */
596 tx = _mm_mul_pd(fscal,dx00);
597 ty = _mm_mul_pd(fscal,dy00);
598 tz = _mm_mul_pd(fscal,dz00);
600 /* Update vectorial force */
601 fix0 = _mm_add_pd(fix0,tx);
602 fiy0 = _mm_add_pd(fiy0,ty);
603 fiz0 = _mm_add_pd(fiz0,tz);
605 fjx0 = _mm_add_pd(fjx0,tx);
606 fjy0 = _mm_add_pd(fjy0,ty);
607 fjz0 = _mm_add_pd(fjz0,tz);
611 /**************************
612 * CALCULATE INTERACTIONS *
613 **************************/
615 if (gmx_mm_any_lt(rsq10,rcutoff2))
618 r10 = _mm_mul_pd(rsq10,rinv10);
620 /* Compute parameters for interactions between i and j atoms */
621 qq10 = _mm_mul_pd(iq1,jq0);
623 /* EWALD ELECTROSTATICS */
625 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
626 ewrt = _mm_mul_pd(r10,ewtabscale);
627 ewitab = _mm_cvttpd_epi32(ewrt);
628 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
629 ewitab = _mm_slli_epi32(ewitab,2);
630 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
631 ewtabD = _mm_setzero_pd();
632 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
633 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
634 ewtabFn = _mm_setzero_pd();
635 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
636 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
637 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
638 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
639 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
641 d = _mm_sub_pd(r10,rswitch);
642 d = _mm_max_pd(d,_mm_setzero_pd());
643 d2 = _mm_mul_pd(d,d);
644 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
646 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
648 /* Evaluate switch function */
649 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
650 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
651 velec = _mm_mul_pd(velec,sw);
652 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
654 /* Update potential sum for this i atom from the interaction with this j atom. */
655 velec = _mm_and_pd(velec,cutoff_mask);
656 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
657 velecsum = _mm_add_pd(velecsum,velec);
661 fscal = _mm_and_pd(fscal,cutoff_mask);
663 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
665 /* Calculate temporary vectorial force */
666 tx = _mm_mul_pd(fscal,dx10);
667 ty = _mm_mul_pd(fscal,dy10);
668 tz = _mm_mul_pd(fscal,dz10);
670 /* Update vectorial force */
671 fix1 = _mm_add_pd(fix1,tx);
672 fiy1 = _mm_add_pd(fiy1,ty);
673 fiz1 = _mm_add_pd(fiz1,tz);
675 fjx0 = _mm_add_pd(fjx0,tx);
676 fjy0 = _mm_add_pd(fjy0,ty);
677 fjz0 = _mm_add_pd(fjz0,tz);
681 /**************************
682 * CALCULATE INTERACTIONS *
683 **************************/
685 if (gmx_mm_any_lt(rsq20,rcutoff2))
688 r20 = _mm_mul_pd(rsq20,rinv20);
690 /* Compute parameters for interactions between i and j atoms */
691 qq20 = _mm_mul_pd(iq2,jq0);
693 /* EWALD ELECTROSTATICS */
695 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
696 ewrt = _mm_mul_pd(r20,ewtabscale);
697 ewitab = _mm_cvttpd_epi32(ewrt);
698 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
699 ewitab = _mm_slli_epi32(ewitab,2);
700 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
701 ewtabD = _mm_setzero_pd();
702 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
703 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
704 ewtabFn = _mm_setzero_pd();
705 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
706 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
707 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
708 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
709 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
711 d = _mm_sub_pd(r20,rswitch);
712 d = _mm_max_pd(d,_mm_setzero_pd());
713 d2 = _mm_mul_pd(d,d);
714 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
716 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
718 /* Evaluate switch function */
719 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
720 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
721 velec = _mm_mul_pd(velec,sw);
722 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
724 /* Update potential sum for this i atom from the interaction with this j atom. */
725 velec = _mm_and_pd(velec,cutoff_mask);
726 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
727 velecsum = _mm_add_pd(velecsum,velec);
731 fscal = _mm_and_pd(fscal,cutoff_mask);
733 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
735 /* Calculate temporary vectorial force */
736 tx = _mm_mul_pd(fscal,dx20);
737 ty = _mm_mul_pd(fscal,dy20);
738 tz = _mm_mul_pd(fscal,dz20);
740 /* Update vectorial force */
741 fix2 = _mm_add_pd(fix2,tx);
742 fiy2 = _mm_add_pd(fiy2,ty);
743 fiz2 = _mm_add_pd(fiz2,tz);
745 fjx0 = _mm_add_pd(fjx0,tx);
746 fjy0 = _mm_add_pd(fjy0,ty);
747 fjz0 = _mm_add_pd(fjz0,tz);
751 /**************************
752 * CALCULATE INTERACTIONS *
753 **************************/
755 if (gmx_mm_any_lt(rsq30,rcutoff2))
758 r30 = _mm_mul_pd(rsq30,rinv30);
760 /* Compute parameters for interactions between i and j atoms */
761 qq30 = _mm_mul_pd(iq3,jq0);
763 /* EWALD ELECTROSTATICS */
765 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
766 ewrt = _mm_mul_pd(r30,ewtabscale);
767 ewitab = _mm_cvttpd_epi32(ewrt);
768 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
769 ewitab = _mm_slli_epi32(ewitab,2);
770 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
771 ewtabD = _mm_setzero_pd();
772 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
773 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
774 ewtabFn = _mm_setzero_pd();
775 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
776 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
777 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
778 velec = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
779 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
781 d = _mm_sub_pd(r30,rswitch);
782 d = _mm_max_pd(d,_mm_setzero_pd());
783 d2 = _mm_mul_pd(d,d);
784 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
786 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
788 /* Evaluate switch function */
789 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
790 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv30,_mm_mul_pd(velec,dsw)) );
791 velec = _mm_mul_pd(velec,sw);
792 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
794 /* Update potential sum for this i atom from the interaction with this j atom. */
795 velec = _mm_and_pd(velec,cutoff_mask);
796 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
797 velecsum = _mm_add_pd(velecsum,velec);
801 fscal = _mm_and_pd(fscal,cutoff_mask);
803 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
805 /* Calculate temporary vectorial force */
806 tx = _mm_mul_pd(fscal,dx30);
807 ty = _mm_mul_pd(fscal,dy30);
808 tz = _mm_mul_pd(fscal,dz30);
810 /* Update vectorial force */
811 fix3 = _mm_add_pd(fix3,tx);
812 fiy3 = _mm_add_pd(fiy3,ty);
813 fiz3 = _mm_add_pd(fiz3,tz);
815 fjx0 = _mm_add_pd(fjx0,tx);
816 fjy0 = _mm_add_pd(fjy0,ty);
817 fjz0 = _mm_add_pd(fjz0,tz);
821 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
823 /* Inner loop uses 257 flops */
826 /* End of innermost loop */
828 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
829 f+i_coord_offset,fshift+i_shift_offset);
832 /* Update potential energies */
833 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
834 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
836 /* Increment number of inner iterations */
837 inneriter += j_index_end - j_index_start;
839 /* Outer loop uses 26 flops */
842 /* Increment number of outer iterations */
845 /* Update outer/inner flops */
847 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*257);
850 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_double
851 * Electrostatics interaction: Ewald
852 * VdW interaction: LennardJones
853 * Geometry: Water4-Particle
854 * Calculate force/pot: Force
857 nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_double
858 (t_nblist * gmx_restrict nlist,
859 rvec * gmx_restrict xx,
860 rvec * gmx_restrict ff,
861 t_forcerec * gmx_restrict fr,
862 t_mdatoms * gmx_restrict mdatoms,
863 nb_kernel_data_t * gmx_restrict kernel_data,
864 t_nrnb * gmx_restrict nrnb)
866 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
867 * just 0 for non-waters.
868 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
869 * jnr indices corresponding to data put in the four positions in the SIMD register.
871 int i_shift_offset,i_coord_offset,outeriter,inneriter;
872 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
874 int j_coord_offsetA,j_coord_offsetB;
875 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
877 real *shiftvec,*fshift,*x,*f;
878 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
880 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
882 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
884 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
886 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
887 int vdwjidx0A,vdwjidx0B;
888 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
889 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
890 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
891 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
892 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
893 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
896 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
899 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
900 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
902 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
904 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
905 real rswitch_scalar,d_scalar;
906 __m128d dummy_mask,cutoff_mask;
907 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
908 __m128d one = _mm_set1_pd(1.0);
909 __m128d two = _mm_set1_pd(2.0);
915 jindex = nlist->jindex;
917 shiftidx = nlist->shift;
919 shiftvec = fr->shift_vec[0];
920 fshift = fr->fshift[0];
921 facel = _mm_set1_pd(fr->epsfac);
922 charge = mdatoms->chargeA;
923 nvdwtype = fr->ntype;
925 vdwtype = mdatoms->typeA;
927 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
928 ewtab = fr->ic->tabq_coul_FDV0;
929 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
930 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
932 /* Setup water-specific parameters */
933 inr = nlist->iinr[0];
934 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
935 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
936 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
937 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
939 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
940 rcutoff_scalar = fr->rcoulomb;
941 rcutoff = _mm_set1_pd(rcutoff_scalar);
942 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
944 rswitch_scalar = fr->rcoulomb_switch;
945 rswitch = _mm_set1_pd(rswitch_scalar);
946 /* Setup switch parameters */
947 d_scalar = rcutoff_scalar-rswitch_scalar;
948 d = _mm_set1_pd(d_scalar);
949 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
950 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
951 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
952 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
953 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
954 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
956 /* Avoid stupid compiler warnings */
964 /* Start outer loop over neighborlists */
965 for(iidx=0; iidx<nri; iidx++)
967 /* Load shift vector for this list */
968 i_shift_offset = DIM*shiftidx[iidx];
970 /* Load limits for loop over neighbors */
971 j_index_start = jindex[iidx];
972 j_index_end = jindex[iidx+1];
974 /* Get outer coordinate index */
976 i_coord_offset = DIM*inr;
978 /* Load i particle coords and add shift vector */
979 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
980 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
982 fix0 = _mm_setzero_pd();
983 fiy0 = _mm_setzero_pd();
984 fiz0 = _mm_setzero_pd();
985 fix1 = _mm_setzero_pd();
986 fiy1 = _mm_setzero_pd();
987 fiz1 = _mm_setzero_pd();
988 fix2 = _mm_setzero_pd();
989 fiy2 = _mm_setzero_pd();
990 fiz2 = _mm_setzero_pd();
991 fix3 = _mm_setzero_pd();
992 fiy3 = _mm_setzero_pd();
993 fiz3 = _mm_setzero_pd();
995 /* Start inner kernel loop */
996 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
999 /* Get j neighbor index, and coordinate index */
1001 jnrB = jjnr[jidx+1];
1002 j_coord_offsetA = DIM*jnrA;
1003 j_coord_offsetB = DIM*jnrB;
1005 /* load j atom coordinates */
1006 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1009 /* Calculate displacement vector */
1010 dx00 = _mm_sub_pd(ix0,jx0);
1011 dy00 = _mm_sub_pd(iy0,jy0);
1012 dz00 = _mm_sub_pd(iz0,jz0);
1013 dx10 = _mm_sub_pd(ix1,jx0);
1014 dy10 = _mm_sub_pd(iy1,jy0);
1015 dz10 = _mm_sub_pd(iz1,jz0);
1016 dx20 = _mm_sub_pd(ix2,jx0);
1017 dy20 = _mm_sub_pd(iy2,jy0);
1018 dz20 = _mm_sub_pd(iz2,jz0);
1019 dx30 = _mm_sub_pd(ix3,jx0);
1020 dy30 = _mm_sub_pd(iy3,jy0);
1021 dz30 = _mm_sub_pd(iz3,jz0);
1023 /* Calculate squared distance and things based on it */
1024 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1025 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1026 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1027 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
1029 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1030 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1031 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1032 rinv30 = gmx_mm_invsqrt_pd(rsq30);
1034 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1035 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1036 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1037 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
1039 /* Load parameters for j particles */
1040 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
1041 vdwjidx0A = 2*vdwtype[jnrA+0];
1042 vdwjidx0B = 2*vdwtype[jnrB+0];
1044 fjx0 = _mm_setzero_pd();
1045 fjy0 = _mm_setzero_pd();
1046 fjz0 = _mm_setzero_pd();
1048 /**************************
1049 * CALCULATE INTERACTIONS *
1050 **************************/
1052 if (gmx_mm_any_lt(rsq00,rcutoff2))
1055 r00 = _mm_mul_pd(rsq00,rinv00);
1057 /* Compute parameters for interactions between i and j atoms */
1058 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
1059 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
1061 /* LENNARD-JONES DISPERSION/REPULSION */
1063 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1064 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1065 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1066 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
1067 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1069 d = _mm_sub_pd(r00,rswitch);
1070 d = _mm_max_pd(d,_mm_setzero_pd());
1071 d2 = _mm_mul_pd(d,d);
1072 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1074 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1076 /* Evaluate switch function */
1077 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1078 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1079 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1083 fscal = _mm_and_pd(fscal,cutoff_mask);
1085 /* Calculate temporary vectorial force */
1086 tx = _mm_mul_pd(fscal,dx00);
1087 ty = _mm_mul_pd(fscal,dy00);
1088 tz = _mm_mul_pd(fscal,dz00);
1090 /* Update vectorial force */
1091 fix0 = _mm_add_pd(fix0,tx);
1092 fiy0 = _mm_add_pd(fiy0,ty);
1093 fiz0 = _mm_add_pd(fiz0,tz);
1095 fjx0 = _mm_add_pd(fjx0,tx);
1096 fjy0 = _mm_add_pd(fjy0,ty);
1097 fjz0 = _mm_add_pd(fjz0,tz);
1101 /**************************
1102 * CALCULATE INTERACTIONS *
1103 **************************/
1105 if (gmx_mm_any_lt(rsq10,rcutoff2))
1108 r10 = _mm_mul_pd(rsq10,rinv10);
1110 /* Compute parameters for interactions between i and j atoms */
1111 qq10 = _mm_mul_pd(iq1,jq0);
1113 /* EWALD ELECTROSTATICS */
1115 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1116 ewrt = _mm_mul_pd(r10,ewtabscale);
1117 ewitab = _mm_cvttpd_epi32(ewrt);
1118 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1119 ewitab = _mm_slli_epi32(ewitab,2);
1120 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1121 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1122 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1123 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1124 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1125 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1126 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1127 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1128 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
1129 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1131 d = _mm_sub_pd(r10,rswitch);
1132 d = _mm_max_pd(d,_mm_setzero_pd());
1133 d2 = _mm_mul_pd(d,d);
1134 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1136 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1138 /* Evaluate switch function */
1139 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1140 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
1141 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1145 fscal = _mm_and_pd(fscal,cutoff_mask);
1147 /* Calculate temporary vectorial force */
1148 tx = _mm_mul_pd(fscal,dx10);
1149 ty = _mm_mul_pd(fscal,dy10);
1150 tz = _mm_mul_pd(fscal,dz10);
1152 /* Update vectorial force */
1153 fix1 = _mm_add_pd(fix1,tx);
1154 fiy1 = _mm_add_pd(fiy1,ty);
1155 fiz1 = _mm_add_pd(fiz1,tz);
1157 fjx0 = _mm_add_pd(fjx0,tx);
1158 fjy0 = _mm_add_pd(fjy0,ty);
1159 fjz0 = _mm_add_pd(fjz0,tz);
1163 /**************************
1164 * CALCULATE INTERACTIONS *
1165 **************************/
1167 if (gmx_mm_any_lt(rsq20,rcutoff2))
1170 r20 = _mm_mul_pd(rsq20,rinv20);
1172 /* Compute parameters for interactions between i and j atoms */
1173 qq20 = _mm_mul_pd(iq2,jq0);
1175 /* EWALD ELECTROSTATICS */
1177 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1178 ewrt = _mm_mul_pd(r20,ewtabscale);
1179 ewitab = _mm_cvttpd_epi32(ewrt);
1180 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1181 ewitab = _mm_slli_epi32(ewitab,2);
1182 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1183 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1184 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1185 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1186 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1187 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1188 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1189 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1190 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
1191 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1193 d = _mm_sub_pd(r20,rswitch);
1194 d = _mm_max_pd(d,_mm_setzero_pd());
1195 d2 = _mm_mul_pd(d,d);
1196 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1198 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1200 /* Evaluate switch function */
1201 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1202 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
1203 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1207 fscal = _mm_and_pd(fscal,cutoff_mask);
1209 /* Calculate temporary vectorial force */
1210 tx = _mm_mul_pd(fscal,dx20);
1211 ty = _mm_mul_pd(fscal,dy20);
1212 tz = _mm_mul_pd(fscal,dz20);
1214 /* Update vectorial force */
1215 fix2 = _mm_add_pd(fix2,tx);
1216 fiy2 = _mm_add_pd(fiy2,ty);
1217 fiz2 = _mm_add_pd(fiz2,tz);
1219 fjx0 = _mm_add_pd(fjx0,tx);
1220 fjy0 = _mm_add_pd(fjy0,ty);
1221 fjz0 = _mm_add_pd(fjz0,tz);
1225 /**************************
1226 * CALCULATE INTERACTIONS *
1227 **************************/
1229 if (gmx_mm_any_lt(rsq30,rcutoff2))
1232 r30 = _mm_mul_pd(rsq30,rinv30);
1234 /* Compute parameters for interactions between i and j atoms */
1235 qq30 = _mm_mul_pd(iq3,jq0);
1237 /* EWALD ELECTROSTATICS */
1239 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1240 ewrt = _mm_mul_pd(r30,ewtabscale);
1241 ewitab = _mm_cvttpd_epi32(ewrt);
1242 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1243 ewitab = _mm_slli_epi32(ewitab,2);
1244 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1245 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1246 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1247 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1248 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1249 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1250 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1251 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1252 velec = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
1253 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
1255 d = _mm_sub_pd(r30,rswitch);
1256 d = _mm_max_pd(d,_mm_setzero_pd());
1257 d2 = _mm_mul_pd(d,d);
1258 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1260 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1262 /* Evaluate switch function */
1263 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1264 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv30,_mm_mul_pd(velec,dsw)) );
1265 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
1269 fscal = _mm_and_pd(fscal,cutoff_mask);
1271 /* Calculate temporary vectorial force */
1272 tx = _mm_mul_pd(fscal,dx30);
1273 ty = _mm_mul_pd(fscal,dy30);
1274 tz = _mm_mul_pd(fscal,dz30);
1276 /* Update vectorial force */
1277 fix3 = _mm_add_pd(fix3,tx);
1278 fiy3 = _mm_add_pd(fiy3,ty);
1279 fiz3 = _mm_add_pd(fiz3,tz);
1281 fjx0 = _mm_add_pd(fjx0,tx);
1282 fjy0 = _mm_add_pd(fjy0,ty);
1283 fjz0 = _mm_add_pd(fjz0,tz);
1287 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
1289 /* Inner loop uses 245 flops */
1292 if(jidx<j_index_end)
1296 j_coord_offsetA = DIM*jnrA;
1298 /* load j atom coordinates */
1299 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1302 /* Calculate displacement vector */
1303 dx00 = _mm_sub_pd(ix0,jx0);
1304 dy00 = _mm_sub_pd(iy0,jy0);
1305 dz00 = _mm_sub_pd(iz0,jz0);
1306 dx10 = _mm_sub_pd(ix1,jx0);
1307 dy10 = _mm_sub_pd(iy1,jy0);
1308 dz10 = _mm_sub_pd(iz1,jz0);
1309 dx20 = _mm_sub_pd(ix2,jx0);
1310 dy20 = _mm_sub_pd(iy2,jy0);
1311 dz20 = _mm_sub_pd(iz2,jz0);
1312 dx30 = _mm_sub_pd(ix3,jx0);
1313 dy30 = _mm_sub_pd(iy3,jy0);
1314 dz30 = _mm_sub_pd(iz3,jz0);
1316 /* Calculate squared distance and things based on it */
1317 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1318 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1319 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1320 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
1322 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1323 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1324 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1325 rinv30 = gmx_mm_invsqrt_pd(rsq30);
1327 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1328 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1329 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1330 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
1332 /* Load parameters for j particles */
1333 jq0 = _mm_load_sd(charge+jnrA+0);
1334 vdwjidx0A = 2*vdwtype[jnrA+0];
1336 fjx0 = _mm_setzero_pd();
1337 fjy0 = _mm_setzero_pd();
1338 fjz0 = _mm_setzero_pd();
1340 /**************************
1341 * CALCULATE INTERACTIONS *
1342 **************************/
1344 if (gmx_mm_any_lt(rsq00,rcutoff2))
1347 r00 = _mm_mul_pd(rsq00,rinv00);
1349 /* Compute parameters for interactions between i and j atoms */
1350 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
1352 /* LENNARD-JONES DISPERSION/REPULSION */
1354 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1355 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1356 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1357 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
1358 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1360 d = _mm_sub_pd(r00,rswitch);
1361 d = _mm_max_pd(d,_mm_setzero_pd());
1362 d2 = _mm_mul_pd(d,d);
1363 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1365 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1367 /* Evaluate switch function */
1368 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1369 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1370 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1374 fscal = _mm_and_pd(fscal,cutoff_mask);
1376 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1378 /* Calculate temporary vectorial force */
1379 tx = _mm_mul_pd(fscal,dx00);
1380 ty = _mm_mul_pd(fscal,dy00);
1381 tz = _mm_mul_pd(fscal,dz00);
1383 /* Update vectorial force */
1384 fix0 = _mm_add_pd(fix0,tx);
1385 fiy0 = _mm_add_pd(fiy0,ty);
1386 fiz0 = _mm_add_pd(fiz0,tz);
1388 fjx0 = _mm_add_pd(fjx0,tx);
1389 fjy0 = _mm_add_pd(fjy0,ty);
1390 fjz0 = _mm_add_pd(fjz0,tz);
1394 /**************************
1395 * CALCULATE INTERACTIONS *
1396 **************************/
1398 if (gmx_mm_any_lt(rsq10,rcutoff2))
1401 r10 = _mm_mul_pd(rsq10,rinv10);
1403 /* Compute parameters for interactions between i and j atoms */
1404 qq10 = _mm_mul_pd(iq1,jq0);
1406 /* EWALD ELECTROSTATICS */
1408 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1409 ewrt = _mm_mul_pd(r10,ewtabscale);
1410 ewitab = _mm_cvttpd_epi32(ewrt);
1411 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1412 ewitab = _mm_slli_epi32(ewitab,2);
1413 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1414 ewtabD = _mm_setzero_pd();
1415 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1416 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1417 ewtabFn = _mm_setzero_pd();
1418 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1419 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1420 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1421 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
1422 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1424 d = _mm_sub_pd(r10,rswitch);
1425 d = _mm_max_pd(d,_mm_setzero_pd());
1426 d2 = _mm_mul_pd(d,d);
1427 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1429 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1431 /* Evaluate switch function */
1432 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1433 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
1434 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1438 fscal = _mm_and_pd(fscal,cutoff_mask);
1440 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1442 /* Calculate temporary vectorial force */
1443 tx = _mm_mul_pd(fscal,dx10);
1444 ty = _mm_mul_pd(fscal,dy10);
1445 tz = _mm_mul_pd(fscal,dz10);
1447 /* Update vectorial force */
1448 fix1 = _mm_add_pd(fix1,tx);
1449 fiy1 = _mm_add_pd(fiy1,ty);
1450 fiz1 = _mm_add_pd(fiz1,tz);
1452 fjx0 = _mm_add_pd(fjx0,tx);
1453 fjy0 = _mm_add_pd(fjy0,ty);
1454 fjz0 = _mm_add_pd(fjz0,tz);
1458 /**************************
1459 * CALCULATE INTERACTIONS *
1460 **************************/
1462 if (gmx_mm_any_lt(rsq20,rcutoff2))
1465 r20 = _mm_mul_pd(rsq20,rinv20);
1467 /* Compute parameters for interactions between i and j atoms */
1468 qq20 = _mm_mul_pd(iq2,jq0);
1470 /* EWALD ELECTROSTATICS */
1472 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1473 ewrt = _mm_mul_pd(r20,ewtabscale);
1474 ewitab = _mm_cvttpd_epi32(ewrt);
1475 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1476 ewitab = _mm_slli_epi32(ewitab,2);
1477 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1478 ewtabD = _mm_setzero_pd();
1479 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1480 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1481 ewtabFn = _mm_setzero_pd();
1482 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1483 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1484 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1485 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
1486 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1488 d = _mm_sub_pd(r20,rswitch);
1489 d = _mm_max_pd(d,_mm_setzero_pd());
1490 d2 = _mm_mul_pd(d,d);
1491 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1493 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1495 /* Evaluate switch function */
1496 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1497 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
1498 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1502 fscal = _mm_and_pd(fscal,cutoff_mask);
1504 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1506 /* Calculate temporary vectorial force */
1507 tx = _mm_mul_pd(fscal,dx20);
1508 ty = _mm_mul_pd(fscal,dy20);
1509 tz = _mm_mul_pd(fscal,dz20);
1511 /* Update vectorial force */
1512 fix2 = _mm_add_pd(fix2,tx);
1513 fiy2 = _mm_add_pd(fiy2,ty);
1514 fiz2 = _mm_add_pd(fiz2,tz);
1516 fjx0 = _mm_add_pd(fjx0,tx);
1517 fjy0 = _mm_add_pd(fjy0,ty);
1518 fjz0 = _mm_add_pd(fjz0,tz);
1522 /**************************
1523 * CALCULATE INTERACTIONS *
1524 **************************/
1526 if (gmx_mm_any_lt(rsq30,rcutoff2))
1529 r30 = _mm_mul_pd(rsq30,rinv30);
1531 /* Compute parameters for interactions between i and j atoms */
1532 qq30 = _mm_mul_pd(iq3,jq0);
1534 /* EWALD ELECTROSTATICS */
1536 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1537 ewrt = _mm_mul_pd(r30,ewtabscale);
1538 ewitab = _mm_cvttpd_epi32(ewrt);
1539 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1540 ewitab = _mm_slli_epi32(ewitab,2);
1541 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1542 ewtabD = _mm_setzero_pd();
1543 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1544 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1545 ewtabFn = _mm_setzero_pd();
1546 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1547 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1548 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1549 velec = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
1550 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
1552 d = _mm_sub_pd(r30,rswitch);
1553 d = _mm_max_pd(d,_mm_setzero_pd());
1554 d2 = _mm_mul_pd(d,d);
1555 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1557 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1559 /* Evaluate switch function */
1560 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1561 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv30,_mm_mul_pd(velec,dsw)) );
1562 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
1566 fscal = _mm_and_pd(fscal,cutoff_mask);
1568 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1570 /* Calculate temporary vectorial force */
1571 tx = _mm_mul_pd(fscal,dx30);
1572 ty = _mm_mul_pd(fscal,dy30);
1573 tz = _mm_mul_pd(fscal,dz30);
1575 /* Update vectorial force */
1576 fix3 = _mm_add_pd(fix3,tx);
1577 fiy3 = _mm_add_pd(fiy3,ty);
1578 fiz3 = _mm_add_pd(fiz3,tz);
1580 fjx0 = _mm_add_pd(fjx0,tx);
1581 fjy0 = _mm_add_pd(fjy0,ty);
1582 fjz0 = _mm_add_pd(fjz0,tz);
1586 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
1588 /* Inner loop uses 245 flops */
1591 /* End of innermost loop */
1593 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1594 f+i_coord_offset,fshift+i_shift_offset);
1596 /* Increment number of inner iterations */
1597 inneriter += j_index_end - j_index_start;
1599 /* Outer loop uses 24 flops */
1602 /* Increment number of outer iterations */
1605 /* Update outer/inner flops */
1607 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*245);