2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
75 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
77 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
78 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
79 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
80 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
83 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
86 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
87 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
89 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
91 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
92 real rswitch_scalar,d_scalar;
93 __m128 dummy_mask,cutoff_mask;
94 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
95 __m128 one = _mm_set1_ps(1.0);
96 __m128 two = _mm_set1_ps(2.0);
102 jindex = nlist->jindex;
104 shiftidx = nlist->shift;
106 shiftvec = fr->shift_vec[0];
107 fshift = fr->fshift[0];
108 facel = _mm_set1_ps(fr->epsfac);
109 charge = mdatoms->chargeA;
110 nvdwtype = fr->ntype;
112 vdwtype = mdatoms->typeA;
114 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
115 ewtab = fr->ic->tabq_coul_FDV0;
116 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
117 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
119 /* Setup water-specific parameters */
120 inr = nlist->iinr[0];
121 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
122 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
123 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
124 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
126 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
127 rcutoff_scalar = fr->rcoulomb;
128 rcutoff = _mm_set1_ps(rcutoff_scalar);
129 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
131 rswitch_scalar = fr->rcoulomb_switch;
132 rswitch = _mm_set1_ps(rswitch_scalar);
133 /* Setup switch parameters */
134 d_scalar = rcutoff_scalar-rswitch_scalar;
135 d = _mm_set1_ps(d_scalar);
136 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
137 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
138 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
139 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
140 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
141 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
143 /* Avoid stupid compiler warnings */
144 jnrA = jnrB = jnrC = jnrD = 0;
153 /* Start outer loop over neighborlists */
154 for(iidx=0; iidx<nri; iidx++)
156 /* Load shift vector for this list */
157 i_shift_offset = DIM*shiftidx[iidx];
158 shX = shiftvec[i_shift_offset+XX];
159 shY = shiftvec[i_shift_offset+YY];
160 shZ = shiftvec[i_shift_offset+ZZ];
162 /* Load limits for loop over neighbors */
163 j_index_start = jindex[iidx];
164 j_index_end = jindex[iidx+1];
166 /* Get outer coordinate index */
168 i_coord_offset = DIM*inr;
170 /* Load i particle coords and add shift vector */
171 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
172 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
173 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
174 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
175 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
176 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
177 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
178 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
179 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
180 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
181 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
182 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
184 fix0 = _mm_setzero_ps();
185 fiy0 = _mm_setzero_ps();
186 fiz0 = _mm_setzero_ps();
187 fix1 = _mm_setzero_ps();
188 fiy1 = _mm_setzero_ps();
189 fiz1 = _mm_setzero_ps();
190 fix2 = _mm_setzero_ps();
191 fiy2 = _mm_setzero_ps();
192 fiz2 = _mm_setzero_ps();
193 fix3 = _mm_setzero_ps();
194 fiy3 = _mm_setzero_ps();
195 fiz3 = _mm_setzero_ps();
197 /* Reset potential sums */
198 velecsum = _mm_setzero_ps();
199 vvdwsum = _mm_setzero_ps();
201 /* Start inner kernel loop */
202 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
205 /* Get j neighbor index, and coordinate index */
211 j_coord_offsetA = DIM*jnrA;
212 j_coord_offsetB = DIM*jnrB;
213 j_coord_offsetC = DIM*jnrC;
214 j_coord_offsetD = DIM*jnrD;
216 /* load j atom coordinates */
217 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
218 x+j_coord_offsetC,x+j_coord_offsetD,
221 /* Calculate displacement vector */
222 dx00 = _mm_sub_ps(ix0,jx0);
223 dy00 = _mm_sub_ps(iy0,jy0);
224 dz00 = _mm_sub_ps(iz0,jz0);
225 dx10 = _mm_sub_ps(ix1,jx0);
226 dy10 = _mm_sub_ps(iy1,jy0);
227 dz10 = _mm_sub_ps(iz1,jz0);
228 dx20 = _mm_sub_ps(ix2,jx0);
229 dy20 = _mm_sub_ps(iy2,jy0);
230 dz20 = _mm_sub_ps(iz2,jz0);
231 dx30 = _mm_sub_ps(ix3,jx0);
232 dy30 = _mm_sub_ps(iy3,jy0);
233 dz30 = _mm_sub_ps(iz3,jz0);
235 /* Calculate squared distance and things based on it */
236 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
237 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
238 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
239 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
241 rinv00 = gmx_mm_invsqrt_ps(rsq00);
242 rinv10 = gmx_mm_invsqrt_ps(rsq10);
243 rinv20 = gmx_mm_invsqrt_ps(rsq20);
244 rinv30 = gmx_mm_invsqrt_ps(rsq30);
246 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
247 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
248 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
249 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
251 /* Load parameters for j particles */
252 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
253 charge+jnrC+0,charge+jnrD+0);
254 vdwjidx0A = 2*vdwtype[jnrA+0];
255 vdwjidx0B = 2*vdwtype[jnrB+0];
256 vdwjidx0C = 2*vdwtype[jnrC+0];
257 vdwjidx0D = 2*vdwtype[jnrD+0];
259 /**************************
260 * CALCULATE INTERACTIONS *
261 **************************/
263 if (gmx_mm_any_lt(rsq00,rcutoff2))
266 r00 = _mm_mul_ps(rsq00,rinv00);
268 /* Compute parameters for interactions between i and j atoms */
269 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
270 vdwparam+vdwioffset0+vdwjidx0B,
271 vdwparam+vdwioffset0+vdwjidx0C,
272 vdwparam+vdwioffset0+vdwjidx0D,
275 /* LENNARD-JONES DISPERSION/REPULSION */
277 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
278 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
279 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
280 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
281 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
283 d = _mm_sub_ps(r00,rswitch);
284 d = _mm_max_ps(d,_mm_setzero_ps());
285 d2 = _mm_mul_ps(d,d);
286 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
288 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
290 /* Evaluate switch function */
291 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
292 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
293 vvdw = _mm_mul_ps(vvdw,sw);
294 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
296 /* Update potential sum for this i atom from the interaction with this j atom. */
297 vvdw = _mm_and_ps(vvdw,cutoff_mask);
298 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
302 fscal = _mm_and_ps(fscal,cutoff_mask);
304 /* Calculate temporary vectorial force */
305 tx = _mm_mul_ps(fscal,dx00);
306 ty = _mm_mul_ps(fscal,dy00);
307 tz = _mm_mul_ps(fscal,dz00);
309 /* Update vectorial force */
310 fix0 = _mm_add_ps(fix0,tx);
311 fiy0 = _mm_add_ps(fiy0,ty);
312 fiz0 = _mm_add_ps(fiz0,tz);
314 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
315 f+j_coord_offsetC,f+j_coord_offsetD,
320 /**************************
321 * CALCULATE INTERACTIONS *
322 **************************/
324 if (gmx_mm_any_lt(rsq10,rcutoff2))
327 r10 = _mm_mul_ps(rsq10,rinv10);
329 /* Compute parameters for interactions between i and j atoms */
330 qq10 = _mm_mul_ps(iq1,jq0);
332 /* EWALD ELECTROSTATICS */
334 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
335 ewrt = _mm_mul_ps(r10,ewtabscale);
336 ewitab = _mm_cvttps_epi32(ewrt);
337 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
338 ewitab = _mm_slli_epi32(ewitab,2);
339 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
340 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
341 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
342 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
343 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
344 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
345 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
346 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
347 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
349 d = _mm_sub_ps(r10,rswitch);
350 d = _mm_max_ps(d,_mm_setzero_ps());
351 d2 = _mm_mul_ps(d,d);
352 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
354 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
356 /* Evaluate switch function */
357 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
358 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
359 velec = _mm_mul_ps(velec,sw);
360 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
362 /* Update potential sum for this i atom from the interaction with this j atom. */
363 velec = _mm_and_ps(velec,cutoff_mask);
364 velecsum = _mm_add_ps(velecsum,velec);
368 fscal = _mm_and_ps(fscal,cutoff_mask);
370 /* Calculate temporary vectorial force */
371 tx = _mm_mul_ps(fscal,dx10);
372 ty = _mm_mul_ps(fscal,dy10);
373 tz = _mm_mul_ps(fscal,dz10);
375 /* Update vectorial force */
376 fix1 = _mm_add_ps(fix1,tx);
377 fiy1 = _mm_add_ps(fiy1,ty);
378 fiz1 = _mm_add_ps(fiz1,tz);
380 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
381 f+j_coord_offsetC,f+j_coord_offsetD,
386 /**************************
387 * CALCULATE INTERACTIONS *
388 **************************/
390 if (gmx_mm_any_lt(rsq20,rcutoff2))
393 r20 = _mm_mul_ps(rsq20,rinv20);
395 /* Compute parameters for interactions between i and j atoms */
396 qq20 = _mm_mul_ps(iq2,jq0);
398 /* EWALD ELECTROSTATICS */
400 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
401 ewrt = _mm_mul_ps(r20,ewtabscale);
402 ewitab = _mm_cvttps_epi32(ewrt);
403 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
404 ewitab = _mm_slli_epi32(ewitab,2);
405 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
406 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
407 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
408 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
409 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
410 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
411 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
412 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
413 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
415 d = _mm_sub_ps(r20,rswitch);
416 d = _mm_max_ps(d,_mm_setzero_ps());
417 d2 = _mm_mul_ps(d,d);
418 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
420 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
422 /* Evaluate switch function */
423 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
424 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
425 velec = _mm_mul_ps(velec,sw);
426 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
428 /* Update potential sum for this i atom from the interaction with this j atom. */
429 velec = _mm_and_ps(velec,cutoff_mask);
430 velecsum = _mm_add_ps(velecsum,velec);
434 fscal = _mm_and_ps(fscal,cutoff_mask);
436 /* Calculate temporary vectorial force */
437 tx = _mm_mul_ps(fscal,dx20);
438 ty = _mm_mul_ps(fscal,dy20);
439 tz = _mm_mul_ps(fscal,dz20);
441 /* Update vectorial force */
442 fix2 = _mm_add_ps(fix2,tx);
443 fiy2 = _mm_add_ps(fiy2,ty);
444 fiz2 = _mm_add_ps(fiz2,tz);
446 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
447 f+j_coord_offsetC,f+j_coord_offsetD,
452 /**************************
453 * CALCULATE INTERACTIONS *
454 **************************/
456 if (gmx_mm_any_lt(rsq30,rcutoff2))
459 r30 = _mm_mul_ps(rsq30,rinv30);
461 /* Compute parameters for interactions between i and j atoms */
462 qq30 = _mm_mul_ps(iq3,jq0);
464 /* EWALD ELECTROSTATICS */
466 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
467 ewrt = _mm_mul_ps(r30,ewtabscale);
468 ewitab = _mm_cvttps_epi32(ewrt);
469 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
470 ewitab = _mm_slli_epi32(ewitab,2);
471 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
472 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
473 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
474 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
475 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
476 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
477 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
478 velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
479 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
481 d = _mm_sub_ps(r30,rswitch);
482 d = _mm_max_ps(d,_mm_setzero_ps());
483 d2 = _mm_mul_ps(d,d);
484 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
486 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
488 /* Evaluate switch function */
489 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
490 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
491 velec = _mm_mul_ps(velec,sw);
492 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
494 /* Update potential sum for this i atom from the interaction with this j atom. */
495 velec = _mm_and_ps(velec,cutoff_mask);
496 velecsum = _mm_add_ps(velecsum,velec);
500 fscal = _mm_and_ps(fscal,cutoff_mask);
502 /* Calculate temporary vectorial force */
503 tx = _mm_mul_ps(fscal,dx30);
504 ty = _mm_mul_ps(fscal,dy30);
505 tz = _mm_mul_ps(fscal,dz30);
507 /* Update vectorial force */
508 fix3 = _mm_add_ps(fix3,tx);
509 fiy3 = _mm_add_ps(fiy3,ty);
510 fiz3 = _mm_add_ps(fiz3,tz);
512 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
513 f+j_coord_offsetC,f+j_coord_offsetD,
518 /* Inner loop uses 254 flops */
524 /* Get j neighbor index, and coordinate index */
530 /* Sign of each element will be negative for non-real atoms.
531 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
532 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
534 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
535 jnrA = (jnrA>=0) ? jnrA : 0;
536 jnrB = (jnrB>=0) ? jnrB : 0;
537 jnrC = (jnrC>=0) ? jnrC : 0;
538 jnrD = (jnrD>=0) ? jnrD : 0;
540 j_coord_offsetA = DIM*jnrA;
541 j_coord_offsetB = DIM*jnrB;
542 j_coord_offsetC = DIM*jnrC;
543 j_coord_offsetD = DIM*jnrD;
545 /* load j atom coordinates */
546 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
547 x+j_coord_offsetC,x+j_coord_offsetD,
550 /* Calculate displacement vector */
551 dx00 = _mm_sub_ps(ix0,jx0);
552 dy00 = _mm_sub_ps(iy0,jy0);
553 dz00 = _mm_sub_ps(iz0,jz0);
554 dx10 = _mm_sub_ps(ix1,jx0);
555 dy10 = _mm_sub_ps(iy1,jy0);
556 dz10 = _mm_sub_ps(iz1,jz0);
557 dx20 = _mm_sub_ps(ix2,jx0);
558 dy20 = _mm_sub_ps(iy2,jy0);
559 dz20 = _mm_sub_ps(iz2,jz0);
560 dx30 = _mm_sub_ps(ix3,jx0);
561 dy30 = _mm_sub_ps(iy3,jy0);
562 dz30 = _mm_sub_ps(iz3,jz0);
564 /* Calculate squared distance and things based on it */
565 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
566 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
567 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
568 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
570 rinv00 = gmx_mm_invsqrt_ps(rsq00);
571 rinv10 = gmx_mm_invsqrt_ps(rsq10);
572 rinv20 = gmx_mm_invsqrt_ps(rsq20);
573 rinv30 = gmx_mm_invsqrt_ps(rsq30);
575 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
576 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
577 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
578 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
580 /* Load parameters for j particles */
581 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
582 charge+jnrC+0,charge+jnrD+0);
583 vdwjidx0A = 2*vdwtype[jnrA+0];
584 vdwjidx0B = 2*vdwtype[jnrB+0];
585 vdwjidx0C = 2*vdwtype[jnrC+0];
586 vdwjidx0D = 2*vdwtype[jnrD+0];
588 /**************************
589 * CALCULATE INTERACTIONS *
590 **************************/
592 if (gmx_mm_any_lt(rsq00,rcutoff2))
595 r00 = _mm_mul_ps(rsq00,rinv00);
596 r00 = _mm_andnot_ps(dummy_mask,r00);
598 /* Compute parameters for interactions between i and j atoms */
599 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
600 vdwparam+vdwioffset0+vdwjidx0B,
601 vdwparam+vdwioffset0+vdwjidx0C,
602 vdwparam+vdwioffset0+vdwjidx0D,
605 /* LENNARD-JONES DISPERSION/REPULSION */
607 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
608 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
609 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
610 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
611 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
613 d = _mm_sub_ps(r00,rswitch);
614 d = _mm_max_ps(d,_mm_setzero_ps());
615 d2 = _mm_mul_ps(d,d);
616 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
618 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
620 /* Evaluate switch function */
621 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
622 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
623 vvdw = _mm_mul_ps(vvdw,sw);
624 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
626 /* Update potential sum for this i atom from the interaction with this j atom. */
627 vvdw = _mm_and_ps(vvdw,cutoff_mask);
628 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
629 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
633 fscal = _mm_and_ps(fscal,cutoff_mask);
635 fscal = _mm_andnot_ps(dummy_mask,fscal);
637 /* Calculate temporary vectorial force */
638 tx = _mm_mul_ps(fscal,dx00);
639 ty = _mm_mul_ps(fscal,dy00);
640 tz = _mm_mul_ps(fscal,dz00);
642 /* Update vectorial force */
643 fix0 = _mm_add_ps(fix0,tx);
644 fiy0 = _mm_add_ps(fiy0,ty);
645 fiz0 = _mm_add_ps(fiz0,tz);
647 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
648 f+j_coord_offsetC,f+j_coord_offsetD,
653 /**************************
654 * CALCULATE INTERACTIONS *
655 **************************/
657 if (gmx_mm_any_lt(rsq10,rcutoff2))
660 r10 = _mm_mul_ps(rsq10,rinv10);
661 r10 = _mm_andnot_ps(dummy_mask,r10);
663 /* Compute parameters for interactions between i and j atoms */
664 qq10 = _mm_mul_ps(iq1,jq0);
666 /* EWALD ELECTROSTATICS */
668 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
669 ewrt = _mm_mul_ps(r10,ewtabscale);
670 ewitab = _mm_cvttps_epi32(ewrt);
671 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
672 ewitab = _mm_slli_epi32(ewitab,2);
673 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
674 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
675 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
676 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
677 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
678 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
679 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
680 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
681 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
683 d = _mm_sub_ps(r10,rswitch);
684 d = _mm_max_ps(d,_mm_setzero_ps());
685 d2 = _mm_mul_ps(d,d);
686 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
688 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
690 /* Evaluate switch function */
691 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
692 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
693 velec = _mm_mul_ps(velec,sw);
694 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
696 /* Update potential sum for this i atom from the interaction with this j atom. */
697 velec = _mm_and_ps(velec,cutoff_mask);
698 velec = _mm_andnot_ps(dummy_mask,velec);
699 velecsum = _mm_add_ps(velecsum,velec);
703 fscal = _mm_and_ps(fscal,cutoff_mask);
705 fscal = _mm_andnot_ps(dummy_mask,fscal);
707 /* Calculate temporary vectorial force */
708 tx = _mm_mul_ps(fscal,dx10);
709 ty = _mm_mul_ps(fscal,dy10);
710 tz = _mm_mul_ps(fscal,dz10);
712 /* Update vectorial force */
713 fix1 = _mm_add_ps(fix1,tx);
714 fiy1 = _mm_add_ps(fiy1,ty);
715 fiz1 = _mm_add_ps(fiz1,tz);
717 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
718 f+j_coord_offsetC,f+j_coord_offsetD,
723 /**************************
724 * CALCULATE INTERACTIONS *
725 **************************/
727 if (gmx_mm_any_lt(rsq20,rcutoff2))
730 r20 = _mm_mul_ps(rsq20,rinv20);
731 r20 = _mm_andnot_ps(dummy_mask,r20);
733 /* Compute parameters for interactions between i and j atoms */
734 qq20 = _mm_mul_ps(iq2,jq0);
736 /* EWALD ELECTROSTATICS */
738 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
739 ewrt = _mm_mul_ps(r20,ewtabscale);
740 ewitab = _mm_cvttps_epi32(ewrt);
741 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
742 ewitab = _mm_slli_epi32(ewitab,2);
743 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
744 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
745 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
746 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
747 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
748 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
749 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
750 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
751 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
753 d = _mm_sub_ps(r20,rswitch);
754 d = _mm_max_ps(d,_mm_setzero_ps());
755 d2 = _mm_mul_ps(d,d);
756 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
758 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
760 /* Evaluate switch function */
761 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
762 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
763 velec = _mm_mul_ps(velec,sw);
764 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
766 /* Update potential sum for this i atom from the interaction with this j atom. */
767 velec = _mm_and_ps(velec,cutoff_mask);
768 velec = _mm_andnot_ps(dummy_mask,velec);
769 velecsum = _mm_add_ps(velecsum,velec);
773 fscal = _mm_and_ps(fscal,cutoff_mask);
775 fscal = _mm_andnot_ps(dummy_mask,fscal);
777 /* Calculate temporary vectorial force */
778 tx = _mm_mul_ps(fscal,dx20);
779 ty = _mm_mul_ps(fscal,dy20);
780 tz = _mm_mul_ps(fscal,dz20);
782 /* Update vectorial force */
783 fix2 = _mm_add_ps(fix2,tx);
784 fiy2 = _mm_add_ps(fiy2,ty);
785 fiz2 = _mm_add_ps(fiz2,tz);
787 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
788 f+j_coord_offsetC,f+j_coord_offsetD,
793 /**************************
794 * CALCULATE INTERACTIONS *
795 **************************/
797 if (gmx_mm_any_lt(rsq30,rcutoff2))
800 r30 = _mm_mul_ps(rsq30,rinv30);
801 r30 = _mm_andnot_ps(dummy_mask,r30);
803 /* Compute parameters for interactions between i and j atoms */
804 qq30 = _mm_mul_ps(iq3,jq0);
806 /* EWALD ELECTROSTATICS */
808 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
809 ewrt = _mm_mul_ps(r30,ewtabscale);
810 ewitab = _mm_cvttps_epi32(ewrt);
811 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
812 ewitab = _mm_slli_epi32(ewitab,2);
813 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
814 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
815 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
816 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
817 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
818 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
819 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
820 velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
821 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
823 d = _mm_sub_ps(r30,rswitch);
824 d = _mm_max_ps(d,_mm_setzero_ps());
825 d2 = _mm_mul_ps(d,d);
826 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
828 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
830 /* Evaluate switch function */
831 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
832 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
833 velec = _mm_mul_ps(velec,sw);
834 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
836 /* Update potential sum for this i atom from the interaction with this j atom. */
837 velec = _mm_and_ps(velec,cutoff_mask);
838 velec = _mm_andnot_ps(dummy_mask,velec);
839 velecsum = _mm_add_ps(velecsum,velec);
843 fscal = _mm_and_ps(fscal,cutoff_mask);
845 fscal = _mm_andnot_ps(dummy_mask,fscal);
847 /* Calculate temporary vectorial force */
848 tx = _mm_mul_ps(fscal,dx30);
849 ty = _mm_mul_ps(fscal,dy30);
850 tz = _mm_mul_ps(fscal,dz30);
852 /* Update vectorial force */
853 fix3 = _mm_add_ps(fix3,tx);
854 fiy3 = _mm_add_ps(fiy3,ty);
855 fiz3 = _mm_add_ps(fiz3,tz);
857 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
858 f+j_coord_offsetC,f+j_coord_offsetD,
863 /* Inner loop uses 258 flops */
866 /* End of innermost loop */
868 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
869 f+i_coord_offset,fshift+i_shift_offset);
872 /* Update potential energies */
873 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
874 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
876 /* Increment number of inner iterations */
877 inneriter += j_index_end - j_index_start;
879 /* Outer loop uses 38 flops */
882 /* Increment number of outer iterations */
885 /* Update outer/inner flops */
887 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*258);
890 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single
891 * Electrostatics interaction: Ewald
892 * VdW interaction: LennardJones
893 * Geometry: Water4-Particle
894 * Calculate force/pot: Force
897 nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single
898 (t_nblist * gmx_restrict nlist,
899 rvec * gmx_restrict xx,
900 rvec * gmx_restrict ff,
901 t_forcerec * gmx_restrict fr,
902 t_mdatoms * gmx_restrict mdatoms,
903 nb_kernel_data_t * gmx_restrict kernel_data,
904 t_nrnb * gmx_restrict nrnb)
906 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
907 * just 0 for non-waters.
908 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
909 * jnr indices corresponding to data put in the four positions in the SIMD register.
911 int i_shift_offset,i_coord_offset,outeriter,inneriter;
912 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
913 int jnrA,jnrB,jnrC,jnrD;
914 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
915 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
916 real shX,shY,shZ,rcutoff_scalar;
917 real *shiftvec,*fshift,*x,*f;
918 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
920 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
922 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
924 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
926 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
927 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
928 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
929 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
930 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
931 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
932 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
933 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
936 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
939 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
940 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
942 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
944 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
945 real rswitch_scalar,d_scalar;
946 __m128 dummy_mask,cutoff_mask;
947 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
948 __m128 one = _mm_set1_ps(1.0);
949 __m128 two = _mm_set1_ps(2.0);
955 jindex = nlist->jindex;
957 shiftidx = nlist->shift;
959 shiftvec = fr->shift_vec[0];
960 fshift = fr->fshift[0];
961 facel = _mm_set1_ps(fr->epsfac);
962 charge = mdatoms->chargeA;
963 nvdwtype = fr->ntype;
965 vdwtype = mdatoms->typeA;
967 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
968 ewtab = fr->ic->tabq_coul_FDV0;
969 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
970 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
972 /* Setup water-specific parameters */
973 inr = nlist->iinr[0];
974 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
975 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
976 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
977 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
979 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
980 rcutoff_scalar = fr->rcoulomb;
981 rcutoff = _mm_set1_ps(rcutoff_scalar);
982 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
984 rswitch_scalar = fr->rcoulomb_switch;
985 rswitch = _mm_set1_ps(rswitch_scalar);
986 /* Setup switch parameters */
987 d_scalar = rcutoff_scalar-rswitch_scalar;
988 d = _mm_set1_ps(d_scalar);
989 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
990 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
991 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
992 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
993 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
994 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
996 /* Avoid stupid compiler warnings */
997 jnrA = jnrB = jnrC = jnrD = 0;
1000 j_coord_offsetC = 0;
1001 j_coord_offsetD = 0;
1006 /* Start outer loop over neighborlists */
1007 for(iidx=0; iidx<nri; iidx++)
1009 /* Load shift vector for this list */
1010 i_shift_offset = DIM*shiftidx[iidx];
1011 shX = shiftvec[i_shift_offset+XX];
1012 shY = shiftvec[i_shift_offset+YY];
1013 shZ = shiftvec[i_shift_offset+ZZ];
1015 /* Load limits for loop over neighbors */
1016 j_index_start = jindex[iidx];
1017 j_index_end = jindex[iidx+1];
1019 /* Get outer coordinate index */
1021 i_coord_offset = DIM*inr;
1023 /* Load i particle coords and add shift vector */
1024 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1025 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1026 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1027 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1028 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1029 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1030 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1031 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1032 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1033 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
1034 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
1035 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
1037 fix0 = _mm_setzero_ps();
1038 fiy0 = _mm_setzero_ps();
1039 fiz0 = _mm_setzero_ps();
1040 fix1 = _mm_setzero_ps();
1041 fiy1 = _mm_setzero_ps();
1042 fiz1 = _mm_setzero_ps();
1043 fix2 = _mm_setzero_ps();
1044 fiy2 = _mm_setzero_ps();
1045 fiz2 = _mm_setzero_ps();
1046 fix3 = _mm_setzero_ps();
1047 fiy3 = _mm_setzero_ps();
1048 fiz3 = _mm_setzero_ps();
1050 /* Start inner kernel loop */
1051 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1054 /* Get j neighbor index, and coordinate index */
1056 jnrB = jjnr[jidx+1];
1057 jnrC = jjnr[jidx+2];
1058 jnrD = jjnr[jidx+3];
1060 j_coord_offsetA = DIM*jnrA;
1061 j_coord_offsetB = DIM*jnrB;
1062 j_coord_offsetC = DIM*jnrC;
1063 j_coord_offsetD = DIM*jnrD;
1065 /* load j atom coordinates */
1066 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1067 x+j_coord_offsetC,x+j_coord_offsetD,
1070 /* Calculate displacement vector */
1071 dx00 = _mm_sub_ps(ix0,jx0);
1072 dy00 = _mm_sub_ps(iy0,jy0);
1073 dz00 = _mm_sub_ps(iz0,jz0);
1074 dx10 = _mm_sub_ps(ix1,jx0);
1075 dy10 = _mm_sub_ps(iy1,jy0);
1076 dz10 = _mm_sub_ps(iz1,jz0);
1077 dx20 = _mm_sub_ps(ix2,jx0);
1078 dy20 = _mm_sub_ps(iy2,jy0);
1079 dz20 = _mm_sub_ps(iz2,jz0);
1080 dx30 = _mm_sub_ps(ix3,jx0);
1081 dy30 = _mm_sub_ps(iy3,jy0);
1082 dz30 = _mm_sub_ps(iz3,jz0);
1084 /* Calculate squared distance and things based on it */
1085 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1086 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1087 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1088 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1090 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1091 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1092 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1093 rinv30 = gmx_mm_invsqrt_ps(rsq30);
1095 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1096 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1097 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1098 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1100 /* Load parameters for j particles */
1101 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1102 charge+jnrC+0,charge+jnrD+0);
1103 vdwjidx0A = 2*vdwtype[jnrA+0];
1104 vdwjidx0B = 2*vdwtype[jnrB+0];
1105 vdwjidx0C = 2*vdwtype[jnrC+0];
1106 vdwjidx0D = 2*vdwtype[jnrD+0];
1108 /**************************
1109 * CALCULATE INTERACTIONS *
1110 **************************/
1112 if (gmx_mm_any_lt(rsq00,rcutoff2))
1115 r00 = _mm_mul_ps(rsq00,rinv00);
1117 /* Compute parameters for interactions between i and j atoms */
1118 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1119 vdwparam+vdwioffset0+vdwjidx0B,
1120 vdwparam+vdwioffset0+vdwjidx0C,
1121 vdwparam+vdwioffset0+vdwjidx0D,
1124 /* LENNARD-JONES DISPERSION/REPULSION */
1126 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1127 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1128 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1129 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1130 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1132 d = _mm_sub_ps(r00,rswitch);
1133 d = _mm_max_ps(d,_mm_setzero_ps());
1134 d2 = _mm_mul_ps(d,d);
1135 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1137 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1139 /* Evaluate switch function */
1140 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1141 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1142 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1146 fscal = _mm_and_ps(fscal,cutoff_mask);
1148 /* Calculate temporary vectorial force */
1149 tx = _mm_mul_ps(fscal,dx00);
1150 ty = _mm_mul_ps(fscal,dy00);
1151 tz = _mm_mul_ps(fscal,dz00);
1153 /* Update vectorial force */
1154 fix0 = _mm_add_ps(fix0,tx);
1155 fiy0 = _mm_add_ps(fiy0,ty);
1156 fiz0 = _mm_add_ps(fiz0,tz);
1158 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1159 f+j_coord_offsetC,f+j_coord_offsetD,
1164 /**************************
1165 * CALCULATE INTERACTIONS *
1166 **************************/
1168 if (gmx_mm_any_lt(rsq10,rcutoff2))
1171 r10 = _mm_mul_ps(rsq10,rinv10);
1173 /* Compute parameters for interactions between i and j atoms */
1174 qq10 = _mm_mul_ps(iq1,jq0);
1176 /* EWALD ELECTROSTATICS */
1178 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1179 ewrt = _mm_mul_ps(r10,ewtabscale);
1180 ewitab = _mm_cvttps_epi32(ewrt);
1181 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1182 ewitab = _mm_slli_epi32(ewitab,2);
1183 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1184 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1185 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1186 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1187 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1188 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1189 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1190 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1191 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1193 d = _mm_sub_ps(r10,rswitch);
1194 d = _mm_max_ps(d,_mm_setzero_ps());
1195 d2 = _mm_mul_ps(d,d);
1196 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1198 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1200 /* Evaluate switch function */
1201 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1202 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1203 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1207 fscal = _mm_and_ps(fscal,cutoff_mask);
1209 /* Calculate temporary vectorial force */
1210 tx = _mm_mul_ps(fscal,dx10);
1211 ty = _mm_mul_ps(fscal,dy10);
1212 tz = _mm_mul_ps(fscal,dz10);
1214 /* Update vectorial force */
1215 fix1 = _mm_add_ps(fix1,tx);
1216 fiy1 = _mm_add_ps(fiy1,ty);
1217 fiz1 = _mm_add_ps(fiz1,tz);
1219 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1220 f+j_coord_offsetC,f+j_coord_offsetD,
1225 /**************************
1226 * CALCULATE INTERACTIONS *
1227 **************************/
1229 if (gmx_mm_any_lt(rsq20,rcutoff2))
1232 r20 = _mm_mul_ps(rsq20,rinv20);
1234 /* Compute parameters for interactions between i and j atoms */
1235 qq20 = _mm_mul_ps(iq2,jq0);
1237 /* EWALD ELECTROSTATICS */
1239 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1240 ewrt = _mm_mul_ps(r20,ewtabscale);
1241 ewitab = _mm_cvttps_epi32(ewrt);
1242 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1243 ewitab = _mm_slli_epi32(ewitab,2);
1244 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1245 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1246 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1247 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1248 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1249 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1250 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1251 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1252 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1254 d = _mm_sub_ps(r20,rswitch);
1255 d = _mm_max_ps(d,_mm_setzero_ps());
1256 d2 = _mm_mul_ps(d,d);
1257 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1259 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1261 /* Evaluate switch function */
1262 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1263 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1264 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1268 fscal = _mm_and_ps(fscal,cutoff_mask);
1270 /* Calculate temporary vectorial force */
1271 tx = _mm_mul_ps(fscal,dx20);
1272 ty = _mm_mul_ps(fscal,dy20);
1273 tz = _mm_mul_ps(fscal,dz20);
1275 /* Update vectorial force */
1276 fix2 = _mm_add_ps(fix2,tx);
1277 fiy2 = _mm_add_ps(fiy2,ty);
1278 fiz2 = _mm_add_ps(fiz2,tz);
1280 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1281 f+j_coord_offsetC,f+j_coord_offsetD,
1286 /**************************
1287 * CALCULATE INTERACTIONS *
1288 **************************/
1290 if (gmx_mm_any_lt(rsq30,rcutoff2))
1293 r30 = _mm_mul_ps(rsq30,rinv30);
1295 /* Compute parameters for interactions between i and j atoms */
1296 qq30 = _mm_mul_ps(iq3,jq0);
1298 /* EWALD ELECTROSTATICS */
1300 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1301 ewrt = _mm_mul_ps(r30,ewtabscale);
1302 ewitab = _mm_cvttps_epi32(ewrt);
1303 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1304 ewitab = _mm_slli_epi32(ewitab,2);
1305 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1306 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1307 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1308 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1309 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1310 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1311 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1312 velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
1313 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1315 d = _mm_sub_ps(r30,rswitch);
1316 d = _mm_max_ps(d,_mm_setzero_ps());
1317 d2 = _mm_mul_ps(d,d);
1318 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1320 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1322 /* Evaluate switch function */
1323 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1324 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
1325 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1329 fscal = _mm_and_ps(fscal,cutoff_mask);
1331 /* Calculate temporary vectorial force */
1332 tx = _mm_mul_ps(fscal,dx30);
1333 ty = _mm_mul_ps(fscal,dy30);
1334 tz = _mm_mul_ps(fscal,dz30);
1336 /* Update vectorial force */
1337 fix3 = _mm_add_ps(fix3,tx);
1338 fiy3 = _mm_add_ps(fiy3,ty);
1339 fiz3 = _mm_add_ps(fiz3,tz);
1341 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1342 f+j_coord_offsetC,f+j_coord_offsetD,
1347 /* Inner loop uses 242 flops */
1350 if(jidx<j_index_end)
1353 /* Get j neighbor index, and coordinate index */
1355 jnrB = jjnr[jidx+1];
1356 jnrC = jjnr[jidx+2];
1357 jnrD = jjnr[jidx+3];
1359 /* Sign of each element will be negative for non-real atoms.
1360 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1361 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1363 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1364 jnrA = (jnrA>=0) ? jnrA : 0;
1365 jnrB = (jnrB>=0) ? jnrB : 0;
1366 jnrC = (jnrC>=0) ? jnrC : 0;
1367 jnrD = (jnrD>=0) ? jnrD : 0;
1369 j_coord_offsetA = DIM*jnrA;
1370 j_coord_offsetB = DIM*jnrB;
1371 j_coord_offsetC = DIM*jnrC;
1372 j_coord_offsetD = DIM*jnrD;
1374 /* load j atom coordinates */
1375 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1376 x+j_coord_offsetC,x+j_coord_offsetD,
1379 /* Calculate displacement vector */
1380 dx00 = _mm_sub_ps(ix0,jx0);
1381 dy00 = _mm_sub_ps(iy0,jy0);
1382 dz00 = _mm_sub_ps(iz0,jz0);
1383 dx10 = _mm_sub_ps(ix1,jx0);
1384 dy10 = _mm_sub_ps(iy1,jy0);
1385 dz10 = _mm_sub_ps(iz1,jz0);
1386 dx20 = _mm_sub_ps(ix2,jx0);
1387 dy20 = _mm_sub_ps(iy2,jy0);
1388 dz20 = _mm_sub_ps(iz2,jz0);
1389 dx30 = _mm_sub_ps(ix3,jx0);
1390 dy30 = _mm_sub_ps(iy3,jy0);
1391 dz30 = _mm_sub_ps(iz3,jz0);
1393 /* Calculate squared distance and things based on it */
1394 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1395 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1396 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1397 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1399 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1400 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1401 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1402 rinv30 = gmx_mm_invsqrt_ps(rsq30);
1404 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1405 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1406 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1407 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1409 /* Load parameters for j particles */
1410 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1411 charge+jnrC+0,charge+jnrD+0);
1412 vdwjidx0A = 2*vdwtype[jnrA+0];
1413 vdwjidx0B = 2*vdwtype[jnrB+0];
1414 vdwjidx0C = 2*vdwtype[jnrC+0];
1415 vdwjidx0D = 2*vdwtype[jnrD+0];
1417 /**************************
1418 * CALCULATE INTERACTIONS *
1419 **************************/
1421 if (gmx_mm_any_lt(rsq00,rcutoff2))
1424 r00 = _mm_mul_ps(rsq00,rinv00);
1425 r00 = _mm_andnot_ps(dummy_mask,r00);
1427 /* Compute parameters for interactions between i and j atoms */
1428 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1429 vdwparam+vdwioffset0+vdwjidx0B,
1430 vdwparam+vdwioffset0+vdwjidx0C,
1431 vdwparam+vdwioffset0+vdwjidx0D,
1434 /* LENNARD-JONES DISPERSION/REPULSION */
1436 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1437 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1438 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1439 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1440 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1442 d = _mm_sub_ps(r00,rswitch);
1443 d = _mm_max_ps(d,_mm_setzero_ps());
1444 d2 = _mm_mul_ps(d,d);
1445 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1447 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1449 /* Evaluate switch function */
1450 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1451 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1452 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1456 fscal = _mm_and_ps(fscal,cutoff_mask);
1458 fscal = _mm_andnot_ps(dummy_mask,fscal);
1460 /* Calculate temporary vectorial force */
1461 tx = _mm_mul_ps(fscal,dx00);
1462 ty = _mm_mul_ps(fscal,dy00);
1463 tz = _mm_mul_ps(fscal,dz00);
1465 /* Update vectorial force */
1466 fix0 = _mm_add_ps(fix0,tx);
1467 fiy0 = _mm_add_ps(fiy0,ty);
1468 fiz0 = _mm_add_ps(fiz0,tz);
1470 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1471 f+j_coord_offsetC,f+j_coord_offsetD,
1476 /**************************
1477 * CALCULATE INTERACTIONS *
1478 **************************/
1480 if (gmx_mm_any_lt(rsq10,rcutoff2))
1483 r10 = _mm_mul_ps(rsq10,rinv10);
1484 r10 = _mm_andnot_ps(dummy_mask,r10);
1486 /* Compute parameters for interactions between i and j atoms */
1487 qq10 = _mm_mul_ps(iq1,jq0);
1489 /* EWALD ELECTROSTATICS */
1491 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1492 ewrt = _mm_mul_ps(r10,ewtabscale);
1493 ewitab = _mm_cvttps_epi32(ewrt);
1494 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1495 ewitab = _mm_slli_epi32(ewitab,2);
1496 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1497 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1498 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1499 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1500 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1501 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1502 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1503 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1504 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1506 d = _mm_sub_ps(r10,rswitch);
1507 d = _mm_max_ps(d,_mm_setzero_ps());
1508 d2 = _mm_mul_ps(d,d);
1509 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1511 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1513 /* Evaluate switch function */
1514 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1515 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1516 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1520 fscal = _mm_and_ps(fscal,cutoff_mask);
1522 fscal = _mm_andnot_ps(dummy_mask,fscal);
1524 /* Calculate temporary vectorial force */
1525 tx = _mm_mul_ps(fscal,dx10);
1526 ty = _mm_mul_ps(fscal,dy10);
1527 tz = _mm_mul_ps(fscal,dz10);
1529 /* Update vectorial force */
1530 fix1 = _mm_add_ps(fix1,tx);
1531 fiy1 = _mm_add_ps(fiy1,ty);
1532 fiz1 = _mm_add_ps(fiz1,tz);
1534 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1535 f+j_coord_offsetC,f+j_coord_offsetD,
1540 /**************************
1541 * CALCULATE INTERACTIONS *
1542 **************************/
1544 if (gmx_mm_any_lt(rsq20,rcutoff2))
1547 r20 = _mm_mul_ps(rsq20,rinv20);
1548 r20 = _mm_andnot_ps(dummy_mask,r20);
1550 /* Compute parameters for interactions between i and j atoms */
1551 qq20 = _mm_mul_ps(iq2,jq0);
1553 /* EWALD ELECTROSTATICS */
1555 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1556 ewrt = _mm_mul_ps(r20,ewtabscale);
1557 ewitab = _mm_cvttps_epi32(ewrt);
1558 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1559 ewitab = _mm_slli_epi32(ewitab,2);
1560 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1561 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1562 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1563 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1564 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1565 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1566 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1567 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1568 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1570 d = _mm_sub_ps(r20,rswitch);
1571 d = _mm_max_ps(d,_mm_setzero_ps());
1572 d2 = _mm_mul_ps(d,d);
1573 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1575 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1577 /* Evaluate switch function */
1578 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1579 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1580 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1584 fscal = _mm_and_ps(fscal,cutoff_mask);
1586 fscal = _mm_andnot_ps(dummy_mask,fscal);
1588 /* Calculate temporary vectorial force */
1589 tx = _mm_mul_ps(fscal,dx20);
1590 ty = _mm_mul_ps(fscal,dy20);
1591 tz = _mm_mul_ps(fscal,dz20);
1593 /* Update vectorial force */
1594 fix2 = _mm_add_ps(fix2,tx);
1595 fiy2 = _mm_add_ps(fiy2,ty);
1596 fiz2 = _mm_add_ps(fiz2,tz);
1598 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1599 f+j_coord_offsetC,f+j_coord_offsetD,
1604 /**************************
1605 * CALCULATE INTERACTIONS *
1606 **************************/
1608 if (gmx_mm_any_lt(rsq30,rcutoff2))
1611 r30 = _mm_mul_ps(rsq30,rinv30);
1612 r30 = _mm_andnot_ps(dummy_mask,r30);
1614 /* Compute parameters for interactions between i and j atoms */
1615 qq30 = _mm_mul_ps(iq3,jq0);
1617 /* EWALD ELECTROSTATICS */
1619 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1620 ewrt = _mm_mul_ps(r30,ewtabscale);
1621 ewitab = _mm_cvttps_epi32(ewrt);
1622 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1623 ewitab = _mm_slli_epi32(ewitab,2);
1624 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1625 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1626 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1627 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1628 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1629 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1630 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1631 velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
1632 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1634 d = _mm_sub_ps(r30,rswitch);
1635 d = _mm_max_ps(d,_mm_setzero_ps());
1636 d2 = _mm_mul_ps(d,d);
1637 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1639 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1641 /* Evaluate switch function */
1642 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1643 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
1644 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1648 fscal = _mm_and_ps(fscal,cutoff_mask);
1650 fscal = _mm_andnot_ps(dummy_mask,fscal);
1652 /* Calculate temporary vectorial force */
1653 tx = _mm_mul_ps(fscal,dx30);
1654 ty = _mm_mul_ps(fscal,dy30);
1655 tz = _mm_mul_ps(fscal,dz30);
1657 /* Update vectorial force */
1658 fix3 = _mm_add_ps(fix3,tx);
1659 fiy3 = _mm_add_ps(fiy3,ty);
1660 fiz3 = _mm_add_ps(fiz3,tz);
1662 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1663 f+j_coord_offsetC,f+j_coord_offsetD,
1668 /* Inner loop uses 246 flops */
1671 /* End of innermost loop */
1673 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1674 f+i_coord_offset,fshift+i_shift_offset);
1676 /* Increment number of inner iterations */
1677 inneriter += j_index_end - j_index_start;
1679 /* Outer loop uses 36 flops */
1682 /* Increment number of outer iterations */
1685 /* Update outer/inner flops */
1687 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*246);