2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water4-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
75 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
77 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
78 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
79 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
80 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
83 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
86 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
87 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
89 __m128i ifour = _mm_set1_epi32(4);
90 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
93 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
95 __m128 dummy_mask,cutoff_mask;
96 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
97 __m128 one = _mm_set1_ps(1.0);
98 __m128 two = _mm_set1_ps(2.0);
104 jindex = nlist->jindex;
106 shiftidx = nlist->shift;
108 shiftvec = fr->shift_vec[0];
109 fshift = fr->fshift[0];
110 facel = _mm_set1_ps(fr->epsfac);
111 charge = mdatoms->chargeA;
112 nvdwtype = fr->ntype;
114 vdwtype = mdatoms->typeA;
116 vftab = kernel_data->table_vdw->data;
117 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
119 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
120 ewtab = fr->ic->tabq_coul_FDV0;
121 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
122 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
124 /* Setup water-specific parameters */
125 inr = nlist->iinr[0];
126 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
127 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
128 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
129 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
131 /* Avoid stupid compiler warnings */
132 jnrA = jnrB = jnrC = jnrD = 0;
141 /* Start outer loop over neighborlists */
142 for(iidx=0; iidx<nri; iidx++)
144 /* Load shift vector for this list */
145 i_shift_offset = DIM*shiftidx[iidx];
146 shX = shiftvec[i_shift_offset+XX];
147 shY = shiftvec[i_shift_offset+YY];
148 shZ = shiftvec[i_shift_offset+ZZ];
150 /* Load limits for loop over neighbors */
151 j_index_start = jindex[iidx];
152 j_index_end = jindex[iidx+1];
154 /* Get outer coordinate index */
156 i_coord_offset = DIM*inr;
158 /* Load i particle coords and add shift vector */
159 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
160 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
161 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
162 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
163 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
164 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
165 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
166 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
167 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
168 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
169 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
170 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
172 fix0 = _mm_setzero_ps();
173 fiy0 = _mm_setzero_ps();
174 fiz0 = _mm_setzero_ps();
175 fix1 = _mm_setzero_ps();
176 fiy1 = _mm_setzero_ps();
177 fiz1 = _mm_setzero_ps();
178 fix2 = _mm_setzero_ps();
179 fiy2 = _mm_setzero_ps();
180 fiz2 = _mm_setzero_ps();
181 fix3 = _mm_setzero_ps();
182 fiy3 = _mm_setzero_ps();
183 fiz3 = _mm_setzero_ps();
185 /* Reset potential sums */
186 velecsum = _mm_setzero_ps();
187 vvdwsum = _mm_setzero_ps();
189 /* Start inner kernel loop */
190 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
193 /* Get j neighbor index, and coordinate index */
199 j_coord_offsetA = DIM*jnrA;
200 j_coord_offsetB = DIM*jnrB;
201 j_coord_offsetC = DIM*jnrC;
202 j_coord_offsetD = DIM*jnrD;
204 /* load j atom coordinates */
205 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
206 x+j_coord_offsetC,x+j_coord_offsetD,
209 /* Calculate displacement vector */
210 dx00 = _mm_sub_ps(ix0,jx0);
211 dy00 = _mm_sub_ps(iy0,jy0);
212 dz00 = _mm_sub_ps(iz0,jz0);
213 dx10 = _mm_sub_ps(ix1,jx0);
214 dy10 = _mm_sub_ps(iy1,jy0);
215 dz10 = _mm_sub_ps(iz1,jz0);
216 dx20 = _mm_sub_ps(ix2,jx0);
217 dy20 = _mm_sub_ps(iy2,jy0);
218 dz20 = _mm_sub_ps(iz2,jz0);
219 dx30 = _mm_sub_ps(ix3,jx0);
220 dy30 = _mm_sub_ps(iy3,jy0);
221 dz30 = _mm_sub_ps(iz3,jz0);
223 /* Calculate squared distance and things based on it */
224 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
225 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
226 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
227 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
229 rinv00 = gmx_mm_invsqrt_ps(rsq00);
230 rinv10 = gmx_mm_invsqrt_ps(rsq10);
231 rinv20 = gmx_mm_invsqrt_ps(rsq20);
232 rinv30 = gmx_mm_invsqrt_ps(rsq30);
234 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
235 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
236 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
238 /* Load parameters for j particles */
239 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
240 charge+jnrC+0,charge+jnrD+0);
241 vdwjidx0A = 2*vdwtype[jnrA+0];
242 vdwjidx0B = 2*vdwtype[jnrB+0];
243 vdwjidx0C = 2*vdwtype[jnrC+0];
244 vdwjidx0D = 2*vdwtype[jnrD+0];
246 /**************************
247 * CALCULATE INTERACTIONS *
248 **************************/
250 r00 = _mm_mul_ps(rsq00,rinv00);
252 /* Compute parameters for interactions between i and j atoms */
253 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
254 vdwparam+vdwioffset0+vdwjidx0B,
255 vdwparam+vdwioffset0+vdwjidx0C,
256 vdwparam+vdwioffset0+vdwjidx0D,
259 /* Calculate table index by multiplying r with table scale and truncate to integer */
260 rt = _mm_mul_ps(r00,vftabscale);
261 vfitab = _mm_cvttps_epi32(rt);
262 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
263 vfitab = _mm_slli_epi32(vfitab,3);
265 /* CUBIC SPLINE TABLE DISPERSION */
266 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
267 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
268 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
269 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
270 _MM_TRANSPOSE4_PS(Y,F,G,H);
271 Heps = _mm_mul_ps(vfeps,H);
272 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
273 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
274 vvdw6 = _mm_mul_ps(c6_00,VV);
275 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
276 fvdw6 = _mm_mul_ps(c6_00,FF);
278 /* CUBIC SPLINE TABLE REPULSION */
279 vfitab = _mm_add_epi32(vfitab,ifour);
280 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
281 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
282 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
283 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
284 _MM_TRANSPOSE4_PS(Y,F,G,H);
285 Heps = _mm_mul_ps(vfeps,H);
286 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
287 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
288 vvdw12 = _mm_mul_ps(c12_00,VV);
289 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
290 fvdw12 = _mm_mul_ps(c12_00,FF);
291 vvdw = _mm_add_ps(vvdw12,vvdw6);
292 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
294 /* Update potential sum for this i atom from the interaction with this j atom. */
295 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
299 /* Calculate temporary vectorial force */
300 tx = _mm_mul_ps(fscal,dx00);
301 ty = _mm_mul_ps(fscal,dy00);
302 tz = _mm_mul_ps(fscal,dz00);
304 /* Update vectorial force */
305 fix0 = _mm_add_ps(fix0,tx);
306 fiy0 = _mm_add_ps(fiy0,ty);
307 fiz0 = _mm_add_ps(fiz0,tz);
309 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
310 f+j_coord_offsetC,f+j_coord_offsetD,
313 /**************************
314 * CALCULATE INTERACTIONS *
315 **************************/
317 r10 = _mm_mul_ps(rsq10,rinv10);
319 /* Compute parameters for interactions between i and j atoms */
320 qq10 = _mm_mul_ps(iq1,jq0);
322 /* EWALD ELECTROSTATICS */
324 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
325 ewrt = _mm_mul_ps(r10,ewtabscale);
326 ewitab = _mm_cvttps_epi32(ewrt);
327 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
328 ewitab = _mm_slli_epi32(ewitab,2);
329 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
330 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
331 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
332 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
333 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
334 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
335 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
336 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
337 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
339 /* Update potential sum for this i atom from the interaction with this j atom. */
340 velecsum = _mm_add_ps(velecsum,velec);
344 /* Calculate temporary vectorial force */
345 tx = _mm_mul_ps(fscal,dx10);
346 ty = _mm_mul_ps(fscal,dy10);
347 tz = _mm_mul_ps(fscal,dz10);
349 /* Update vectorial force */
350 fix1 = _mm_add_ps(fix1,tx);
351 fiy1 = _mm_add_ps(fiy1,ty);
352 fiz1 = _mm_add_ps(fiz1,tz);
354 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
355 f+j_coord_offsetC,f+j_coord_offsetD,
358 /**************************
359 * CALCULATE INTERACTIONS *
360 **************************/
362 r20 = _mm_mul_ps(rsq20,rinv20);
364 /* Compute parameters for interactions between i and j atoms */
365 qq20 = _mm_mul_ps(iq2,jq0);
367 /* EWALD ELECTROSTATICS */
369 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
370 ewrt = _mm_mul_ps(r20,ewtabscale);
371 ewitab = _mm_cvttps_epi32(ewrt);
372 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
373 ewitab = _mm_slli_epi32(ewitab,2);
374 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
375 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
376 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
377 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
378 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
379 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
380 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
381 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
382 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
384 /* Update potential sum for this i atom from the interaction with this j atom. */
385 velecsum = _mm_add_ps(velecsum,velec);
389 /* Calculate temporary vectorial force */
390 tx = _mm_mul_ps(fscal,dx20);
391 ty = _mm_mul_ps(fscal,dy20);
392 tz = _mm_mul_ps(fscal,dz20);
394 /* Update vectorial force */
395 fix2 = _mm_add_ps(fix2,tx);
396 fiy2 = _mm_add_ps(fiy2,ty);
397 fiz2 = _mm_add_ps(fiz2,tz);
399 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
400 f+j_coord_offsetC,f+j_coord_offsetD,
403 /**************************
404 * CALCULATE INTERACTIONS *
405 **************************/
407 r30 = _mm_mul_ps(rsq30,rinv30);
409 /* Compute parameters for interactions between i and j atoms */
410 qq30 = _mm_mul_ps(iq3,jq0);
412 /* EWALD ELECTROSTATICS */
414 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
415 ewrt = _mm_mul_ps(r30,ewtabscale);
416 ewitab = _mm_cvttps_epi32(ewrt);
417 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
418 ewitab = _mm_slli_epi32(ewitab,2);
419 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
420 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
421 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
422 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
423 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
424 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
425 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
426 velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
427 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
429 /* Update potential sum for this i atom from the interaction with this j atom. */
430 velecsum = _mm_add_ps(velecsum,velec);
434 /* Calculate temporary vectorial force */
435 tx = _mm_mul_ps(fscal,dx30);
436 ty = _mm_mul_ps(fscal,dy30);
437 tz = _mm_mul_ps(fscal,dz30);
439 /* Update vectorial force */
440 fix3 = _mm_add_ps(fix3,tx);
441 fiy3 = _mm_add_ps(fiy3,ty);
442 fiz3 = _mm_add_ps(fiz3,tz);
444 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
445 f+j_coord_offsetC,f+j_coord_offsetD,
448 /* Inner loop uses 179 flops */
454 /* Get j neighbor index, and coordinate index */
460 /* Sign of each element will be negative for non-real atoms.
461 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
462 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
464 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
465 jnrA = (jnrA>=0) ? jnrA : 0;
466 jnrB = (jnrB>=0) ? jnrB : 0;
467 jnrC = (jnrC>=0) ? jnrC : 0;
468 jnrD = (jnrD>=0) ? jnrD : 0;
470 j_coord_offsetA = DIM*jnrA;
471 j_coord_offsetB = DIM*jnrB;
472 j_coord_offsetC = DIM*jnrC;
473 j_coord_offsetD = DIM*jnrD;
475 /* load j atom coordinates */
476 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
477 x+j_coord_offsetC,x+j_coord_offsetD,
480 /* Calculate displacement vector */
481 dx00 = _mm_sub_ps(ix0,jx0);
482 dy00 = _mm_sub_ps(iy0,jy0);
483 dz00 = _mm_sub_ps(iz0,jz0);
484 dx10 = _mm_sub_ps(ix1,jx0);
485 dy10 = _mm_sub_ps(iy1,jy0);
486 dz10 = _mm_sub_ps(iz1,jz0);
487 dx20 = _mm_sub_ps(ix2,jx0);
488 dy20 = _mm_sub_ps(iy2,jy0);
489 dz20 = _mm_sub_ps(iz2,jz0);
490 dx30 = _mm_sub_ps(ix3,jx0);
491 dy30 = _mm_sub_ps(iy3,jy0);
492 dz30 = _mm_sub_ps(iz3,jz0);
494 /* Calculate squared distance and things based on it */
495 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
496 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
497 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
498 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
500 rinv00 = gmx_mm_invsqrt_ps(rsq00);
501 rinv10 = gmx_mm_invsqrt_ps(rsq10);
502 rinv20 = gmx_mm_invsqrt_ps(rsq20);
503 rinv30 = gmx_mm_invsqrt_ps(rsq30);
505 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
506 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
507 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
509 /* Load parameters for j particles */
510 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
511 charge+jnrC+0,charge+jnrD+0);
512 vdwjidx0A = 2*vdwtype[jnrA+0];
513 vdwjidx0B = 2*vdwtype[jnrB+0];
514 vdwjidx0C = 2*vdwtype[jnrC+0];
515 vdwjidx0D = 2*vdwtype[jnrD+0];
517 /**************************
518 * CALCULATE INTERACTIONS *
519 **************************/
521 r00 = _mm_mul_ps(rsq00,rinv00);
522 r00 = _mm_andnot_ps(dummy_mask,r00);
524 /* Compute parameters for interactions between i and j atoms */
525 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
526 vdwparam+vdwioffset0+vdwjidx0B,
527 vdwparam+vdwioffset0+vdwjidx0C,
528 vdwparam+vdwioffset0+vdwjidx0D,
531 /* Calculate table index by multiplying r with table scale and truncate to integer */
532 rt = _mm_mul_ps(r00,vftabscale);
533 vfitab = _mm_cvttps_epi32(rt);
534 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
535 vfitab = _mm_slli_epi32(vfitab,3);
537 /* CUBIC SPLINE TABLE DISPERSION */
538 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
539 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
540 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
541 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
542 _MM_TRANSPOSE4_PS(Y,F,G,H);
543 Heps = _mm_mul_ps(vfeps,H);
544 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
545 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
546 vvdw6 = _mm_mul_ps(c6_00,VV);
547 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
548 fvdw6 = _mm_mul_ps(c6_00,FF);
550 /* CUBIC SPLINE TABLE REPULSION */
551 vfitab = _mm_add_epi32(vfitab,ifour);
552 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
553 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
554 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
555 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
556 _MM_TRANSPOSE4_PS(Y,F,G,H);
557 Heps = _mm_mul_ps(vfeps,H);
558 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
559 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
560 vvdw12 = _mm_mul_ps(c12_00,VV);
561 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
562 fvdw12 = _mm_mul_ps(c12_00,FF);
563 vvdw = _mm_add_ps(vvdw12,vvdw6);
564 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
566 /* Update potential sum for this i atom from the interaction with this j atom. */
567 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
568 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
572 fscal = _mm_andnot_ps(dummy_mask,fscal);
574 /* Calculate temporary vectorial force */
575 tx = _mm_mul_ps(fscal,dx00);
576 ty = _mm_mul_ps(fscal,dy00);
577 tz = _mm_mul_ps(fscal,dz00);
579 /* Update vectorial force */
580 fix0 = _mm_add_ps(fix0,tx);
581 fiy0 = _mm_add_ps(fiy0,ty);
582 fiz0 = _mm_add_ps(fiz0,tz);
584 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
585 f+j_coord_offsetC,f+j_coord_offsetD,
588 /**************************
589 * CALCULATE INTERACTIONS *
590 **************************/
592 r10 = _mm_mul_ps(rsq10,rinv10);
593 r10 = _mm_andnot_ps(dummy_mask,r10);
595 /* Compute parameters for interactions between i and j atoms */
596 qq10 = _mm_mul_ps(iq1,jq0);
598 /* EWALD ELECTROSTATICS */
600 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
601 ewrt = _mm_mul_ps(r10,ewtabscale);
602 ewitab = _mm_cvttps_epi32(ewrt);
603 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
604 ewitab = _mm_slli_epi32(ewitab,2);
605 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
606 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
607 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
608 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
609 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
610 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
611 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
612 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
613 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
615 /* Update potential sum for this i atom from the interaction with this j atom. */
616 velec = _mm_andnot_ps(dummy_mask,velec);
617 velecsum = _mm_add_ps(velecsum,velec);
621 fscal = _mm_andnot_ps(dummy_mask,fscal);
623 /* Calculate temporary vectorial force */
624 tx = _mm_mul_ps(fscal,dx10);
625 ty = _mm_mul_ps(fscal,dy10);
626 tz = _mm_mul_ps(fscal,dz10);
628 /* Update vectorial force */
629 fix1 = _mm_add_ps(fix1,tx);
630 fiy1 = _mm_add_ps(fiy1,ty);
631 fiz1 = _mm_add_ps(fiz1,tz);
633 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
634 f+j_coord_offsetC,f+j_coord_offsetD,
637 /**************************
638 * CALCULATE INTERACTIONS *
639 **************************/
641 r20 = _mm_mul_ps(rsq20,rinv20);
642 r20 = _mm_andnot_ps(dummy_mask,r20);
644 /* Compute parameters for interactions between i and j atoms */
645 qq20 = _mm_mul_ps(iq2,jq0);
647 /* EWALD ELECTROSTATICS */
649 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
650 ewrt = _mm_mul_ps(r20,ewtabscale);
651 ewitab = _mm_cvttps_epi32(ewrt);
652 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
653 ewitab = _mm_slli_epi32(ewitab,2);
654 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
655 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
656 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
657 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
658 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
659 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
660 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
661 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
662 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
664 /* Update potential sum for this i atom from the interaction with this j atom. */
665 velec = _mm_andnot_ps(dummy_mask,velec);
666 velecsum = _mm_add_ps(velecsum,velec);
670 fscal = _mm_andnot_ps(dummy_mask,fscal);
672 /* Calculate temporary vectorial force */
673 tx = _mm_mul_ps(fscal,dx20);
674 ty = _mm_mul_ps(fscal,dy20);
675 tz = _mm_mul_ps(fscal,dz20);
677 /* Update vectorial force */
678 fix2 = _mm_add_ps(fix2,tx);
679 fiy2 = _mm_add_ps(fiy2,ty);
680 fiz2 = _mm_add_ps(fiz2,tz);
682 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
683 f+j_coord_offsetC,f+j_coord_offsetD,
686 /**************************
687 * CALCULATE INTERACTIONS *
688 **************************/
690 r30 = _mm_mul_ps(rsq30,rinv30);
691 r30 = _mm_andnot_ps(dummy_mask,r30);
693 /* Compute parameters for interactions between i and j atoms */
694 qq30 = _mm_mul_ps(iq3,jq0);
696 /* EWALD ELECTROSTATICS */
698 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
699 ewrt = _mm_mul_ps(r30,ewtabscale);
700 ewitab = _mm_cvttps_epi32(ewrt);
701 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
702 ewitab = _mm_slli_epi32(ewitab,2);
703 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
704 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
705 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
706 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
707 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
708 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
709 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
710 velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
711 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
713 /* Update potential sum for this i atom from the interaction with this j atom. */
714 velec = _mm_andnot_ps(dummy_mask,velec);
715 velecsum = _mm_add_ps(velecsum,velec);
719 fscal = _mm_andnot_ps(dummy_mask,fscal);
721 /* Calculate temporary vectorial force */
722 tx = _mm_mul_ps(fscal,dx30);
723 ty = _mm_mul_ps(fscal,dy30);
724 tz = _mm_mul_ps(fscal,dz30);
726 /* Update vectorial force */
727 fix3 = _mm_add_ps(fix3,tx);
728 fiy3 = _mm_add_ps(fiy3,ty);
729 fiz3 = _mm_add_ps(fiz3,tz);
731 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
732 f+j_coord_offsetC,f+j_coord_offsetD,
735 /* Inner loop uses 183 flops */
738 /* End of innermost loop */
740 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
741 f+i_coord_offset,fshift+i_shift_offset);
744 /* Update potential energies */
745 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
746 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
748 /* Increment number of inner iterations */
749 inneriter += j_index_end - j_index_start;
751 /* Outer loop uses 38 flops */
754 /* Increment number of outer iterations */
757 /* Update outer/inner flops */
759 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*38 + inneriter*183);
762 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single
763 * Electrostatics interaction: Ewald
764 * VdW interaction: CubicSplineTable
765 * Geometry: Water4-Particle
766 * Calculate force/pot: Force
769 nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single
770 (t_nblist * gmx_restrict nlist,
771 rvec * gmx_restrict xx,
772 rvec * gmx_restrict ff,
773 t_forcerec * gmx_restrict fr,
774 t_mdatoms * gmx_restrict mdatoms,
775 nb_kernel_data_t * gmx_restrict kernel_data,
776 t_nrnb * gmx_restrict nrnb)
778 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
779 * just 0 for non-waters.
780 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
781 * jnr indices corresponding to data put in the four positions in the SIMD register.
783 int i_shift_offset,i_coord_offset,outeriter,inneriter;
784 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
785 int jnrA,jnrB,jnrC,jnrD;
786 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
787 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
788 real shX,shY,shZ,rcutoff_scalar;
789 real *shiftvec,*fshift,*x,*f;
790 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
792 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
794 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
796 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
798 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
799 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
800 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
801 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
802 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
803 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
804 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
805 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
808 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
811 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
812 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
814 __m128i ifour = _mm_set1_epi32(4);
815 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
818 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
820 __m128 dummy_mask,cutoff_mask;
821 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
822 __m128 one = _mm_set1_ps(1.0);
823 __m128 two = _mm_set1_ps(2.0);
829 jindex = nlist->jindex;
831 shiftidx = nlist->shift;
833 shiftvec = fr->shift_vec[0];
834 fshift = fr->fshift[0];
835 facel = _mm_set1_ps(fr->epsfac);
836 charge = mdatoms->chargeA;
837 nvdwtype = fr->ntype;
839 vdwtype = mdatoms->typeA;
841 vftab = kernel_data->table_vdw->data;
842 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
844 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
845 ewtab = fr->ic->tabq_coul_F;
846 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
847 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
849 /* Setup water-specific parameters */
850 inr = nlist->iinr[0];
851 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
852 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
853 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
854 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
856 /* Avoid stupid compiler warnings */
857 jnrA = jnrB = jnrC = jnrD = 0;
866 /* Start outer loop over neighborlists */
867 for(iidx=0; iidx<nri; iidx++)
869 /* Load shift vector for this list */
870 i_shift_offset = DIM*shiftidx[iidx];
871 shX = shiftvec[i_shift_offset+XX];
872 shY = shiftvec[i_shift_offset+YY];
873 shZ = shiftvec[i_shift_offset+ZZ];
875 /* Load limits for loop over neighbors */
876 j_index_start = jindex[iidx];
877 j_index_end = jindex[iidx+1];
879 /* Get outer coordinate index */
881 i_coord_offset = DIM*inr;
883 /* Load i particle coords and add shift vector */
884 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
885 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
886 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
887 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
888 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
889 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
890 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
891 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
892 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
893 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
894 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
895 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
897 fix0 = _mm_setzero_ps();
898 fiy0 = _mm_setzero_ps();
899 fiz0 = _mm_setzero_ps();
900 fix1 = _mm_setzero_ps();
901 fiy1 = _mm_setzero_ps();
902 fiz1 = _mm_setzero_ps();
903 fix2 = _mm_setzero_ps();
904 fiy2 = _mm_setzero_ps();
905 fiz2 = _mm_setzero_ps();
906 fix3 = _mm_setzero_ps();
907 fiy3 = _mm_setzero_ps();
908 fiz3 = _mm_setzero_ps();
910 /* Start inner kernel loop */
911 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
914 /* Get j neighbor index, and coordinate index */
920 j_coord_offsetA = DIM*jnrA;
921 j_coord_offsetB = DIM*jnrB;
922 j_coord_offsetC = DIM*jnrC;
923 j_coord_offsetD = DIM*jnrD;
925 /* load j atom coordinates */
926 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
927 x+j_coord_offsetC,x+j_coord_offsetD,
930 /* Calculate displacement vector */
931 dx00 = _mm_sub_ps(ix0,jx0);
932 dy00 = _mm_sub_ps(iy0,jy0);
933 dz00 = _mm_sub_ps(iz0,jz0);
934 dx10 = _mm_sub_ps(ix1,jx0);
935 dy10 = _mm_sub_ps(iy1,jy0);
936 dz10 = _mm_sub_ps(iz1,jz0);
937 dx20 = _mm_sub_ps(ix2,jx0);
938 dy20 = _mm_sub_ps(iy2,jy0);
939 dz20 = _mm_sub_ps(iz2,jz0);
940 dx30 = _mm_sub_ps(ix3,jx0);
941 dy30 = _mm_sub_ps(iy3,jy0);
942 dz30 = _mm_sub_ps(iz3,jz0);
944 /* Calculate squared distance and things based on it */
945 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
946 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
947 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
948 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
950 rinv00 = gmx_mm_invsqrt_ps(rsq00);
951 rinv10 = gmx_mm_invsqrt_ps(rsq10);
952 rinv20 = gmx_mm_invsqrt_ps(rsq20);
953 rinv30 = gmx_mm_invsqrt_ps(rsq30);
955 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
956 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
957 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
959 /* Load parameters for j particles */
960 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
961 charge+jnrC+0,charge+jnrD+0);
962 vdwjidx0A = 2*vdwtype[jnrA+0];
963 vdwjidx0B = 2*vdwtype[jnrB+0];
964 vdwjidx0C = 2*vdwtype[jnrC+0];
965 vdwjidx0D = 2*vdwtype[jnrD+0];
967 /**************************
968 * CALCULATE INTERACTIONS *
969 **************************/
971 r00 = _mm_mul_ps(rsq00,rinv00);
973 /* Compute parameters for interactions between i and j atoms */
974 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
975 vdwparam+vdwioffset0+vdwjidx0B,
976 vdwparam+vdwioffset0+vdwjidx0C,
977 vdwparam+vdwioffset0+vdwjidx0D,
980 /* Calculate table index by multiplying r with table scale and truncate to integer */
981 rt = _mm_mul_ps(r00,vftabscale);
982 vfitab = _mm_cvttps_epi32(rt);
983 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
984 vfitab = _mm_slli_epi32(vfitab,3);
986 /* CUBIC SPLINE TABLE DISPERSION */
987 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
988 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
989 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
990 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
991 _MM_TRANSPOSE4_PS(Y,F,G,H);
992 Heps = _mm_mul_ps(vfeps,H);
993 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
994 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
995 fvdw6 = _mm_mul_ps(c6_00,FF);
997 /* CUBIC SPLINE TABLE REPULSION */
998 vfitab = _mm_add_epi32(vfitab,ifour);
999 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1000 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1001 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1002 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1003 _MM_TRANSPOSE4_PS(Y,F,G,H);
1004 Heps = _mm_mul_ps(vfeps,H);
1005 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1006 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1007 fvdw12 = _mm_mul_ps(c12_00,FF);
1008 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1012 /* Calculate temporary vectorial force */
1013 tx = _mm_mul_ps(fscal,dx00);
1014 ty = _mm_mul_ps(fscal,dy00);
1015 tz = _mm_mul_ps(fscal,dz00);
1017 /* Update vectorial force */
1018 fix0 = _mm_add_ps(fix0,tx);
1019 fiy0 = _mm_add_ps(fiy0,ty);
1020 fiz0 = _mm_add_ps(fiz0,tz);
1022 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1023 f+j_coord_offsetC,f+j_coord_offsetD,
1026 /**************************
1027 * CALCULATE INTERACTIONS *
1028 **************************/
1030 r10 = _mm_mul_ps(rsq10,rinv10);
1032 /* Compute parameters for interactions between i and j atoms */
1033 qq10 = _mm_mul_ps(iq1,jq0);
1035 /* EWALD ELECTROSTATICS */
1037 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1038 ewrt = _mm_mul_ps(r10,ewtabscale);
1039 ewitab = _mm_cvttps_epi32(ewrt);
1040 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1041 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1042 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1044 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1045 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1049 /* Calculate temporary vectorial force */
1050 tx = _mm_mul_ps(fscal,dx10);
1051 ty = _mm_mul_ps(fscal,dy10);
1052 tz = _mm_mul_ps(fscal,dz10);
1054 /* Update vectorial force */
1055 fix1 = _mm_add_ps(fix1,tx);
1056 fiy1 = _mm_add_ps(fiy1,ty);
1057 fiz1 = _mm_add_ps(fiz1,tz);
1059 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1060 f+j_coord_offsetC,f+j_coord_offsetD,
1063 /**************************
1064 * CALCULATE INTERACTIONS *
1065 **************************/
1067 r20 = _mm_mul_ps(rsq20,rinv20);
1069 /* Compute parameters for interactions between i and j atoms */
1070 qq20 = _mm_mul_ps(iq2,jq0);
1072 /* EWALD ELECTROSTATICS */
1074 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1075 ewrt = _mm_mul_ps(r20,ewtabscale);
1076 ewitab = _mm_cvttps_epi32(ewrt);
1077 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1078 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1079 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1081 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1082 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1086 /* Calculate temporary vectorial force */
1087 tx = _mm_mul_ps(fscal,dx20);
1088 ty = _mm_mul_ps(fscal,dy20);
1089 tz = _mm_mul_ps(fscal,dz20);
1091 /* Update vectorial force */
1092 fix2 = _mm_add_ps(fix2,tx);
1093 fiy2 = _mm_add_ps(fiy2,ty);
1094 fiz2 = _mm_add_ps(fiz2,tz);
1096 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1097 f+j_coord_offsetC,f+j_coord_offsetD,
1100 /**************************
1101 * CALCULATE INTERACTIONS *
1102 **************************/
1104 r30 = _mm_mul_ps(rsq30,rinv30);
1106 /* Compute parameters for interactions between i and j atoms */
1107 qq30 = _mm_mul_ps(iq3,jq0);
1109 /* EWALD ELECTROSTATICS */
1111 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1112 ewrt = _mm_mul_ps(r30,ewtabscale);
1113 ewitab = _mm_cvttps_epi32(ewrt);
1114 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1115 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1116 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1118 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1119 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1123 /* Calculate temporary vectorial force */
1124 tx = _mm_mul_ps(fscal,dx30);
1125 ty = _mm_mul_ps(fscal,dy30);
1126 tz = _mm_mul_ps(fscal,dz30);
1128 /* Update vectorial force */
1129 fix3 = _mm_add_ps(fix3,tx);
1130 fiy3 = _mm_add_ps(fiy3,ty);
1131 fiz3 = _mm_add_ps(fiz3,tz);
1133 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1134 f+j_coord_offsetC,f+j_coord_offsetD,
1137 /* Inner loop uses 156 flops */
1140 if(jidx<j_index_end)
1143 /* Get j neighbor index, and coordinate index */
1145 jnrB = jjnr[jidx+1];
1146 jnrC = jjnr[jidx+2];
1147 jnrD = jjnr[jidx+3];
1149 /* Sign of each element will be negative for non-real atoms.
1150 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1151 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1153 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1154 jnrA = (jnrA>=0) ? jnrA : 0;
1155 jnrB = (jnrB>=0) ? jnrB : 0;
1156 jnrC = (jnrC>=0) ? jnrC : 0;
1157 jnrD = (jnrD>=0) ? jnrD : 0;
1159 j_coord_offsetA = DIM*jnrA;
1160 j_coord_offsetB = DIM*jnrB;
1161 j_coord_offsetC = DIM*jnrC;
1162 j_coord_offsetD = DIM*jnrD;
1164 /* load j atom coordinates */
1165 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1166 x+j_coord_offsetC,x+j_coord_offsetD,
1169 /* Calculate displacement vector */
1170 dx00 = _mm_sub_ps(ix0,jx0);
1171 dy00 = _mm_sub_ps(iy0,jy0);
1172 dz00 = _mm_sub_ps(iz0,jz0);
1173 dx10 = _mm_sub_ps(ix1,jx0);
1174 dy10 = _mm_sub_ps(iy1,jy0);
1175 dz10 = _mm_sub_ps(iz1,jz0);
1176 dx20 = _mm_sub_ps(ix2,jx0);
1177 dy20 = _mm_sub_ps(iy2,jy0);
1178 dz20 = _mm_sub_ps(iz2,jz0);
1179 dx30 = _mm_sub_ps(ix3,jx0);
1180 dy30 = _mm_sub_ps(iy3,jy0);
1181 dz30 = _mm_sub_ps(iz3,jz0);
1183 /* Calculate squared distance and things based on it */
1184 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1185 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1186 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1187 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1189 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1190 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1191 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1192 rinv30 = gmx_mm_invsqrt_ps(rsq30);
1194 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1195 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1196 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1198 /* Load parameters for j particles */
1199 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1200 charge+jnrC+0,charge+jnrD+0);
1201 vdwjidx0A = 2*vdwtype[jnrA+0];
1202 vdwjidx0B = 2*vdwtype[jnrB+0];
1203 vdwjidx0C = 2*vdwtype[jnrC+0];
1204 vdwjidx0D = 2*vdwtype[jnrD+0];
1206 /**************************
1207 * CALCULATE INTERACTIONS *
1208 **************************/
1210 r00 = _mm_mul_ps(rsq00,rinv00);
1211 r00 = _mm_andnot_ps(dummy_mask,r00);
1213 /* Compute parameters for interactions between i and j atoms */
1214 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1215 vdwparam+vdwioffset0+vdwjidx0B,
1216 vdwparam+vdwioffset0+vdwjidx0C,
1217 vdwparam+vdwioffset0+vdwjidx0D,
1220 /* Calculate table index by multiplying r with table scale and truncate to integer */
1221 rt = _mm_mul_ps(r00,vftabscale);
1222 vfitab = _mm_cvttps_epi32(rt);
1223 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1224 vfitab = _mm_slli_epi32(vfitab,3);
1226 /* CUBIC SPLINE TABLE DISPERSION */
1227 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1228 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1229 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1230 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1231 _MM_TRANSPOSE4_PS(Y,F,G,H);
1232 Heps = _mm_mul_ps(vfeps,H);
1233 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1234 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1235 fvdw6 = _mm_mul_ps(c6_00,FF);
1237 /* CUBIC SPLINE TABLE REPULSION */
1238 vfitab = _mm_add_epi32(vfitab,ifour);
1239 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1240 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1241 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1242 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1243 _MM_TRANSPOSE4_PS(Y,F,G,H);
1244 Heps = _mm_mul_ps(vfeps,H);
1245 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1246 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1247 fvdw12 = _mm_mul_ps(c12_00,FF);
1248 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1252 fscal = _mm_andnot_ps(dummy_mask,fscal);
1254 /* Calculate temporary vectorial force */
1255 tx = _mm_mul_ps(fscal,dx00);
1256 ty = _mm_mul_ps(fscal,dy00);
1257 tz = _mm_mul_ps(fscal,dz00);
1259 /* Update vectorial force */
1260 fix0 = _mm_add_ps(fix0,tx);
1261 fiy0 = _mm_add_ps(fiy0,ty);
1262 fiz0 = _mm_add_ps(fiz0,tz);
1264 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1265 f+j_coord_offsetC,f+j_coord_offsetD,
1268 /**************************
1269 * CALCULATE INTERACTIONS *
1270 **************************/
1272 r10 = _mm_mul_ps(rsq10,rinv10);
1273 r10 = _mm_andnot_ps(dummy_mask,r10);
1275 /* Compute parameters for interactions between i and j atoms */
1276 qq10 = _mm_mul_ps(iq1,jq0);
1278 /* EWALD ELECTROSTATICS */
1280 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1281 ewrt = _mm_mul_ps(r10,ewtabscale);
1282 ewitab = _mm_cvttps_epi32(ewrt);
1283 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1284 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1285 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1287 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1288 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1292 fscal = _mm_andnot_ps(dummy_mask,fscal);
1294 /* Calculate temporary vectorial force */
1295 tx = _mm_mul_ps(fscal,dx10);
1296 ty = _mm_mul_ps(fscal,dy10);
1297 tz = _mm_mul_ps(fscal,dz10);
1299 /* Update vectorial force */
1300 fix1 = _mm_add_ps(fix1,tx);
1301 fiy1 = _mm_add_ps(fiy1,ty);
1302 fiz1 = _mm_add_ps(fiz1,tz);
1304 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1305 f+j_coord_offsetC,f+j_coord_offsetD,
1308 /**************************
1309 * CALCULATE INTERACTIONS *
1310 **************************/
1312 r20 = _mm_mul_ps(rsq20,rinv20);
1313 r20 = _mm_andnot_ps(dummy_mask,r20);
1315 /* Compute parameters for interactions between i and j atoms */
1316 qq20 = _mm_mul_ps(iq2,jq0);
1318 /* EWALD ELECTROSTATICS */
1320 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1321 ewrt = _mm_mul_ps(r20,ewtabscale);
1322 ewitab = _mm_cvttps_epi32(ewrt);
1323 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1324 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1325 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1327 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1328 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1332 fscal = _mm_andnot_ps(dummy_mask,fscal);
1334 /* Calculate temporary vectorial force */
1335 tx = _mm_mul_ps(fscal,dx20);
1336 ty = _mm_mul_ps(fscal,dy20);
1337 tz = _mm_mul_ps(fscal,dz20);
1339 /* Update vectorial force */
1340 fix2 = _mm_add_ps(fix2,tx);
1341 fiy2 = _mm_add_ps(fiy2,ty);
1342 fiz2 = _mm_add_ps(fiz2,tz);
1344 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1345 f+j_coord_offsetC,f+j_coord_offsetD,
1348 /**************************
1349 * CALCULATE INTERACTIONS *
1350 **************************/
1352 r30 = _mm_mul_ps(rsq30,rinv30);
1353 r30 = _mm_andnot_ps(dummy_mask,r30);
1355 /* Compute parameters for interactions between i and j atoms */
1356 qq30 = _mm_mul_ps(iq3,jq0);
1358 /* EWALD ELECTROSTATICS */
1360 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1361 ewrt = _mm_mul_ps(r30,ewtabscale);
1362 ewitab = _mm_cvttps_epi32(ewrt);
1363 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1364 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1365 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1367 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1368 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1372 fscal = _mm_andnot_ps(dummy_mask,fscal);
1374 /* Calculate temporary vectorial force */
1375 tx = _mm_mul_ps(fscal,dx30);
1376 ty = _mm_mul_ps(fscal,dy30);
1377 tz = _mm_mul_ps(fscal,dz30);
1379 /* Update vectorial force */
1380 fix3 = _mm_add_ps(fix3,tx);
1381 fiy3 = _mm_add_ps(fiy3,ty);
1382 fiz3 = _mm_add_ps(fiz3,tz);
1384 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1385 f+j_coord_offsetC,f+j_coord_offsetD,
1388 /* Inner loop uses 160 flops */
1391 /* End of innermost loop */
1393 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1394 f+i_coord_offset,fshift+i_shift_offset);
1396 /* Increment number of inner iterations */
1397 inneriter += j_index_end - j_index_start;
1399 /* Outer loop uses 36 flops */
1402 /* Increment number of outer iterations */
1405 /* Update outer/inner flops */
1407 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*36 + inneriter*160);