2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
93 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
96 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
97 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
99 __m128i ifour = _mm_set1_epi32(4);
100 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
103 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
105 __m128 dummy_mask,cutoff_mask;
106 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
107 __m128 one = _mm_set1_ps(1.0);
108 __m128 two = _mm_set1_ps(2.0);
114 jindex = nlist->jindex;
116 shiftidx = nlist->shift;
118 shiftvec = fr->shift_vec[0];
119 fshift = fr->fshift[0];
120 facel = _mm_set1_ps(fr->epsfac);
121 charge = mdatoms->chargeA;
122 nvdwtype = fr->ntype;
124 vdwtype = mdatoms->typeA;
126 vftab = kernel_data->table_vdw->data;
127 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
129 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
130 ewtab = fr->ic->tabq_coul_FDV0;
131 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
132 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
134 /* Setup water-specific parameters */
135 inr = nlist->iinr[0];
136 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
137 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
138 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
139 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
141 jq0 = _mm_set1_ps(charge[inr+0]);
142 jq1 = _mm_set1_ps(charge[inr+1]);
143 jq2 = _mm_set1_ps(charge[inr+2]);
144 vdwjidx0A = 2*vdwtype[inr+0];
145 qq00 = _mm_mul_ps(iq0,jq0);
146 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
147 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
148 qq01 = _mm_mul_ps(iq0,jq1);
149 qq02 = _mm_mul_ps(iq0,jq2);
150 qq10 = _mm_mul_ps(iq1,jq0);
151 qq11 = _mm_mul_ps(iq1,jq1);
152 qq12 = _mm_mul_ps(iq1,jq2);
153 qq20 = _mm_mul_ps(iq2,jq0);
154 qq21 = _mm_mul_ps(iq2,jq1);
155 qq22 = _mm_mul_ps(iq2,jq2);
157 /* Avoid stupid compiler warnings */
158 jnrA = jnrB = jnrC = jnrD = 0;
167 for(iidx=0;iidx<4*DIM;iidx++)
172 /* Start outer loop over neighborlists */
173 for(iidx=0; iidx<nri; iidx++)
175 /* Load shift vector for this list */
176 i_shift_offset = DIM*shiftidx[iidx];
178 /* Load limits for loop over neighbors */
179 j_index_start = jindex[iidx];
180 j_index_end = jindex[iidx+1];
182 /* Get outer coordinate index */
184 i_coord_offset = DIM*inr;
186 /* Load i particle coords and add shift vector */
187 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
188 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
190 fix0 = _mm_setzero_ps();
191 fiy0 = _mm_setzero_ps();
192 fiz0 = _mm_setzero_ps();
193 fix1 = _mm_setzero_ps();
194 fiy1 = _mm_setzero_ps();
195 fiz1 = _mm_setzero_ps();
196 fix2 = _mm_setzero_ps();
197 fiy2 = _mm_setzero_ps();
198 fiz2 = _mm_setzero_ps();
200 /* Reset potential sums */
201 velecsum = _mm_setzero_ps();
202 vvdwsum = _mm_setzero_ps();
204 /* Start inner kernel loop */
205 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
208 /* Get j neighbor index, and coordinate index */
213 j_coord_offsetA = DIM*jnrA;
214 j_coord_offsetB = DIM*jnrB;
215 j_coord_offsetC = DIM*jnrC;
216 j_coord_offsetD = DIM*jnrD;
218 /* load j atom coordinates */
219 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
220 x+j_coord_offsetC,x+j_coord_offsetD,
221 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
223 /* Calculate displacement vector */
224 dx00 = _mm_sub_ps(ix0,jx0);
225 dy00 = _mm_sub_ps(iy0,jy0);
226 dz00 = _mm_sub_ps(iz0,jz0);
227 dx01 = _mm_sub_ps(ix0,jx1);
228 dy01 = _mm_sub_ps(iy0,jy1);
229 dz01 = _mm_sub_ps(iz0,jz1);
230 dx02 = _mm_sub_ps(ix0,jx2);
231 dy02 = _mm_sub_ps(iy0,jy2);
232 dz02 = _mm_sub_ps(iz0,jz2);
233 dx10 = _mm_sub_ps(ix1,jx0);
234 dy10 = _mm_sub_ps(iy1,jy0);
235 dz10 = _mm_sub_ps(iz1,jz0);
236 dx11 = _mm_sub_ps(ix1,jx1);
237 dy11 = _mm_sub_ps(iy1,jy1);
238 dz11 = _mm_sub_ps(iz1,jz1);
239 dx12 = _mm_sub_ps(ix1,jx2);
240 dy12 = _mm_sub_ps(iy1,jy2);
241 dz12 = _mm_sub_ps(iz1,jz2);
242 dx20 = _mm_sub_ps(ix2,jx0);
243 dy20 = _mm_sub_ps(iy2,jy0);
244 dz20 = _mm_sub_ps(iz2,jz0);
245 dx21 = _mm_sub_ps(ix2,jx1);
246 dy21 = _mm_sub_ps(iy2,jy1);
247 dz21 = _mm_sub_ps(iz2,jz1);
248 dx22 = _mm_sub_ps(ix2,jx2);
249 dy22 = _mm_sub_ps(iy2,jy2);
250 dz22 = _mm_sub_ps(iz2,jz2);
252 /* Calculate squared distance and things based on it */
253 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
254 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
255 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
256 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
257 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
258 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
259 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
260 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
261 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
263 rinv00 = gmx_mm_invsqrt_ps(rsq00);
264 rinv01 = gmx_mm_invsqrt_ps(rsq01);
265 rinv02 = gmx_mm_invsqrt_ps(rsq02);
266 rinv10 = gmx_mm_invsqrt_ps(rsq10);
267 rinv11 = gmx_mm_invsqrt_ps(rsq11);
268 rinv12 = gmx_mm_invsqrt_ps(rsq12);
269 rinv20 = gmx_mm_invsqrt_ps(rsq20);
270 rinv21 = gmx_mm_invsqrt_ps(rsq21);
271 rinv22 = gmx_mm_invsqrt_ps(rsq22);
273 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
274 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
275 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
276 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
277 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
278 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
279 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
280 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
281 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
283 fjx0 = _mm_setzero_ps();
284 fjy0 = _mm_setzero_ps();
285 fjz0 = _mm_setzero_ps();
286 fjx1 = _mm_setzero_ps();
287 fjy1 = _mm_setzero_ps();
288 fjz1 = _mm_setzero_ps();
289 fjx2 = _mm_setzero_ps();
290 fjy2 = _mm_setzero_ps();
291 fjz2 = _mm_setzero_ps();
293 /**************************
294 * CALCULATE INTERACTIONS *
295 **************************/
297 r00 = _mm_mul_ps(rsq00,rinv00);
299 /* Calculate table index by multiplying r with table scale and truncate to integer */
300 rt = _mm_mul_ps(r00,vftabscale);
301 vfitab = _mm_cvttps_epi32(rt);
302 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
303 vfitab = _mm_slli_epi32(vfitab,3);
305 /* EWALD ELECTROSTATICS */
307 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
308 ewrt = _mm_mul_ps(r00,ewtabscale);
309 ewitab = _mm_cvttps_epi32(ewrt);
310 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
311 ewitab = _mm_slli_epi32(ewitab,2);
312 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
313 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
314 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
315 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
316 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
317 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
318 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
319 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
320 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
322 /* CUBIC SPLINE TABLE DISPERSION */
323 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
324 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
325 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
326 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
327 _MM_TRANSPOSE4_PS(Y,F,G,H);
328 Heps = _mm_mul_ps(vfeps,H);
329 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
330 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
331 vvdw6 = _mm_mul_ps(c6_00,VV);
332 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
333 fvdw6 = _mm_mul_ps(c6_00,FF);
335 /* CUBIC SPLINE TABLE REPULSION */
336 vfitab = _mm_add_epi32(vfitab,ifour);
337 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
338 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
339 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
340 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
341 _MM_TRANSPOSE4_PS(Y,F,G,H);
342 Heps = _mm_mul_ps(vfeps,H);
343 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
344 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
345 vvdw12 = _mm_mul_ps(c12_00,VV);
346 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
347 fvdw12 = _mm_mul_ps(c12_00,FF);
348 vvdw = _mm_add_ps(vvdw12,vvdw6);
349 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
351 /* Update potential sum for this i atom from the interaction with this j atom. */
352 velecsum = _mm_add_ps(velecsum,velec);
353 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
355 fscal = _mm_add_ps(felec,fvdw);
357 /* Calculate temporary vectorial force */
358 tx = _mm_mul_ps(fscal,dx00);
359 ty = _mm_mul_ps(fscal,dy00);
360 tz = _mm_mul_ps(fscal,dz00);
362 /* Update vectorial force */
363 fix0 = _mm_add_ps(fix0,tx);
364 fiy0 = _mm_add_ps(fiy0,ty);
365 fiz0 = _mm_add_ps(fiz0,tz);
367 fjx0 = _mm_add_ps(fjx0,tx);
368 fjy0 = _mm_add_ps(fjy0,ty);
369 fjz0 = _mm_add_ps(fjz0,tz);
371 /**************************
372 * CALCULATE INTERACTIONS *
373 **************************/
375 r01 = _mm_mul_ps(rsq01,rinv01);
377 /* EWALD ELECTROSTATICS */
379 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
380 ewrt = _mm_mul_ps(r01,ewtabscale);
381 ewitab = _mm_cvttps_epi32(ewrt);
382 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
383 ewitab = _mm_slli_epi32(ewitab,2);
384 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
385 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
386 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
387 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
388 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
389 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
390 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
391 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
392 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
394 /* Update potential sum for this i atom from the interaction with this j atom. */
395 velecsum = _mm_add_ps(velecsum,velec);
399 /* Calculate temporary vectorial force */
400 tx = _mm_mul_ps(fscal,dx01);
401 ty = _mm_mul_ps(fscal,dy01);
402 tz = _mm_mul_ps(fscal,dz01);
404 /* Update vectorial force */
405 fix0 = _mm_add_ps(fix0,tx);
406 fiy0 = _mm_add_ps(fiy0,ty);
407 fiz0 = _mm_add_ps(fiz0,tz);
409 fjx1 = _mm_add_ps(fjx1,tx);
410 fjy1 = _mm_add_ps(fjy1,ty);
411 fjz1 = _mm_add_ps(fjz1,tz);
413 /**************************
414 * CALCULATE INTERACTIONS *
415 **************************/
417 r02 = _mm_mul_ps(rsq02,rinv02);
419 /* EWALD ELECTROSTATICS */
421 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
422 ewrt = _mm_mul_ps(r02,ewtabscale);
423 ewitab = _mm_cvttps_epi32(ewrt);
424 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
425 ewitab = _mm_slli_epi32(ewitab,2);
426 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
427 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
428 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
429 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
430 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
431 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
432 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
433 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
434 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
436 /* Update potential sum for this i atom from the interaction with this j atom. */
437 velecsum = _mm_add_ps(velecsum,velec);
441 /* Calculate temporary vectorial force */
442 tx = _mm_mul_ps(fscal,dx02);
443 ty = _mm_mul_ps(fscal,dy02);
444 tz = _mm_mul_ps(fscal,dz02);
446 /* Update vectorial force */
447 fix0 = _mm_add_ps(fix0,tx);
448 fiy0 = _mm_add_ps(fiy0,ty);
449 fiz0 = _mm_add_ps(fiz0,tz);
451 fjx2 = _mm_add_ps(fjx2,tx);
452 fjy2 = _mm_add_ps(fjy2,ty);
453 fjz2 = _mm_add_ps(fjz2,tz);
455 /**************************
456 * CALCULATE INTERACTIONS *
457 **************************/
459 r10 = _mm_mul_ps(rsq10,rinv10);
461 /* EWALD ELECTROSTATICS */
463 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
464 ewrt = _mm_mul_ps(r10,ewtabscale);
465 ewitab = _mm_cvttps_epi32(ewrt);
466 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
467 ewitab = _mm_slli_epi32(ewitab,2);
468 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
469 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
470 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
471 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
472 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
473 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
474 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
475 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
476 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
478 /* Update potential sum for this i atom from the interaction with this j atom. */
479 velecsum = _mm_add_ps(velecsum,velec);
483 /* Calculate temporary vectorial force */
484 tx = _mm_mul_ps(fscal,dx10);
485 ty = _mm_mul_ps(fscal,dy10);
486 tz = _mm_mul_ps(fscal,dz10);
488 /* Update vectorial force */
489 fix1 = _mm_add_ps(fix1,tx);
490 fiy1 = _mm_add_ps(fiy1,ty);
491 fiz1 = _mm_add_ps(fiz1,tz);
493 fjx0 = _mm_add_ps(fjx0,tx);
494 fjy0 = _mm_add_ps(fjy0,ty);
495 fjz0 = _mm_add_ps(fjz0,tz);
497 /**************************
498 * CALCULATE INTERACTIONS *
499 **************************/
501 r11 = _mm_mul_ps(rsq11,rinv11);
503 /* EWALD ELECTROSTATICS */
505 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
506 ewrt = _mm_mul_ps(r11,ewtabscale);
507 ewitab = _mm_cvttps_epi32(ewrt);
508 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
509 ewitab = _mm_slli_epi32(ewitab,2);
510 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
511 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
512 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
513 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
514 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
515 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
516 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
517 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
518 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
520 /* Update potential sum for this i atom from the interaction with this j atom. */
521 velecsum = _mm_add_ps(velecsum,velec);
525 /* Calculate temporary vectorial force */
526 tx = _mm_mul_ps(fscal,dx11);
527 ty = _mm_mul_ps(fscal,dy11);
528 tz = _mm_mul_ps(fscal,dz11);
530 /* Update vectorial force */
531 fix1 = _mm_add_ps(fix1,tx);
532 fiy1 = _mm_add_ps(fiy1,ty);
533 fiz1 = _mm_add_ps(fiz1,tz);
535 fjx1 = _mm_add_ps(fjx1,tx);
536 fjy1 = _mm_add_ps(fjy1,ty);
537 fjz1 = _mm_add_ps(fjz1,tz);
539 /**************************
540 * CALCULATE INTERACTIONS *
541 **************************/
543 r12 = _mm_mul_ps(rsq12,rinv12);
545 /* EWALD ELECTROSTATICS */
547 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
548 ewrt = _mm_mul_ps(r12,ewtabscale);
549 ewitab = _mm_cvttps_epi32(ewrt);
550 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
551 ewitab = _mm_slli_epi32(ewitab,2);
552 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
553 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
554 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
555 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
556 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
557 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
558 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
559 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
560 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
562 /* Update potential sum for this i atom from the interaction with this j atom. */
563 velecsum = _mm_add_ps(velecsum,velec);
567 /* Calculate temporary vectorial force */
568 tx = _mm_mul_ps(fscal,dx12);
569 ty = _mm_mul_ps(fscal,dy12);
570 tz = _mm_mul_ps(fscal,dz12);
572 /* Update vectorial force */
573 fix1 = _mm_add_ps(fix1,tx);
574 fiy1 = _mm_add_ps(fiy1,ty);
575 fiz1 = _mm_add_ps(fiz1,tz);
577 fjx2 = _mm_add_ps(fjx2,tx);
578 fjy2 = _mm_add_ps(fjy2,ty);
579 fjz2 = _mm_add_ps(fjz2,tz);
581 /**************************
582 * CALCULATE INTERACTIONS *
583 **************************/
585 r20 = _mm_mul_ps(rsq20,rinv20);
587 /* EWALD ELECTROSTATICS */
589 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
590 ewrt = _mm_mul_ps(r20,ewtabscale);
591 ewitab = _mm_cvttps_epi32(ewrt);
592 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
593 ewitab = _mm_slli_epi32(ewitab,2);
594 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
595 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
596 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
597 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
598 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
599 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
600 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
601 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
602 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
604 /* Update potential sum for this i atom from the interaction with this j atom. */
605 velecsum = _mm_add_ps(velecsum,velec);
609 /* Calculate temporary vectorial force */
610 tx = _mm_mul_ps(fscal,dx20);
611 ty = _mm_mul_ps(fscal,dy20);
612 tz = _mm_mul_ps(fscal,dz20);
614 /* Update vectorial force */
615 fix2 = _mm_add_ps(fix2,tx);
616 fiy2 = _mm_add_ps(fiy2,ty);
617 fiz2 = _mm_add_ps(fiz2,tz);
619 fjx0 = _mm_add_ps(fjx0,tx);
620 fjy0 = _mm_add_ps(fjy0,ty);
621 fjz0 = _mm_add_ps(fjz0,tz);
623 /**************************
624 * CALCULATE INTERACTIONS *
625 **************************/
627 r21 = _mm_mul_ps(rsq21,rinv21);
629 /* EWALD ELECTROSTATICS */
631 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
632 ewrt = _mm_mul_ps(r21,ewtabscale);
633 ewitab = _mm_cvttps_epi32(ewrt);
634 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
635 ewitab = _mm_slli_epi32(ewitab,2);
636 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
637 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
638 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
639 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
640 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
641 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
642 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
643 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
644 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
646 /* Update potential sum for this i atom from the interaction with this j atom. */
647 velecsum = _mm_add_ps(velecsum,velec);
651 /* Calculate temporary vectorial force */
652 tx = _mm_mul_ps(fscal,dx21);
653 ty = _mm_mul_ps(fscal,dy21);
654 tz = _mm_mul_ps(fscal,dz21);
656 /* Update vectorial force */
657 fix2 = _mm_add_ps(fix2,tx);
658 fiy2 = _mm_add_ps(fiy2,ty);
659 fiz2 = _mm_add_ps(fiz2,tz);
661 fjx1 = _mm_add_ps(fjx1,tx);
662 fjy1 = _mm_add_ps(fjy1,ty);
663 fjz1 = _mm_add_ps(fjz1,tz);
665 /**************************
666 * CALCULATE INTERACTIONS *
667 **************************/
669 r22 = _mm_mul_ps(rsq22,rinv22);
671 /* EWALD ELECTROSTATICS */
673 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
674 ewrt = _mm_mul_ps(r22,ewtabscale);
675 ewitab = _mm_cvttps_epi32(ewrt);
676 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
677 ewitab = _mm_slli_epi32(ewitab,2);
678 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
679 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
680 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
681 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
682 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
683 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
684 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
685 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
686 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
688 /* Update potential sum for this i atom from the interaction with this j atom. */
689 velecsum = _mm_add_ps(velecsum,velec);
693 /* Calculate temporary vectorial force */
694 tx = _mm_mul_ps(fscal,dx22);
695 ty = _mm_mul_ps(fscal,dy22);
696 tz = _mm_mul_ps(fscal,dz22);
698 /* Update vectorial force */
699 fix2 = _mm_add_ps(fix2,tx);
700 fiy2 = _mm_add_ps(fiy2,ty);
701 fiz2 = _mm_add_ps(fiz2,tz);
703 fjx2 = _mm_add_ps(fjx2,tx);
704 fjy2 = _mm_add_ps(fjy2,ty);
705 fjz2 = _mm_add_ps(fjz2,tz);
707 fjptrA = f+j_coord_offsetA;
708 fjptrB = f+j_coord_offsetB;
709 fjptrC = f+j_coord_offsetC;
710 fjptrD = f+j_coord_offsetD;
712 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
713 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
715 /* Inner loop uses 403 flops */
721 /* Get j neighbor index, and coordinate index */
722 jnrlistA = jjnr[jidx];
723 jnrlistB = jjnr[jidx+1];
724 jnrlistC = jjnr[jidx+2];
725 jnrlistD = jjnr[jidx+3];
726 /* Sign of each element will be negative for non-real atoms.
727 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
728 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
730 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
731 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
732 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
733 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
734 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
735 j_coord_offsetA = DIM*jnrA;
736 j_coord_offsetB = DIM*jnrB;
737 j_coord_offsetC = DIM*jnrC;
738 j_coord_offsetD = DIM*jnrD;
740 /* load j atom coordinates */
741 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
742 x+j_coord_offsetC,x+j_coord_offsetD,
743 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
745 /* Calculate displacement vector */
746 dx00 = _mm_sub_ps(ix0,jx0);
747 dy00 = _mm_sub_ps(iy0,jy0);
748 dz00 = _mm_sub_ps(iz0,jz0);
749 dx01 = _mm_sub_ps(ix0,jx1);
750 dy01 = _mm_sub_ps(iy0,jy1);
751 dz01 = _mm_sub_ps(iz0,jz1);
752 dx02 = _mm_sub_ps(ix0,jx2);
753 dy02 = _mm_sub_ps(iy0,jy2);
754 dz02 = _mm_sub_ps(iz0,jz2);
755 dx10 = _mm_sub_ps(ix1,jx0);
756 dy10 = _mm_sub_ps(iy1,jy0);
757 dz10 = _mm_sub_ps(iz1,jz0);
758 dx11 = _mm_sub_ps(ix1,jx1);
759 dy11 = _mm_sub_ps(iy1,jy1);
760 dz11 = _mm_sub_ps(iz1,jz1);
761 dx12 = _mm_sub_ps(ix1,jx2);
762 dy12 = _mm_sub_ps(iy1,jy2);
763 dz12 = _mm_sub_ps(iz1,jz2);
764 dx20 = _mm_sub_ps(ix2,jx0);
765 dy20 = _mm_sub_ps(iy2,jy0);
766 dz20 = _mm_sub_ps(iz2,jz0);
767 dx21 = _mm_sub_ps(ix2,jx1);
768 dy21 = _mm_sub_ps(iy2,jy1);
769 dz21 = _mm_sub_ps(iz2,jz1);
770 dx22 = _mm_sub_ps(ix2,jx2);
771 dy22 = _mm_sub_ps(iy2,jy2);
772 dz22 = _mm_sub_ps(iz2,jz2);
774 /* Calculate squared distance and things based on it */
775 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
776 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
777 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
778 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
779 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
780 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
781 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
782 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
783 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
785 rinv00 = gmx_mm_invsqrt_ps(rsq00);
786 rinv01 = gmx_mm_invsqrt_ps(rsq01);
787 rinv02 = gmx_mm_invsqrt_ps(rsq02);
788 rinv10 = gmx_mm_invsqrt_ps(rsq10);
789 rinv11 = gmx_mm_invsqrt_ps(rsq11);
790 rinv12 = gmx_mm_invsqrt_ps(rsq12);
791 rinv20 = gmx_mm_invsqrt_ps(rsq20);
792 rinv21 = gmx_mm_invsqrt_ps(rsq21);
793 rinv22 = gmx_mm_invsqrt_ps(rsq22);
795 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
796 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
797 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
798 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
799 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
800 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
801 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
802 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
803 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
805 fjx0 = _mm_setzero_ps();
806 fjy0 = _mm_setzero_ps();
807 fjz0 = _mm_setzero_ps();
808 fjx1 = _mm_setzero_ps();
809 fjy1 = _mm_setzero_ps();
810 fjz1 = _mm_setzero_ps();
811 fjx2 = _mm_setzero_ps();
812 fjy2 = _mm_setzero_ps();
813 fjz2 = _mm_setzero_ps();
815 /**************************
816 * CALCULATE INTERACTIONS *
817 **************************/
819 r00 = _mm_mul_ps(rsq00,rinv00);
820 r00 = _mm_andnot_ps(dummy_mask,r00);
822 /* Calculate table index by multiplying r with table scale and truncate to integer */
823 rt = _mm_mul_ps(r00,vftabscale);
824 vfitab = _mm_cvttps_epi32(rt);
825 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
826 vfitab = _mm_slli_epi32(vfitab,3);
828 /* EWALD ELECTROSTATICS */
830 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
831 ewrt = _mm_mul_ps(r00,ewtabscale);
832 ewitab = _mm_cvttps_epi32(ewrt);
833 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
834 ewitab = _mm_slli_epi32(ewitab,2);
835 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
836 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
837 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
838 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
839 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
840 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
841 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
842 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
843 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
845 /* CUBIC SPLINE TABLE DISPERSION */
846 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
847 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
848 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
849 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
850 _MM_TRANSPOSE4_PS(Y,F,G,H);
851 Heps = _mm_mul_ps(vfeps,H);
852 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
853 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
854 vvdw6 = _mm_mul_ps(c6_00,VV);
855 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
856 fvdw6 = _mm_mul_ps(c6_00,FF);
858 /* CUBIC SPLINE TABLE REPULSION */
859 vfitab = _mm_add_epi32(vfitab,ifour);
860 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
861 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
862 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
863 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
864 _MM_TRANSPOSE4_PS(Y,F,G,H);
865 Heps = _mm_mul_ps(vfeps,H);
866 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
867 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
868 vvdw12 = _mm_mul_ps(c12_00,VV);
869 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
870 fvdw12 = _mm_mul_ps(c12_00,FF);
871 vvdw = _mm_add_ps(vvdw12,vvdw6);
872 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
874 /* Update potential sum for this i atom from the interaction with this j atom. */
875 velec = _mm_andnot_ps(dummy_mask,velec);
876 velecsum = _mm_add_ps(velecsum,velec);
877 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
878 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
880 fscal = _mm_add_ps(felec,fvdw);
882 fscal = _mm_andnot_ps(dummy_mask,fscal);
884 /* Calculate temporary vectorial force */
885 tx = _mm_mul_ps(fscal,dx00);
886 ty = _mm_mul_ps(fscal,dy00);
887 tz = _mm_mul_ps(fscal,dz00);
889 /* Update vectorial force */
890 fix0 = _mm_add_ps(fix0,tx);
891 fiy0 = _mm_add_ps(fiy0,ty);
892 fiz0 = _mm_add_ps(fiz0,tz);
894 fjx0 = _mm_add_ps(fjx0,tx);
895 fjy0 = _mm_add_ps(fjy0,ty);
896 fjz0 = _mm_add_ps(fjz0,tz);
898 /**************************
899 * CALCULATE INTERACTIONS *
900 **************************/
902 r01 = _mm_mul_ps(rsq01,rinv01);
903 r01 = _mm_andnot_ps(dummy_mask,r01);
905 /* EWALD ELECTROSTATICS */
907 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
908 ewrt = _mm_mul_ps(r01,ewtabscale);
909 ewitab = _mm_cvttps_epi32(ewrt);
910 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
911 ewitab = _mm_slli_epi32(ewitab,2);
912 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
913 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
914 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
915 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
916 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
917 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
918 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
919 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
920 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
922 /* Update potential sum for this i atom from the interaction with this j atom. */
923 velec = _mm_andnot_ps(dummy_mask,velec);
924 velecsum = _mm_add_ps(velecsum,velec);
928 fscal = _mm_andnot_ps(dummy_mask,fscal);
930 /* Calculate temporary vectorial force */
931 tx = _mm_mul_ps(fscal,dx01);
932 ty = _mm_mul_ps(fscal,dy01);
933 tz = _mm_mul_ps(fscal,dz01);
935 /* Update vectorial force */
936 fix0 = _mm_add_ps(fix0,tx);
937 fiy0 = _mm_add_ps(fiy0,ty);
938 fiz0 = _mm_add_ps(fiz0,tz);
940 fjx1 = _mm_add_ps(fjx1,tx);
941 fjy1 = _mm_add_ps(fjy1,ty);
942 fjz1 = _mm_add_ps(fjz1,tz);
944 /**************************
945 * CALCULATE INTERACTIONS *
946 **************************/
948 r02 = _mm_mul_ps(rsq02,rinv02);
949 r02 = _mm_andnot_ps(dummy_mask,r02);
951 /* EWALD ELECTROSTATICS */
953 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
954 ewrt = _mm_mul_ps(r02,ewtabscale);
955 ewitab = _mm_cvttps_epi32(ewrt);
956 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
957 ewitab = _mm_slli_epi32(ewitab,2);
958 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
959 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
960 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
961 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
962 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
963 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
964 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
965 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
966 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
968 /* Update potential sum for this i atom from the interaction with this j atom. */
969 velec = _mm_andnot_ps(dummy_mask,velec);
970 velecsum = _mm_add_ps(velecsum,velec);
974 fscal = _mm_andnot_ps(dummy_mask,fscal);
976 /* Calculate temporary vectorial force */
977 tx = _mm_mul_ps(fscal,dx02);
978 ty = _mm_mul_ps(fscal,dy02);
979 tz = _mm_mul_ps(fscal,dz02);
981 /* Update vectorial force */
982 fix0 = _mm_add_ps(fix0,tx);
983 fiy0 = _mm_add_ps(fiy0,ty);
984 fiz0 = _mm_add_ps(fiz0,tz);
986 fjx2 = _mm_add_ps(fjx2,tx);
987 fjy2 = _mm_add_ps(fjy2,ty);
988 fjz2 = _mm_add_ps(fjz2,tz);
990 /**************************
991 * CALCULATE INTERACTIONS *
992 **************************/
994 r10 = _mm_mul_ps(rsq10,rinv10);
995 r10 = _mm_andnot_ps(dummy_mask,r10);
997 /* EWALD ELECTROSTATICS */
999 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1000 ewrt = _mm_mul_ps(r10,ewtabscale);
1001 ewitab = _mm_cvttps_epi32(ewrt);
1002 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1003 ewitab = _mm_slli_epi32(ewitab,2);
1004 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1005 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1006 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1007 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1008 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1009 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1010 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1011 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1012 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1014 /* Update potential sum for this i atom from the interaction with this j atom. */
1015 velec = _mm_andnot_ps(dummy_mask,velec);
1016 velecsum = _mm_add_ps(velecsum,velec);
1020 fscal = _mm_andnot_ps(dummy_mask,fscal);
1022 /* Calculate temporary vectorial force */
1023 tx = _mm_mul_ps(fscal,dx10);
1024 ty = _mm_mul_ps(fscal,dy10);
1025 tz = _mm_mul_ps(fscal,dz10);
1027 /* Update vectorial force */
1028 fix1 = _mm_add_ps(fix1,tx);
1029 fiy1 = _mm_add_ps(fiy1,ty);
1030 fiz1 = _mm_add_ps(fiz1,tz);
1032 fjx0 = _mm_add_ps(fjx0,tx);
1033 fjy0 = _mm_add_ps(fjy0,ty);
1034 fjz0 = _mm_add_ps(fjz0,tz);
1036 /**************************
1037 * CALCULATE INTERACTIONS *
1038 **************************/
1040 r11 = _mm_mul_ps(rsq11,rinv11);
1041 r11 = _mm_andnot_ps(dummy_mask,r11);
1043 /* EWALD ELECTROSTATICS */
1045 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1046 ewrt = _mm_mul_ps(r11,ewtabscale);
1047 ewitab = _mm_cvttps_epi32(ewrt);
1048 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1049 ewitab = _mm_slli_epi32(ewitab,2);
1050 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1051 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1052 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1053 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1054 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1055 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1056 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1057 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1058 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1060 /* Update potential sum for this i atom from the interaction with this j atom. */
1061 velec = _mm_andnot_ps(dummy_mask,velec);
1062 velecsum = _mm_add_ps(velecsum,velec);
1066 fscal = _mm_andnot_ps(dummy_mask,fscal);
1068 /* Calculate temporary vectorial force */
1069 tx = _mm_mul_ps(fscal,dx11);
1070 ty = _mm_mul_ps(fscal,dy11);
1071 tz = _mm_mul_ps(fscal,dz11);
1073 /* Update vectorial force */
1074 fix1 = _mm_add_ps(fix1,tx);
1075 fiy1 = _mm_add_ps(fiy1,ty);
1076 fiz1 = _mm_add_ps(fiz1,tz);
1078 fjx1 = _mm_add_ps(fjx1,tx);
1079 fjy1 = _mm_add_ps(fjy1,ty);
1080 fjz1 = _mm_add_ps(fjz1,tz);
1082 /**************************
1083 * CALCULATE INTERACTIONS *
1084 **************************/
1086 r12 = _mm_mul_ps(rsq12,rinv12);
1087 r12 = _mm_andnot_ps(dummy_mask,r12);
1089 /* EWALD ELECTROSTATICS */
1091 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1092 ewrt = _mm_mul_ps(r12,ewtabscale);
1093 ewitab = _mm_cvttps_epi32(ewrt);
1094 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1095 ewitab = _mm_slli_epi32(ewitab,2);
1096 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1097 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1098 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1099 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1100 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1101 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1102 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1103 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1104 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1106 /* Update potential sum for this i atom from the interaction with this j atom. */
1107 velec = _mm_andnot_ps(dummy_mask,velec);
1108 velecsum = _mm_add_ps(velecsum,velec);
1112 fscal = _mm_andnot_ps(dummy_mask,fscal);
1114 /* Calculate temporary vectorial force */
1115 tx = _mm_mul_ps(fscal,dx12);
1116 ty = _mm_mul_ps(fscal,dy12);
1117 tz = _mm_mul_ps(fscal,dz12);
1119 /* Update vectorial force */
1120 fix1 = _mm_add_ps(fix1,tx);
1121 fiy1 = _mm_add_ps(fiy1,ty);
1122 fiz1 = _mm_add_ps(fiz1,tz);
1124 fjx2 = _mm_add_ps(fjx2,tx);
1125 fjy2 = _mm_add_ps(fjy2,ty);
1126 fjz2 = _mm_add_ps(fjz2,tz);
1128 /**************************
1129 * CALCULATE INTERACTIONS *
1130 **************************/
1132 r20 = _mm_mul_ps(rsq20,rinv20);
1133 r20 = _mm_andnot_ps(dummy_mask,r20);
1135 /* EWALD ELECTROSTATICS */
1137 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1138 ewrt = _mm_mul_ps(r20,ewtabscale);
1139 ewitab = _mm_cvttps_epi32(ewrt);
1140 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1141 ewitab = _mm_slli_epi32(ewitab,2);
1142 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1143 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1144 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1145 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1146 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1147 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1148 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1149 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1150 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1152 /* Update potential sum for this i atom from the interaction with this j atom. */
1153 velec = _mm_andnot_ps(dummy_mask,velec);
1154 velecsum = _mm_add_ps(velecsum,velec);
1158 fscal = _mm_andnot_ps(dummy_mask,fscal);
1160 /* Calculate temporary vectorial force */
1161 tx = _mm_mul_ps(fscal,dx20);
1162 ty = _mm_mul_ps(fscal,dy20);
1163 tz = _mm_mul_ps(fscal,dz20);
1165 /* Update vectorial force */
1166 fix2 = _mm_add_ps(fix2,tx);
1167 fiy2 = _mm_add_ps(fiy2,ty);
1168 fiz2 = _mm_add_ps(fiz2,tz);
1170 fjx0 = _mm_add_ps(fjx0,tx);
1171 fjy0 = _mm_add_ps(fjy0,ty);
1172 fjz0 = _mm_add_ps(fjz0,tz);
1174 /**************************
1175 * CALCULATE INTERACTIONS *
1176 **************************/
1178 r21 = _mm_mul_ps(rsq21,rinv21);
1179 r21 = _mm_andnot_ps(dummy_mask,r21);
1181 /* EWALD ELECTROSTATICS */
1183 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1184 ewrt = _mm_mul_ps(r21,ewtabscale);
1185 ewitab = _mm_cvttps_epi32(ewrt);
1186 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1187 ewitab = _mm_slli_epi32(ewitab,2);
1188 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1189 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1190 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1191 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1192 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1193 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1194 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1195 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1196 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1198 /* Update potential sum for this i atom from the interaction with this j atom. */
1199 velec = _mm_andnot_ps(dummy_mask,velec);
1200 velecsum = _mm_add_ps(velecsum,velec);
1204 fscal = _mm_andnot_ps(dummy_mask,fscal);
1206 /* Calculate temporary vectorial force */
1207 tx = _mm_mul_ps(fscal,dx21);
1208 ty = _mm_mul_ps(fscal,dy21);
1209 tz = _mm_mul_ps(fscal,dz21);
1211 /* Update vectorial force */
1212 fix2 = _mm_add_ps(fix2,tx);
1213 fiy2 = _mm_add_ps(fiy2,ty);
1214 fiz2 = _mm_add_ps(fiz2,tz);
1216 fjx1 = _mm_add_ps(fjx1,tx);
1217 fjy1 = _mm_add_ps(fjy1,ty);
1218 fjz1 = _mm_add_ps(fjz1,tz);
1220 /**************************
1221 * CALCULATE INTERACTIONS *
1222 **************************/
1224 r22 = _mm_mul_ps(rsq22,rinv22);
1225 r22 = _mm_andnot_ps(dummy_mask,r22);
1227 /* EWALD ELECTROSTATICS */
1229 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1230 ewrt = _mm_mul_ps(r22,ewtabscale);
1231 ewitab = _mm_cvttps_epi32(ewrt);
1232 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1233 ewitab = _mm_slli_epi32(ewitab,2);
1234 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1235 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1236 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1237 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1238 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1239 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1240 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1241 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1242 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1244 /* Update potential sum for this i atom from the interaction with this j atom. */
1245 velec = _mm_andnot_ps(dummy_mask,velec);
1246 velecsum = _mm_add_ps(velecsum,velec);
1250 fscal = _mm_andnot_ps(dummy_mask,fscal);
1252 /* Calculate temporary vectorial force */
1253 tx = _mm_mul_ps(fscal,dx22);
1254 ty = _mm_mul_ps(fscal,dy22);
1255 tz = _mm_mul_ps(fscal,dz22);
1257 /* Update vectorial force */
1258 fix2 = _mm_add_ps(fix2,tx);
1259 fiy2 = _mm_add_ps(fiy2,ty);
1260 fiz2 = _mm_add_ps(fiz2,tz);
1262 fjx2 = _mm_add_ps(fjx2,tx);
1263 fjy2 = _mm_add_ps(fjy2,ty);
1264 fjz2 = _mm_add_ps(fjz2,tz);
1266 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1267 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1268 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1269 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1271 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1272 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1274 /* Inner loop uses 412 flops */
1277 /* End of innermost loop */
1279 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1280 f+i_coord_offset,fshift+i_shift_offset);
1283 /* Update potential energies */
1284 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1285 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1287 /* Increment number of inner iterations */
1288 inneriter += j_index_end - j_index_start;
1290 /* Outer loop uses 20 flops */
1293 /* Increment number of outer iterations */
1296 /* Update outer/inner flops */
1298 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*412);
1301 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_single
1302 * Electrostatics interaction: Ewald
1303 * VdW interaction: CubicSplineTable
1304 * Geometry: Water3-Water3
1305 * Calculate force/pot: Force
1308 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_single
1309 (t_nblist * gmx_restrict nlist,
1310 rvec * gmx_restrict xx,
1311 rvec * gmx_restrict ff,
1312 t_forcerec * gmx_restrict fr,
1313 t_mdatoms * gmx_restrict mdatoms,
1314 nb_kernel_data_t * gmx_restrict kernel_data,
1315 t_nrnb * gmx_restrict nrnb)
1317 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1318 * just 0 for non-waters.
1319 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1320 * jnr indices corresponding to data put in the four positions in the SIMD register.
1322 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1323 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1324 int jnrA,jnrB,jnrC,jnrD;
1325 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1326 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1327 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1328 real rcutoff_scalar;
1329 real *shiftvec,*fshift,*x,*f;
1330 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1331 real scratch[4*DIM];
1332 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1334 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1336 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1338 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1339 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1340 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1341 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1342 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1343 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1344 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1345 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1346 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1347 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1348 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1349 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1350 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1351 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1352 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1353 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1354 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1357 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1360 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1361 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1363 __m128i ifour = _mm_set1_epi32(4);
1364 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1367 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1369 __m128 dummy_mask,cutoff_mask;
1370 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1371 __m128 one = _mm_set1_ps(1.0);
1372 __m128 two = _mm_set1_ps(2.0);
1378 jindex = nlist->jindex;
1380 shiftidx = nlist->shift;
1382 shiftvec = fr->shift_vec[0];
1383 fshift = fr->fshift[0];
1384 facel = _mm_set1_ps(fr->epsfac);
1385 charge = mdatoms->chargeA;
1386 nvdwtype = fr->ntype;
1387 vdwparam = fr->nbfp;
1388 vdwtype = mdatoms->typeA;
1390 vftab = kernel_data->table_vdw->data;
1391 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
1393 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1394 ewtab = fr->ic->tabq_coul_F;
1395 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1396 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1398 /* Setup water-specific parameters */
1399 inr = nlist->iinr[0];
1400 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1401 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1402 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1403 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1405 jq0 = _mm_set1_ps(charge[inr+0]);
1406 jq1 = _mm_set1_ps(charge[inr+1]);
1407 jq2 = _mm_set1_ps(charge[inr+2]);
1408 vdwjidx0A = 2*vdwtype[inr+0];
1409 qq00 = _mm_mul_ps(iq0,jq0);
1410 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1411 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1412 qq01 = _mm_mul_ps(iq0,jq1);
1413 qq02 = _mm_mul_ps(iq0,jq2);
1414 qq10 = _mm_mul_ps(iq1,jq0);
1415 qq11 = _mm_mul_ps(iq1,jq1);
1416 qq12 = _mm_mul_ps(iq1,jq2);
1417 qq20 = _mm_mul_ps(iq2,jq0);
1418 qq21 = _mm_mul_ps(iq2,jq1);
1419 qq22 = _mm_mul_ps(iq2,jq2);
1421 /* Avoid stupid compiler warnings */
1422 jnrA = jnrB = jnrC = jnrD = 0;
1423 j_coord_offsetA = 0;
1424 j_coord_offsetB = 0;
1425 j_coord_offsetC = 0;
1426 j_coord_offsetD = 0;
1431 for(iidx=0;iidx<4*DIM;iidx++)
1433 scratch[iidx] = 0.0;
1436 /* Start outer loop over neighborlists */
1437 for(iidx=0; iidx<nri; iidx++)
1439 /* Load shift vector for this list */
1440 i_shift_offset = DIM*shiftidx[iidx];
1442 /* Load limits for loop over neighbors */
1443 j_index_start = jindex[iidx];
1444 j_index_end = jindex[iidx+1];
1446 /* Get outer coordinate index */
1448 i_coord_offset = DIM*inr;
1450 /* Load i particle coords and add shift vector */
1451 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1452 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1454 fix0 = _mm_setzero_ps();
1455 fiy0 = _mm_setzero_ps();
1456 fiz0 = _mm_setzero_ps();
1457 fix1 = _mm_setzero_ps();
1458 fiy1 = _mm_setzero_ps();
1459 fiz1 = _mm_setzero_ps();
1460 fix2 = _mm_setzero_ps();
1461 fiy2 = _mm_setzero_ps();
1462 fiz2 = _mm_setzero_ps();
1464 /* Start inner kernel loop */
1465 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1468 /* Get j neighbor index, and coordinate index */
1470 jnrB = jjnr[jidx+1];
1471 jnrC = jjnr[jidx+2];
1472 jnrD = jjnr[jidx+3];
1473 j_coord_offsetA = DIM*jnrA;
1474 j_coord_offsetB = DIM*jnrB;
1475 j_coord_offsetC = DIM*jnrC;
1476 j_coord_offsetD = DIM*jnrD;
1478 /* load j atom coordinates */
1479 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1480 x+j_coord_offsetC,x+j_coord_offsetD,
1481 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1483 /* Calculate displacement vector */
1484 dx00 = _mm_sub_ps(ix0,jx0);
1485 dy00 = _mm_sub_ps(iy0,jy0);
1486 dz00 = _mm_sub_ps(iz0,jz0);
1487 dx01 = _mm_sub_ps(ix0,jx1);
1488 dy01 = _mm_sub_ps(iy0,jy1);
1489 dz01 = _mm_sub_ps(iz0,jz1);
1490 dx02 = _mm_sub_ps(ix0,jx2);
1491 dy02 = _mm_sub_ps(iy0,jy2);
1492 dz02 = _mm_sub_ps(iz0,jz2);
1493 dx10 = _mm_sub_ps(ix1,jx0);
1494 dy10 = _mm_sub_ps(iy1,jy0);
1495 dz10 = _mm_sub_ps(iz1,jz0);
1496 dx11 = _mm_sub_ps(ix1,jx1);
1497 dy11 = _mm_sub_ps(iy1,jy1);
1498 dz11 = _mm_sub_ps(iz1,jz1);
1499 dx12 = _mm_sub_ps(ix1,jx2);
1500 dy12 = _mm_sub_ps(iy1,jy2);
1501 dz12 = _mm_sub_ps(iz1,jz2);
1502 dx20 = _mm_sub_ps(ix2,jx0);
1503 dy20 = _mm_sub_ps(iy2,jy0);
1504 dz20 = _mm_sub_ps(iz2,jz0);
1505 dx21 = _mm_sub_ps(ix2,jx1);
1506 dy21 = _mm_sub_ps(iy2,jy1);
1507 dz21 = _mm_sub_ps(iz2,jz1);
1508 dx22 = _mm_sub_ps(ix2,jx2);
1509 dy22 = _mm_sub_ps(iy2,jy2);
1510 dz22 = _mm_sub_ps(iz2,jz2);
1512 /* Calculate squared distance and things based on it */
1513 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1514 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1515 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1516 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1517 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1518 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1519 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1520 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1521 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1523 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1524 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1525 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1526 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1527 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1528 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1529 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1530 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1531 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1533 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1534 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1535 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1536 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1537 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1538 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1539 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1540 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1541 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1543 fjx0 = _mm_setzero_ps();
1544 fjy0 = _mm_setzero_ps();
1545 fjz0 = _mm_setzero_ps();
1546 fjx1 = _mm_setzero_ps();
1547 fjy1 = _mm_setzero_ps();
1548 fjz1 = _mm_setzero_ps();
1549 fjx2 = _mm_setzero_ps();
1550 fjy2 = _mm_setzero_ps();
1551 fjz2 = _mm_setzero_ps();
1553 /**************************
1554 * CALCULATE INTERACTIONS *
1555 **************************/
1557 r00 = _mm_mul_ps(rsq00,rinv00);
1559 /* Calculate table index by multiplying r with table scale and truncate to integer */
1560 rt = _mm_mul_ps(r00,vftabscale);
1561 vfitab = _mm_cvttps_epi32(rt);
1562 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1563 vfitab = _mm_slli_epi32(vfitab,3);
1565 /* EWALD ELECTROSTATICS */
1567 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1568 ewrt = _mm_mul_ps(r00,ewtabscale);
1569 ewitab = _mm_cvttps_epi32(ewrt);
1570 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1571 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1572 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1574 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1575 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1577 /* CUBIC SPLINE TABLE DISPERSION */
1578 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1579 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1580 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1581 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1582 _MM_TRANSPOSE4_PS(Y,F,G,H);
1583 Heps = _mm_mul_ps(vfeps,H);
1584 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1585 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1586 fvdw6 = _mm_mul_ps(c6_00,FF);
1588 /* CUBIC SPLINE TABLE REPULSION */
1589 vfitab = _mm_add_epi32(vfitab,ifour);
1590 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1591 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1592 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1593 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1594 _MM_TRANSPOSE4_PS(Y,F,G,H);
1595 Heps = _mm_mul_ps(vfeps,H);
1596 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1597 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1598 fvdw12 = _mm_mul_ps(c12_00,FF);
1599 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1601 fscal = _mm_add_ps(felec,fvdw);
1603 /* Calculate temporary vectorial force */
1604 tx = _mm_mul_ps(fscal,dx00);
1605 ty = _mm_mul_ps(fscal,dy00);
1606 tz = _mm_mul_ps(fscal,dz00);
1608 /* Update vectorial force */
1609 fix0 = _mm_add_ps(fix0,tx);
1610 fiy0 = _mm_add_ps(fiy0,ty);
1611 fiz0 = _mm_add_ps(fiz0,tz);
1613 fjx0 = _mm_add_ps(fjx0,tx);
1614 fjy0 = _mm_add_ps(fjy0,ty);
1615 fjz0 = _mm_add_ps(fjz0,tz);
1617 /**************************
1618 * CALCULATE INTERACTIONS *
1619 **************************/
1621 r01 = _mm_mul_ps(rsq01,rinv01);
1623 /* EWALD ELECTROSTATICS */
1625 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1626 ewrt = _mm_mul_ps(r01,ewtabscale);
1627 ewitab = _mm_cvttps_epi32(ewrt);
1628 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1629 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1630 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1632 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1633 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1637 /* Calculate temporary vectorial force */
1638 tx = _mm_mul_ps(fscal,dx01);
1639 ty = _mm_mul_ps(fscal,dy01);
1640 tz = _mm_mul_ps(fscal,dz01);
1642 /* Update vectorial force */
1643 fix0 = _mm_add_ps(fix0,tx);
1644 fiy0 = _mm_add_ps(fiy0,ty);
1645 fiz0 = _mm_add_ps(fiz0,tz);
1647 fjx1 = _mm_add_ps(fjx1,tx);
1648 fjy1 = _mm_add_ps(fjy1,ty);
1649 fjz1 = _mm_add_ps(fjz1,tz);
1651 /**************************
1652 * CALCULATE INTERACTIONS *
1653 **************************/
1655 r02 = _mm_mul_ps(rsq02,rinv02);
1657 /* EWALD ELECTROSTATICS */
1659 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1660 ewrt = _mm_mul_ps(r02,ewtabscale);
1661 ewitab = _mm_cvttps_epi32(ewrt);
1662 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1663 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1664 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1666 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1667 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1671 /* Calculate temporary vectorial force */
1672 tx = _mm_mul_ps(fscal,dx02);
1673 ty = _mm_mul_ps(fscal,dy02);
1674 tz = _mm_mul_ps(fscal,dz02);
1676 /* Update vectorial force */
1677 fix0 = _mm_add_ps(fix0,tx);
1678 fiy0 = _mm_add_ps(fiy0,ty);
1679 fiz0 = _mm_add_ps(fiz0,tz);
1681 fjx2 = _mm_add_ps(fjx2,tx);
1682 fjy2 = _mm_add_ps(fjy2,ty);
1683 fjz2 = _mm_add_ps(fjz2,tz);
1685 /**************************
1686 * CALCULATE INTERACTIONS *
1687 **************************/
1689 r10 = _mm_mul_ps(rsq10,rinv10);
1691 /* EWALD ELECTROSTATICS */
1693 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1694 ewrt = _mm_mul_ps(r10,ewtabscale);
1695 ewitab = _mm_cvttps_epi32(ewrt);
1696 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1697 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1698 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1700 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1701 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1705 /* Calculate temporary vectorial force */
1706 tx = _mm_mul_ps(fscal,dx10);
1707 ty = _mm_mul_ps(fscal,dy10);
1708 tz = _mm_mul_ps(fscal,dz10);
1710 /* Update vectorial force */
1711 fix1 = _mm_add_ps(fix1,tx);
1712 fiy1 = _mm_add_ps(fiy1,ty);
1713 fiz1 = _mm_add_ps(fiz1,tz);
1715 fjx0 = _mm_add_ps(fjx0,tx);
1716 fjy0 = _mm_add_ps(fjy0,ty);
1717 fjz0 = _mm_add_ps(fjz0,tz);
1719 /**************************
1720 * CALCULATE INTERACTIONS *
1721 **************************/
1723 r11 = _mm_mul_ps(rsq11,rinv11);
1725 /* EWALD ELECTROSTATICS */
1727 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1728 ewrt = _mm_mul_ps(r11,ewtabscale);
1729 ewitab = _mm_cvttps_epi32(ewrt);
1730 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1731 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1732 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1734 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1735 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1739 /* Calculate temporary vectorial force */
1740 tx = _mm_mul_ps(fscal,dx11);
1741 ty = _mm_mul_ps(fscal,dy11);
1742 tz = _mm_mul_ps(fscal,dz11);
1744 /* Update vectorial force */
1745 fix1 = _mm_add_ps(fix1,tx);
1746 fiy1 = _mm_add_ps(fiy1,ty);
1747 fiz1 = _mm_add_ps(fiz1,tz);
1749 fjx1 = _mm_add_ps(fjx1,tx);
1750 fjy1 = _mm_add_ps(fjy1,ty);
1751 fjz1 = _mm_add_ps(fjz1,tz);
1753 /**************************
1754 * CALCULATE INTERACTIONS *
1755 **************************/
1757 r12 = _mm_mul_ps(rsq12,rinv12);
1759 /* EWALD ELECTROSTATICS */
1761 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1762 ewrt = _mm_mul_ps(r12,ewtabscale);
1763 ewitab = _mm_cvttps_epi32(ewrt);
1764 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1765 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1766 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1768 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1769 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1773 /* Calculate temporary vectorial force */
1774 tx = _mm_mul_ps(fscal,dx12);
1775 ty = _mm_mul_ps(fscal,dy12);
1776 tz = _mm_mul_ps(fscal,dz12);
1778 /* Update vectorial force */
1779 fix1 = _mm_add_ps(fix1,tx);
1780 fiy1 = _mm_add_ps(fiy1,ty);
1781 fiz1 = _mm_add_ps(fiz1,tz);
1783 fjx2 = _mm_add_ps(fjx2,tx);
1784 fjy2 = _mm_add_ps(fjy2,ty);
1785 fjz2 = _mm_add_ps(fjz2,tz);
1787 /**************************
1788 * CALCULATE INTERACTIONS *
1789 **************************/
1791 r20 = _mm_mul_ps(rsq20,rinv20);
1793 /* EWALD ELECTROSTATICS */
1795 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1796 ewrt = _mm_mul_ps(r20,ewtabscale);
1797 ewitab = _mm_cvttps_epi32(ewrt);
1798 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1799 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1800 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1802 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1803 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1807 /* Calculate temporary vectorial force */
1808 tx = _mm_mul_ps(fscal,dx20);
1809 ty = _mm_mul_ps(fscal,dy20);
1810 tz = _mm_mul_ps(fscal,dz20);
1812 /* Update vectorial force */
1813 fix2 = _mm_add_ps(fix2,tx);
1814 fiy2 = _mm_add_ps(fiy2,ty);
1815 fiz2 = _mm_add_ps(fiz2,tz);
1817 fjx0 = _mm_add_ps(fjx0,tx);
1818 fjy0 = _mm_add_ps(fjy0,ty);
1819 fjz0 = _mm_add_ps(fjz0,tz);
1821 /**************************
1822 * CALCULATE INTERACTIONS *
1823 **************************/
1825 r21 = _mm_mul_ps(rsq21,rinv21);
1827 /* EWALD ELECTROSTATICS */
1829 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1830 ewrt = _mm_mul_ps(r21,ewtabscale);
1831 ewitab = _mm_cvttps_epi32(ewrt);
1832 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1833 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1834 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1836 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1837 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1841 /* Calculate temporary vectorial force */
1842 tx = _mm_mul_ps(fscal,dx21);
1843 ty = _mm_mul_ps(fscal,dy21);
1844 tz = _mm_mul_ps(fscal,dz21);
1846 /* Update vectorial force */
1847 fix2 = _mm_add_ps(fix2,tx);
1848 fiy2 = _mm_add_ps(fiy2,ty);
1849 fiz2 = _mm_add_ps(fiz2,tz);
1851 fjx1 = _mm_add_ps(fjx1,tx);
1852 fjy1 = _mm_add_ps(fjy1,ty);
1853 fjz1 = _mm_add_ps(fjz1,tz);
1855 /**************************
1856 * CALCULATE INTERACTIONS *
1857 **************************/
1859 r22 = _mm_mul_ps(rsq22,rinv22);
1861 /* EWALD ELECTROSTATICS */
1863 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1864 ewrt = _mm_mul_ps(r22,ewtabscale);
1865 ewitab = _mm_cvttps_epi32(ewrt);
1866 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1867 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1868 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1870 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1871 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1875 /* Calculate temporary vectorial force */
1876 tx = _mm_mul_ps(fscal,dx22);
1877 ty = _mm_mul_ps(fscal,dy22);
1878 tz = _mm_mul_ps(fscal,dz22);
1880 /* Update vectorial force */
1881 fix2 = _mm_add_ps(fix2,tx);
1882 fiy2 = _mm_add_ps(fiy2,ty);
1883 fiz2 = _mm_add_ps(fiz2,tz);
1885 fjx2 = _mm_add_ps(fjx2,tx);
1886 fjy2 = _mm_add_ps(fjy2,ty);
1887 fjz2 = _mm_add_ps(fjz2,tz);
1889 fjptrA = f+j_coord_offsetA;
1890 fjptrB = f+j_coord_offsetB;
1891 fjptrC = f+j_coord_offsetC;
1892 fjptrD = f+j_coord_offsetD;
1894 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1895 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1897 /* Inner loop uses 350 flops */
1900 if(jidx<j_index_end)
1903 /* Get j neighbor index, and coordinate index */
1904 jnrlistA = jjnr[jidx];
1905 jnrlistB = jjnr[jidx+1];
1906 jnrlistC = jjnr[jidx+2];
1907 jnrlistD = jjnr[jidx+3];
1908 /* Sign of each element will be negative for non-real atoms.
1909 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1910 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1912 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1913 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1914 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1915 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1916 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1917 j_coord_offsetA = DIM*jnrA;
1918 j_coord_offsetB = DIM*jnrB;
1919 j_coord_offsetC = DIM*jnrC;
1920 j_coord_offsetD = DIM*jnrD;
1922 /* load j atom coordinates */
1923 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1924 x+j_coord_offsetC,x+j_coord_offsetD,
1925 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1927 /* Calculate displacement vector */
1928 dx00 = _mm_sub_ps(ix0,jx0);
1929 dy00 = _mm_sub_ps(iy0,jy0);
1930 dz00 = _mm_sub_ps(iz0,jz0);
1931 dx01 = _mm_sub_ps(ix0,jx1);
1932 dy01 = _mm_sub_ps(iy0,jy1);
1933 dz01 = _mm_sub_ps(iz0,jz1);
1934 dx02 = _mm_sub_ps(ix0,jx2);
1935 dy02 = _mm_sub_ps(iy0,jy2);
1936 dz02 = _mm_sub_ps(iz0,jz2);
1937 dx10 = _mm_sub_ps(ix1,jx0);
1938 dy10 = _mm_sub_ps(iy1,jy0);
1939 dz10 = _mm_sub_ps(iz1,jz0);
1940 dx11 = _mm_sub_ps(ix1,jx1);
1941 dy11 = _mm_sub_ps(iy1,jy1);
1942 dz11 = _mm_sub_ps(iz1,jz1);
1943 dx12 = _mm_sub_ps(ix1,jx2);
1944 dy12 = _mm_sub_ps(iy1,jy2);
1945 dz12 = _mm_sub_ps(iz1,jz2);
1946 dx20 = _mm_sub_ps(ix2,jx0);
1947 dy20 = _mm_sub_ps(iy2,jy0);
1948 dz20 = _mm_sub_ps(iz2,jz0);
1949 dx21 = _mm_sub_ps(ix2,jx1);
1950 dy21 = _mm_sub_ps(iy2,jy1);
1951 dz21 = _mm_sub_ps(iz2,jz1);
1952 dx22 = _mm_sub_ps(ix2,jx2);
1953 dy22 = _mm_sub_ps(iy2,jy2);
1954 dz22 = _mm_sub_ps(iz2,jz2);
1956 /* Calculate squared distance and things based on it */
1957 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1958 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1959 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1960 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1961 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1962 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1963 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1964 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1965 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1967 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1968 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1969 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1970 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1971 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1972 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1973 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1974 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1975 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1977 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1978 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1979 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1980 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1981 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1982 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1983 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1984 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1985 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1987 fjx0 = _mm_setzero_ps();
1988 fjy0 = _mm_setzero_ps();
1989 fjz0 = _mm_setzero_ps();
1990 fjx1 = _mm_setzero_ps();
1991 fjy1 = _mm_setzero_ps();
1992 fjz1 = _mm_setzero_ps();
1993 fjx2 = _mm_setzero_ps();
1994 fjy2 = _mm_setzero_ps();
1995 fjz2 = _mm_setzero_ps();
1997 /**************************
1998 * CALCULATE INTERACTIONS *
1999 **************************/
2001 r00 = _mm_mul_ps(rsq00,rinv00);
2002 r00 = _mm_andnot_ps(dummy_mask,r00);
2004 /* Calculate table index by multiplying r with table scale and truncate to integer */
2005 rt = _mm_mul_ps(r00,vftabscale);
2006 vfitab = _mm_cvttps_epi32(rt);
2007 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2008 vfitab = _mm_slli_epi32(vfitab,3);
2010 /* EWALD ELECTROSTATICS */
2012 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2013 ewrt = _mm_mul_ps(r00,ewtabscale);
2014 ewitab = _mm_cvttps_epi32(ewrt);
2015 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2016 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2017 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2019 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2020 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2022 /* CUBIC SPLINE TABLE DISPERSION */
2023 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2024 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2025 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2026 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2027 _MM_TRANSPOSE4_PS(Y,F,G,H);
2028 Heps = _mm_mul_ps(vfeps,H);
2029 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2030 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2031 fvdw6 = _mm_mul_ps(c6_00,FF);
2033 /* CUBIC SPLINE TABLE REPULSION */
2034 vfitab = _mm_add_epi32(vfitab,ifour);
2035 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2036 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2037 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2038 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2039 _MM_TRANSPOSE4_PS(Y,F,G,H);
2040 Heps = _mm_mul_ps(vfeps,H);
2041 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2042 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2043 fvdw12 = _mm_mul_ps(c12_00,FF);
2044 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
2046 fscal = _mm_add_ps(felec,fvdw);
2048 fscal = _mm_andnot_ps(dummy_mask,fscal);
2050 /* Calculate temporary vectorial force */
2051 tx = _mm_mul_ps(fscal,dx00);
2052 ty = _mm_mul_ps(fscal,dy00);
2053 tz = _mm_mul_ps(fscal,dz00);
2055 /* Update vectorial force */
2056 fix0 = _mm_add_ps(fix0,tx);
2057 fiy0 = _mm_add_ps(fiy0,ty);
2058 fiz0 = _mm_add_ps(fiz0,tz);
2060 fjx0 = _mm_add_ps(fjx0,tx);
2061 fjy0 = _mm_add_ps(fjy0,ty);
2062 fjz0 = _mm_add_ps(fjz0,tz);
2064 /**************************
2065 * CALCULATE INTERACTIONS *
2066 **************************/
2068 r01 = _mm_mul_ps(rsq01,rinv01);
2069 r01 = _mm_andnot_ps(dummy_mask,r01);
2071 /* EWALD ELECTROSTATICS */
2073 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2074 ewrt = _mm_mul_ps(r01,ewtabscale);
2075 ewitab = _mm_cvttps_epi32(ewrt);
2076 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2077 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2078 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2080 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2081 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2085 fscal = _mm_andnot_ps(dummy_mask,fscal);
2087 /* Calculate temporary vectorial force */
2088 tx = _mm_mul_ps(fscal,dx01);
2089 ty = _mm_mul_ps(fscal,dy01);
2090 tz = _mm_mul_ps(fscal,dz01);
2092 /* Update vectorial force */
2093 fix0 = _mm_add_ps(fix0,tx);
2094 fiy0 = _mm_add_ps(fiy0,ty);
2095 fiz0 = _mm_add_ps(fiz0,tz);
2097 fjx1 = _mm_add_ps(fjx1,tx);
2098 fjy1 = _mm_add_ps(fjy1,ty);
2099 fjz1 = _mm_add_ps(fjz1,tz);
2101 /**************************
2102 * CALCULATE INTERACTIONS *
2103 **************************/
2105 r02 = _mm_mul_ps(rsq02,rinv02);
2106 r02 = _mm_andnot_ps(dummy_mask,r02);
2108 /* EWALD ELECTROSTATICS */
2110 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2111 ewrt = _mm_mul_ps(r02,ewtabscale);
2112 ewitab = _mm_cvttps_epi32(ewrt);
2113 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2114 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2115 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2117 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2118 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2122 fscal = _mm_andnot_ps(dummy_mask,fscal);
2124 /* Calculate temporary vectorial force */
2125 tx = _mm_mul_ps(fscal,dx02);
2126 ty = _mm_mul_ps(fscal,dy02);
2127 tz = _mm_mul_ps(fscal,dz02);
2129 /* Update vectorial force */
2130 fix0 = _mm_add_ps(fix0,tx);
2131 fiy0 = _mm_add_ps(fiy0,ty);
2132 fiz0 = _mm_add_ps(fiz0,tz);
2134 fjx2 = _mm_add_ps(fjx2,tx);
2135 fjy2 = _mm_add_ps(fjy2,ty);
2136 fjz2 = _mm_add_ps(fjz2,tz);
2138 /**************************
2139 * CALCULATE INTERACTIONS *
2140 **************************/
2142 r10 = _mm_mul_ps(rsq10,rinv10);
2143 r10 = _mm_andnot_ps(dummy_mask,r10);
2145 /* EWALD ELECTROSTATICS */
2147 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2148 ewrt = _mm_mul_ps(r10,ewtabscale);
2149 ewitab = _mm_cvttps_epi32(ewrt);
2150 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2151 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2152 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2154 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2155 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2159 fscal = _mm_andnot_ps(dummy_mask,fscal);
2161 /* Calculate temporary vectorial force */
2162 tx = _mm_mul_ps(fscal,dx10);
2163 ty = _mm_mul_ps(fscal,dy10);
2164 tz = _mm_mul_ps(fscal,dz10);
2166 /* Update vectorial force */
2167 fix1 = _mm_add_ps(fix1,tx);
2168 fiy1 = _mm_add_ps(fiy1,ty);
2169 fiz1 = _mm_add_ps(fiz1,tz);
2171 fjx0 = _mm_add_ps(fjx0,tx);
2172 fjy0 = _mm_add_ps(fjy0,ty);
2173 fjz0 = _mm_add_ps(fjz0,tz);
2175 /**************************
2176 * CALCULATE INTERACTIONS *
2177 **************************/
2179 r11 = _mm_mul_ps(rsq11,rinv11);
2180 r11 = _mm_andnot_ps(dummy_mask,r11);
2182 /* EWALD ELECTROSTATICS */
2184 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2185 ewrt = _mm_mul_ps(r11,ewtabscale);
2186 ewitab = _mm_cvttps_epi32(ewrt);
2187 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2188 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2189 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2191 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2192 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2196 fscal = _mm_andnot_ps(dummy_mask,fscal);
2198 /* Calculate temporary vectorial force */
2199 tx = _mm_mul_ps(fscal,dx11);
2200 ty = _mm_mul_ps(fscal,dy11);
2201 tz = _mm_mul_ps(fscal,dz11);
2203 /* Update vectorial force */
2204 fix1 = _mm_add_ps(fix1,tx);
2205 fiy1 = _mm_add_ps(fiy1,ty);
2206 fiz1 = _mm_add_ps(fiz1,tz);
2208 fjx1 = _mm_add_ps(fjx1,tx);
2209 fjy1 = _mm_add_ps(fjy1,ty);
2210 fjz1 = _mm_add_ps(fjz1,tz);
2212 /**************************
2213 * CALCULATE INTERACTIONS *
2214 **************************/
2216 r12 = _mm_mul_ps(rsq12,rinv12);
2217 r12 = _mm_andnot_ps(dummy_mask,r12);
2219 /* EWALD ELECTROSTATICS */
2221 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2222 ewrt = _mm_mul_ps(r12,ewtabscale);
2223 ewitab = _mm_cvttps_epi32(ewrt);
2224 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2225 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2226 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2228 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2229 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2233 fscal = _mm_andnot_ps(dummy_mask,fscal);
2235 /* Calculate temporary vectorial force */
2236 tx = _mm_mul_ps(fscal,dx12);
2237 ty = _mm_mul_ps(fscal,dy12);
2238 tz = _mm_mul_ps(fscal,dz12);
2240 /* Update vectorial force */
2241 fix1 = _mm_add_ps(fix1,tx);
2242 fiy1 = _mm_add_ps(fiy1,ty);
2243 fiz1 = _mm_add_ps(fiz1,tz);
2245 fjx2 = _mm_add_ps(fjx2,tx);
2246 fjy2 = _mm_add_ps(fjy2,ty);
2247 fjz2 = _mm_add_ps(fjz2,tz);
2249 /**************************
2250 * CALCULATE INTERACTIONS *
2251 **************************/
2253 r20 = _mm_mul_ps(rsq20,rinv20);
2254 r20 = _mm_andnot_ps(dummy_mask,r20);
2256 /* EWALD ELECTROSTATICS */
2258 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2259 ewrt = _mm_mul_ps(r20,ewtabscale);
2260 ewitab = _mm_cvttps_epi32(ewrt);
2261 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2262 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2263 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2265 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2266 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2270 fscal = _mm_andnot_ps(dummy_mask,fscal);
2272 /* Calculate temporary vectorial force */
2273 tx = _mm_mul_ps(fscal,dx20);
2274 ty = _mm_mul_ps(fscal,dy20);
2275 tz = _mm_mul_ps(fscal,dz20);
2277 /* Update vectorial force */
2278 fix2 = _mm_add_ps(fix2,tx);
2279 fiy2 = _mm_add_ps(fiy2,ty);
2280 fiz2 = _mm_add_ps(fiz2,tz);
2282 fjx0 = _mm_add_ps(fjx0,tx);
2283 fjy0 = _mm_add_ps(fjy0,ty);
2284 fjz0 = _mm_add_ps(fjz0,tz);
2286 /**************************
2287 * CALCULATE INTERACTIONS *
2288 **************************/
2290 r21 = _mm_mul_ps(rsq21,rinv21);
2291 r21 = _mm_andnot_ps(dummy_mask,r21);
2293 /* EWALD ELECTROSTATICS */
2295 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2296 ewrt = _mm_mul_ps(r21,ewtabscale);
2297 ewitab = _mm_cvttps_epi32(ewrt);
2298 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2299 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2300 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2302 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2303 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2307 fscal = _mm_andnot_ps(dummy_mask,fscal);
2309 /* Calculate temporary vectorial force */
2310 tx = _mm_mul_ps(fscal,dx21);
2311 ty = _mm_mul_ps(fscal,dy21);
2312 tz = _mm_mul_ps(fscal,dz21);
2314 /* Update vectorial force */
2315 fix2 = _mm_add_ps(fix2,tx);
2316 fiy2 = _mm_add_ps(fiy2,ty);
2317 fiz2 = _mm_add_ps(fiz2,tz);
2319 fjx1 = _mm_add_ps(fjx1,tx);
2320 fjy1 = _mm_add_ps(fjy1,ty);
2321 fjz1 = _mm_add_ps(fjz1,tz);
2323 /**************************
2324 * CALCULATE INTERACTIONS *
2325 **************************/
2327 r22 = _mm_mul_ps(rsq22,rinv22);
2328 r22 = _mm_andnot_ps(dummy_mask,r22);
2330 /* EWALD ELECTROSTATICS */
2332 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2333 ewrt = _mm_mul_ps(r22,ewtabscale);
2334 ewitab = _mm_cvttps_epi32(ewrt);
2335 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2336 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2337 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2339 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2340 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2344 fscal = _mm_andnot_ps(dummy_mask,fscal);
2346 /* Calculate temporary vectorial force */
2347 tx = _mm_mul_ps(fscal,dx22);
2348 ty = _mm_mul_ps(fscal,dy22);
2349 tz = _mm_mul_ps(fscal,dz22);
2351 /* Update vectorial force */
2352 fix2 = _mm_add_ps(fix2,tx);
2353 fiy2 = _mm_add_ps(fiy2,ty);
2354 fiz2 = _mm_add_ps(fiz2,tz);
2356 fjx2 = _mm_add_ps(fjx2,tx);
2357 fjy2 = _mm_add_ps(fjy2,ty);
2358 fjz2 = _mm_add_ps(fjz2,tz);
2360 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2361 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2362 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2363 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2365 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2366 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2368 /* Inner loop uses 359 flops */
2371 /* End of innermost loop */
2373 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2374 f+i_coord_offset,fshift+i_shift_offset);
2376 /* Increment number of inner iterations */
2377 inneriter += j_index_end - j_index_start;
2379 /* Outer loop uses 18 flops */
2382 /* Increment number of outer iterations */
2385 /* Update outer/inner flops */
2387 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*359);