2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
75 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
77 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
79 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
81 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
95 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
98 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
99 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
101 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
103 __m128 dummy_mask,cutoff_mask;
104 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
105 __m128 one = _mm_set1_ps(1.0);
106 __m128 two = _mm_set1_ps(2.0);
112 jindex = nlist->jindex;
114 shiftidx = nlist->shift;
116 shiftvec = fr->shift_vec[0];
117 fshift = fr->fshift[0];
118 facel = _mm_set1_ps(fr->epsfac);
119 charge = mdatoms->chargeA;
120 nvdwtype = fr->ntype;
122 vdwtype = mdatoms->typeA;
124 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
125 ewtab = fr->ic->tabq_coul_FDV0;
126 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
127 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
129 /* Setup water-specific parameters */
130 inr = nlist->iinr[0];
131 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
132 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
133 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
134 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
136 jq1 = _mm_set1_ps(charge[inr+1]);
137 jq2 = _mm_set1_ps(charge[inr+2]);
138 jq3 = _mm_set1_ps(charge[inr+3]);
139 vdwjidx0A = 2*vdwtype[inr+0];
140 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
141 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
142 qq11 = _mm_mul_ps(iq1,jq1);
143 qq12 = _mm_mul_ps(iq1,jq2);
144 qq13 = _mm_mul_ps(iq1,jq3);
145 qq21 = _mm_mul_ps(iq2,jq1);
146 qq22 = _mm_mul_ps(iq2,jq2);
147 qq23 = _mm_mul_ps(iq2,jq3);
148 qq31 = _mm_mul_ps(iq3,jq1);
149 qq32 = _mm_mul_ps(iq3,jq2);
150 qq33 = _mm_mul_ps(iq3,jq3);
152 /* Avoid stupid compiler warnings */
153 jnrA = jnrB = jnrC = jnrD = 0;
162 /* Start outer loop over neighborlists */
163 for(iidx=0; iidx<nri; iidx++)
165 /* Load shift vector for this list */
166 i_shift_offset = DIM*shiftidx[iidx];
167 shX = shiftvec[i_shift_offset+XX];
168 shY = shiftvec[i_shift_offset+YY];
169 shZ = shiftvec[i_shift_offset+ZZ];
171 /* Load limits for loop over neighbors */
172 j_index_start = jindex[iidx];
173 j_index_end = jindex[iidx+1];
175 /* Get outer coordinate index */
177 i_coord_offset = DIM*inr;
179 /* Load i particle coords and add shift vector */
180 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
181 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
182 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
183 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
184 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
185 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
186 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
187 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
188 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
189 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
190 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
191 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
193 fix0 = _mm_setzero_ps();
194 fiy0 = _mm_setzero_ps();
195 fiz0 = _mm_setzero_ps();
196 fix1 = _mm_setzero_ps();
197 fiy1 = _mm_setzero_ps();
198 fiz1 = _mm_setzero_ps();
199 fix2 = _mm_setzero_ps();
200 fiy2 = _mm_setzero_ps();
201 fiz2 = _mm_setzero_ps();
202 fix3 = _mm_setzero_ps();
203 fiy3 = _mm_setzero_ps();
204 fiz3 = _mm_setzero_ps();
206 /* Reset potential sums */
207 velecsum = _mm_setzero_ps();
208 vvdwsum = _mm_setzero_ps();
210 /* Start inner kernel loop */
211 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
214 /* Get j neighbor index, and coordinate index */
220 j_coord_offsetA = DIM*jnrA;
221 j_coord_offsetB = DIM*jnrB;
222 j_coord_offsetC = DIM*jnrC;
223 j_coord_offsetD = DIM*jnrD;
225 /* load j atom coordinates */
226 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
227 x+j_coord_offsetC,x+j_coord_offsetD,
228 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
229 &jy2,&jz2,&jx3,&jy3,&jz3);
231 /* Calculate displacement vector */
232 dx00 = _mm_sub_ps(ix0,jx0);
233 dy00 = _mm_sub_ps(iy0,jy0);
234 dz00 = _mm_sub_ps(iz0,jz0);
235 dx11 = _mm_sub_ps(ix1,jx1);
236 dy11 = _mm_sub_ps(iy1,jy1);
237 dz11 = _mm_sub_ps(iz1,jz1);
238 dx12 = _mm_sub_ps(ix1,jx2);
239 dy12 = _mm_sub_ps(iy1,jy2);
240 dz12 = _mm_sub_ps(iz1,jz2);
241 dx13 = _mm_sub_ps(ix1,jx3);
242 dy13 = _mm_sub_ps(iy1,jy3);
243 dz13 = _mm_sub_ps(iz1,jz3);
244 dx21 = _mm_sub_ps(ix2,jx1);
245 dy21 = _mm_sub_ps(iy2,jy1);
246 dz21 = _mm_sub_ps(iz2,jz1);
247 dx22 = _mm_sub_ps(ix2,jx2);
248 dy22 = _mm_sub_ps(iy2,jy2);
249 dz22 = _mm_sub_ps(iz2,jz2);
250 dx23 = _mm_sub_ps(ix2,jx3);
251 dy23 = _mm_sub_ps(iy2,jy3);
252 dz23 = _mm_sub_ps(iz2,jz3);
253 dx31 = _mm_sub_ps(ix3,jx1);
254 dy31 = _mm_sub_ps(iy3,jy1);
255 dz31 = _mm_sub_ps(iz3,jz1);
256 dx32 = _mm_sub_ps(ix3,jx2);
257 dy32 = _mm_sub_ps(iy3,jy2);
258 dz32 = _mm_sub_ps(iz3,jz2);
259 dx33 = _mm_sub_ps(ix3,jx3);
260 dy33 = _mm_sub_ps(iy3,jy3);
261 dz33 = _mm_sub_ps(iz3,jz3);
263 /* Calculate squared distance and things based on it */
264 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
265 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
266 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
267 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
268 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
269 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
270 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
271 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
272 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
273 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
275 rinv11 = gmx_mm_invsqrt_ps(rsq11);
276 rinv12 = gmx_mm_invsqrt_ps(rsq12);
277 rinv13 = gmx_mm_invsqrt_ps(rsq13);
278 rinv21 = gmx_mm_invsqrt_ps(rsq21);
279 rinv22 = gmx_mm_invsqrt_ps(rsq22);
280 rinv23 = gmx_mm_invsqrt_ps(rsq23);
281 rinv31 = gmx_mm_invsqrt_ps(rsq31);
282 rinv32 = gmx_mm_invsqrt_ps(rsq32);
283 rinv33 = gmx_mm_invsqrt_ps(rsq33);
285 rinvsq00 = gmx_mm_inv_ps(rsq00);
286 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
287 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
288 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
289 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
290 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
291 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
292 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
293 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
294 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
296 fjx0 = _mm_setzero_ps();
297 fjy0 = _mm_setzero_ps();
298 fjz0 = _mm_setzero_ps();
299 fjx1 = _mm_setzero_ps();
300 fjy1 = _mm_setzero_ps();
301 fjz1 = _mm_setzero_ps();
302 fjx2 = _mm_setzero_ps();
303 fjy2 = _mm_setzero_ps();
304 fjz2 = _mm_setzero_ps();
305 fjx3 = _mm_setzero_ps();
306 fjy3 = _mm_setzero_ps();
307 fjz3 = _mm_setzero_ps();
309 /**************************
310 * CALCULATE INTERACTIONS *
311 **************************/
313 /* LENNARD-JONES DISPERSION/REPULSION */
315 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
316 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
317 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
318 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
319 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
321 /* Update potential sum for this i atom from the interaction with this j atom. */
322 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
326 /* Calculate temporary vectorial force */
327 tx = _mm_mul_ps(fscal,dx00);
328 ty = _mm_mul_ps(fscal,dy00);
329 tz = _mm_mul_ps(fscal,dz00);
331 /* Update vectorial force */
332 fix0 = _mm_add_ps(fix0,tx);
333 fiy0 = _mm_add_ps(fiy0,ty);
334 fiz0 = _mm_add_ps(fiz0,tz);
336 fjx0 = _mm_add_ps(fjx0,tx);
337 fjy0 = _mm_add_ps(fjy0,ty);
338 fjz0 = _mm_add_ps(fjz0,tz);
340 /**************************
341 * CALCULATE INTERACTIONS *
342 **************************/
344 r11 = _mm_mul_ps(rsq11,rinv11);
346 /* EWALD ELECTROSTATICS */
348 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
349 ewrt = _mm_mul_ps(r11,ewtabscale);
350 ewitab = _mm_cvttps_epi32(ewrt);
351 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
352 ewitab = _mm_slli_epi32(ewitab,2);
353 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
354 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
355 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
356 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
357 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
358 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
359 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
360 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
361 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
363 /* Update potential sum for this i atom from the interaction with this j atom. */
364 velecsum = _mm_add_ps(velecsum,velec);
368 /* Calculate temporary vectorial force */
369 tx = _mm_mul_ps(fscal,dx11);
370 ty = _mm_mul_ps(fscal,dy11);
371 tz = _mm_mul_ps(fscal,dz11);
373 /* Update vectorial force */
374 fix1 = _mm_add_ps(fix1,tx);
375 fiy1 = _mm_add_ps(fiy1,ty);
376 fiz1 = _mm_add_ps(fiz1,tz);
378 fjx1 = _mm_add_ps(fjx1,tx);
379 fjy1 = _mm_add_ps(fjy1,ty);
380 fjz1 = _mm_add_ps(fjz1,tz);
382 /**************************
383 * CALCULATE INTERACTIONS *
384 **************************/
386 r12 = _mm_mul_ps(rsq12,rinv12);
388 /* EWALD ELECTROSTATICS */
390 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
391 ewrt = _mm_mul_ps(r12,ewtabscale);
392 ewitab = _mm_cvttps_epi32(ewrt);
393 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
394 ewitab = _mm_slli_epi32(ewitab,2);
395 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
396 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
397 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
398 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
399 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
400 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
401 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
402 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
403 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
405 /* Update potential sum for this i atom from the interaction with this j atom. */
406 velecsum = _mm_add_ps(velecsum,velec);
410 /* Calculate temporary vectorial force */
411 tx = _mm_mul_ps(fscal,dx12);
412 ty = _mm_mul_ps(fscal,dy12);
413 tz = _mm_mul_ps(fscal,dz12);
415 /* Update vectorial force */
416 fix1 = _mm_add_ps(fix1,tx);
417 fiy1 = _mm_add_ps(fiy1,ty);
418 fiz1 = _mm_add_ps(fiz1,tz);
420 fjx2 = _mm_add_ps(fjx2,tx);
421 fjy2 = _mm_add_ps(fjy2,ty);
422 fjz2 = _mm_add_ps(fjz2,tz);
424 /**************************
425 * CALCULATE INTERACTIONS *
426 **************************/
428 r13 = _mm_mul_ps(rsq13,rinv13);
430 /* EWALD ELECTROSTATICS */
432 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
433 ewrt = _mm_mul_ps(r13,ewtabscale);
434 ewitab = _mm_cvttps_epi32(ewrt);
435 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
436 ewitab = _mm_slli_epi32(ewitab,2);
437 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
438 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
439 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
440 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
441 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
442 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
443 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
444 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
445 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
447 /* Update potential sum for this i atom from the interaction with this j atom. */
448 velecsum = _mm_add_ps(velecsum,velec);
452 /* Calculate temporary vectorial force */
453 tx = _mm_mul_ps(fscal,dx13);
454 ty = _mm_mul_ps(fscal,dy13);
455 tz = _mm_mul_ps(fscal,dz13);
457 /* Update vectorial force */
458 fix1 = _mm_add_ps(fix1,tx);
459 fiy1 = _mm_add_ps(fiy1,ty);
460 fiz1 = _mm_add_ps(fiz1,tz);
462 fjx3 = _mm_add_ps(fjx3,tx);
463 fjy3 = _mm_add_ps(fjy3,ty);
464 fjz3 = _mm_add_ps(fjz3,tz);
466 /**************************
467 * CALCULATE INTERACTIONS *
468 **************************/
470 r21 = _mm_mul_ps(rsq21,rinv21);
472 /* EWALD ELECTROSTATICS */
474 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
475 ewrt = _mm_mul_ps(r21,ewtabscale);
476 ewitab = _mm_cvttps_epi32(ewrt);
477 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
478 ewitab = _mm_slli_epi32(ewitab,2);
479 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
480 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
481 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
482 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
483 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
484 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
485 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
486 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
487 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
489 /* Update potential sum for this i atom from the interaction with this j atom. */
490 velecsum = _mm_add_ps(velecsum,velec);
494 /* Calculate temporary vectorial force */
495 tx = _mm_mul_ps(fscal,dx21);
496 ty = _mm_mul_ps(fscal,dy21);
497 tz = _mm_mul_ps(fscal,dz21);
499 /* Update vectorial force */
500 fix2 = _mm_add_ps(fix2,tx);
501 fiy2 = _mm_add_ps(fiy2,ty);
502 fiz2 = _mm_add_ps(fiz2,tz);
504 fjx1 = _mm_add_ps(fjx1,tx);
505 fjy1 = _mm_add_ps(fjy1,ty);
506 fjz1 = _mm_add_ps(fjz1,tz);
508 /**************************
509 * CALCULATE INTERACTIONS *
510 **************************/
512 r22 = _mm_mul_ps(rsq22,rinv22);
514 /* EWALD ELECTROSTATICS */
516 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
517 ewrt = _mm_mul_ps(r22,ewtabscale);
518 ewitab = _mm_cvttps_epi32(ewrt);
519 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
520 ewitab = _mm_slli_epi32(ewitab,2);
521 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
522 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
523 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
524 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
525 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
526 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
527 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
528 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
529 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
531 /* Update potential sum for this i atom from the interaction with this j atom. */
532 velecsum = _mm_add_ps(velecsum,velec);
536 /* Calculate temporary vectorial force */
537 tx = _mm_mul_ps(fscal,dx22);
538 ty = _mm_mul_ps(fscal,dy22);
539 tz = _mm_mul_ps(fscal,dz22);
541 /* Update vectorial force */
542 fix2 = _mm_add_ps(fix2,tx);
543 fiy2 = _mm_add_ps(fiy2,ty);
544 fiz2 = _mm_add_ps(fiz2,tz);
546 fjx2 = _mm_add_ps(fjx2,tx);
547 fjy2 = _mm_add_ps(fjy2,ty);
548 fjz2 = _mm_add_ps(fjz2,tz);
550 /**************************
551 * CALCULATE INTERACTIONS *
552 **************************/
554 r23 = _mm_mul_ps(rsq23,rinv23);
556 /* EWALD ELECTROSTATICS */
558 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
559 ewrt = _mm_mul_ps(r23,ewtabscale);
560 ewitab = _mm_cvttps_epi32(ewrt);
561 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
562 ewitab = _mm_slli_epi32(ewitab,2);
563 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
564 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
565 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
566 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
567 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
568 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
569 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
570 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
571 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
573 /* Update potential sum for this i atom from the interaction with this j atom. */
574 velecsum = _mm_add_ps(velecsum,velec);
578 /* Calculate temporary vectorial force */
579 tx = _mm_mul_ps(fscal,dx23);
580 ty = _mm_mul_ps(fscal,dy23);
581 tz = _mm_mul_ps(fscal,dz23);
583 /* Update vectorial force */
584 fix2 = _mm_add_ps(fix2,tx);
585 fiy2 = _mm_add_ps(fiy2,ty);
586 fiz2 = _mm_add_ps(fiz2,tz);
588 fjx3 = _mm_add_ps(fjx3,tx);
589 fjy3 = _mm_add_ps(fjy3,ty);
590 fjz3 = _mm_add_ps(fjz3,tz);
592 /**************************
593 * CALCULATE INTERACTIONS *
594 **************************/
596 r31 = _mm_mul_ps(rsq31,rinv31);
598 /* EWALD ELECTROSTATICS */
600 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
601 ewrt = _mm_mul_ps(r31,ewtabscale);
602 ewitab = _mm_cvttps_epi32(ewrt);
603 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
604 ewitab = _mm_slli_epi32(ewitab,2);
605 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
606 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
607 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
608 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
609 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
610 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
611 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
612 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
613 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
615 /* Update potential sum for this i atom from the interaction with this j atom. */
616 velecsum = _mm_add_ps(velecsum,velec);
620 /* Calculate temporary vectorial force */
621 tx = _mm_mul_ps(fscal,dx31);
622 ty = _mm_mul_ps(fscal,dy31);
623 tz = _mm_mul_ps(fscal,dz31);
625 /* Update vectorial force */
626 fix3 = _mm_add_ps(fix3,tx);
627 fiy3 = _mm_add_ps(fiy3,ty);
628 fiz3 = _mm_add_ps(fiz3,tz);
630 fjx1 = _mm_add_ps(fjx1,tx);
631 fjy1 = _mm_add_ps(fjy1,ty);
632 fjz1 = _mm_add_ps(fjz1,tz);
634 /**************************
635 * CALCULATE INTERACTIONS *
636 **************************/
638 r32 = _mm_mul_ps(rsq32,rinv32);
640 /* EWALD ELECTROSTATICS */
642 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
643 ewrt = _mm_mul_ps(r32,ewtabscale);
644 ewitab = _mm_cvttps_epi32(ewrt);
645 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
646 ewitab = _mm_slli_epi32(ewitab,2);
647 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
648 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
649 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
650 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
651 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
652 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
653 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
654 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
655 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
657 /* Update potential sum for this i atom from the interaction with this j atom. */
658 velecsum = _mm_add_ps(velecsum,velec);
662 /* Calculate temporary vectorial force */
663 tx = _mm_mul_ps(fscal,dx32);
664 ty = _mm_mul_ps(fscal,dy32);
665 tz = _mm_mul_ps(fscal,dz32);
667 /* Update vectorial force */
668 fix3 = _mm_add_ps(fix3,tx);
669 fiy3 = _mm_add_ps(fiy3,ty);
670 fiz3 = _mm_add_ps(fiz3,tz);
672 fjx2 = _mm_add_ps(fjx2,tx);
673 fjy2 = _mm_add_ps(fjy2,ty);
674 fjz2 = _mm_add_ps(fjz2,tz);
676 /**************************
677 * CALCULATE INTERACTIONS *
678 **************************/
680 r33 = _mm_mul_ps(rsq33,rinv33);
682 /* EWALD ELECTROSTATICS */
684 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
685 ewrt = _mm_mul_ps(r33,ewtabscale);
686 ewitab = _mm_cvttps_epi32(ewrt);
687 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
688 ewitab = _mm_slli_epi32(ewitab,2);
689 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
690 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
691 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
692 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
693 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
694 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
695 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
696 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
697 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
699 /* Update potential sum for this i atom from the interaction with this j atom. */
700 velecsum = _mm_add_ps(velecsum,velec);
704 /* Calculate temporary vectorial force */
705 tx = _mm_mul_ps(fscal,dx33);
706 ty = _mm_mul_ps(fscal,dy33);
707 tz = _mm_mul_ps(fscal,dz33);
709 /* Update vectorial force */
710 fix3 = _mm_add_ps(fix3,tx);
711 fiy3 = _mm_add_ps(fiy3,ty);
712 fiz3 = _mm_add_ps(fiz3,tz);
714 fjx3 = _mm_add_ps(fjx3,tx);
715 fjy3 = _mm_add_ps(fjy3,ty);
716 fjz3 = _mm_add_ps(fjz3,tz);
718 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
719 f+j_coord_offsetC,f+j_coord_offsetD,
720 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
721 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
723 /* Inner loop uses 404 flops */
729 /* Get j neighbor index, and coordinate index */
735 /* Sign of each element will be negative for non-real atoms.
736 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
737 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
739 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
740 jnrA = (jnrA>=0) ? jnrA : 0;
741 jnrB = (jnrB>=0) ? jnrB : 0;
742 jnrC = (jnrC>=0) ? jnrC : 0;
743 jnrD = (jnrD>=0) ? jnrD : 0;
745 j_coord_offsetA = DIM*jnrA;
746 j_coord_offsetB = DIM*jnrB;
747 j_coord_offsetC = DIM*jnrC;
748 j_coord_offsetD = DIM*jnrD;
750 /* load j atom coordinates */
751 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
752 x+j_coord_offsetC,x+j_coord_offsetD,
753 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
754 &jy2,&jz2,&jx3,&jy3,&jz3);
756 /* Calculate displacement vector */
757 dx00 = _mm_sub_ps(ix0,jx0);
758 dy00 = _mm_sub_ps(iy0,jy0);
759 dz00 = _mm_sub_ps(iz0,jz0);
760 dx11 = _mm_sub_ps(ix1,jx1);
761 dy11 = _mm_sub_ps(iy1,jy1);
762 dz11 = _mm_sub_ps(iz1,jz1);
763 dx12 = _mm_sub_ps(ix1,jx2);
764 dy12 = _mm_sub_ps(iy1,jy2);
765 dz12 = _mm_sub_ps(iz1,jz2);
766 dx13 = _mm_sub_ps(ix1,jx3);
767 dy13 = _mm_sub_ps(iy1,jy3);
768 dz13 = _mm_sub_ps(iz1,jz3);
769 dx21 = _mm_sub_ps(ix2,jx1);
770 dy21 = _mm_sub_ps(iy2,jy1);
771 dz21 = _mm_sub_ps(iz2,jz1);
772 dx22 = _mm_sub_ps(ix2,jx2);
773 dy22 = _mm_sub_ps(iy2,jy2);
774 dz22 = _mm_sub_ps(iz2,jz2);
775 dx23 = _mm_sub_ps(ix2,jx3);
776 dy23 = _mm_sub_ps(iy2,jy3);
777 dz23 = _mm_sub_ps(iz2,jz3);
778 dx31 = _mm_sub_ps(ix3,jx1);
779 dy31 = _mm_sub_ps(iy3,jy1);
780 dz31 = _mm_sub_ps(iz3,jz1);
781 dx32 = _mm_sub_ps(ix3,jx2);
782 dy32 = _mm_sub_ps(iy3,jy2);
783 dz32 = _mm_sub_ps(iz3,jz2);
784 dx33 = _mm_sub_ps(ix3,jx3);
785 dy33 = _mm_sub_ps(iy3,jy3);
786 dz33 = _mm_sub_ps(iz3,jz3);
788 /* Calculate squared distance and things based on it */
789 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
790 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
791 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
792 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
793 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
794 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
795 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
796 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
797 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
798 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
800 rinv11 = gmx_mm_invsqrt_ps(rsq11);
801 rinv12 = gmx_mm_invsqrt_ps(rsq12);
802 rinv13 = gmx_mm_invsqrt_ps(rsq13);
803 rinv21 = gmx_mm_invsqrt_ps(rsq21);
804 rinv22 = gmx_mm_invsqrt_ps(rsq22);
805 rinv23 = gmx_mm_invsqrt_ps(rsq23);
806 rinv31 = gmx_mm_invsqrt_ps(rsq31);
807 rinv32 = gmx_mm_invsqrt_ps(rsq32);
808 rinv33 = gmx_mm_invsqrt_ps(rsq33);
810 rinvsq00 = gmx_mm_inv_ps(rsq00);
811 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
812 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
813 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
814 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
815 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
816 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
817 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
818 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
819 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
821 fjx0 = _mm_setzero_ps();
822 fjy0 = _mm_setzero_ps();
823 fjz0 = _mm_setzero_ps();
824 fjx1 = _mm_setzero_ps();
825 fjy1 = _mm_setzero_ps();
826 fjz1 = _mm_setzero_ps();
827 fjx2 = _mm_setzero_ps();
828 fjy2 = _mm_setzero_ps();
829 fjz2 = _mm_setzero_ps();
830 fjx3 = _mm_setzero_ps();
831 fjy3 = _mm_setzero_ps();
832 fjz3 = _mm_setzero_ps();
834 /**************************
835 * CALCULATE INTERACTIONS *
836 **************************/
838 /* LENNARD-JONES DISPERSION/REPULSION */
840 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
841 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
842 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
843 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
844 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
846 /* Update potential sum for this i atom from the interaction with this j atom. */
847 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
848 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
852 fscal = _mm_andnot_ps(dummy_mask,fscal);
854 /* Calculate temporary vectorial force */
855 tx = _mm_mul_ps(fscal,dx00);
856 ty = _mm_mul_ps(fscal,dy00);
857 tz = _mm_mul_ps(fscal,dz00);
859 /* Update vectorial force */
860 fix0 = _mm_add_ps(fix0,tx);
861 fiy0 = _mm_add_ps(fiy0,ty);
862 fiz0 = _mm_add_ps(fiz0,tz);
864 fjx0 = _mm_add_ps(fjx0,tx);
865 fjy0 = _mm_add_ps(fjy0,ty);
866 fjz0 = _mm_add_ps(fjz0,tz);
868 /**************************
869 * CALCULATE INTERACTIONS *
870 **************************/
872 r11 = _mm_mul_ps(rsq11,rinv11);
873 r11 = _mm_andnot_ps(dummy_mask,r11);
875 /* EWALD ELECTROSTATICS */
877 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
878 ewrt = _mm_mul_ps(r11,ewtabscale);
879 ewitab = _mm_cvttps_epi32(ewrt);
880 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
881 ewitab = _mm_slli_epi32(ewitab,2);
882 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
883 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
884 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
885 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
886 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
887 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
888 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
889 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
890 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
892 /* Update potential sum for this i atom from the interaction with this j atom. */
893 velec = _mm_andnot_ps(dummy_mask,velec);
894 velecsum = _mm_add_ps(velecsum,velec);
898 fscal = _mm_andnot_ps(dummy_mask,fscal);
900 /* Calculate temporary vectorial force */
901 tx = _mm_mul_ps(fscal,dx11);
902 ty = _mm_mul_ps(fscal,dy11);
903 tz = _mm_mul_ps(fscal,dz11);
905 /* Update vectorial force */
906 fix1 = _mm_add_ps(fix1,tx);
907 fiy1 = _mm_add_ps(fiy1,ty);
908 fiz1 = _mm_add_ps(fiz1,tz);
910 fjx1 = _mm_add_ps(fjx1,tx);
911 fjy1 = _mm_add_ps(fjy1,ty);
912 fjz1 = _mm_add_ps(fjz1,tz);
914 /**************************
915 * CALCULATE INTERACTIONS *
916 **************************/
918 r12 = _mm_mul_ps(rsq12,rinv12);
919 r12 = _mm_andnot_ps(dummy_mask,r12);
921 /* EWALD ELECTROSTATICS */
923 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
924 ewrt = _mm_mul_ps(r12,ewtabscale);
925 ewitab = _mm_cvttps_epi32(ewrt);
926 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
927 ewitab = _mm_slli_epi32(ewitab,2);
928 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
929 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
930 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
931 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
932 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
933 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
934 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
935 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
936 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
938 /* Update potential sum for this i atom from the interaction with this j atom. */
939 velec = _mm_andnot_ps(dummy_mask,velec);
940 velecsum = _mm_add_ps(velecsum,velec);
944 fscal = _mm_andnot_ps(dummy_mask,fscal);
946 /* Calculate temporary vectorial force */
947 tx = _mm_mul_ps(fscal,dx12);
948 ty = _mm_mul_ps(fscal,dy12);
949 tz = _mm_mul_ps(fscal,dz12);
951 /* Update vectorial force */
952 fix1 = _mm_add_ps(fix1,tx);
953 fiy1 = _mm_add_ps(fiy1,ty);
954 fiz1 = _mm_add_ps(fiz1,tz);
956 fjx2 = _mm_add_ps(fjx2,tx);
957 fjy2 = _mm_add_ps(fjy2,ty);
958 fjz2 = _mm_add_ps(fjz2,tz);
960 /**************************
961 * CALCULATE INTERACTIONS *
962 **************************/
964 r13 = _mm_mul_ps(rsq13,rinv13);
965 r13 = _mm_andnot_ps(dummy_mask,r13);
967 /* EWALD ELECTROSTATICS */
969 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
970 ewrt = _mm_mul_ps(r13,ewtabscale);
971 ewitab = _mm_cvttps_epi32(ewrt);
972 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
973 ewitab = _mm_slli_epi32(ewitab,2);
974 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
975 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
976 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
977 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
978 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
979 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
980 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
981 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
982 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
984 /* Update potential sum for this i atom from the interaction with this j atom. */
985 velec = _mm_andnot_ps(dummy_mask,velec);
986 velecsum = _mm_add_ps(velecsum,velec);
990 fscal = _mm_andnot_ps(dummy_mask,fscal);
992 /* Calculate temporary vectorial force */
993 tx = _mm_mul_ps(fscal,dx13);
994 ty = _mm_mul_ps(fscal,dy13);
995 tz = _mm_mul_ps(fscal,dz13);
997 /* Update vectorial force */
998 fix1 = _mm_add_ps(fix1,tx);
999 fiy1 = _mm_add_ps(fiy1,ty);
1000 fiz1 = _mm_add_ps(fiz1,tz);
1002 fjx3 = _mm_add_ps(fjx3,tx);
1003 fjy3 = _mm_add_ps(fjy3,ty);
1004 fjz3 = _mm_add_ps(fjz3,tz);
1006 /**************************
1007 * CALCULATE INTERACTIONS *
1008 **************************/
1010 r21 = _mm_mul_ps(rsq21,rinv21);
1011 r21 = _mm_andnot_ps(dummy_mask,r21);
1013 /* EWALD ELECTROSTATICS */
1015 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1016 ewrt = _mm_mul_ps(r21,ewtabscale);
1017 ewitab = _mm_cvttps_epi32(ewrt);
1018 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1019 ewitab = _mm_slli_epi32(ewitab,2);
1020 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1021 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1022 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1023 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1024 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1025 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1026 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1027 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1028 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1030 /* Update potential sum for this i atom from the interaction with this j atom. */
1031 velec = _mm_andnot_ps(dummy_mask,velec);
1032 velecsum = _mm_add_ps(velecsum,velec);
1036 fscal = _mm_andnot_ps(dummy_mask,fscal);
1038 /* Calculate temporary vectorial force */
1039 tx = _mm_mul_ps(fscal,dx21);
1040 ty = _mm_mul_ps(fscal,dy21);
1041 tz = _mm_mul_ps(fscal,dz21);
1043 /* Update vectorial force */
1044 fix2 = _mm_add_ps(fix2,tx);
1045 fiy2 = _mm_add_ps(fiy2,ty);
1046 fiz2 = _mm_add_ps(fiz2,tz);
1048 fjx1 = _mm_add_ps(fjx1,tx);
1049 fjy1 = _mm_add_ps(fjy1,ty);
1050 fjz1 = _mm_add_ps(fjz1,tz);
1052 /**************************
1053 * CALCULATE INTERACTIONS *
1054 **************************/
1056 r22 = _mm_mul_ps(rsq22,rinv22);
1057 r22 = _mm_andnot_ps(dummy_mask,r22);
1059 /* EWALD ELECTROSTATICS */
1061 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1062 ewrt = _mm_mul_ps(r22,ewtabscale);
1063 ewitab = _mm_cvttps_epi32(ewrt);
1064 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1065 ewitab = _mm_slli_epi32(ewitab,2);
1066 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1067 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1068 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1069 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1070 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1071 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1072 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1073 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1074 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1076 /* Update potential sum for this i atom from the interaction with this j atom. */
1077 velec = _mm_andnot_ps(dummy_mask,velec);
1078 velecsum = _mm_add_ps(velecsum,velec);
1082 fscal = _mm_andnot_ps(dummy_mask,fscal);
1084 /* Calculate temporary vectorial force */
1085 tx = _mm_mul_ps(fscal,dx22);
1086 ty = _mm_mul_ps(fscal,dy22);
1087 tz = _mm_mul_ps(fscal,dz22);
1089 /* Update vectorial force */
1090 fix2 = _mm_add_ps(fix2,tx);
1091 fiy2 = _mm_add_ps(fiy2,ty);
1092 fiz2 = _mm_add_ps(fiz2,tz);
1094 fjx2 = _mm_add_ps(fjx2,tx);
1095 fjy2 = _mm_add_ps(fjy2,ty);
1096 fjz2 = _mm_add_ps(fjz2,tz);
1098 /**************************
1099 * CALCULATE INTERACTIONS *
1100 **************************/
1102 r23 = _mm_mul_ps(rsq23,rinv23);
1103 r23 = _mm_andnot_ps(dummy_mask,r23);
1105 /* EWALD ELECTROSTATICS */
1107 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1108 ewrt = _mm_mul_ps(r23,ewtabscale);
1109 ewitab = _mm_cvttps_epi32(ewrt);
1110 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1111 ewitab = _mm_slli_epi32(ewitab,2);
1112 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1113 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1114 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1115 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1116 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1117 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1118 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1119 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1120 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1122 /* Update potential sum for this i atom from the interaction with this j atom. */
1123 velec = _mm_andnot_ps(dummy_mask,velec);
1124 velecsum = _mm_add_ps(velecsum,velec);
1128 fscal = _mm_andnot_ps(dummy_mask,fscal);
1130 /* Calculate temporary vectorial force */
1131 tx = _mm_mul_ps(fscal,dx23);
1132 ty = _mm_mul_ps(fscal,dy23);
1133 tz = _mm_mul_ps(fscal,dz23);
1135 /* Update vectorial force */
1136 fix2 = _mm_add_ps(fix2,tx);
1137 fiy2 = _mm_add_ps(fiy2,ty);
1138 fiz2 = _mm_add_ps(fiz2,tz);
1140 fjx3 = _mm_add_ps(fjx3,tx);
1141 fjy3 = _mm_add_ps(fjy3,ty);
1142 fjz3 = _mm_add_ps(fjz3,tz);
1144 /**************************
1145 * CALCULATE INTERACTIONS *
1146 **************************/
1148 r31 = _mm_mul_ps(rsq31,rinv31);
1149 r31 = _mm_andnot_ps(dummy_mask,r31);
1151 /* EWALD ELECTROSTATICS */
1153 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1154 ewrt = _mm_mul_ps(r31,ewtabscale);
1155 ewitab = _mm_cvttps_epi32(ewrt);
1156 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1157 ewitab = _mm_slli_epi32(ewitab,2);
1158 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1159 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1160 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1161 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1162 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1163 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1164 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1165 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1166 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1168 /* Update potential sum for this i atom from the interaction with this j atom. */
1169 velec = _mm_andnot_ps(dummy_mask,velec);
1170 velecsum = _mm_add_ps(velecsum,velec);
1174 fscal = _mm_andnot_ps(dummy_mask,fscal);
1176 /* Calculate temporary vectorial force */
1177 tx = _mm_mul_ps(fscal,dx31);
1178 ty = _mm_mul_ps(fscal,dy31);
1179 tz = _mm_mul_ps(fscal,dz31);
1181 /* Update vectorial force */
1182 fix3 = _mm_add_ps(fix3,tx);
1183 fiy3 = _mm_add_ps(fiy3,ty);
1184 fiz3 = _mm_add_ps(fiz3,tz);
1186 fjx1 = _mm_add_ps(fjx1,tx);
1187 fjy1 = _mm_add_ps(fjy1,ty);
1188 fjz1 = _mm_add_ps(fjz1,tz);
1190 /**************************
1191 * CALCULATE INTERACTIONS *
1192 **************************/
1194 r32 = _mm_mul_ps(rsq32,rinv32);
1195 r32 = _mm_andnot_ps(dummy_mask,r32);
1197 /* EWALD ELECTROSTATICS */
1199 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1200 ewrt = _mm_mul_ps(r32,ewtabscale);
1201 ewitab = _mm_cvttps_epi32(ewrt);
1202 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1203 ewitab = _mm_slli_epi32(ewitab,2);
1204 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1205 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1206 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1207 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1208 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1209 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1210 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1211 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1212 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1214 /* Update potential sum for this i atom from the interaction with this j atom. */
1215 velec = _mm_andnot_ps(dummy_mask,velec);
1216 velecsum = _mm_add_ps(velecsum,velec);
1220 fscal = _mm_andnot_ps(dummy_mask,fscal);
1222 /* Calculate temporary vectorial force */
1223 tx = _mm_mul_ps(fscal,dx32);
1224 ty = _mm_mul_ps(fscal,dy32);
1225 tz = _mm_mul_ps(fscal,dz32);
1227 /* Update vectorial force */
1228 fix3 = _mm_add_ps(fix3,tx);
1229 fiy3 = _mm_add_ps(fiy3,ty);
1230 fiz3 = _mm_add_ps(fiz3,tz);
1232 fjx2 = _mm_add_ps(fjx2,tx);
1233 fjy2 = _mm_add_ps(fjy2,ty);
1234 fjz2 = _mm_add_ps(fjz2,tz);
1236 /**************************
1237 * CALCULATE INTERACTIONS *
1238 **************************/
1240 r33 = _mm_mul_ps(rsq33,rinv33);
1241 r33 = _mm_andnot_ps(dummy_mask,r33);
1243 /* EWALD ELECTROSTATICS */
1245 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1246 ewrt = _mm_mul_ps(r33,ewtabscale);
1247 ewitab = _mm_cvttps_epi32(ewrt);
1248 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1249 ewitab = _mm_slli_epi32(ewitab,2);
1250 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1251 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1252 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1253 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1254 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1255 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1256 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1257 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1258 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1260 /* Update potential sum for this i atom from the interaction with this j atom. */
1261 velec = _mm_andnot_ps(dummy_mask,velec);
1262 velecsum = _mm_add_ps(velecsum,velec);
1266 fscal = _mm_andnot_ps(dummy_mask,fscal);
1268 /* Calculate temporary vectorial force */
1269 tx = _mm_mul_ps(fscal,dx33);
1270 ty = _mm_mul_ps(fscal,dy33);
1271 tz = _mm_mul_ps(fscal,dz33);
1273 /* Update vectorial force */
1274 fix3 = _mm_add_ps(fix3,tx);
1275 fiy3 = _mm_add_ps(fiy3,ty);
1276 fiz3 = _mm_add_ps(fiz3,tz);
1278 fjx3 = _mm_add_ps(fjx3,tx);
1279 fjy3 = _mm_add_ps(fjy3,ty);
1280 fjz3 = _mm_add_ps(fjz3,tz);
1282 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1283 f+j_coord_offsetC,f+j_coord_offsetD,
1284 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1285 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1287 /* Inner loop uses 413 flops */
1290 /* End of innermost loop */
1292 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1293 f+i_coord_offset,fshift+i_shift_offset);
1296 /* Update potential energies */
1297 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1298 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1300 /* Increment number of inner iterations */
1301 inneriter += j_index_end - j_index_start;
1303 /* Outer loop uses 38 flops */
1306 /* Increment number of outer iterations */
1309 /* Update outer/inner flops */
1311 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*413);
1314 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_single
1315 * Electrostatics interaction: Ewald
1316 * VdW interaction: LennardJones
1317 * Geometry: Water4-Water4
1318 * Calculate force/pot: Force
1321 nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_single
1322 (t_nblist * gmx_restrict nlist,
1323 rvec * gmx_restrict xx,
1324 rvec * gmx_restrict ff,
1325 t_forcerec * gmx_restrict fr,
1326 t_mdatoms * gmx_restrict mdatoms,
1327 nb_kernel_data_t * gmx_restrict kernel_data,
1328 t_nrnb * gmx_restrict nrnb)
1330 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1331 * just 0 for non-waters.
1332 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1333 * jnr indices corresponding to data put in the four positions in the SIMD register.
1335 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1336 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1337 int jnrA,jnrB,jnrC,jnrD;
1338 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1339 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1340 real shX,shY,shZ,rcutoff_scalar;
1341 real *shiftvec,*fshift,*x,*f;
1342 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1344 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1346 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1348 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1350 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1351 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1352 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1353 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1354 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1355 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1356 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1357 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1358 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1359 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1360 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1361 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1362 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1363 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1364 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1365 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1366 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1367 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1368 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1369 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1372 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1375 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1376 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1378 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1380 __m128 dummy_mask,cutoff_mask;
1381 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1382 __m128 one = _mm_set1_ps(1.0);
1383 __m128 two = _mm_set1_ps(2.0);
1389 jindex = nlist->jindex;
1391 shiftidx = nlist->shift;
1393 shiftvec = fr->shift_vec[0];
1394 fshift = fr->fshift[0];
1395 facel = _mm_set1_ps(fr->epsfac);
1396 charge = mdatoms->chargeA;
1397 nvdwtype = fr->ntype;
1398 vdwparam = fr->nbfp;
1399 vdwtype = mdatoms->typeA;
1401 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1402 ewtab = fr->ic->tabq_coul_F;
1403 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1404 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1406 /* Setup water-specific parameters */
1407 inr = nlist->iinr[0];
1408 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1409 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1410 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1411 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1413 jq1 = _mm_set1_ps(charge[inr+1]);
1414 jq2 = _mm_set1_ps(charge[inr+2]);
1415 jq3 = _mm_set1_ps(charge[inr+3]);
1416 vdwjidx0A = 2*vdwtype[inr+0];
1417 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1418 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1419 qq11 = _mm_mul_ps(iq1,jq1);
1420 qq12 = _mm_mul_ps(iq1,jq2);
1421 qq13 = _mm_mul_ps(iq1,jq3);
1422 qq21 = _mm_mul_ps(iq2,jq1);
1423 qq22 = _mm_mul_ps(iq2,jq2);
1424 qq23 = _mm_mul_ps(iq2,jq3);
1425 qq31 = _mm_mul_ps(iq3,jq1);
1426 qq32 = _mm_mul_ps(iq3,jq2);
1427 qq33 = _mm_mul_ps(iq3,jq3);
1429 /* Avoid stupid compiler warnings */
1430 jnrA = jnrB = jnrC = jnrD = 0;
1431 j_coord_offsetA = 0;
1432 j_coord_offsetB = 0;
1433 j_coord_offsetC = 0;
1434 j_coord_offsetD = 0;
1439 /* Start outer loop over neighborlists */
1440 for(iidx=0; iidx<nri; iidx++)
1442 /* Load shift vector for this list */
1443 i_shift_offset = DIM*shiftidx[iidx];
1444 shX = shiftvec[i_shift_offset+XX];
1445 shY = shiftvec[i_shift_offset+YY];
1446 shZ = shiftvec[i_shift_offset+ZZ];
1448 /* Load limits for loop over neighbors */
1449 j_index_start = jindex[iidx];
1450 j_index_end = jindex[iidx+1];
1452 /* Get outer coordinate index */
1454 i_coord_offset = DIM*inr;
1456 /* Load i particle coords and add shift vector */
1457 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1458 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1459 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1460 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1461 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1462 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1463 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1464 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1465 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1466 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
1467 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
1468 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
1470 fix0 = _mm_setzero_ps();
1471 fiy0 = _mm_setzero_ps();
1472 fiz0 = _mm_setzero_ps();
1473 fix1 = _mm_setzero_ps();
1474 fiy1 = _mm_setzero_ps();
1475 fiz1 = _mm_setzero_ps();
1476 fix2 = _mm_setzero_ps();
1477 fiy2 = _mm_setzero_ps();
1478 fiz2 = _mm_setzero_ps();
1479 fix3 = _mm_setzero_ps();
1480 fiy3 = _mm_setzero_ps();
1481 fiz3 = _mm_setzero_ps();
1483 /* Start inner kernel loop */
1484 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1487 /* Get j neighbor index, and coordinate index */
1489 jnrB = jjnr[jidx+1];
1490 jnrC = jjnr[jidx+2];
1491 jnrD = jjnr[jidx+3];
1493 j_coord_offsetA = DIM*jnrA;
1494 j_coord_offsetB = DIM*jnrB;
1495 j_coord_offsetC = DIM*jnrC;
1496 j_coord_offsetD = DIM*jnrD;
1498 /* load j atom coordinates */
1499 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1500 x+j_coord_offsetC,x+j_coord_offsetD,
1501 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1502 &jy2,&jz2,&jx3,&jy3,&jz3);
1504 /* Calculate displacement vector */
1505 dx00 = _mm_sub_ps(ix0,jx0);
1506 dy00 = _mm_sub_ps(iy0,jy0);
1507 dz00 = _mm_sub_ps(iz0,jz0);
1508 dx11 = _mm_sub_ps(ix1,jx1);
1509 dy11 = _mm_sub_ps(iy1,jy1);
1510 dz11 = _mm_sub_ps(iz1,jz1);
1511 dx12 = _mm_sub_ps(ix1,jx2);
1512 dy12 = _mm_sub_ps(iy1,jy2);
1513 dz12 = _mm_sub_ps(iz1,jz2);
1514 dx13 = _mm_sub_ps(ix1,jx3);
1515 dy13 = _mm_sub_ps(iy1,jy3);
1516 dz13 = _mm_sub_ps(iz1,jz3);
1517 dx21 = _mm_sub_ps(ix2,jx1);
1518 dy21 = _mm_sub_ps(iy2,jy1);
1519 dz21 = _mm_sub_ps(iz2,jz1);
1520 dx22 = _mm_sub_ps(ix2,jx2);
1521 dy22 = _mm_sub_ps(iy2,jy2);
1522 dz22 = _mm_sub_ps(iz2,jz2);
1523 dx23 = _mm_sub_ps(ix2,jx3);
1524 dy23 = _mm_sub_ps(iy2,jy3);
1525 dz23 = _mm_sub_ps(iz2,jz3);
1526 dx31 = _mm_sub_ps(ix3,jx1);
1527 dy31 = _mm_sub_ps(iy3,jy1);
1528 dz31 = _mm_sub_ps(iz3,jz1);
1529 dx32 = _mm_sub_ps(ix3,jx2);
1530 dy32 = _mm_sub_ps(iy3,jy2);
1531 dz32 = _mm_sub_ps(iz3,jz2);
1532 dx33 = _mm_sub_ps(ix3,jx3);
1533 dy33 = _mm_sub_ps(iy3,jy3);
1534 dz33 = _mm_sub_ps(iz3,jz3);
1536 /* Calculate squared distance and things based on it */
1537 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1538 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1539 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1540 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1541 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1542 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1543 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1544 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1545 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1546 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1548 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1549 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1550 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1551 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1552 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1553 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1554 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1555 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1556 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1558 rinvsq00 = gmx_mm_inv_ps(rsq00);
1559 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1560 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1561 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1562 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1563 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1564 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1565 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1566 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1567 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1569 fjx0 = _mm_setzero_ps();
1570 fjy0 = _mm_setzero_ps();
1571 fjz0 = _mm_setzero_ps();
1572 fjx1 = _mm_setzero_ps();
1573 fjy1 = _mm_setzero_ps();
1574 fjz1 = _mm_setzero_ps();
1575 fjx2 = _mm_setzero_ps();
1576 fjy2 = _mm_setzero_ps();
1577 fjz2 = _mm_setzero_ps();
1578 fjx3 = _mm_setzero_ps();
1579 fjy3 = _mm_setzero_ps();
1580 fjz3 = _mm_setzero_ps();
1582 /**************************
1583 * CALCULATE INTERACTIONS *
1584 **************************/
1586 /* LENNARD-JONES DISPERSION/REPULSION */
1588 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1589 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1593 /* Calculate temporary vectorial force */
1594 tx = _mm_mul_ps(fscal,dx00);
1595 ty = _mm_mul_ps(fscal,dy00);
1596 tz = _mm_mul_ps(fscal,dz00);
1598 /* Update vectorial force */
1599 fix0 = _mm_add_ps(fix0,tx);
1600 fiy0 = _mm_add_ps(fiy0,ty);
1601 fiz0 = _mm_add_ps(fiz0,tz);
1603 fjx0 = _mm_add_ps(fjx0,tx);
1604 fjy0 = _mm_add_ps(fjy0,ty);
1605 fjz0 = _mm_add_ps(fjz0,tz);
1607 /**************************
1608 * CALCULATE INTERACTIONS *
1609 **************************/
1611 r11 = _mm_mul_ps(rsq11,rinv11);
1613 /* EWALD ELECTROSTATICS */
1615 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1616 ewrt = _mm_mul_ps(r11,ewtabscale);
1617 ewitab = _mm_cvttps_epi32(ewrt);
1618 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1619 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1620 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1622 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1623 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1627 /* Calculate temporary vectorial force */
1628 tx = _mm_mul_ps(fscal,dx11);
1629 ty = _mm_mul_ps(fscal,dy11);
1630 tz = _mm_mul_ps(fscal,dz11);
1632 /* Update vectorial force */
1633 fix1 = _mm_add_ps(fix1,tx);
1634 fiy1 = _mm_add_ps(fiy1,ty);
1635 fiz1 = _mm_add_ps(fiz1,tz);
1637 fjx1 = _mm_add_ps(fjx1,tx);
1638 fjy1 = _mm_add_ps(fjy1,ty);
1639 fjz1 = _mm_add_ps(fjz1,tz);
1641 /**************************
1642 * CALCULATE INTERACTIONS *
1643 **************************/
1645 r12 = _mm_mul_ps(rsq12,rinv12);
1647 /* EWALD ELECTROSTATICS */
1649 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1650 ewrt = _mm_mul_ps(r12,ewtabscale);
1651 ewitab = _mm_cvttps_epi32(ewrt);
1652 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1653 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1654 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1656 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1657 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1661 /* Calculate temporary vectorial force */
1662 tx = _mm_mul_ps(fscal,dx12);
1663 ty = _mm_mul_ps(fscal,dy12);
1664 tz = _mm_mul_ps(fscal,dz12);
1666 /* Update vectorial force */
1667 fix1 = _mm_add_ps(fix1,tx);
1668 fiy1 = _mm_add_ps(fiy1,ty);
1669 fiz1 = _mm_add_ps(fiz1,tz);
1671 fjx2 = _mm_add_ps(fjx2,tx);
1672 fjy2 = _mm_add_ps(fjy2,ty);
1673 fjz2 = _mm_add_ps(fjz2,tz);
1675 /**************************
1676 * CALCULATE INTERACTIONS *
1677 **************************/
1679 r13 = _mm_mul_ps(rsq13,rinv13);
1681 /* EWALD ELECTROSTATICS */
1683 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1684 ewrt = _mm_mul_ps(r13,ewtabscale);
1685 ewitab = _mm_cvttps_epi32(ewrt);
1686 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1687 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1688 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1690 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1691 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1695 /* Calculate temporary vectorial force */
1696 tx = _mm_mul_ps(fscal,dx13);
1697 ty = _mm_mul_ps(fscal,dy13);
1698 tz = _mm_mul_ps(fscal,dz13);
1700 /* Update vectorial force */
1701 fix1 = _mm_add_ps(fix1,tx);
1702 fiy1 = _mm_add_ps(fiy1,ty);
1703 fiz1 = _mm_add_ps(fiz1,tz);
1705 fjx3 = _mm_add_ps(fjx3,tx);
1706 fjy3 = _mm_add_ps(fjy3,ty);
1707 fjz3 = _mm_add_ps(fjz3,tz);
1709 /**************************
1710 * CALCULATE INTERACTIONS *
1711 **************************/
1713 r21 = _mm_mul_ps(rsq21,rinv21);
1715 /* EWALD ELECTROSTATICS */
1717 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1718 ewrt = _mm_mul_ps(r21,ewtabscale);
1719 ewitab = _mm_cvttps_epi32(ewrt);
1720 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1721 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1722 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1724 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1725 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1729 /* Calculate temporary vectorial force */
1730 tx = _mm_mul_ps(fscal,dx21);
1731 ty = _mm_mul_ps(fscal,dy21);
1732 tz = _mm_mul_ps(fscal,dz21);
1734 /* Update vectorial force */
1735 fix2 = _mm_add_ps(fix2,tx);
1736 fiy2 = _mm_add_ps(fiy2,ty);
1737 fiz2 = _mm_add_ps(fiz2,tz);
1739 fjx1 = _mm_add_ps(fjx1,tx);
1740 fjy1 = _mm_add_ps(fjy1,ty);
1741 fjz1 = _mm_add_ps(fjz1,tz);
1743 /**************************
1744 * CALCULATE INTERACTIONS *
1745 **************************/
1747 r22 = _mm_mul_ps(rsq22,rinv22);
1749 /* EWALD ELECTROSTATICS */
1751 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1752 ewrt = _mm_mul_ps(r22,ewtabscale);
1753 ewitab = _mm_cvttps_epi32(ewrt);
1754 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1755 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1756 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1758 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1759 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1763 /* Calculate temporary vectorial force */
1764 tx = _mm_mul_ps(fscal,dx22);
1765 ty = _mm_mul_ps(fscal,dy22);
1766 tz = _mm_mul_ps(fscal,dz22);
1768 /* Update vectorial force */
1769 fix2 = _mm_add_ps(fix2,tx);
1770 fiy2 = _mm_add_ps(fiy2,ty);
1771 fiz2 = _mm_add_ps(fiz2,tz);
1773 fjx2 = _mm_add_ps(fjx2,tx);
1774 fjy2 = _mm_add_ps(fjy2,ty);
1775 fjz2 = _mm_add_ps(fjz2,tz);
1777 /**************************
1778 * CALCULATE INTERACTIONS *
1779 **************************/
1781 r23 = _mm_mul_ps(rsq23,rinv23);
1783 /* EWALD ELECTROSTATICS */
1785 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1786 ewrt = _mm_mul_ps(r23,ewtabscale);
1787 ewitab = _mm_cvttps_epi32(ewrt);
1788 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1789 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1790 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1792 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1793 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1797 /* Calculate temporary vectorial force */
1798 tx = _mm_mul_ps(fscal,dx23);
1799 ty = _mm_mul_ps(fscal,dy23);
1800 tz = _mm_mul_ps(fscal,dz23);
1802 /* Update vectorial force */
1803 fix2 = _mm_add_ps(fix2,tx);
1804 fiy2 = _mm_add_ps(fiy2,ty);
1805 fiz2 = _mm_add_ps(fiz2,tz);
1807 fjx3 = _mm_add_ps(fjx3,tx);
1808 fjy3 = _mm_add_ps(fjy3,ty);
1809 fjz3 = _mm_add_ps(fjz3,tz);
1811 /**************************
1812 * CALCULATE INTERACTIONS *
1813 **************************/
1815 r31 = _mm_mul_ps(rsq31,rinv31);
1817 /* EWALD ELECTROSTATICS */
1819 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1820 ewrt = _mm_mul_ps(r31,ewtabscale);
1821 ewitab = _mm_cvttps_epi32(ewrt);
1822 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1823 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1824 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1826 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1827 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1831 /* Calculate temporary vectorial force */
1832 tx = _mm_mul_ps(fscal,dx31);
1833 ty = _mm_mul_ps(fscal,dy31);
1834 tz = _mm_mul_ps(fscal,dz31);
1836 /* Update vectorial force */
1837 fix3 = _mm_add_ps(fix3,tx);
1838 fiy3 = _mm_add_ps(fiy3,ty);
1839 fiz3 = _mm_add_ps(fiz3,tz);
1841 fjx1 = _mm_add_ps(fjx1,tx);
1842 fjy1 = _mm_add_ps(fjy1,ty);
1843 fjz1 = _mm_add_ps(fjz1,tz);
1845 /**************************
1846 * CALCULATE INTERACTIONS *
1847 **************************/
1849 r32 = _mm_mul_ps(rsq32,rinv32);
1851 /* EWALD ELECTROSTATICS */
1853 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1854 ewrt = _mm_mul_ps(r32,ewtabscale);
1855 ewitab = _mm_cvttps_epi32(ewrt);
1856 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1857 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1858 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1860 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1861 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1865 /* Calculate temporary vectorial force */
1866 tx = _mm_mul_ps(fscal,dx32);
1867 ty = _mm_mul_ps(fscal,dy32);
1868 tz = _mm_mul_ps(fscal,dz32);
1870 /* Update vectorial force */
1871 fix3 = _mm_add_ps(fix3,tx);
1872 fiy3 = _mm_add_ps(fiy3,ty);
1873 fiz3 = _mm_add_ps(fiz3,tz);
1875 fjx2 = _mm_add_ps(fjx2,tx);
1876 fjy2 = _mm_add_ps(fjy2,ty);
1877 fjz2 = _mm_add_ps(fjz2,tz);
1879 /**************************
1880 * CALCULATE INTERACTIONS *
1881 **************************/
1883 r33 = _mm_mul_ps(rsq33,rinv33);
1885 /* EWALD ELECTROSTATICS */
1887 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1888 ewrt = _mm_mul_ps(r33,ewtabscale);
1889 ewitab = _mm_cvttps_epi32(ewrt);
1890 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1891 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1892 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1894 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1895 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1899 /* Calculate temporary vectorial force */
1900 tx = _mm_mul_ps(fscal,dx33);
1901 ty = _mm_mul_ps(fscal,dy33);
1902 tz = _mm_mul_ps(fscal,dz33);
1904 /* Update vectorial force */
1905 fix3 = _mm_add_ps(fix3,tx);
1906 fiy3 = _mm_add_ps(fiy3,ty);
1907 fiz3 = _mm_add_ps(fiz3,tz);
1909 fjx3 = _mm_add_ps(fjx3,tx);
1910 fjy3 = _mm_add_ps(fjy3,ty);
1911 fjz3 = _mm_add_ps(fjz3,tz);
1913 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1914 f+j_coord_offsetC,f+j_coord_offsetD,
1915 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1916 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1918 /* Inner loop uses 354 flops */
1921 if(jidx<j_index_end)
1924 /* Get j neighbor index, and coordinate index */
1926 jnrB = jjnr[jidx+1];
1927 jnrC = jjnr[jidx+2];
1928 jnrD = jjnr[jidx+3];
1930 /* Sign of each element will be negative for non-real atoms.
1931 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1932 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1934 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1935 jnrA = (jnrA>=0) ? jnrA : 0;
1936 jnrB = (jnrB>=0) ? jnrB : 0;
1937 jnrC = (jnrC>=0) ? jnrC : 0;
1938 jnrD = (jnrD>=0) ? jnrD : 0;
1940 j_coord_offsetA = DIM*jnrA;
1941 j_coord_offsetB = DIM*jnrB;
1942 j_coord_offsetC = DIM*jnrC;
1943 j_coord_offsetD = DIM*jnrD;
1945 /* load j atom coordinates */
1946 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1947 x+j_coord_offsetC,x+j_coord_offsetD,
1948 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1949 &jy2,&jz2,&jx3,&jy3,&jz3);
1951 /* Calculate displacement vector */
1952 dx00 = _mm_sub_ps(ix0,jx0);
1953 dy00 = _mm_sub_ps(iy0,jy0);
1954 dz00 = _mm_sub_ps(iz0,jz0);
1955 dx11 = _mm_sub_ps(ix1,jx1);
1956 dy11 = _mm_sub_ps(iy1,jy1);
1957 dz11 = _mm_sub_ps(iz1,jz1);
1958 dx12 = _mm_sub_ps(ix1,jx2);
1959 dy12 = _mm_sub_ps(iy1,jy2);
1960 dz12 = _mm_sub_ps(iz1,jz2);
1961 dx13 = _mm_sub_ps(ix1,jx3);
1962 dy13 = _mm_sub_ps(iy1,jy3);
1963 dz13 = _mm_sub_ps(iz1,jz3);
1964 dx21 = _mm_sub_ps(ix2,jx1);
1965 dy21 = _mm_sub_ps(iy2,jy1);
1966 dz21 = _mm_sub_ps(iz2,jz1);
1967 dx22 = _mm_sub_ps(ix2,jx2);
1968 dy22 = _mm_sub_ps(iy2,jy2);
1969 dz22 = _mm_sub_ps(iz2,jz2);
1970 dx23 = _mm_sub_ps(ix2,jx3);
1971 dy23 = _mm_sub_ps(iy2,jy3);
1972 dz23 = _mm_sub_ps(iz2,jz3);
1973 dx31 = _mm_sub_ps(ix3,jx1);
1974 dy31 = _mm_sub_ps(iy3,jy1);
1975 dz31 = _mm_sub_ps(iz3,jz1);
1976 dx32 = _mm_sub_ps(ix3,jx2);
1977 dy32 = _mm_sub_ps(iy3,jy2);
1978 dz32 = _mm_sub_ps(iz3,jz2);
1979 dx33 = _mm_sub_ps(ix3,jx3);
1980 dy33 = _mm_sub_ps(iy3,jy3);
1981 dz33 = _mm_sub_ps(iz3,jz3);
1983 /* Calculate squared distance and things based on it */
1984 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1985 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1986 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1987 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1988 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1989 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1990 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1991 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1992 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1993 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1995 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1996 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1997 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1998 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1999 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2000 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2001 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2002 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2003 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2005 rinvsq00 = gmx_mm_inv_ps(rsq00);
2006 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2007 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2008 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2009 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2010 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2011 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2012 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2013 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2014 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2016 fjx0 = _mm_setzero_ps();
2017 fjy0 = _mm_setzero_ps();
2018 fjz0 = _mm_setzero_ps();
2019 fjx1 = _mm_setzero_ps();
2020 fjy1 = _mm_setzero_ps();
2021 fjz1 = _mm_setzero_ps();
2022 fjx2 = _mm_setzero_ps();
2023 fjy2 = _mm_setzero_ps();
2024 fjz2 = _mm_setzero_ps();
2025 fjx3 = _mm_setzero_ps();
2026 fjy3 = _mm_setzero_ps();
2027 fjz3 = _mm_setzero_ps();
2029 /**************************
2030 * CALCULATE INTERACTIONS *
2031 **************************/
2033 /* LENNARD-JONES DISPERSION/REPULSION */
2035 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2036 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
2040 fscal = _mm_andnot_ps(dummy_mask,fscal);
2042 /* Calculate temporary vectorial force */
2043 tx = _mm_mul_ps(fscal,dx00);
2044 ty = _mm_mul_ps(fscal,dy00);
2045 tz = _mm_mul_ps(fscal,dz00);
2047 /* Update vectorial force */
2048 fix0 = _mm_add_ps(fix0,tx);
2049 fiy0 = _mm_add_ps(fiy0,ty);
2050 fiz0 = _mm_add_ps(fiz0,tz);
2052 fjx0 = _mm_add_ps(fjx0,tx);
2053 fjy0 = _mm_add_ps(fjy0,ty);
2054 fjz0 = _mm_add_ps(fjz0,tz);
2056 /**************************
2057 * CALCULATE INTERACTIONS *
2058 **************************/
2060 r11 = _mm_mul_ps(rsq11,rinv11);
2061 r11 = _mm_andnot_ps(dummy_mask,r11);
2063 /* EWALD ELECTROSTATICS */
2065 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2066 ewrt = _mm_mul_ps(r11,ewtabscale);
2067 ewitab = _mm_cvttps_epi32(ewrt);
2068 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2069 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2070 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2072 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2073 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2077 fscal = _mm_andnot_ps(dummy_mask,fscal);
2079 /* Calculate temporary vectorial force */
2080 tx = _mm_mul_ps(fscal,dx11);
2081 ty = _mm_mul_ps(fscal,dy11);
2082 tz = _mm_mul_ps(fscal,dz11);
2084 /* Update vectorial force */
2085 fix1 = _mm_add_ps(fix1,tx);
2086 fiy1 = _mm_add_ps(fiy1,ty);
2087 fiz1 = _mm_add_ps(fiz1,tz);
2089 fjx1 = _mm_add_ps(fjx1,tx);
2090 fjy1 = _mm_add_ps(fjy1,ty);
2091 fjz1 = _mm_add_ps(fjz1,tz);
2093 /**************************
2094 * CALCULATE INTERACTIONS *
2095 **************************/
2097 r12 = _mm_mul_ps(rsq12,rinv12);
2098 r12 = _mm_andnot_ps(dummy_mask,r12);
2100 /* EWALD ELECTROSTATICS */
2102 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2103 ewrt = _mm_mul_ps(r12,ewtabscale);
2104 ewitab = _mm_cvttps_epi32(ewrt);
2105 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2106 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2107 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2109 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2110 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2114 fscal = _mm_andnot_ps(dummy_mask,fscal);
2116 /* Calculate temporary vectorial force */
2117 tx = _mm_mul_ps(fscal,dx12);
2118 ty = _mm_mul_ps(fscal,dy12);
2119 tz = _mm_mul_ps(fscal,dz12);
2121 /* Update vectorial force */
2122 fix1 = _mm_add_ps(fix1,tx);
2123 fiy1 = _mm_add_ps(fiy1,ty);
2124 fiz1 = _mm_add_ps(fiz1,tz);
2126 fjx2 = _mm_add_ps(fjx2,tx);
2127 fjy2 = _mm_add_ps(fjy2,ty);
2128 fjz2 = _mm_add_ps(fjz2,tz);
2130 /**************************
2131 * CALCULATE INTERACTIONS *
2132 **************************/
2134 r13 = _mm_mul_ps(rsq13,rinv13);
2135 r13 = _mm_andnot_ps(dummy_mask,r13);
2137 /* EWALD ELECTROSTATICS */
2139 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2140 ewrt = _mm_mul_ps(r13,ewtabscale);
2141 ewitab = _mm_cvttps_epi32(ewrt);
2142 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2143 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2144 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2146 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2147 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2151 fscal = _mm_andnot_ps(dummy_mask,fscal);
2153 /* Calculate temporary vectorial force */
2154 tx = _mm_mul_ps(fscal,dx13);
2155 ty = _mm_mul_ps(fscal,dy13);
2156 tz = _mm_mul_ps(fscal,dz13);
2158 /* Update vectorial force */
2159 fix1 = _mm_add_ps(fix1,tx);
2160 fiy1 = _mm_add_ps(fiy1,ty);
2161 fiz1 = _mm_add_ps(fiz1,tz);
2163 fjx3 = _mm_add_ps(fjx3,tx);
2164 fjy3 = _mm_add_ps(fjy3,ty);
2165 fjz3 = _mm_add_ps(fjz3,tz);
2167 /**************************
2168 * CALCULATE INTERACTIONS *
2169 **************************/
2171 r21 = _mm_mul_ps(rsq21,rinv21);
2172 r21 = _mm_andnot_ps(dummy_mask,r21);
2174 /* EWALD ELECTROSTATICS */
2176 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2177 ewrt = _mm_mul_ps(r21,ewtabscale);
2178 ewitab = _mm_cvttps_epi32(ewrt);
2179 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2180 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2181 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2183 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2184 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2188 fscal = _mm_andnot_ps(dummy_mask,fscal);
2190 /* Calculate temporary vectorial force */
2191 tx = _mm_mul_ps(fscal,dx21);
2192 ty = _mm_mul_ps(fscal,dy21);
2193 tz = _mm_mul_ps(fscal,dz21);
2195 /* Update vectorial force */
2196 fix2 = _mm_add_ps(fix2,tx);
2197 fiy2 = _mm_add_ps(fiy2,ty);
2198 fiz2 = _mm_add_ps(fiz2,tz);
2200 fjx1 = _mm_add_ps(fjx1,tx);
2201 fjy1 = _mm_add_ps(fjy1,ty);
2202 fjz1 = _mm_add_ps(fjz1,tz);
2204 /**************************
2205 * CALCULATE INTERACTIONS *
2206 **************************/
2208 r22 = _mm_mul_ps(rsq22,rinv22);
2209 r22 = _mm_andnot_ps(dummy_mask,r22);
2211 /* EWALD ELECTROSTATICS */
2213 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2214 ewrt = _mm_mul_ps(r22,ewtabscale);
2215 ewitab = _mm_cvttps_epi32(ewrt);
2216 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2217 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2218 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2220 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2221 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2225 fscal = _mm_andnot_ps(dummy_mask,fscal);
2227 /* Calculate temporary vectorial force */
2228 tx = _mm_mul_ps(fscal,dx22);
2229 ty = _mm_mul_ps(fscal,dy22);
2230 tz = _mm_mul_ps(fscal,dz22);
2232 /* Update vectorial force */
2233 fix2 = _mm_add_ps(fix2,tx);
2234 fiy2 = _mm_add_ps(fiy2,ty);
2235 fiz2 = _mm_add_ps(fiz2,tz);
2237 fjx2 = _mm_add_ps(fjx2,tx);
2238 fjy2 = _mm_add_ps(fjy2,ty);
2239 fjz2 = _mm_add_ps(fjz2,tz);
2241 /**************************
2242 * CALCULATE INTERACTIONS *
2243 **************************/
2245 r23 = _mm_mul_ps(rsq23,rinv23);
2246 r23 = _mm_andnot_ps(dummy_mask,r23);
2248 /* EWALD ELECTROSTATICS */
2250 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2251 ewrt = _mm_mul_ps(r23,ewtabscale);
2252 ewitab = _mm_cvttps_epi32(ewrt);
2253 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2254 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2255 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2257 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2258 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2262 fscal = _mm_andnot_ps(dummy_mask,fscal);
2264 /* Calculate temporary vectorial force */
2265 tx = _mm_mul_ps(fscal,dx23);
2266 ty = _mm_mul_ps(fscal,dy23);
2267 tz = _mm_mul_ps(fscal,dz23);
2269 /* Update vectorial force */
2270 fix2 = _mm_add_ps(fix2,tx);
2271 fiy2 = _mm_add_ps(fiy2,ty);
2272 fiz2 = _mm_add_ps(fiz2,tz);
2274 fjx3 = _mm_add_ps(fjx3,tx);
2275 fjy3 = _mm_add_ps(fjy3,ty);
2276 fjz3 = _mm_add_ps(fjz3,tz);
2278 /**************************
2279 * CALCULATE INTERACTIONS *
2280 **************************/
2282 r31 = _mm_mul_ps(rsq31,rinv31);
2283 r31 = _mm_andnot_ps(dummy_mask,r31);
2285 /* EWALD ELECTROSTATICS */
2287 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2288 ewrt = _mm_mul_ps(r31,ewtabscale);
2289 ewitab = _mm_cvttps_epi32(ewrt);
2290 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2291 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2292 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2294 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2295 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2299 fscal = _mm_andnot_ps(dummy_mask,fscal);
2301 /* Calculate temporary vectorial force */
2302 tx = _mm_mul_ps(fscal,dx31);
2303 ty = _mm_mul_ps(fscal,dy31);
2304 tz = _mm_mul_ps(fscal,dz31);
2306 /* Update vectorial force */
2307 fix3 = _mm_add_ps(fix3,tx);
2308 fiy3 = _mm_add_ps(fiy3,ty);
2309 fiz3 = _mm_add_ps(fiz3,tz);
2311 fjx1 = _mm_add_ps(fjx1,tx);
2312 fjy1 = _mm_add_ps(fjy1,ty);
2313 fjz1 = _mm_add_ps(fjz1,tz);
2315 /**************************
2316 * CALCULATE INTERACTIONS *
2317 **************************/
2319 r32 = _mm_mul_ps(rsq32,rinv32);
2320 r32 = _mm_andnot_ps(dummy_mask,r32);
2322 /* EWALD ELECTROSTATICS */
2324 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2325 ewrt = _mm_mul_ps(r32,ewtabscale);
2326 ewitab = _mm_cvttps_epi32(ewrt);
2327 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2328 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2329 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2331 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2332 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2336 fscal = _mm_andnot_ps(dummy_mask,fscal);
2338 /* Calculate temporary vectorial force */
2339 tx = _mm_mul_ps(fscal,dx32);
2340 ty = _mm_mul_ps(fscal,dy32);
2341 tz = _mm_mul_ps(fscal,dz32);
2343 /* Update vectorial force */
2344 fix3 = _mm_add_ps(fix3,tx);
2345 fiy3 = _mm_add_ps(fiy3,ty);
2346 fiz3 = _mm_add_ps(fiz3,tz);
2348 fjx2 = _mm_add_ps(fjx2,tx);
2349 fjy2 = _mm_add_ps(fjy2,ty);
2350 fjz2 = _mm_add_ps(fjz2,tz);
2352 /**************************
2353 * CALCULATE INTERACTIONS *
2354 **************************/
2356 r33 = _mm_mul_ps(rsq33,rinv33);
2357 r33 = _mm_andnot_ps(dummy_mask,r33);
2359 /* EWALD ELECTROSTATICS */
2361 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2362 ewrt = _mm_mul_ps(r33,ewtabscale);
2363 ewitab = _mm_cvttps_epi32(ewrt);
2364 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2365 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2366 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2368 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2369 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2373 fscal = _mm_andnot_ps(dummy_mask,fscal);
2375 /* Calculate temporary vectorial force */
2376 tx = _mm_mul_ps(fscal,dx33);
2377 ty = _mm_mul_ps(fscal,dy33);
2378 tz = _mm_mul_ps(fscal,dz33);
2380 /* Update vectorial force */
2381 fix3 = _mm_add_ps(fix3,tx);
2382 fiy3 = _mm_add_ps(fiy3,ty);
2383 fiz3 = _mm_add_ps(fiz3,tz);
2385 fjx3 = _mm_add_ps(fjx3,tx);
2386 fjy3 = _mm_add_ps(fjy3,ty);
2387 fjz3 = _mm_add_ps(fjz3,tz);
2389 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2390 f+j_coord_offsetC,f+j_coord_offsetD,
2391 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2392 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2394 /* Inner loop uses 363 flops */
2397 /* End of innermost loop */
2399 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2400 f+i_coord_offset,fshift+i_shift_offset);
2402 /* Increment number of inner iterations */
2403 inneriter += j_index_end - j_index_start;
2405 /* Outer loop uses 36 flops */
2408 /* Increment number of outer iterations */
2411 /* Update outer/inner flops */
2413 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*363);