2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: None
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSh_VdwNone_GeomW4W4_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
69 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
71 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
72 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
73 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
74 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
75 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
76 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
77 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
78 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
79 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
80 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
81 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
82 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
83 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
84 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
85 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
86 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
87 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
92 __m128 dummy_mask,cutoff_mask;
93 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
94 __m128 one = _mm_set1_ps(1.0);
95 __m128 two = _mm_set1_ps(2.0);
101 jindex = nlist->jindex;
103 shiftidx = nlist->shift;
105 shiftvec = fr->shift_vec[0];
106 fshift = fr->fshift[0];
107 facel = _mm_set1_ps(fr->epsfac);
108 charge = mdatoms->chargeA;
110 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
111 ewtab = fr->ic->tabq_coul_FDV0;
112 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
113 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
115 /* Setup water-specific parameters */
116 inr = nlist->iinr[0];
117 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
118 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
119 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
121 jq1 = _mm_set1_ps(charge[inr+1]);
122 jq2 = _mm_set1_ps(charge[inr+2]);
123 jq3 = _mm_set1_ps(charge[inr+3]);
124 qq11 = _mm_mul_ps(iq1,jq1);
125 qq12 = _mm_mul_ps(iq1,jq2);
126 qq13 = _mm_mul_ps(iq1,jq3);
127 qq21 = _mm_mul_ps(iq2,jq1);
128 qq22 = _mm_mul_ps(iq2,jq2);
129 qq23 = _mm_mul_ps(iq2,jq3);
130 qq31 = _mm_mul_ps(iq3,jq1);
131 qq32 = _mm_mul_ps(iq3,jq2);
132 qq33 = _mm_mul_ps(iq3,jq3);
134 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
135 rcutoff_scalar = fr->rcoulomb;
136 rcutoff = _mm_set1_ps(rcutoff_scalar);
137 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
139 /* Avoid stupid compiler warnings */
140 jnrA = jnrB = jnrC = jnrD = 0;
149 /* Start outer loop over neighborlists */
150 for(iidx=0; iidx<nri; iidx++)
152 /* Load shift vector for this list */
153 i_shift_offset = DIM*shiftidx[iidx];
154 shX = shiftvec[i_shift_offset+XX];
155 shY = shiftvec[i_shift_offset+YY];
156 shZ = shiftvec[i_shift_offset+ZZ];
158 /* Load limits for loop over neighbors */
159 j_index_start = jindex[iidx];
160 j_index_end = jindex[iidx+1];
162 /* Get outer coordinate index */
164 i_coord_offset = DIM*inr;
166 /* Load i particle coords and add shift vector */
167 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
168 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
169 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
170 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
171 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
172 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
173 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
174 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
175 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
177 fix1 = _mm_setzero_ps();
178 fiy1 = _mm_setzero_ps();
179 fiz1 = _mm_setzero_ps();
180 fix2 = _mm_setzero_ps();
181 fiy2 = _mm_setzero_ps();
182 fiz2 = _mm_setzero_ps();
183 fix3 = _mm_setzero_ps();
184 fiy3 = _mm_setzero_ps();
185 fiz3 = _mm_setzero_ps();
187 /* Reset potential sums */
188 velecsum = _mm_setzero_ps();
190 /* Start inner kernel loop */
191 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
194 /* Get j neighbor index, and coordinate index */
200 j_coord_offsetA = DIM*jnrA;
201 j_coord_offsetB = DIM*jnrB;
202 j_coord_offsetC = DIM*jnrC;
203 j_coord_offsetD = DIM*jnrD;
205 /* load j atom coordinates */
206 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
207 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
208 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
210 /* Calculate displacement vector */
211 dx11 = _mm_sub_ps(ix1,jx1);
212 dy11 = _mm_sub_ps(iy1,jy1);
213 dz11 = _mm_sub_ps(iz1,jz1);
214 dx12 = _mm_sub_ps(ix1,jx2);
215 dy12 = _mm_sub_ps(iy1,jy2);
216 dz12 = _mm_sub_ps(iz1,jz2);
217 dx13 = _mm_sub_ps(ix1,jx3);
218 dy13 = _mm_sub_ps(iy1,jy3);
219 dz13 = _mm_sub_ps(iz1,jz3);
220 dx21 = _mm_sub_ps(ix2,jx1);
221 dy21 = _mm_sub_ps(iy2,jy1);
222 dz21 = _mm_sub_ps(iz2,jz1);
223 dx22 = _mm_sub_ps(ix2,jx2);
224 dy22 = _mm_sub_ps(iy2,jy2);
225 dz22 = _mm_sub_ps(iz2,jz2);
226 dx23 = _mm_sub_ps(ix2,jx3);
227 dy23 = _mm_sub_ps(iy2,jy3);
228 dz23 = _mm_sub_ps(iz2,jz3);
229 dx31 = _mm_sub_ps(ix3,jx1);
230 dy31 = _mm_sub_ps(iy3,jy1);
231 dz31 = _mm_sub_ps(iz3,jz1);
232 dx32 = _mm_sub_ps(ix3,jx2);
233 dy32 = _mm_sub_ps(iy3,jy2);
234 dz32 = _mm_sub_ps(iz3,jz2);
235 dx33 = _mm_sub_ps(ix3,jx3);
236 dy33 = _mm_sub_ps(iy3,jy3);
237 dz33 = _mm_sub_ps(iz3,jz3);
239 /* Calculate squared distance and things based on it */
240 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
241 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
242 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
243 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
244 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
245 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
246 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
247 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
248 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
250 rinv11 = gmx_mm_invsqrt_ps(rsq11);
251 rinv12 = gmx_mm_invsqrt_ps(rsq12);
252 rinv13 = gmx_mm_invsqrt_ps(rsq13);
253 rinv21 = gmx_mm_invsqrt_ps(rsq21);
254 rinv22 = gmx_mm_invsqrt_ps(rsq22);
255 rinv23 = gmx_mm_invsqrt_ps(rsq23);
256 rinv31 = gmx_mm_invsqrt_ps(rsq31);
257 rinv32 = gmx_mm_invsqrt_ps(rsq32);
258 rinv33 = gmx_mm_invsqrt_ps(rsq33);
260 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
261 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
262 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
263 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
264 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
265 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
266 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
267 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
268 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
270 fjx1 = _mm_setzero_ps();
271 fjy1 = _mm_setzero_ps();
272 fjz1 = _mm_setzero_ps();
273 fjx2 = _mm_setzero_ps();
274 fjy2 = _mm_setzero_ps();
275 fjz2 = _mm_setzero_ps();
276 fjx3 = _mm_setzero_ps();
277 fjy3 = _mm_setzero_ps();
278 fjz3 = _mm_setzero_ps();
280 /**************************
281 * CALCULATE INTERACTIONS *
282 **************************/
284 if (gmx_mm_any_lt(rsq11,rcutoff2))
287 r11 = _mm_mul_ps(rsq11,rinv11);
289 /* EWALD ELECTROSTATICS */
291 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
292 ewrt = _mm_mul_ps(r11,ewtabscale);
293 ewitab = _mm_cvttps_epi32(ewrt);
294 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
295 ewitab = _mm_slli_epi32(ewitab,2);
296 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
297 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
298 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
299 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
300 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
301 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
302 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
303 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
304 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
306 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
308 /* Update potential sum for this i atom from the interaction with this j atom. */
309 velec = _mm_and_ps(velec,cutoff_mask);
310 velecsum = _mm_add_ps(velecsum,velec);
314 fscal = _mm_and_ps(fscal,cutoff_mask);
316 /* Calculate temporary vectorial force */
317 tx = _mm_mul_ps(fscal,dx11);
318 ty = _mm_mul_ps(fscal,dy11);
319 tz = _mm_mul_ps(fscal,dz11);
321 /* Update vectorial force */
322 fix1 = _mm_add_ps(fix1,tx);
323 fiy1 = _mm_add_ps(fiy1,ty);
324 fiz1 = _mm_add_ps(fiz1,tz);
326 fjx1 = _mm_add_ps(fjx1,tx);
327 fjy1 = _mm_add_ps(fjy1,ty);
328 fjz1 = _mm_add_ps(fjz1,tz);
332 /**************************
333 * CALCULATE INTERACTIONS *
334 **************************/
336 if (gmx_mm_any_lt(rsq12,rcutoff2))
339 r12 = _mm_mul_ps(rsq12,rinv12);
341 /* EWALD ELECTROSTATICS */
343 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
344 ewrt = _mm_mul_ps(r12,ewtabscale);
345 ewitab = _mm_cvttps_epi32(ewrt);
346 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
347 ewitab = _mm_slli_epi32(ewitab,2);
348 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
349 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
350 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
351 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
352 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
353 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
354 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
355 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
356 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
358 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
360 /* Update potential sum for this i atom from the interaction with this j atom. */
361 velec = _mm_and_ps(velec,cutoff_mask);
362 velecsum = _mm_add_ps(velecsum,velec);
366 fscal = _mm_and_ps(fscal,cutoff_mask);
368 /* Calculate temporary vectorial force */
369 tx = _mm_mul_ps(fscal,dx12);
370 ty = _mm_mul_ps(fscal,dy12);
371 tz = _mm_mul_ps(fscal,dz12);
373 /* Update vectorial force */
374 fix1 = _mm_add_ps(fix1,tx);
375 fiy1 = _mm_add_ps(fiy1,ty);
376 fiz1 = _mm_add_ps(fiz1,tz);
378 fjx2 = _mm_add_ps(fjx2,tx);
379 fjy2 = _mm_add_ps(fjy2,ty);
380 fjz2 = _mm_add_ps(fjz2,tz);
384 /**************************
385 * CALCULATE INTERACTIONS *
386 **************************/
388 if (gmx_mm_any_lt(rsq13,rcutoff2))
391 r13 = _mm_mul_ps(rsq13,rinv13);
393 /* EWALD ELECTROSTATICS */
395 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
396 ewrt = _mm_mul_ps(r13,ewtabscale);
397 ewitab = _mm_cvttps_epi32(ewrt);
398 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
399 ewitab = _mm_slli_epi32(ewitab,2);
400 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
401 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
402 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
403 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
404 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
405 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
406 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
407 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
408 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
410 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
412 /* Update potential sum for this i atom from the interaction with this j atom. */
413 velec = _mm_and_ps(velec,cutoff_mask);
414 velecsum = _mm_add_ps(velecsum,velec);
418 fscal = _mm_and_ps(fscal,cutoff_mask);
420 /* Calculate temporary vectorial force */
421 tx = _mm_mul_ps(fscal,dx13);
422 ty = _mm_mul_ps(fscal,dy13);
423 tz = _mm_mul_ps(fscal,dz13);
425 /* Update vectorial force */
426 fix1 = _mm_add_ps(fix1,tx);
427 fiy1 = _mm_add_ps(fiy1,ty);
428 fiz1 = _mm_add_ps(fiz1,tz);
430 fjx3 = _mm_add_ps(fjx3,tx);
431 fjy3 = _mm_add_ps(fjy3,ty);
432 fjz3 = _mm_add_ps(fjz3,tz);
436 /**************************
437 * CALCULATE INTERACTIONS *
438 **************************/
440 if (gmx_mm_any_lt(rsq21,rcutoff2))
443 r21 = _mm_mul_ps(rsq21,rinv21);
445 /* EWALD ELECTROSTATICS */
447 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
448 ewrt = _mm_mul_ps(r21,ewtabscale);
449 ewitab = _mm_cvttps_epi32(ewrt);
450 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
451 ewitab = _mm_slli_epi32(ewitab,2);
452 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
453 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
454 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
455 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
456 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
457 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
458 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
459 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
460 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
462 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
464 /* Update potential sum for this i atom from the interaction with this j atom. */
465 velec = _mm_and_ps(velec,cutoff_mask);
466 velecsum = _mm_add_ps(velecsum,velec);
470 fscal = _mm_and_ps(fscal,cutoff_mask);
472 /* Calculate temporary vectorial force */
473 tx = _mm_mul_ps(fscal,dx21);
474 ty = _mm_mul_ps(fscal,dy21);
475 tz = _mm_mul_ps(fscal,dz21);
477 /* Update vectorial force */
478 fix2 = _mm_add_ps(fix2,tx);
479 fiy2 = _mm_add_ps(fiy2,ty);
480 fiz2 = _mm_add_ps(fiz2,tz);
482 fjx1 = _mm_add_ps(fjx1,tx);
483 fjy1 = _mm_add_ps(fjy1,ty);
484 fjz1 = _mm_add_ps(fjz1,tz);
488 /**************************
489 * CALCULATE INTERACTIONS *
490 **************************/
492 if (gmx_mm_any_lt(rsq22,rcutoff2))
495 r22 = _mm_mul_ps(rsq22,rinv22);
497 /* EWALD ELECTROSTATICS */
499 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
500 ewrt = _mm_mul_ps(r22,ewtabscale);
501 ewitab = _mm_cvttps_epi32(ewrt);
502 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
503 ewitab = _mm_slli_epi32(ewitab,2);
504 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
505 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
506 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
507 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
508 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
509 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
510 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
511 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
512 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
514 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
516 /* Update potential sum for this i atom from the interaction with this j atom. */
517 velec = _mm_and_ps(velec,cutoff_mask);
518 velecsum = _mm_add_ps(velecsum,velec);
522 fscal = _mm_and_ps(fscal,cutoff_mask);
524 /* Calculate temporary vectorial force */
525 tx = _mm_mul_ps(fscal,dx22);
526 ty = _mm_mul_ps(fscal,dy22);
527 tz = _mm_mul_ps(fscal,dz22);
529 /* Update vectorial force */
530 fix2 = _mm_add_ps(fix2,tx);
531 fiy2 = _mm_add_ps(fiy2,ty);
532 fiz2 = _mm_add_ps(fiz2,tz);
534 fjx2 = _mm_add_ps(fjx2,tx);
535 fjy2 = _mm_add_ps(fjy2,ty);
536 fjz2 = _mm_add_ps(fjz2,tz);
540 /**************************
541 * CALCULATE INTERACTIONS *
542 **************************/
544 if (gmx_mm_any_lt(rsq23,rcutoff2))
547 r23 = _mm_mul_ps(rsq23,rinv23);
549 /* EWALD ELECTROSTATICS */
551 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
552 ewrt = _mm_mul_ps(r23,ewtabscale);
553 ewitab = _mm_cvttps_epi32(ewrt);
554 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
555 ewitab = _mm_slli_epi32(ewitab,2);
556 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
557 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
558 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
559 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
560 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
561 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
562 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
563 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
564 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
566 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
568 /* Update potential sum for this i atom from the interaction with this j atom. */
569 velec = _mm_and_ps(velec,cutoff_mask);
570 velecsum = _mm_add_ps(velecsum,velec);
574 fscal = _mm_and_ps(fscal,cutoff_mask);
576 /* Calculate temporary vectorial force */
577 tx = _mm_mul_ps(fscal,dx23);
578 ty = _mm_mul_ps(fscal,dy23);
579 tz = _mm_mul_ps(fscal,dz23);
581 /* Update vectorial force */
582 fix2 = _mm_add_ps(fix2,tx);
583 fiy2 = _mm_add_ps(fiy2,ty);
584 fiz2 = _mm_add_ps(fiz2,tz);
586 fjx3 = _mm_add_ps(fjx3,tx);
587 fjy3 = _mm_add_ps(fjy3,ty);
588 fjz3 = _mm_add_ps(fjz3,tz);
592 /**************************
593 * CALCULATE INTERACTIONS *
594 **************************/
596 if (gmx_mm_any_lt(rsq31,rcutoff2))
599 r31 = _mm_mul_ps(rsq31,rinv31);
601 /* EWALD ELECTROSTATICS */
603 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
604 ewrt = _mm_mul_ps(r31,ewtabscale);
605 ewitab = _mm_cvttps_epi32(ewrt);
606 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
607 ewitab = _mm_slli_epi32(ewitab,2);
608 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
609 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
610 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
611 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
612 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
613 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
614 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
615 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
616 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
618 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
620 /* Update potential sum for this i atom from the interaction with this j atom. */
621 velec = _mm_and_ps(velec,cutoff_mask);
622 velecsum = _mm_add_ps(velecsum,velec);
626 fscal = _mm_and_ps(fscal,cutoff_mask);
628 /* Calculate temporary vectorial force */
629 tx = _mm_mul_ps(fscal,dx31);
630 ty = _mm_mul_ps(fscal,dy31);
631 tz = _mm_mul_ps(fscal,dz31);
633 /* Update vectorial force */
634 fix3 = _mm_add_ps(fix3,tx);
635 fiy3 = _mm_add_ps(fiy3,ty);
636 fiz3 = _mm_add_ps(fiz3,tz);
638 fjx1 = _mm_add_ps(fjx1,tx);
639 fjy1 = _mm_add_ps(fjy1,ty);
640 fjz1 = _mm_add_ps(fjz1,tz);
644 /**************************
645 * CALCULATE INTERACTIONS *
646 **************************/
648 if (gmx_mm_any_lt(rsq32,rcutoff2))
651 r32 = _mm_mul_ps(rsq32,rinv32);
653 /* EWALD ELECTROSTATICS */
655 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
656 ewrt = _mm_mul_ps(r32,ewtabscale);
657 ewitab = _mm_cvttps_epi32(ewrt);
658 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
659 ewitab = _mm_slli_epi32(ewitab,2);
660 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
661 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
662 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
663 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
664 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
665 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
666 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
667 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
668 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
670 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
672 /* Update potential sum for this i atom from the interaction with this j atom. */
673 velec = _mm_and_ps(velec,cutoff_mask);
674 velecsum = _mm_add_ps(velecsum,velec);
678 fscal = _mm_and_ps(fscal,cutoff_mask);
680 /* Calculate temporary vectorial force */
681 tx = _mm_mul_ps(fscal,dx32);
682 ty = _mm_mul_ps(fscal,dy32);
683 tz = _mm_mul_ps(fscal,dz32);
685 /* Update vectorial force */
686 fix3 = _mm_add_ps(fix3,tx);
687 fiy3 = _mm_add_ps(fiy3,ty);
688 fiz3 = _mm_add_ps(fiz3,tz);
690 fjx2 = _mm_add_ps(fjx2,tx);
691 fjy2 = _mm_add_ps(fjy2,ty);
692 fjz2 = _mm_add_ps(fjz2,tz);
696 /**************************
697 * CALCULATE INTERACTIONS *
698 **************************/
700 if (gmx_mm_any_lt(rsq33,rcutoff2))
703 r33 = _mm_mul_ps(rsq33,rinv33);
705 /* EWALD ELECTROSTATICS */
707 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
708 ewrt = _mm_mul_ps(r33,ewtabscale);
709 ewitab = _mm_cvttps_epi32(ewrt);
710 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
711 ewitab = _mm_slli_epi32(ewitab,2);
712 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
713 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
714 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
715 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
716 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
717 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
718 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
719 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
720 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
722 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
724 /* Update potential sum for this i atom from the interaction with this j atom. */
725 velec = _mm_and_ps(velec,cutoff_mask);
726 velecsum = _mm_add_ps(velecsum,velec);
730 fscal = _mm_and_ps(fscal,cutoff_mask);
732 /* Calculate temporary vectorial force */
733 tx = _mm_mul_ps(fscal,dx33);
734 ty = _mm_mul_ps(fscal,dy33);
735 tz = _mm_mul_ps(fscal,dz33);
737 /* Update vectorial force */
738 fix3 = _mm_add_ps(fix3,tx);
739 fiy3 = _mm_add_ps(fiy3,ty);
740 fiz3 = _mm_add_ps(fiz3,tz);
742 fjx3 = _mm_add_ps(fjx3,tx);
743 fjy3 = _mm_add_ps(fjy3,ty);
744 fjz3 = _mm_add_ps(fjz3,tz);
748 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
749 f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
750 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
752 /* Inner loop uses 414 flops */
758 /* Get j neighbor index, and coordinate index */
764 /* Sign of each element will be negative for non-real atoms.
765 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
766 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
768 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
769 jnrA = (jnrA>=0) ? jnrA : 0;
770 jnrB = (jnrB>=0) ? jnrB : 0;
771 jnrC = (jnrC>=0) ? jnrC : 0;
772 jnrD = (jnrD>=0) ? jnrD : 0;
774 j_coord_offsetA = DIM*jnrA;
775 j_coord_offsetB = DIM*jnrB;
776 j_coord_offsetC = DIM*jnrC;
777 j_coord_offsetD = DIM*jnrD;
779 /* load j atom coordinates */
780 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
781 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
782 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
784 /* Calculate displacement vector */
785 dx11 = _mm_sub_ps(ix1,jx1);
786 dy11 = _mm_sub_ps(iy1,jy1);
787 dz11 = _mm_sub_ps(iz1,jz1);
788 dx12 = _mm_sub_ps(ix1,jx2);
789 dy12 = _mm_sub_ps(iy1,jy2);
790 dz12 = _mm_sub_ps(iz1,jz2);
791 dx13 = _mm_sub_ps(ix1,jx3);
792 dy13 = _mm_sub_ps(iy1,jy3);
793 dz13 = _mm_sub_ps(iz1,jz3);
794 dx21 = _mm_sub_ps(ix2,jx1);
795 dy21 = _mm_sub_ps(iy2,jy1);
796 dz21 = _mm_sub_ps(iz2,jz1);
797 dx22 = _mm_sub_ps(ix2,jx2);
798 dy22 = _mm_sub_ps(iy2,jy2);
799 dz22 = _mm_sub_ps(iz2,jz2);
800 dx23 = _mm_sub_ps(ix2,jx3);
801 dy23 = _mm_sub_ps(iy2,jy3);
802 dz23 = _mm_sub_ps(iz2,jz3);
803 dx31 = _mm_sub_ps(ix3,jx1);
804 dy31 = _mm_sub_ps(iy3,jy1);
805 dz31 = _mm_sub_ps(iz3,jz1);
806 dx32 = _mm_sub_ps(ix3,jx2);
807 dy32 = _mm_sub_ps(iy3,jy2);
808 dz32 = _mm_sub_ps(iz3,jz2);
809 dx33 = _mm_sub_ps(ix3,jx3);
810 dy33 = _mm_sub_ps(iy3,jy3);
811 dz33 = _mm_sub_ps(iz3,jz3);
813 /* Calculate squared distance and things based on it */
814 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
815 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
816 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
817 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
818 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
819 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
820 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
821 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
822 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
824 rinv11 = gmx_mm_invsqrt_ps(rsq11);
825 rinv12 = gmx_mm_invsqrt_ps(rsq12);
826 rinv13 = gmx_mm_invsqrt_ps(rsq13);
827 rinv21 = gmx_mm_invsqrt_ps(rsq21);
828 rinv22 = gmx_mm_invsqrt_ps(rsq22);
829 rinv23 = gmx_mm_invsqrt_ps(rsq23);
830 rinv31 = gmx_mm_invsqrt_ps(rsq31);
831 rinv32 = gmx_mm_invsqrt_ps(rsq32);
832 rinv33 = gmx_mm_invsqrt_ps(rsq33);
834 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
835 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
836 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
837 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
838 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
839 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
840 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
841 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
842 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
844 fjx1 = _mm_setzero_ps();
845 fjy1 = _mm_setzero_ps();
846 fjz1 = _mm_setzero_ps();
847 fjx2 = _mm_setzero_ps();
848 fjy2 = _mm_setzero_ps();
849 fjz2 = _mm_setzero_ps();
850 fjx3 = _mm_setzero_ps();
851 fjy3 = _mm_setzero_ps();
852 fjz3 = _mm_setzero_ps();
854 /**************************
855 * CALCULATE INTERACTIONS *
856 **************************/
858 if (gmx_mm_any_lt(rsq11,rcutoff2))
861 r11 = _mm_mul_ps(rsq11,rinv11);
862 r11 = _mm_andnot_ps(dummy_mask,r11);
864 /* EWALD ELECTROSTATICS */
866 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
867 ewrt = _mm_mul_ps(r11,ewtabscale);
868 ewitab = _mm_cvttps_epi32(ewrt);
869 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
870 ewitab = _mm_slli_epi32(ewitab,2);
871 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
872 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
873 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
874 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
875 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
876 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
877 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
878 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
879 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
881 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
883 /* Update potential sum for this i atom from the interaction with this j atom. */
884 velec = _mm_and_ps(velec,cutoff_mask);
885 velec = _mm_andnot_ps(dummy_mask,velec);
886 velecsum = _mm_add_ps(velecsum,velec);
890 fscal = _mm_and_ps(fscal,cutoff_mask);
892 fscal = _mm_andnot_ps(dummy_mask,fscal);
894 /* Calculate temporary vectorial force */
895 tx = _mm_mul_ps(fscal,dx11);
896 ty = _mm_mul_ps(fscal,dy11);
897 tz = _mm_mul_ps(fscal,dz11);
899 /* Update vectorial force */
900 fix1 = _mm_add_ps(fix1,tx);
901 fiy1 = _mm_add_ps(fiy1,ty);
902 fiz1 = _mm_add_ps(fiz1,tz);
904 fjx1 = _mm_add_ps(fjx1,tx);
905 fjy1 = _mm_add_ps(fjy1,ty);
906 fjz1 = _mm_add_ps(fjz1,tz);
910 /**************************
911 * CALCULATE INTERACTIONS *
912 **************************/
914 if (gmx_mm_any_lt(rsq12,rcutoff2))
917 r12 = _mm_mul_ps(rsq12,rinv12);
918 r12 = _mm_andnot_ps(dummy_mask,r12);
920 /* EWALD ELECTROSTATICS */
922 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
923 ewrt = _mm_mul_ps(r12,ewtabscale);
924 ewitab = _mm_cvttps_epi32(ewrt);
925 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
926 ewitab = _mm_slli_epi32(ewitab,2);
927 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
928 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
929 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
930 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
931 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
932 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
933 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
934 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
935 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
937 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
939 /* Update potential sum for this i atom from the interaction with this j atom. */
940 velec = _mm_and_ps(velec,cutoff_mask);
941 velec = _mm_andnot_ps(dummy_mask,velec);
942 velecsum = _mm_add_ps(velecsum,velec);
946 fscal = _mm_and_ps(fscal,cutoff_mask);
948 fscal = _mm_andnot_ps(dummy_mask,fscal);
950 /* Calculate temporary vectorial force */
951 tx = _mm_mul_ps(fscal,dx12);
952 ty = _mm_mul_ps(fscal,dy12);
953 tz = _mm_mul_ps(fscal,dz12);
955 /* Update vectorial force */
956 fix1 = _mm_add_ps(fix1,tx);
957 fiy1 = _mm_add_ps(fiy1,ty);
958 fiz1 = _mm_add_ps(fiz1,tz);
960 fjx2 = _mm_add_ps(fjx2,tx);
961 fjy2 = _mm_add_ps(fjy2,ty);
962 fjz2 = _mm_add_ps(fjz2,tz);
966 /**************************
967 * CALCULATE INTERACTIONS *
968 **************************/
970 if (gmx_mm_any_lt(rsq13,rcutoff2))
973 r13 = _mm_mul_ps(rsq13,rinv13);
974 r13 = _mm_andnot_ps(dummy_mask,r13);
976 /* EWALD ELECTROSTATICS */
978 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
979 ewrt = _mm_mul_ps(r13,ewtabscale);
980 ewitab = _mm_cvttps_epi32(ewrt);
981 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
982 ewitab = _mm_slli_epi32(ewitab,2);
983 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
984 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
985 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
986 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
987 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
988 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
989 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
990 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
991 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
993 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
995 /* Update potential sum for this i atom from the interaction with this j atom. */
996 velec = _mm_and_ps(velec,cutoff_mask);
997 velec = _mm_andnot_ps(dummy_mask,velec);
998 velecsum = _mm_add_ps(velecsum,velec);
1002 fscal = _mm_and_ps(fscal,cutoff_mask);
1004 fscal = _mm_andnot_ps(dummy_mask,fscal);
1006 /* Calculate temporary vectorial force */
1007 tx = _mm_mul_ps(fscal,dx13);
1008 ty = _mm_mul_ps(fscal,dy13);
1009 tz = _mm_mul_ps(fscal,dz13);
1011 /* Update vectorial force */
1012 fix1 = _mm_add_ps(fix1,tx);
1013 fiy1 = _mm_add_ps(fiy1,ty);
1014 fiz1 = _mm_add_ps(fiz1,tz);
1016 fjx3 = _mm_add_ps(fjx3,tx);
1017 fjy3 = _mm_add_ps(fjy3,ty);
1018 fjz3 = _mm_add_ps(fjz3,tz);
1022 /**************************
1023 * CALCULATE INTERACTIONS *
1024 **************************/
1026 if (gmx_mm_any_lt(rsq21,rcutoff2))
1029 r21 = _mm_mul_ps(rsq21,rinv21);
1030 r21 = _mm_andnot_ps(dummy_mask,r21);
1032 /* EWALD ELECTROSTATICS */
1034 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1035 ewrt = _mm_mul_ps(r21,ewtabscale);
1036 ewitab = _mm_cvttps_epi32(ewrt);
1037 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1038 ewitab = _mm_slli_epi32(ewitab,2);
1039 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1040 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1041 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1042 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1043 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1044 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1045 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1046 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
1047 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1049 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1051 /* Update potential sum for this i atom from the interaction with this j atom. */
1052 velec = _mm_and_ps(velec,cutoff_mask);
1053 velec = _mm_andnot_ps(dummy_mask,velec);
1054 velecsum = _mm_add_ps(velecsum,velec);
1058 fscal = _mm_and_ps(fscal,cutoff_mask);
1060 fscal = _mm_andnot_ps(dummy_mask,fscal);
1062 /* Calculate temporary vectorial force */
1063 tx = _mm_mul_ps(fscal,dx21);
1064 ty = _mm_mul_ps(fscal,dy21);
1065 tz = _mm_mul_ps(fscal,dz21);
1067 /* Update vectorial force */
1068 fix2 = _mm_add_ps(fix2,tx);
1069 fiy2 = _mm_add_ps(fiy2,ty);
1070 fiz2 = _mm_add_ps(fiz2,tz);
1072 fjx1 = _mm_add_ps(fjx1,tx);
1073 fjy1 = _mm_add_ps(fjy1,ty);
1074 fjz1 = _mm_add_ps(fjz1,tz);
1078 /**************************
1079 * CALCULATE INTERACTIONS *
1080 **************************/
1082 if (gmx_mm_any_lt(rsq22,rcutoff2))
1085 r22 = _mm_mul_ps(rsq22,rinv22);
1086 r22 = _mm_andnot_ps(dummy_mask,r22);
1088 /* EWALD ELECTROSTATICS */
1090 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1091 ewrt = _mm_mul_ps(r22,ewtabscale);
1092 ewitab = _mm_cvttps_epi32(ewrt);
1093 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1094 ewitab = _mm_slli_epi32(ewitab,2);
1095 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1096 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1097 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1098 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1099 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1100 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1101 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1102 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
1103 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1105 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1107 /* Update potential sum for this i atom from the interaction with this j atom. */
1108 velec = _mm_and_ps(velec,cutoff_mask);
1109 velec = _mm_andnot_ps(dummy_mask,velec);
1110 velecsum = _mm_add_ps(velecsum,velec);
1114 fscal = _mm_and_ps(fscal,cutoff_mask);
1116 fscal = _mm_andnot_ps(dummy_mask,fscal);
1118 /* Calculate temporary vectorial force */
1119 tx = _mm_mul_ps(fscal,dx22);
1120 ty = _mm_mul_ps(fscal,dy22);
1121 tz = _mm_mul_ps(fscal,dz22);
1123 /* Update vectorial force */
1124 fix2 = _mm_add_ps(fix2,tx);
1125 fiy2 = _mm_add_ps(fiy2,ty);
1126 fiz2 = _mm_add_ps(fiz2,tz);
1128 fjx2 = _mm_add_ps(fjx2,tx);
1129 fjy2 = _mm_add_ps(fjy2,ty);
1130 fjz2 = _mm_add_ps(fjz2,tz);
1134 /**************************
1135 * CALCULATE INTERACTIONS *
1136 **************************/
1138 if (gmx_mm_any_lt(rsq23,rcutoff2))
1141 r23 = _mm_mul_ps(rsq23,rinv23);
1142 r23 = _mm_andnot_ps(dummy_mask,r23);
1144 /* EWALD ELECTROSTATICS */
1146 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1147 ewrt = _mm_mul_ps(r23,ewtabscale);
1148 ewitab = _mm_cvttps_epi32(ewrt);
1149 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1150 ewitab = _mm_slli_epi32(ewitab,2);
1151 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1152 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1153 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1154 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1155 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1156 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1157 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1158 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
1159 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1161 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1163 /* Update potential sum for this i atom from the interaction with this j atom. */
1164 velec = _mm_and_ps(velec,cutoff_mask);
1165 velec = _mm_andnot_ps(dummy_mask,velec);
1166 velecsum = _mm_add_ps(velecsum,velec);
1170 fscal = _mm_and_ps(fscal,cutoff_mask);
1172 fscal = _mm_andnot_ps(dummy_mask,fscal);
1174 /* Calculate temporary vectorial force */
1175 tx = _mm_mul_ps(fscal,dx23);
1176 ty = _mm_mul_ps(fscal,dy23);
1177 tz = _mm_mul_ps(fscal,dz23);
1179 /* Update vectorial force */
1180 fix2 = _mm_add_ps(fix2,tx);
1181 fiy2 = _mm_add_ps(fiy2,ty);
1182 fiz2 = _mm_add_ps(fiz2,tz);
1184 fjx3 = _mm_add_ps(fjx3,tx);
1185 fjy3 = _mm_add_ps(fjy3,ty);
1186 fjz3 = _mm_add_ps(fjz3,tz);
1190 /**************************
1191 * CALCULATE INTERACTIONS *
1192 **************************/
1194 if (gmx_mm_any_lt(rsq31,rcutoff2))
1197 r31 = _mm_mul_ps(rsq31,rinv31);
1198 r31 = _mm_andnot_ps(dummy_mask,r31);
1200 /* EWALD ELECTROSTATICS */
1202 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1203 ewrt = _mm_mul_ps(r31,ewtabscale);
1204 ewitab = _mm_cvttps_epi32(ewrt);
1205 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1206 ewitab = _mm_slli_epi32(ewitab,2);
1207 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1208 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1209 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1210 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1211 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1212 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1213 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1214 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
1215 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1217 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1219 /* Update potential sum for this i atom from the interaction with this j atom. */
1220 velec = _mm_and_ps(velec,cutoff_mask);
1221 velec = _mm_andnot_ps(dummy_mask,velec);
1222 velecsum = _mm_add_ps(velecsum,velec);
1226 fscal = _mm_and_ps(fscal,cutoff_mask);
1228 fscal = _mm_andnot_ps(dummy_mask,fscal);
1230 /* Calculate temporary vectorial force */
1231 tx = _mm_mul_ps(fscal,dx31);
1232 ty = _mm_mul_ps(fscal,dy31);
1233 tz = _mm_mul_ps(fscal,dz31);
1235 /* Update vectorial force */
1236 fix3 = _mm_add_ps(fix3,tx);
1237 fiy3 = _mm_add_ps(fiy3,ty);
1238 fiz3 = _mm_add_ps(fiz3,tz);
1240 fjx1 = _mm_add_ps(fjx1,tx);
1241 fjy1 = _mm_add_ps(fjy1,ty);
1242 fjz1 = _mm_add_ps(fjz1,tz);
1246 /**************************
1247 * CALCULATE INTERACTIONS *
1248 **************************/
1250 if (gmx_mm_any_lt(rsq32,rcutoff2))
1253 r32 = _mm_mul_ps(rsq32,rinv32);
1254 r32 = _mm_andnot_ps(dummy_mask,r32);
1256 /* EWALD ELECTROSTATICS */
1258 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1259 ewrt = _mm_mul_ps(r32,ewtabscale);
1260 ewitab = _mm_cvttps_epi32(ewrt);
1261 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1262 ewitab = _mm_slli_epi32(ewitab,2);
1263 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1264 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1265 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1266 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1267 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1268 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1269 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1270 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
1271 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1273 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1275 /* Update potential sum for this i atom from the interaction with this j atom. */
1276 velec = _mm_and_ps(velec,cutoff_mask);
1277 velec = _mm_andnot_ps(dummy_mask,velec);
1278 velecsum = _mm_add_ps(velecsum,velec);
1282 fscal = _mm_and_ps(fscal,cutoff_mask);
1284 fscal = _mm_andnot_ps(dummy_mask,fscal);
1286 /* Calculate temporary vectorial force */
1287 tx = _mm_mul_ps(fscal,dx32);
1288 ty = _mm_mul_ps(fscal,dy32);
1289 tz = _mm_mul_ps(fscal,dz32);
1291 /* Update vectorial force */
1292 fix3 = _mm_add_ps(fix3,tx);
1293 fiy3 = _mm_add_ps(fiy3,ty);
1294 fiz3 = _mm_add_ps(fiz3,tz);
1296 fjx2 = _mm_add_ps(fjx2,tx);
1297 fjy2 = _mm_add_ps(fjy2,ty);
1298 fjz2 = _mm_add_ps(fjz2,tz);
1302 /**************************
1303 * CALCULATE INTERACTIONS *
1304 **************************/
1306 if (gmx_mm_any_lt(rsq33,rcutoff2))
1309 r33 = _mm_mul_ps(rsq33,rinv33);
1310 r33 = _mm_andnot_ps(dummy_mask,r33);
1312 /* EWALD ELECTROSTATICS */
1314 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1315 ewrt = _mm_mul_ps(r33,ewtabscale);
1316 ewitab = _mm_cvttps_epi32(ewrt);
1317 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1318 ewitab = _mm_slli_epi32(ewitab,2);
1319 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1320 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1321 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1322 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1323 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1324 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1325 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1326 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
1327 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1329 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1331 /* Update potential sum for this i atom from the interaction with this j atom. */
1332 velec = _mm_and_ps(velec,cutoff_mask);
1333 velec = _mm_andnot_ps(dummy_mask,velec);
1334 velecsum = _mm_add_ps(velecsum,velec);
1338 fscal = _mm_and_ps(fscal,cutoff_mask);
1340 fscal = _mm_andnot_ps(dummy_mask,fscal);
1342 /* Calculate temporary vectorial force */
1343 tx = _mm_mul_ps(fscal,dx33);
1344 ty = _mm_mul_ps(fscal,dy33);
1345 tz = _mm_mul_ps(fscal,dz33);
1347 /* Update vectorial force */
1348 fix3 = _mm_add_ps(fix3,tx);
1349 fiy3 = _mm_add_ps(fiy3,ty);
1350 fiz3 = _mm_add_ps(fiz3,tz);
1352 fjx3 = _mm_add_ps(fjx3,tx);
1353 fjy3 = _mm_add_ps(fjy3,ty);
1354 fjz3 = _mm_add_ps(fjz3,tz);
1358 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
1359 f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
1360 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1362 /* Inner loop uses 423 flops */
1365 /* End of innermost loop */
1367 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1368 f+i_coord_offset+DIM,fshift+i_shift_offset);
1371 /* Update potential energies */
1372 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1374 /* Increment number of inner iterations */
1375 inneriter += j_index_end - j_index_start;
1377 /* Outer loop uses 28 flops */
1380 /* Increment number of outer iterations */
1383 /* Update outer/inner flops */
1385 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*28 + inneriter*423);
1388 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_single
1389 * Electrostatics interaction: Ewald
1390 * VdW interaction: None
1391 * Geometry: Water4-Water4
1392 * Calculate force/pot: Force
1395 nb_kernel_ElecEwSh_VdwNone_GeomW4W4_F_sse2_single
1396 (t_nblist * gmx_restrict nlist,
1397 rvec * gmx_restrict xx,
1398 rvec * gmx_restrict ff,
1399 t_forcerec * gmx_restrict fr,
1400 t_mdatoms * gmx_restrict mdatoms,
1401 nb_kernel_data_t * gmx_restrict kernel_data,
1402 t_nrnb * gmx_restrict nrnb)
1404 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1405 * just 0 for non-waters.
1406 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1407 * jnr indices corresponding to data put in the four positions in the SIMD register.
1409 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1410 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1411 int jnrA,jnrB,jnrC,jnrD;
1412 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1413 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1414 real shX,shY,shZ,rcutoff_scalar;
1415 real *shiftvec,*fshift,*x,*f;
1416 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1418 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1420 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1422 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1423 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1424 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1425 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1426 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1427 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1428 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1429 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1430 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1431 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1432 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1433 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1434 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1435 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1436 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1437 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1438 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1441 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1443 __m128 dummy_mask,cutoff_mask;
1444 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1445 __m128 one = _mm_set1_ps(1.0);
1446 __m128 two = _mm_set1_ps(2.0);
1452 jindex = nlist->jindex;
1454 shiftidx = nlist->shift;
1456 shiftvec = fr->shift_vec[0];
1457 fshift = fr->fshift[0];
1458 facel = _mm_set1_ps(fr->epsfac);
1459 charge = mdatoms->chargeA;
1461 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1462 ewtab = fr->ic->tabq_coul_F;
1463 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1464 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1466 /* Setup water-specific parameters */
1467 inr = nlist->iinr[0];
1468 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1469 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1470 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1472 jq1 = _mm_set1_ps(charge[inr+1]);
1473 jq2 = _mm_set1_ps(charge[inr+2]);
1474 jq3 = _mm_set1_ps(charge[inr+3]);
1475 qq11 = _mm_mul_ps(iq1,jq1);
1476 qq12 = _mm_mul_ps(iq1,jq2);
1477 qq13 = _mm_mul_ps(iq1,jq3);
1478 qq21 = _mm_mul_ps(iq2,jq1);
1479 qq22 = _mm_mul_ps(iq2,jq2);
1480 qq23 = _mm_mul_ps(iq2,jq3);
1481 qq31 = _mm_mul_ps(iq3,jq1);
1482 qq32 = _mm_mul_ps(iq3,jq2);
1483 qq33 = _mm_mul_ps(iq3,jq3);
1485 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1486 rcutoff_scalar = fr->rcoulomb;
1487 rcutoff = _mm_set1_ps(rcutoff_scalar);
1488 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1490 /* Avoid stupid compiler warnings */
1491 jnrA = jnrB = jnrC = jnrD = 0;
1492 j_coord_offsetA = 0;
1493 j_coord_offsetB = 0;
1494 j_coord_offsetC = 0;
1495 j_coord_offsetD = 0;
1500 /* Start outer loop over neighborlists */
1501 for(iidx=0; iidx<nri; iidx++)
1503 /* Load shift vector for this list */
1504 i_shift_offset = DIM*shiftidx[iidx];
1505 shX = shiftvec[i_shift_offset+XX];
1506 shY = shiftvec[i_shift_offset+YY];
1507 shZ = shiftvec[i_shift_offset+ZZ];
1509 /* Load limits for loop over neighbors */
1510 j_index_start = jindex[iidx];
1511 j_index_end = jindex[iidx+1];
1513 /* Get outer coordinate index */
1515 i_coord_offset = DIM*inr;
1517 /* Load i particle coords and add shift vector */
1518 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1519 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1520 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1521 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1522 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1523 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1524 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
1525 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
1526 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
1528 fix1 = _mm_setzero_ps();
1529 fiy1 = _mm_setzero_ps();
1530 fiz1 = _mm_setzero_ps();
1531 fix2 = _mm_setzero_ps();
1532 fiy2 = _mm_setzero_ps();
1533 fiz2 = _mm_setzero_ps();
1534 fix3 = _mm_setzero_ps();
1535 fiy3 = _mm_setzero_ps();
1536 fiz3 = _mm_setzero_ps();
1538 /* Start inner kernel loop */
1539 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1542 /* Get j neighbor index, and coordinate index */
1544 jnrB = jjnr[jidx+1];
1545 jnrC = jjnr[jidx+2];
1546 jnrD = jjnr[jidx+3];
1548 j_coord_offsetA = DIM*jnrA;
1549 j_coord_offsetB = DIM*jnrB;
1550 j_coord_offsetC = DIM*jnrC;
1551 j_coord_offsetD = DIM*jnrD;
1553 /* load j atom coordinates */
1554 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1555 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1556 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1558 /* Calculate displacement vector */
1559 dx11 = _mm_sub_ps(ix1,jx1);
1560 dy11 = _mm_sub_ps(iy1,jy1);
1561 dz11 = _mm_sub_ps(iz1,jz1);
1562 dx12 = _mm_sub_ps(ix1,jx2);
1563 dy12 = _mm_sub_ps(iy1,jy2);
1564 dz12 = _mm_sub_ps(iz1,jz2);
1565 dx13 = _mm_sub_ps(ix1,jx3);
1566 dy13 = _mm_sub_ps(iy1,jy3);
1567 dz13 = _mm_sub_ps(iz1,jz3);
1568 dx21 = _mm_sub_ps(ix2,jx1);
1569 dy21 = _mm_sub_ps(iy2,jy1);
1570 dz21 = _mm_sub_ps(iz2,jz1);
1571 dx22 = _mm_sub_ps(ix2,jx2);
1572 dy22 = _mm_sub_ps(iy2,jy2);
1573 dz22 = _mm_sub_ps(iz2,jz2);
1574 dx23 = _mm_sub_ps(ix2,jx3);
1575 dy23 = _mm_sub_ps(iy2,jy3);
1576 dz23 = _mm_sub_ps(iz2,jz3);
1577 dx31 = _mm_sub_ps(ix3,jx1);
1578 dy31 = _mm_sub_ps(iy3,jy1);
1579 dz31 = _mm_sub_ps(iz3,jz1);
1580 dx32 = _mm_sub_ps(ix3,jx2);
1581 dy32 = _mm_sub_ps(iy3,jy2);
1582 dz32 = _mm_sub_ps(iz3,jz2);
1583 dx33 = _mm_sub_ps(ix3,jx3);
1584 dy33 = _mm_sub_ps(iy3,jy3);
1585 dz33 = _mm_sub_ps(iz3,jz3);
1587 /* Calculate squared distance and things based on it */
1588 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1589 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1590 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1591 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1592 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1593 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1594 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1595 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1596 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1598 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1599 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1600 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1601 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1602 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1603 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1604 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1605 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1606 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1608 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1609 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1610 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1611 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1612 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1613 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1614 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1615 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1616 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1618 fjx1 = _mm_setzero_ps();
1619 fjy1 = _mm_setzero_ps();
1620 fjz1 = _mm_setzero_ps();
1621 fjx2 = _mm_setzero_ps();
1622 fjy2 = _mm_setzero_ps();
1623 fjz2 = _mm_setzero_ps();
1624 fjx3 = _mm_setzero_ps();
1625 fjy3 = _mm_setzero_ps();
1626 fjz3 = _mm_setzero_ps();
1628 /**************************
1629 * CALCULATE INTERACTIONS *
1630 **************************/
1632 if (gmx_mm_any_lt(rsq11,rcutoff2))
1635 r11 = _mm_mul_ps(rsq11,rinv11);
1637 /* EWALD ELECTROSTATICS */
1639 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1640 ewrt = _mm_mul_ps(r11,ewtabscale);
1641 ewitab = _mm_cvttps_epi32(ewrt);
1642 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1643 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1644 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1646 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1647 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1649 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1653 fscal = _mm_and_ps(fscal,cutoff_mask);
1655 /* Calculate temporary vectorial force */
1656 tx = _mm_mul_ps(fscal,dx11);
1657 ty = _mm_mul_ps(fscal,dy11);
1658 tz = _mm_mul_ps(fscal,dz11);
1660 /* Update vectorial force */
1661 fix1 = _mm_add_ps(fix1,tx);
1662 fiy1 = _mm_add_ps(fiy1,ty);
1663 fiz1 = _mm_add_ps(fiz1,tz);
1665 fjx1 = _mm_add_ps(fjx1,tx);
1666 fjy1 = _mm_add_ps(fjy1,ty);
1667 fjz1 = _mm_add_ps(fjz1,tz);
1671 /**************************
1672 * CALCULATE INTERACTIONS *
1673 **************************/
1675 if (gmx_mm_any_lt(rsq12,rcutoff2))
1678 r12 = _mm_mul_ps(rsq12,rinv12);
1680 /* EWALD ELECTROSTATICS */
1682 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1683 ewrt = _mm_mul_ps(r12,ewtabscale);
1684 ewitab = _mm_cvttps_epi32(ewrt);
1685 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1686 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1687 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1689 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1690 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1692 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1696 fscal = _mm_and_ps(fscal,cutoff_mask);
1698 /* Calculate temporary vectorial force */
1699 tx = _mm_mul_ps(fscal,dx12);
1700 ty = _mm_mul_ps(fscal,dy12);
1701 tz = _mm_mul_ps(fscal,dz12);
1703 /* Update vectorial force */
1704 fix1 = _mm_add_ps(fix1,tx);
1705 fiy1 = _mm_add_ps(fiy1,ty);
1706 fiz1 = _mm_add_ps(fiz1,tz);
1708 fjx2 = _mm_add_ps(fjx2,tx);
1709 fjy2 = _mm_add_ps(fjy2,ty);
1710 fjz2 = _mm_add_ps(fjz2,tz);
1714 /**************************
1715 * CALCULATE INTERACTIONS *
1716 **************************/
1718 if (gmx_mm_any_lt(rsq13,rcutoff2))
1721 r13 = _mm_mul_ps(rsq13,rinv13);
1723 /* EWALD ELECTROSTATICS */
1725 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1726 ewrt = _mm_mul_ps(r13,ewtabscale);
1727 ewitab = _mm_cvttps_epi32(ewrt);
1728 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1729 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1730 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1732 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1733 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1735 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1739 fscal = _mm_and_ps(fscal,cutoff_mask);
1741 /* Calculate temporary vectorial force */
1742 tx = _mm_mul_ps(fscal,dx13);
1743 ty = _mm_mul_ps(fscal,dy13);
1744 tz = _mm_mul_ps(fscal,dz13);
1746 /* Update vectorial force */
1747 fix1 = _mm_add_ps(fix1,tx);
1748 fiy1 = _mm_add_ps(fiy1,ty);
1749 fiz1 = _mm_add_ps(fiz1,tz);
1751 fjx3 = _mm_add_ps(fjx3,tx);
1752 fjy3 = _mm_add_ps(fjy3,ty);
1753 fjz3 = _mm_add_ps(fjz3,tz);
1757 /**************************
1758 * CALCULATE INTERACTIONS *
1759 **************************/
1761 if (gmx_mm_any_lt(rsq21,rcutoff2))
1764 r21 = _mm_mul_ps(rsq21,rinv21);
1766 /* EWALD ELECTROSTATICS */
1768 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1769 ewrt = _mm_mul_ps(r21,ewtabscale);
1770 ewitab = _mm_cvttps_epi32(ewrt);
1771 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1772 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1773 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1775 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1776 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1778 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1782 fscal = _mm_and_ps(fscal,cutoff_mask);
1784 /* Calculate temporary vectorial force */
1785 tx = _mm_mul_ps(fscal,dx21);
1786 ty = _mm_mul_ps(fscal,dy21);
1787 tz = _mm_mul_ps(fscal,dz21);
1789 /* Update vectorial force */
1790 fix2 = _mm_add_ps(fix2,tx);
1791 fiy2 = _mm_add_ps(fiy2,ty);
1792 fiz2 = _mm_add_ps(fiz2,tz);
1794 fjx1 = _mm_add_ps(fjx1,tx);
1795 fjy1 = _mm_add_ps(fjy1,ty);
1796 fjz1 = _mm_add_ps(fjz1,tz);
1800 /**************************
1801 * CALCULATE INTERACTIONS *
1802 **************************/
1804 if (gmx_mm_any_lt(rsq22,rcutoff2))
1807 r22 = _mm_mul_ps(rsq22,rinv22);
1809 /* EWALD ELECTROSTATICS */
1811 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1812 ewrt = _mm_mul_ps(r22,ewtabscale);
1813 ewitab = _mm_cvttps_epi32(ewrt);
1814 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1815 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1816 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1818 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1819 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1821 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1825 fscal = _mm_and_ps(fscal,cutoff_mask);
1827 /* Calculate temporary vectorial force */
1828 tx = _mm_mul_ps(fscal,dx22);
1829 ty = _mm_mul_ps(fscal,dy22);
1830 tz = _mm_mul_ps(fscal,dz22);
1832 /* Update vectorial force */
1833 fix2 = _mm_add_ps(fix2,tx);
1834 fiy2 = _mm_add_ps(fiy2,ty);
1835 fiz2 = _mm_add_ps(fiz2,tz);
1837 fjx2 = _mm_add_ps(fjx2,tx);
1838 fjy2 = _mm_add_ps(fjy2,ty);
1839 fjz2 = _mm_add_ps(fjz2,tz);
1843 /**************************
1844 * CALCULATE INTERACTIONS *
1845 **************************/
1847 if (gmx_mm_any_lt(rsq23,rcutoff2))
1850 r23 = _mm_mul_ps(rsq23,rinv23);
1852 /* EWALD ELECTROSTATICS */
1854 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1855 ewrt = _mm_mul_ps(r23,ewtabscale);
1856 ewitab = _mm_cvttps_epi32(ewrt);
1857 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1858 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1859 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1861 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1862 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1864 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1868 fscal = _mm_and_ps(fscal,cutoff_mask);
1870 /* Calculate temporary vectorial force */
1871 tx = _mm_mul_ps(fscal,dx23);
1872 ty = _mm_mul_ps(fscal,dy23);
1873 tz = _mm_mul_ps(fscal,dz23);
1875 /* Update vectorial force */
1876 fix2 = _mm_add_ps(fix2,tx);
1877 fiy2 = _mm_add_ps(fiy2,ty);
1878 fiz2 = _mm_add_ps(fiz2,tz);
1880 fjx3 = _mm_add_ps(fjx3,tx);
1881 fjy3 = _mm_add_ps(fjy3,ty);
1882 fjz3 = _mm_add_ps(fjz3,tz);
1886 /**************************
1887 * CALCULATE INTERACTIONS *
1888 **************************/
1890 if (gmx_mm_any_lt(rsq31,rcutoff2))
1893 r31 = _mm_mul_ps(rsq31,rinv31);
1895 /* EWALD ELECTROSTATICS */
1897 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1898 ewrt = _mm_mul_ps(r31,ewtabscale);
1899 ewitab = _mm_cvttps_epi32(ewrt);
1900 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1901 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1902 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1904 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1905 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1907 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1911 fscal = _mm_and_ps(fscal,cutoff_mask);
1913 /* Calculate temporary vectorial force */
1914 tx = _mm_mul_ps(fscal,dx31);
1915 ty = _mm_mul_ps(fscal,dy31);
1916 tz = _mm_mul_ps(fscal,dz31);
1918 /* Update vectorial force */
1919 fix3 = _mm_add_ps(fix3,tx);
1920 fiy3 = _mm_add_ps(fiy3,ty);
1921 fiz3 = _mm_add_ps(fiz3,tz);
1923 fjx1 = _mm_add_ps(fjx1,tx);
1924 fjy1 = _mm_add_ps(fjy1,ty);
1925 fjz1 = _mm_add_ps(fjz1,tz);
1929 /**************************
1930 * CALCULATE INTERACTIONS *
1931 **************************/
1933 if (gmx_mm_any_lt(rsq32,rcutoff2))
1936 r32 = _mm_mul_ps(rsq32,rinv32);
1938 /* EWALD ELECTROSTATICS */
1940 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1941 ewrt = _mm_mul_ps(r32,ewtabscale);
1942 ewitab = _mm_cvttps_epi32(ewrt);
1943 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1944 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1945 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1947 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1948 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1950 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1954 fscal = _mm_and_ps(fscal,cutoff_mask);
1956 /* Calculate temporary vectorial force */
1957 tx = _mm_mul_ps(fscal,dx32);
1958 ty = _mm_mul_ps(fscal,dy32);
1959 tz = _mm_mul_ps(fscal,dz32);
1961 /* Update vectorial force */
1962 fix3 = _mm_add_ps(fix3,tx);
1963 fiy3 = _mm_add_ps(fiy3,ty);
1964 fiz3 = _mm_add_ps(fiz3,tz);
1966 fjx2 = _mm_add_ps(fjx2,tx);
1967 fjy2 = _mm_add_ps(fjy2,ty);
1968 fjz2 = _mm_add_ps(fjz2,tz);
1972 /**************************
1973 * CALCULATE INTERACTIONS *
1974 **************************/
1976 if (gmx_mm_any_lt(rsq33,rcutoff2))
1979 r33 = _mm_mul_ps(rsq33,rinv33);
1981 /* EWALD ELECTROSTATICS */
1983 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1984 ewrt = _mm_mul_ps(r33,ewtabscale);
1985 ewitab = _mm_cvttps_epi32(ewrt);
1986 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1987 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1988 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1990 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1991 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1993 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1997 fscal = _mm_and_ps(fscal,cutoff_mask);
1999 /* Calculate temporary vectorial force */
2000 tx = _mm_mul_ps(fscal,dx33);
2001 ty = _mm_mul_ps(fscal,dy33);
2002 tz = _mm_mul_ps(fscal,dz33);
2004 /* Update vectorial force */
2005 fix3 = _mm_add_ps(fix3,tx);
2006 fiy3 = _mm_add_ps(fiy3,ty);
2007 fiz3 = _mm_add_ps(fiz3,tz);
2009 fjx3 = _mm_add_ps(fjx3,tx);
2010 fjy3 = _mm_add_ps(fjy3,ty);
2011 fjz3 = _mm_add_ps(fjz3,tz);
2015 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
2016 f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
2017 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2019 /* Inner loop uses 351 flops */
2022 if(jidx<j_index_end)
2025 /* Get j neighbor index, and coordinate index */
2027 jnrB = jjnr[jidx+1];
2028 jnrC = jjnr[jidx+2];
2029 jnrD = jjnr[jidx+3];
2031 /* Sign of each element will be negative for non-real atoms.
2032 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2033 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2035 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2036 jnrA = (jnrA>=0) ? jnrA : 0;
2037 jnrB = (jnrB>=0) ? jnrB : 0;
2038 jnrC = (jnrC>=0) ? jnrC : 0;
2039 jnrD = (jnrD>=0) ? jnrD : 0;
2041 j_coord_offsetA = DIM*jnrA;
2042 j_coord_offsetB = DIM*jnrB;
2043 j_coord_offsetC = DIM*jnrC;
2044 j_coord_offsetD = DIM*jnrD;
2046 /* load j atom coordinates */
2047 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
2048 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
2049 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2051 /* Calculate displacement vector */
2052 dx11 = _mm_sub_ps(ix1,jx1);
2053 dy11 = _mm_sub_ps(iy1,jy1);
2054 dz11 = _mm_sub_ps(iz1,jz1);
2055 dx12 = _mm_sub_ps(ix1,jx2);
2056 dy12 = _mm_sub_ps(iy1,jy2);
2057 dz12 = _mm_sub_ps(iz1,jz2);
2058 dx13 = _mm_sub_ps(ix1,jx3);
2059 dy13 = _mm_sub_ps(iy1,jy3);
2060 dz13 = _mm_sub_ps(iz1,jz3);
2061 dx21 = _mm_sub_ps(ix2,jx1);
2062 dy21 = _mm_sub_ps(iy2,jy1);
2063 dz21 = _mm_sub_ps(iz2,jz1);
2064 dx22 = _mm_sub_ps(ix2,jx2);
2065 dy22 = _mm_sub_ps(iy2,jy2);
2066 dz22 = _mm_sub_ps(iz2,jz2);
2067 dx23 = _mm_sub_ps(ix2,jx3);
2068 dy23 = _mm_sub_ps(iy2,jy3);
2069 dz23 = _mm_sub_ps(iz2,jz3);
2070 dx31 = _mm_sub_ps(ix3,jx1);
2071 dy31 = _mm_sub_ps(iy3,jy1);
2072 dz31 = _mm_sub_ps(iz3,jz1);
2073 dx32 = _mm_sub_ps(ix3,jx2);
2074 dy32 = _mm_sub_ps(iy3,jy2);
2075 dz32 = _mm_sub_ps(iz3,jz2);
2076 dx33 = _mm_sub_ps(ix3,jx3);
2077 dy33 = _mm_sub_ps(iy3,jy3);
2078 dz33 = _mm_sub_ps(iz3,jz3);
2080 /* Calculate squared distance and things based on it */
2081 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2082 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2083 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2084 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2085 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2086 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2087 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2088 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2089 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2091 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2092 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2093 rinv13 = gmx_mm_invsqrt_ps(rsq13);
2094 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2095 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2096 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2097 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2098 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2099 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2101 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2102 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2103 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2104 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2105 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2106 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2107 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2108 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2109 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2111 fjx1 = _mm_setzero_ps();
2112 fjy1 = _mm_setzero_ps();
2113 fjz1 = _mm_setzero_ps();
2114 fjx2 = _mm_setzero_ps();
2115 fjy2 = _mm_setzero_ps();
2116 fjz2 = _mm_setzero_ps();
2117 fjx3 = _mm_setzero_ps();
2118 fjy3 = _mm_setzero_ps();
2119 fjz3 = _mm_setzero_ps();
2121 /**************************
2122 * CALCULATE INTERACTIONS *
2123 **************************/
2125 if (gmx_mm_any_lt(rsq11,rcutoff2))
2128 r11 = _mm_mul_ps(rsq11,rinv11);
2129 r11 = _mm_andnot_ps(dummy_mask,r11);
2131 /* EWALD ELECTROSTATICS */
2133 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2134 ewrt = _mm_mul_ps(r11,ewtabscale);
2135 ewitab = _mm_cvttps_epi32(ewrt);
2136 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2137 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2138 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2140 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2141 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2143 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2147 fscal = _mm_and_ps(fscal,cutoff_mask);
2149 fscal = _mm_andnot_ps(dummy_mask,fscal);
2151 /* Calculate temporary vectorial force */
2152 tx = _mm_mul_ps(fscal,dx11);
2153 ty = _mm_mul_ps(fscal,dy11);
2154 tz = _mm_mul_ps(fscal,dz11);
2156 /* Update vectorial force */
2157 fix1 = _mm_add_ps(fix1,tx);
2158 fiy1 = _mm_add_ps(fiy1,ty);
2159 fiz1 = _mm_add_ps(fiz1,tz);
2161 fjx1 = _mm_add_ps(fjx1,tx);
2162 fjy1 = _mm_add_ps(fjy1,ty);
2163 fjz1 = _mm_add_ps(fjz1,tz);
2167 /**************************
2168 * CALCULATE INTERACTIONS *
2169 **************************/
2171 if (gmx_mm_any_lt(rsq12,rcutoff2))
2174 r12 = _mm_mul_ps(rsq12,rinv12);
2175 r12 = _mm_andnot_ps(dummy_mask,r12);
2177 /* EWALD ELECTROSTATICS */
2179 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2180 ewrt = _mm_mul_ps(r12,ewtabscale);
2181 ewitab = _mm_cvttps_epi32(ewrt);
2182 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2183 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2184 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2186 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2187 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2189 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2193 fscal = _mm_and_ps(fscal,cutoff_mask);
2195 fscal = _mm_andnot_ps(dummy_mask,fscal);
2197 /* Calculate temporary vectorial force */
2198 tx = _mm_mul_ps(fscal,dx12);
2199 ty = _mm_mul_ps(fscal,dy12);
2200 tz = _mm_mul_ps(fscal,dz12);
2202 /* Update vectorial force */
2203 fix1 = _mm_add_ps(fix1,tx);
2204 fiy1 = _mm_add_ps(fiy1,ty);
2205 fiz1 = _mm_add_ps(fiz1,tz);
2207 fjx2 = _mm_add_ps(fjx2,tx);
2208 fjy2 = _mm_add_ps(fjy2,ty);
2209 fjz2 = _mm_add_ps(fjz2,tz);
2213 /**************************
2214 * CALCULATE INTERACTIONS *
2215 **************************/
2217 if (gmx_mm_any_lt(rsq13,rcutoff2))
2220 r13 = _mm_mul_ps(rsq13,rinv13);
2221 r13 = _mm_andnot_ps(dummy_mask,r13);
2223 /* EWALD ELECTROSTATICS */
2225 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2226 ewrt = _mm_mul_ps(r13,ewtabscale);
2227 ewitab = _mm_cvttps_epi32(ewrt);
2228 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2229 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2230 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2232 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2233 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2235 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2239 fscal = _mm_and_ps(fscal,cutoff_mask);
2241 fscal = _mm_andnot_ps(dummy_mask,fscal);
2243 /* Calculate temporary vectorial force */
2244 tx = _mm_mul_ps(fscal,dx13);
2245 ty = _mm_mul_ps(fscal,dy13);
2246 tz = _mm_mul_ps(fscal,dz13);
2248 /* Update vectorial force */
2249 fix1 = _mm_add_ps(fix1,tx);
2250 fiy1 = _mm_add_ps(fiy1,ty);
2251 fiz1 = _mm_add_ps(fiz1,tz);
2253 fjx3 = _mm_add_ps(fjx3,tx);
2254 fjy3 = _mm_add_ps(fjy3,ty);
2255 fjz3 = _mm_add_ps(fjz3,tz);
2259 /**************************
2260 * CALCULATE INTERACTIONS *
2261 **************************/
2263 if (gmx_mm_any_lt(rsq21,rcutoff2))
2266 r21 = _mm_mul_ps(rsq21,rinv21);
2267 r21 = _mm_andnot_ps(dummy_mask,r21);
2269 /* EWALD ELECTROSTATICS */
2271 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2272 ewrt = _mm_mul_ps(r21,ewtabscale);
2273 ewitab = _mm_cvttps_epi32(ewrt);
2274 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2275 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2276 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2278 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2279 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2281 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2285 fscal = _mm_and_ps(fscal,cutoff_mask);
2287 fscal = _mm_andnot_ps(dummy_mask,fscal);
2289 /* Calculate temporary vectorial force */
2290 tx = _mm_mul_ps(fscal,dx21);
2291 ty = _mm_mul_ps(fscal,dy21);
2292 tz = _mm_mul_ps(fscal,dz21);
2294 /* Update vectorial force */
2295 fix2 = _mm_add_ps(fix2,tx);
2296 fiy2 = _mm_add_ps(fiy2,ty);
2297 fiz2 = _mm_add_ps(fiz2,tz);
2299 fjx1 = _mm_add_ps(fjx1,tx);
2300 fjy1 = _mm_add_ps(fjy1,ty);
2301 fjz1 = _mm_add_ps(fjz1,tz);
2305 /**************************
2306 * CALCULATE INTERACTIONS *
2307 **************************/
2309 if (gmx_mm_any_lt(rsq22,rcutoff2))
2312 r22 = _mm_mul_ps(rsq22,rinv22);
2313 r22 = _mm_andnot_ps(dummy_mask,r22);
2315 /* EWALD ELECTROSTATICS */
2317 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2318 ewrt = _mm_mul_ps(r22,ewtabscale);
2319 ewitab = _mm_cvttps_epi32(ewrt);
2320 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2321 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2322 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2324 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2325 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2327 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2331 fscal = _mm_and_ps(fscal,cutoff_mask);
2333 fscal = _mm_andnot_ps(dummy_mask,fscal);
2335 /* Calculate temporary vectorial force */
2336 tx = _mm_mul_ps(fscal,dx22);
2337 ty = _mm_mul_ps(fscal,dy22);
2338 tz = _mm_mul_ps(fscal,dz22);
2340 /* Update vectorial force */
2341 fix2 = _mm_add_ps(fix2,tx);
2342 fiy2 = _mm_add_ps(fiy2,ty);
2343 fiz2 = _mm_add_ps(fiz2,tz);
2345 fjx2 = _mm_add_ps(fjx2,tx);
2346 fjy2 = _mm_add_ps(fjy2,ty);
2347 fjz2 = _mm_add_ps(fjz2,tz);
2351 /**************************
2352 * CALCULATE INTERACTIONS *
2353 **************************/
2355 if (gmx_mm_any_lt(rsq23,rcutoff2))
2358 r23 = _mm_mul_ps(rsq23,rinv23);
2359 r23 = _mm_andnot_ps(dummy_mask,r23);
2361 /* EWALD ELECTROSTATICS */
2363 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2364 ewrt = _mm_mul_ps(r23,ewtabscale);
2365 ewitab = _mm_cvttps_epi32(ewrt);
2366 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2367 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2368 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2370 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2371 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2373 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2377 fscal = _mm_and_ps(fscal,cutoff_mask);
2379 fscal = _mm_andnot_ps(dummy_mask,fscal);
2381 /* Calculate temporary vectorial force */
2382 tx = _mm_mul_ps(fscal,dx23);
2383 ty = _mm_mul_ps(fscal,dy23);
2384 tz = _mm_mul_ps(fscal,dz23);
2386 /* Update vectorial force */
2387 fix2 = _mm_add_ps(fix2,tx);
2388 fiy2 = _mm_add_ps(fiy2,ty);
2389 fiz2 = _mm_add_ps(fiz2,tz);
2391 fjx3 = _mm_add_ps(fjx3,tx);
2392 fjy3 = _mm_add_ps(fjy3,ty);
2393 fjz3 = _mm_add_ps(fjz3,tz);
2397 /**************************
2398 * CALCULATE INTERACTIONS *
2399 **************************/
2401 if (gmx_mm_any_lt(rsq31,rcutoff2))
2404 r31 = _mm_mul_ps(rsq31,rinv31);
2405 r31 = _mm_andnot_ps(dummy_mask,r31);
2407 /* EWALD ELECTROSTATICS */
2409 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2410 ewrt = _mm_mul_ps(r31,ewtabscale);
2411 ewitab = _mm_cvttps_epi32(ewrt);
2412 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2413 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2414 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2416 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2417 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2419 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2423 fscal = _mm_and_ps(fscal,cutoff_mask);
2425 fscal = _mm_andnot_ps(dummy_mask,fscal);
2427 /* Calculate temporary vectorial force */
2428 tx = _mm_mul_ps(fscal,dx31);
2429 ty = _mm_mul_ps(fscal,dy31);
2430 tz = _mm_mul_ps(fscal,dz31);
2432 /* Update vectorial force */
2433 fix3 = _mm_add_ps(fix3,tx);
2434 fiy3 = _mm_add_ps(fiy3,ty);
2435 fiz3 = _mm_add_ps(fiz3,tz);
2437 fjx1 = _mm_add_ps(fjx1,tx);
2438 fjy1 = _mm_add_ps(fjy1,ty);
2439 fjz1 = _mm_add_ps(fjz1,tz);
2443 /**************************
2444 * CALCULATE INTERACTIONS *
2445 **************************/
2447 if (gmx_mm_any_lt(rsq32,rcutoff2))
2450 r32 = _mm_mul_ps(rsq32,rinv32);
2451 r32 = _mm_andnot_ps(dummy_mask,r32);
2453 /* EWALD ELECTROSTATICS */
2455 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2456 ewrt = _mm_mul_ps(r32,ewtabscale);
2457 ewitab = _mm_cvttps_epi32(ewrt);
2458 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2459 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2460 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2462 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2463 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2465 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2469 fscal = _mm_and_ps(fscal,cutoff_mask);
2471 fscal = _mm_andnot_ps(dummy_mask,fscal);
2473 /* Calculate temporary vectorial force */
2474 tx = _mm_mul_ps(fscal,dx32);
2475 ty = _mm_mul_ps(fscal,dy32);
2476 tz = _mm_mul_ps(fscal,dz32);
2478 /* Update vectorial force */
2479 fix3 = _mm_add_ps(fix3,tx);
2480 fiy3 = _mm_add_ps(fiy3,ty);
2481 fiz3 = _mm_add_ps(fiz3,tz);
2483 fjx2 = _mm_add_ps(fjx2,tx);
2484 fjy2 = _mm_add_ps(fjy2,ty);
2485 fjz2 = _mm_add_ps(fjz2,tz);
2489 /**************************
2490 * CALCULATE INTERACTIONS *
2491 **************************/
2493 if (gmx_mm_any_lt(rsq33,rcutoff2))
2496 r33 = _mm_mul_ps(rsq33,rinv33);
2497 r33 = _mm_andnot_ps(dummy_mask,r33);
2499 /* EWALD ELECTROSTATICS */
2501 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2502 ewrt = _mm_mul_ps(r33,ewtabscale);
2503 ewitab = _mm_cvttps_epi32(ewrt);
2504 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2505 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2506 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2508 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2509 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2511 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2515 fscal = _mm_and_ps(fscal,cutoff_mask);
2517 fscal = _mm_andnot_ps(dummy_mask,fscal);
2519 /* Calculate temporary vectorial force */
2520 tx = _mm_mul_ps(fscal,dx33);
2521 ty = _mm_mul_ps(fscal,dy33);
2522 tz = _mm_mul_ps(fscal,dz33);
2524 /* Update vectorial force */
2525 fix3 = _mm_add_ps(fix3,tx);
2526 fiy3 = _mm_add_ps(fiy3,ty);
2527 fiz3 = _mm_add_ps(fiz3,tz);
2529 fjx3 = _mm_add_ps(fjx3,tx);
2530 fjy3 = _mm_add_ps(fjy3,ty);
2531 fjz3 = _mm_add_ps(fjz3,tz);
2535 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,
2536 f+j_coord_offsetC+DIM,f+j_coord_offsetD+DIM,
2537 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2539 /* Inner loop uses 360 flops */
2542 /* End of innermost loop */
2544 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2545 f+i_coord_offset+DIM,fshift+i_shift_offset);
2547 /* Increment number of inner iterations */
2548 inneriter += j_index_end - j_index_start;
2550 /* Outer loop uses 27 flops */
2553 /* Increment number of outer iterations */
2556 /* Update outer/inner flops */
2558 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*27 + inneriter*360);