2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: None
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
73 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
75 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
77 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
92 __m128 dummy_mask,cutoff_mask;
93 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
94 __m128 one = _mm_set1_ps(1.0);
95 __m128 two = _mm_set1_ps(2.0);
101 jindex = nlist->jindex;
103 shiftidx = nlist->shift;
105 shiftvec = fr->shift_vec[0];
106 fshift = fr->fshift[0];
107 facel = _mm_set1_ps(fr->epsfac);
108 charge = mdatoms->chargeA;
110 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
111 ewtab = fr->ic->tabq_coul_FDV0;
112 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
113 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
115 /* Setup water-specific parameters */
116 inr = nlist->iinr[0];
117 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
118 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
119 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
121 jq0 = _mm_set1_ps(charge[inr+0]);
122 jq1 = _mm_set1_ps(charge[inr+1]);
123 jq2 = _mm_set1_ps(charge[inr+2]);
124 qq00 = _mm_mul_ps(iq0,jq0);
125 qq01 = _mm_mul_ps(iq0,jq1);
126 qq02 = _mm_mul_ps(iq0,jq2);
127 qq10 = _mm_mul_ps(iq1,jq0);
128 qq11 = _mm_mul_ps(iq1,jq1);
129 qq12 = _mm_mul_ps(iq1,jq2);
130 qq20 = _mm_mul_ps(iq2,jq0);
131 qq21 = _mm_mul_ps(iq2,jq1);
132 qq22 = _mm_mul_ps(iq2,jq2);
134 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
135 rcutoff_scalar = fr->rcoulomb;
136 rcutoff = _mm_set1_ps(rcutoff_scalar);
137 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
139 /* Avoid stupid compiler warnings */
140 jnrA = jnrB = jnrC = jnrD = 0;
149 /* Start outer loop over neighborlists */
150 for(iidx=0; iidx<nri; iidx++)
152 /* Load shift vector for this list */
153 i_shift_offset = DIM*shiftidx[iidx];
154 shX = shiftvec[i_shift_offset+XX];
155 shY = shiftvec[i_shift_offset+YY];
156 shZ = shiftvec[i_shift_offset+ZZ];
158 /* Load limits for loop over neighbors */
159 j_index_start = jindex[iidx];
160 j_index_end = jindex[iidx+1];
162 /* Get outer coordinate index */
164 i_coord_offset = DIM*inr;
166 /* Load i particle coords and add shift vector */
167 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
168 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
169 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
170 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
171 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
172 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
173 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
174 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
175 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
177 fix0 = _mm_setzero_ps();
178 fiy0 = _mm_setzero_ps();
179 fiz0 = _mm_setzero_ps();
180 fix1 = _mm_setzero_ps();
181 fiy1 = _mm_setzero_ps();
182 fiz1 = _mm_setzero_ps();
183 fix2 = _mm_setzero_ps();
184 fiy2 = _mm_setzero_ps();
185 fiz2 = _mm_setzero_ps();
187 /* Reset potential sums */
188 velecsum = _mm_setzero_ps();
190 /* Start inner kernel loop */
191 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
194 /* Get j neighbor index, and coordinate index */
200 j_coord_offsetA = DIM*jnrA;
201 j_coord_offsetB = DIM*jnrB;
202 j_coord_offsetC = DIM*jnrC;
203 j_coord_offsetD = DIM*jnrD;
205 /* load j atom coordinates */
206 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
207 x+j_coord_offsetC,x+j_coord_offsetD,
208 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
210 /* Calculate displacement vector */
211 dx00 = _mm_sub_ps(ix0,jx0);
212 dy00 = _mm_sub_ps(iy0,jy0);
213 dz00 = _mm_sub_ps(iz0,jz0);
214 dx01 = _mm_sub_ps(ix0,jx1);
215 dy01 = _mm_sub_ps(iy0,jy1);
216 dz01 = _mm_sub_ps(iz0,jz1);
217 dx02 = _mm_sub_ps(ix0,jx2);
218 dy02 = _mm_sub_ps(iy0,jy2);
219 dz02 = _mm_sub_ps(iz0,jz2);
220 dx10 = _mm_sub_ps(ix1,jx0);
221 dy10 = _mm_sub_ps(iy1,jy0);
222 dz10 = _mm_sub_ps(iz1,jz0);
223 dx11 = _mm_sub_ps(ix1,jx1);
224 dy11 = _mm_sub_ps(iy1,jy1);
225 dz11 = _mm_sub_ps(iz1,jz1);
226 dx12 = _mm_sub_ps(ix1,jx2);
227 dy12 = _mm_sub_ps(iy1,jy2);
228 dz12 = _mm_sub_ps(iz1,jz2);
229 dx20 = _mm_sub_ps(ix2,jx0);
230 dy20 = _mm_sub_ps(iy2,jy0);
231 dz20 = _mm_sub_ps(iz2,jz0);
232 dx21 = _mm_sub_ps(ix2,jx1);
233 dy21 = _mm_sub_ps(iy2,jy1);
234 dz21 = _mm_sub_ps(iz2,jz1);
235 dx22 = _mm_sub_ps(ix2,jx2);
236 dy22 = _mm_sub_ps(iy2,jy2);
237 dz22 = _mm_sub_ps(iz2,jz2);
239 /* Calculate squared distance and things based on it */
240 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
241 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
242 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
243 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
244 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
245 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
246 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
247 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
248 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
250 rinv00 = gmx_mm_invsqrt_ps(rsq00);
251 rinv01 = gmx_mm_invsqrt_ps(rsq01);
252 rinv02 = gmx_mm_invsqrt_ps(rsq02);
253 rinv10 = gmx_mm_invsqrt_ps(rsq10);
254 rinv11 = gmx_mm_invsqrt_ps(rsq11);
255 rinv12 = gmx_mm_invsqrt_ps(rsq12);
256 rinv20 = gmx_mm_invsqrt_ps(rsq20);
257 rinv21 = gmx_mm_invsqrt_ps(rsq21);
258 rinv22 = gmx_mm_invsqrt_ps(rsq22);
260 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
261 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
262 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
263 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
264 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
265 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
266 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
267 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
268 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
270 fjx0 = _mm_setzero_ps();
271 fjy0 = _mm_setzero_ps();
272 fjz0 = _mm_setzero_ps();
273 fjx1 = _mm_setzero_ps();
274 fjy1 = _mm_setzero_ps();
275 fjz1 = _mm_setzero_ps();
276 fjx2 = _mm_setzero_ps();
277 fjy2 = _mm_setzero_ps();
278 fjz2 = _mm_setzero_ps();
280 /**************************
281 * CALCULATE INTERACTIONS *
282 **************************/
284 if (gmx_mm_any_lt(rsq00,rcutoff2))
287 r00 = _mm_mul_ps(rsq00,rinv00);
289 /* EWALD ELECTROSTATICS */
291 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
292 ewrt = _mm_mul_ps(r00,ewtabscale);
293 ewitab = _mm_cvttps_epi32(ewrt);
294 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
295 ewitab = _mm_slli_epi32(ewitab,2);
296 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
297 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
298 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
299 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
300 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
301 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
302 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
303 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
304 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
306 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
308 /* Update potential sum for this i atom from the interaction with this j atom. */
309 velec = _mm_and_ps(velec,cutoff_mask);
310 velecsum = _mm_add_ps(velecsum,velec);
314 fscal = _mm_and_ps(fscal,cutoff_mask);
316 /* Calculate temporary vectorial force */
317 tx = _mm_mul_ps(fscal,dx00);
318 ty = _mm_mul_ps(fscal,dy00);
319 tz = _mm_mul_ps(fscal,dz00);
321 /* Update vectorial force */
322 fix0 = _mm_add_ps(fix0,tx);
323 fiy0 = _mm_add_ps(fiy0,ty);
324 fiz0 = _mm_add_ps(fiz0,tz);
326 fjx0 = _mm_add_ps(fjx0,tx);
327 fjy0 = _mm_add_ps(fjy0,ty);
328 fjz0 = _mm_add_ps(fjz0,tz);
332 /**************************
333 * CALCULATE INTERACTIONS *
334 **************************/
336 if (gmx_mm_any_lt(rsq01,rcutoff2))
339 r01 = _mm_mul_ps(rsq01,rinv01);
341 /* EWALD ELECTROSTATICS */
343 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
344 ewrt = _mm_mul_ps(r01,ewtabscale);
345 ewitab = _mm_cvttps_epi32(ewrt);
346 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
347 ewitab = _mm_slli_epi32(ewitab,2);
348 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
349 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
350 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
351 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
352 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
353 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
354 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
355 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
356 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
358 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
360 /* Update potential sum for this i atom from the interaction with this j atom. */
361 velec = _mm_and_ps(velec,cutoff_mask);
362 velecsum = _mm_add_ps(velecsum,velec);
366 fscal = _mm_and_ps(fscal,cutoff_mask);
368 /* Calculate temporary vectorial force */
369 tx = _mm_mul_ps(fscal,dx01);
370 ty = _mm_mul_ps(fscal,dy01);
371 tz = _mm_mul_ps(fscal,dz01);
373 /* Update vectorial force */
374 fix0 = _mm_add_ps(fix0,tx);
375 fiy0 = _mm_add_ps(fiy0,ty);
376 fiz0 = _mm_add_ps(fiz0,tz);
378 fjx1 = _mm_add_ps(fjx1,tx);
379 fjy1 = _mm_add_ps(fjy1,ty);
380 fjz1 = _mm_add_ps(fjz1,tz);
384 /**************************
385 * CALCULATE INTERACTIONS *
386 **************************/
388 if (gmx_mm_any_lt(rsq02,rcutoff2))
391 r02 = _mm_mul_ps(rsq02,rinv02);
393 /* EWALD ELECTROSTATICS */
395 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
396 ewrt = _mm_mul_ps(r02,ewtabscale);
397 ewitab = _mm_cvttps_epi32(ewrt);
398 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
399 ewitab = _mm_slli_epi32(ewitab,2);
400 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
401 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
402 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
403 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
404 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
405 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
406 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
407 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
408 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
410 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
412 /* Update potential sum for this i atom from the interaction with this j atom. */
413 velec = _mm_and_ps(velec,cutoff_mask);
414 velecsum = _mm_add_ps(velecsum,velec);
418 fscal = _mm_and_ps(fscal,cutoff_mask);
420 /* Calculate temporary vectorial force */
421 tx = _mm_mul_ps(fscal,dx02);
422 ty = _mm_mul_ps(fscal,dy02);
423 tz = _mm_mul_ps(fscal,dz02);
425 /* Update vectorial force */
426 fix0 = _mm_add_ps(fix0,tx);
427 fiy0 = _mm_add_ps(fiy0,ty);
428 fiz0 = _mm_add_ps(fiz0,tz);
430 fjx2 = _mm_add_ps(fjx2,tx);
431 fjy2 = _mm_add_ps(fjy2,ty);
432 fjz2 = _mm_add_ps(fjz2,tz);
436 /**************************
437 * CALCULATE INTERACTIONS *
438 **************************/
440 if (gmx_mm_any_lt(rsq10,rcutoff2))
443 r10 = _mm_mul_ps(rsq10,rinv10);
445 /* EWALD ELECTROSTATICS */
447 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
448 ewrt = _mm_mul_ps(r10,ewtabscale);
449 ewitab = _mm_cvttps_epi32(ewrt);
450 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
451 ewitab = _mm_slli_epi32(ewitab,2);
452 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
453 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
454 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
455 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
456 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
457 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
458 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
459 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
460 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
462 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
464 /* Update potential sum for this i atom from the interaction with this j atom. */
465 velec = _mm_and_ps(velec,cutoff_mask);
466 velecsum = _mm_add_ps(velecsum,velec);
470 fscal = _mm_and_ps(fscal,cutoff_mask);
472 /* Calculate temporary vectorial force */
473 tx = _mm_mul_ps(fscal,dx10);
474 ty = _mm_mul_ps(fscal,dy10);
475 tz = _mm_mul_ps(fscal,dz10);
477 /* Update vectorial force */
478 fix1 = _mm_add_ps(fix1,tx);
479 fiy1 = _mm_add_ps(fiy1,ty);
480 fiz1 = _mm_add_ps(fiz1,tz);
482 fjx0 = _mm_add_ps(fjx0,tx);
483 fjy0 = _mm_add_ps(fjy0,ty);
484 fjz0 = _mm_add_ps(fjz0,tz);
488 /**************************
489 * CALCULATE INTERACTIONS *
490 **************************/
492 if (gmx_mm_any_lt(rsq11,rcutoff2))
495 r11 = _mm_mul_ps(rsq11,rinv11);
497 /* EWALD ELECTROSTATICS */
499 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
500 ewrt = _mm_mul_ps(r11,ewtabscale);
501 ewitab = _mm_cvttps_epi32(ewrt);
502 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
503 ewitab = _mm_slli_epi32(ewitab,2);
504 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
505 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
506 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
507 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
508 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
509 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
510 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
511 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
512 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
514 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
516 /* Update potential sum for this i atom from the interaction with this j atom. */
517 velec = _mm_and_ps(velec,cutoff_mask);
518 velecsum = _mm_add_ps(velecsum,velec);
522 fscal = _mm_and_ps(fscal,cutoff_mask);
524 /* Calculate temporary vectorial force */
525 tx = _mm_mul_ps(fscal,dx11);
526 ty = _mm_mul_ps(fscal,dy11);
527 tz = _mm_mul_ps(fscal,dz11);
529 /* Update vectorial force */
530 fix1 = _mm_add_ps(fix1,tx);
531 fiy1 = _mm_add_ps(fiy1,ty);
532 fiz1 = _mm_add_ps(fiz1,tz);
534 fjx1 = _mm_add_ps(fjx1,tx);
535 fjy1 = _mm_add_ps(fjy1,ty);
536 fjz1 = _mm_add_ps(fjz1,tz);
540 /**************************
541 * CALCULATE INTERACTIONS *
542 **************************/
544 if (gmx_mm_any_lt(rsq12,rcutoff2))
547 r12 = _mm_mul_ps(rsq12,rinv12);
549 /* EWALD ELECTROSTATICS */
551 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
552 ewrt = _mm_mul_ps(r12,ewtabscale);
553 ewitab = _mm_cvttps_epi32(ewrt);
554 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
555 ewitab = _mm_slli_epi32(ewitab,2);
556 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
557 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
558 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
559 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
560 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
561 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
562 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
563 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
564 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
566 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
568 /* Update potential sum for this i atom from the interaction with this j atom. */
569 velec = _mm_and_ps(velec,cutoff_mask);
570 velecsum = _mm_add_ps(velecsum,velec);
574 fscal = _mm_and_ps(fscal,cutoff_mask);
576 /* Calculate temporary vectorial force */
577 tx = _mm_mul_ps(fscal,dx12);
578 ty = _mm_mul_ps(fscal,dy12);
579 tz = _mm_mul_ps(fscal,dz12);
581 /* Update vectorial force */
582 fix1 = _mm_add_ps(fix1,tx);
583 fiy1 = _mm_add_ps(fiy1,ty);
584 fiz1 = _mm_add_ps(fiz1,tz);
586 fjx2 = _mm_add_ps(fjx2,tx);
587 fjy2 = _mm_add_ps(fjy2,ty);
588 fjz2 = _mm_add_ps(fjz2,tz);
592 /**************************
593 * CALCULATE INTERACTIONS *
594 **************************/
596 if (gmx_mm_any_lt(rsq20,rcutoff2))
599 r20 = _mm_mul_ps(rsq20,rinv20);
601 /* EWALD ELECTROSTATICS */
603 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
604 ewrt = _mm_mul_ps(r20,ewtabscale);
605 ewitab = _mm_cvttps_epi32(ewrt);
606 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
607 ewitab = _mm_slli_epi32(ewitab,2);
608 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
609 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
610 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
611 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
612 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
613 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
614 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
615 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
616 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
618 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
620 /* Update potential sum for this i atom from the interaction with this j atom. */
621 velec = _mm_and_ps(velec,cutoff_mask);
622 velecsum = _mm_add_ps(velecsum,velec);
626 fscal = _mm_and_ps(fscal,cutoff_mask);
628 /* Calculate temporary vectorial force */
629 tx = _mm_mul_ps(fscal,dx20);
630 ty = _mm_mul_ps(fscal,dy20);
631 tz = _mm_mul_ps(fscal,dz20);
633 /* Update vectorial force */
634 fix2 = _mm_add_ps(fix2,tx);
635 fiy2 = _mm_add_ps(fiy2,ty);
636 fiz2 = _mm_add_ps(fiz2,tz);
638 fjx0 = _mm_add_ps(fjx0,tx);
639 fjy0 = _mm_add_ps(fjy0,ty);
640 fjz0 = _mm_add_ps(fjz0,tz);
644 /**************************
645 * CALCULATE INTERACTIONS *
646 **************************/
648 if (gmx_mm_any_lt(rsq21,rcutoff2))
651 r21 = _mm_mul_ps(rsq21,rinv21);
653 /* EWALD ELECTROSTATICS */
655 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
656 ewrt = _mm_mul_ps(r21,ewtabscale);
657 ewitab = _mm_cvttps_epi32(ewrt);
658 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
659 ewitab = _mm_slli_epi32(ewitab,2);
660 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
661 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
662 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
663 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
664 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
665 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
666 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
667 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
668 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
670 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
672 /* Update potential sum for this i atom from the interaction with this j atom. */
673 velec = _mm_and_ps(velec,cutoff_mask);
674 velecsum = _mm_add_ps(velecsum,velec);
678 fscal = _mm_and_ps(fscal,cutoff_mask);
680 /* Calculate temporary vectorial force */
681 tx = _mm_mul_ps(fscal,dx21);
682 ty = _mm_mul_ps(fscal,dy21);
683 tz = _mm_mul_ps(fscal,dz21);
685 /* Update vectorial force */
686 fix2 = _mm_add_ps(fix2,tx);
687 fiy2 = _mm_add_ps(fiy2,ty);
688 fiz2 = _mm_add_ps(fiz2,tz);
690 fjx1 = _mm_add_ps(fjx1,tx);
691 fjy1 = _mm_add_ps(fjy1,ty);
692 fjz1 = _mm_add_ps(fjz1,tz);
696 /**************************
697 * CALCULATE INTERACTIONS *
698 **************************/
700 if (gmx_mm_any_lt(rsq22,rcutoff2))
703 r22 = _mm_mul_ps(rsq22,rinv22);
705 /* EWALD ELECTROSTATICS */
707 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
708 ewrt = _mm_mul_ps(r22,ewtabscale);
709 ewitab = _mm_cvttps_epi32(ewrt);
710 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
711 ewitab = _mm_slli_epi32(ewitab,2);
712 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
713 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
714 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
715 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
716 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
717 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
718 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
719 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
720 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
722 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
724 /* Update potential sum for this i atom from the interaction with this j atom. */
725 velec = _mm_and_ps(velec,cutoff_mask);
726 velecsum = _mm_add_ps(velecsum,velec);
730 fscal = _mm_and_ps(fscal,cutoff_mask);
732 /* Calculate temporary vectorial force */
733 tx = _mm_mul_ps(fscal,dx22);
734 ty = _mm_mul_ps(fscal,dy22);
735 tz = _mm_mul_ps(fscal,dz22);
737 /* Update vectorial force */
738 fix2 = _mm_add_ps(fix2,tx);
739 fiy2 = _mm_add_ps(fiy2,ty);
740 fiz2 = _mm_add_ps(fiz2,tz);
742 fjx2 = _mm_add_ps(fjx2,tx);
743 fjy2 = _mm_add_ps(fjy2,ty);
744 fjz2 = _mm_add_ps(fjz2,tz);
748 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
749 f+j_coord_offsetC,f+j_coord_offsetD,
750 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
752 /* Inner loop uses 414 flops */
758 /* Get j neighbor index, and coordinate index */
764 /* Sign of each element will be negative for non-real atoms.
765 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
766 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
768 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
769 jnrA = (jnrA>=0) ? jnrA : 0;
770 jnrB = (jnrB>=0) ? jnrB : 0;
771 jnrC = (jnrC>=0) ? jnrC : 0;
772 jnrD = (jnrD>=0) ? jnrD : 0;
774 j_coord_offsetA = DIM*jnrA;
775 j_coord_offsetB = DIM*jnrB;
776 j_coord_offsetC = DIM*jnrC;
777 j_coord_offsetD = DIM*jnrD;
779 /* load j atom coordinates */
780 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
781 x+j_coord_offsetC,x+j_coord_offsetD,
782 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
784 /* Calculate displacement vector */
785 dx00 = _mm_sub_ps(ix0,jx0);
786 dy00 = _mm_sub_ps(iy0,jy0);
787 dz00 = _mm_sub_ps(iz0,jz0);
788 dx01 = _mm_sub_ps(ix0,jx1);
789 dy01 = _mm_sub_ps(iy0,jy1);
790 dz01 = _mm_sub_ps(iz0,jz1);
791 dx02 = _mm_sub_ps(ix0,jx2);
792 dy02 = _mm_sub_ps(iy0,jy2);
793 dz02 = _mm_sub_ps(iz0,jz2);
794 dx10 = _mm_sub_ps(ix1,jx0);
795 dy10 = _mm_sub_ps(iy1,jy0);
796 dz10 = _mm_sub_ps(iz1,jz0);
797 dx11 = _mm_sub_ps(ix1,jx1);
798 dy11 = _mm_sub_ps(iy1,jy1);
799 dz11 = _mm_sub_ps(iz1,jz1);
800 dx12 = _mm_sub_ps(ix1,jx2);
801 dy12 = _mm_sub_ps(iy1,jy2);
802 dz12 = _mm_sub_ps(iz1,jz2);
803 dx20 = _mm_sub_ps(ix2,jx0);
804 dy20 = _mm_sub_ps(iy2,jy0);
805 dz20 = _mm_sub_ps(iz2,jz0);
806 dx21 = _mm_sub_ps(ix2,jx1);
807 dy21 = _mm_sub_ps(iy2,jy1);
808 dz21 = _mm_sub_ps(iz2,jz1);
809 dx22 = _mm_sub_ps(ix2,jx2);
810 dy22 = _mm_sub_ps(iy2,jy2);
811 dz22 = _mm_sub_ps(iz2,jz2);
813 /* Calculate squared distance and things based on it */
814 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
815 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
816 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
817 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
818 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
819 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
820 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
821 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
822 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
824 rinv00 = gmx_mm_invsqrt_ps(rsq00);
825 rinv01 = gmx_mm_invsqrt_ps(rsq01);
826 rinv02 = gmx_mm_invsqrt_ps(rsq02);
827 rinv10 = gmx_mm_invsqrt_ps(rsq10);
828 rinv11 = gmx_mm_invsqrt_ps(rsq11);
829 rinv12 = gmx_mm_invsqrt_ps(rsq12);
830 rinv20 = gmx_mm_invsqrt_ps(rsq20);
831 rinv21 = gmx_mm_invsqrt_ps(rsq21);
832 rinv22 = gmx_mm_invsqrt_ps(rsq22);
834 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
835 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
836 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
837 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
838 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
839 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
840 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
841 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
842 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
844 fjx0 = _mm_setzero_ps();
845 fjy0 = _mm_setzero_ps();
846 fjz0 = _mm_setzero_ps();
847 fjx1 = _mm_setzero_ps();
848 fjy1 = _mm_setzero_ps();
849 fjz1 = _mm_setzero_ps();
850 fjx2 = _mm_setzero_ps();
851 fjy2 = _mm_setzero_ps();
852 fjz2 = _mm_setzero_ps();
854 /**************************
855 * CALCULATE INTERACTIONS *
856 **************************/
858 if (gmx_mm_any_lt(rsq00,rcutoff2))
861 r00 = _mm_mul_ps(rsq00,rinv00);
862 r00 = _mm_andnot_ps(dummy_mask,r00);
864 /* EWALD ELECTROSTATICS */
866 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
867 ewrt = _mm_mul_ps(r00,ewtabscale);
868 ewitab = _mm_cvttps_epi32(ewrt);
869 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
870 ewitab = _mm_slli_epi32(ewitab,2);
871 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
872 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
873 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
874 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
875 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
876 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
877 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
878 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
879 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
881 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
883 /* Update potential sum for this i atom from the interaction with this j atom. */
884 velec = _mm_and_ps(velec,cutoff_mask);
885 velec = _mm_andnot_ps(dummy_mask,velec);
886 velecsum = _mm_add_ps(velecsum,velec);
890 fscal = _mm_and_ps(fscal,cutoff_mask);
892 fscal = _mm_andnot_ps(dummy_mask,fscal);
894 /* Calculate temporary vectorial force */
895 tx = _mm_mul_ps(fscal,dx00);
896 ty = _mm_mul_ps(fscal,dy00);
897 tz = _mm_mul_ps(fscal,dz00);
899 /* Update vectorial force */
900 fix0 = _mm_add_ps(fix0,tx);
901 fiy0 = _mm_add_ps(fiy0,ty);
902 fiz0 = _mm_add_ps(fiz0,tz);
904 fjx0 = _mm_add_ps(fjx0,tx);
905 fjy0 = _mm_add_ps(fjy0,ty);
906 fjz0 = _mm_add_ps(fjz0,tz);
910 /**************************
911 * CALCULATE INTERACTIONS *
912 **************************/
914 if (gmx_mm_any_lt(rsq01,rcutoff2))
917 r01 = _mm_mul_ps(rsq01,rinv01);
918 r01 = _mm_andnot_ps(dummy_mask,r01);
920 /* EWALD ELECTROSTATICS */
922 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
923 ewrt = _mm_mul_ps(r01,ewtabscale);
924 ewitab = _mm_cvttps_epi32(ewrt);
925 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
926 ewitab = _mm_slli_epi32(ewitab,2);
927 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
928 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
929 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
930 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
931 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
932 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
933 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
934 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
935 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
937 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
939 /* Update potential sum for this i atom from the interaction with this j atom. */
940 velec = _mm_and_ps(velec,cutoff_mask);
941 velec = _mm_andnot_ps(dummy_mask,velec);
942 velecsum = _mm_add_ps(velecsum,velec);
946 fscal = _mm_and_ps(fscal,cutoff_mask);
948 fscal = _mm_andnot_ps(dummy_mask,fscal);
950 /* Calculate temporary vectorial force */
951 tx = _mm_mul_ps(fscal,dx01);
952 ty = _mm_mul_ps(fscal,dy01);
953 tz = _mm_mul_ps(fscal,dz01);
955 /* Update vectorial force */
956 fix0 = _mm_add_ps(fix0,tx);
957 fiy0 = _mm_add_ps(fiy0,ty);
958 fiz0 = _mm_add_ps(fiz0,tz);
960 fjx1 = _mm_add_ps(fjx1,tx);
961 fjy1 = _mm_add_ps(fjy1,ty);
962 fjz1 = _mm_add_ps(fjz1,tz);
966 /**************************
967 * CALCULATE INTERACTIONS *
968 **************************/
970 if (gmx_mm_any_lt(rsq02,rcutoff2))
973 r02 = _mm_mul_ps(rsq02,rinv02);
974 r02 = _mm_andnot_ps(dummy_mask,r02);
976 /* EWALD ELECTROSTATICS */
978 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
979 ewrt = _mm_mul_ps(r02,ewtabscale);
980 ewitab = _mm_cvttps_epi32(ewrt);
981 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
982 ewitab = _mm_slli_epi32(ewitab,2);
983 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
984 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
985 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
986 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
987 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
988 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
989 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
990 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
991 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
993 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
995 /* Update potential sum for this i atom from the interaction with this j atom. */
996 velec = _mm_and_ps(velec,cutoff_mask);
997 velec = _mm_andnot_ps(dummy_mask,velec);
998 velecsum = _mm_add_ps(velecsum,velec);
1002 fscal = _mm_and_ps(fscal,cutoff_mask);
1004 fscal = _mm_andnot_ps(dummy_mask,fscal);
1006 /* Calculate temporary vectorial force */
1007 tx = _mm_mul_ps(fscal,dx02);
1008 ty = _mm_mul_ps(fscal,dy02);
1009 tz = _mm_mul_ps(fscal,dz02);
1011 /* Update vectorial force */
1012 fix0 = _mm_add_ps(fix0,tx);
1013 fiy0 = _mm_add_ps(fiy0,ty);
1014 fiz0 = _mm_add_ps(fiz0,tz);
1016 fjx2 = _mm_add_ps(fjx2,tx);
1017 fjy2 = _mm_add_ps(fjy2,ty);
1018 fjz2 = _mm_add_ps(fjz2,tz);
1022 /**************************
1023 * CALCULATE INTERACTIONS *
1024 **************************/
1026 if (gmx_mm_any_lt(rsq10,rcutoff2))
1029 r10 = _mm_mul_ps(rsq10,rinv10);
1030 r10 = _mm_andnot_ps(dummy_mask,r10);
1032 /* EWALD ELECTROSTATICS */
1034 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1035 ewrt = _mm_mul_ps(r10,ewtabscale);
1036 ewitab = _mm_cvttps_epi32(ewrt);
1037 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1038 ewitab = _mm_slli_epi32(ewitab,2);
1039 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1040 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1041 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1042 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1043 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1044 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1045 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1046 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
1047 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1049 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1051 /* Update potential sum for this i atom from the interaction with this j atom. */
1052 velec = _mm_and_ps(velec,cutoff_mask);
1053 velec = _mm_andnot_ps(dummy_mask,velec);
1054 velecsum = _mm_add_ps(velecsum,velec);
1058 fscal = _mm_and_ps(fscal,cutoff_mask);
1060 fscal = _mm_andnot_ps(dummy_mask,fscal);
1062 /* Calculate temporary vectorial force */
1063 tx = _mm_mul_ps(fscal,dx10);
1064 ty = _mm_mul_ps(fscal,dy10);
1065 tz = _mm_mul_ps(fscal,dz10);
1067 /* Update vectorial force */
1068 fix1 = _mm_add_ps(fix1,tx);
1069 fiy1 = _mm_add_ps(fiy1,ty);
1070 fiz1 = _mm_add_ps(fiz1,tz);
1072 fjx0 = _mm_add_ps(fjx0,tx);
1073 fjy0 = _mm_add_ps(fjy0,ty);
1074 fjz0 = _mm_add_ps(fjz0,tz);
1078 /**************************
1079 * CALCULATE INTERACTIONS *
1080 **************************/
1082 if (gmx_mm_any_lt(rsq11,rcutoff2))
1085 r11 = _mm_mul_ps(rsq11,rinv11);
1086 r11 = _mm_andnot_ps(dummy_mask,r11);
1088 /* EWALD ELECTROSTATICS */
1090 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1091 ewrt = _mm_mul_ps(r11,ewtabscale);
1092 ewitab = _mm_cvttps_epi32(ewrt);
1093 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1094 ewitab = _mm_slli_epi32(ewitab,2);
1095 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1096 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1097 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1098 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1099 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1100 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1101 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1102 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
1103 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1105 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1107 /* Update potential sum for this i atom from the interaction with this j atom. */
1108 velec = _mm_and_ps(velec,cutoff_mask);
1109 velec = _mm_andnot_ps(dummy_mask,velec);
1110 velecsum = _mm_add_ps(velecsum,velec);
1114 fscal = _mm_and_ps(fscal,cutoff_mask);
1116 fscal = _mm_andnot_ps(dummy_mask,fscal);
1118 /* Calculate temporary vectorial force */
1119 tx = _mm_mul_ps(fscal,dx11);
1120 ty = _mm_mul_ps(fscal,dy11);
1121 tz = _mm_mul_ps(fscal,dz11);
1123 /* Update vectorial force */
1124 fix1 = _mm_add_ps(fix1,tx);
1125 fiy1 = _mm_add_ps(fiy1,ty);
1126 fiz1 = _mm_add_ps(fiz1,tz);
1128 fjx1 = _mm_add_ps(fjx1,tx);
1129 fjy1 = _mm_add_ps(fjy1,ty);
1130 fjz1 = _mm_add_ps(fjz1,tz);
1134 /**************************
1135 * CALCULATE INTERACTIONS *
1136 **************************/
1138 if (gmx_mm_any_lt(rsq12,rcutoff2))
1141 r12 = _mm_mul_ps(rsq12,rinv12);
1142 r12 = _mm_andnot_ps(dummy_mask,r12);
1144 /* EWALD ELECTROSTATICS */
1146 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1147 ewrt = _mm_mul_ps(r12,ewtabscale);
1148 ewitab = _mm_cvttps_epi32(ewrt);
1149 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1150 ewitab = _mm_slli_epi32(ewitab,2);
1151 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1152 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1153 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1154 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1155 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1156 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1157 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1158 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
1159 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1161 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1163 /* Update potential sum for this i atom from the interaction with this j atom. */
1164 velec = _mm_and_ps(velec,cutoff_mask);
1165 velec = _mm_andnot_ps(dummy_mask,velec);
1166 velecsum = _mm_add_ps(velecsum,velec);
1170 fscal = _mm_and_ps(fscal,cutoff_mask);
1172 fscal = _mm_andnot_ps(dummy_mask,fscal);
1174 /* Calculate temporary vectorial force */
1175 tx = _mm_mul_ps(fscal,dx12);
1176 ty = _mm_mul_ps(fscal,dy12);
1177 tz = _mm_mul_ps(fscal,dz12);
1179 /* Update vectorial force */
1180 fix1 = _mm_add_ps(fix1,tx);
1181 fiy1 = _mm_add_ps(fiy1,ty);
1182 fiz1 = _mm_add_ps(fiz1,tz);
1184 fjx2 = _mm_add_ps(fjx2,tx);
1185 fjy2 = _mm_add_ps(fjy2,ty);
1186 fjz2 = _mm_add_ps(fjz2,tz);
1190 /**************************
1191 * CALCULATE INTERACTIONS *
1192 **************************/
1194 if (gmx_mm_any_lt(rsq20,rcutoff2))
1197 r20 = _mm_mul_ps(rsq20,rinv20);
1198 r20 = _mm_andnot_ps(dummy_mask,r20);
1200 /* EWALD ELECTROSTATICS */
1202 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1203 ewrt = _mm_mul_ps(r20,ewtabscale);
1204 ewitab = _mm_cvttps_epi32(ewrt);
1205 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1206 ewitab = _mm_slli_epi32(ewitab,2);
1207 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1208 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1209 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1210 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1211 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1212 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1213 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1214 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
1215 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1217 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1219 /* Update potential sum for this i atom from the interaction with this j atom. */
1220 velec = _mm_and_ps(velec,cutoff_mask);
1221 velec = _mm_andnot_ps(dummy_mask,velec);
1222 velecsum = _mm_add_ps(velecsum,velec);
1226 fscal = _mm_and_ps(fscal,cutoff_mask);
1228 fscal = _mm_andnot_ps(dummy_mask,fscal);
1230 /* Calculate temporary vectorial force */
1231 tx = _mm_mul_ps(fscal,dx20);
1232 ty = _mm_mul_ps(fscal,dy20);
1233 tz = _mm_mul_ps(fscal,dz20);
1235 /* Update vectorial force */
1236 fix2 = _mm_add_ps(fix2,tx);
1237 fiy2 = _mm_add_ps(fiy2,ty);
1238 fiz2 = _mm_add_ps(fiz2,tz);
1240 fjx0 = _mm_add_ps(fjx0,tx);
1241 fjy0 = _mm_add_ps(fjy0,ty);
1242 fjz0 = _mm_add_ps(fjz0,tz);
1246 /**************************
1247 * CALCULATE INTERACTIONS *
1248 **************************/
1250 if (gmx_mm_any_lt(rsq21,rcutoff2))
1253 r21 = _mm_mul_ps(rsq21,rinv21);
1254 r21 = _mm_andnot_ps(dummy_mask,r21);
1256 /* EWALD ELECTROSTATICS */
1258 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1259 ewrt = _mm_mul_ps(r21,ewtabscale);
1260 ewitab = _mm_cvttps_epi32(ewrt);
1261 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1262 ewitab = _mm_slli_epi32(ewitab,2);
1263 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1264 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1265 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1266 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1267 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1268 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1269 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1270 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
1271 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1273 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1275 /* Update potential sum for this i atom from the interaction with this j atom. */
1276 velec = _mm_and_ps(velec,cutoff_mask);
1277 velec = _mm_andnot_ps(dummy_mask,velec);
1278 velecsum = _mm_add_ps(velecsum,velec);
1282 fscal = _mm_and_ps(fscal,cutoff_mask);
1284 fscal = _mm_andnot_ps(dummy_mask,fscal);
1286 /* Calculate temporary vectorial force */
1287 tx = _mm_mul_ps(fscal,dx21);
1288 ty = _mm_mul_ps(fscal,dy21);
1289 tz = _mm_mul_ps(fscal,dz21);
1291 /* Update vectorial force */
1292 fix2 = _mm_add_ps(fix2,tx);
1293 fiy2 = _mm_add_ps(fiy2,ty);
1294 fiz2 = _mm_add_ps(fiz2,tz);
1296 fjx1 = _mm_add_ps(fjx1,tx);
1297 fjy1 = _mm_add_ps(fjy1,ty);
1298 fjz1 = _mm_add_ps(fjz1,tz);
1302 /**************************
1303 * CALCULATE INTERACTIONS *
1304 **************************/
1306 if (gmx_mm_any_lt(rsq22,rcutoff2))
1309 r22 = _mm_mul_ps(rsq22,rinv22);
1310 r22 = _mm_andnot_ps(dummy_mask,r22);
1312 /* EWALD ELECTROSTATICS */
1314 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1315 ewrt = _mm_mul_ps(r22,ewtabscale);
1316 ewitab = _mm_cvttps_epi32(ewrt);
1317 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1318 ewitab = _mm_slli_epi32(ewitab,2);
1319 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1320 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1321 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1322 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1323 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1324 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1325 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1326 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
1327 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1329 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1331 /* Update potential sum for this i atom from the interaction with this j atom. */
1332 velec = _mm_and_ps(velec,cutoff_mask);
1333 velec = _mm_andnot_ps(dummy_mask,velec);
1334 velecsum = _mm_add_ps(velecsum,velec);
1338 fscal = _mm_and_ps(fscal,cutoff_mask);
1340 fscal = _mm_andnot_ps(dummy_mask,fscal);
1342 /* Calculate temporary vectorial force */
1343 tx = _mm_mul_ps(fscal,dx22);
1344 ty = _mm_mul_ps(fscal,dy22);
1345 tz = _mm_mul_ps(fscal,dz22);
1347 /* Update vectorial force */
1348 fix2 = _mm_add_ps(fix2,tx);
1349 fiy2 = _mm_add_ps(fiy2,ty);
1350 fiz2 = _mm_add_ps(fiz2,tz);
1352 fjx2 = _mm_add_ps(fjx2,tx);
1353 fjy2 = _mm_add_ps(fjy2,ty);
1354 fjz2 = _mm_add_ps(fjz2,tz);
1358 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1359 f+j_coord_offsetC,f+j_coord_offsetD,
1360 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1362 /* Inner loop uses 423 flops */
1365 /* End of innermost loop */
1367 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1368 f+i_coord_offset,fshift+i_shift_offset);
1371 /* Update potential energies */
1372 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1374 /* Increment number of inner iterations */
1375 inneriter += j_index_end - j_index_start;
1377 /* Outer loop uses 28 flops */
1380 /* Increment number of outer iterations */
1383 /* Update outer/inner flops */
1385 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*28 + inneriter*423);
1388 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_single
1389 * Electrostatics interaction: Ewald
1390 * VdW interaction: None
1391 * Geometry: Water3-Water3
1392 * Calculate force/pot: Force
1395 nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_sse2_single
1396 (t_nblist * gmx_restrict nlist,
1397 rvec * gmx_restrict xx,
1398 rvec * gmx_restrict ff,
1399 t_forcerec * gmx_restrict fr,
1400 t_mdatoms * gmx_restrict mdatoms,
1401 nb_kernel_data_t * gmx_restrict kernel_data,
1402 t_nrnb * gmx_restrict nrnb)
1404 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1405 * just 0 for non-waters.
1406 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1407 * jnr indices corresponding to data put in the four positions in the SIMD register.
1409 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1410 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1411 int jnrA,jnrB,jnrC,jnrD;
1412 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1413 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1414 real shX,shY,shZ,rcutoff_scalar;
1415 real *shiftvec,*fshift,*x,*f;
1416 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1418 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1420 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1422 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1423 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1424 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1425 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1426 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1427 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1428 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1429 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1430 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1431 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1432 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1433 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1434 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1435 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1436 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1437 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1438 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1441 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1443 __m128 dummy_mask,cutoff_mask;
1444 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1445 __m128 one = _mm_set1_ps(1.0);
1446 __m128 two = _mm_set1_ps(2.0);
1452 jindex = nlist->jindex;
1454 shiftidx = nlist->shift;
1456 shiftvec = fr->shift_vec[0];
1457 fshift = fr->fshift[0];
1458 facel = _mm_set1_ps(fr->epsfac);
1459 charge = mdatoms->chargeA;
1461 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1462 ewtab = fr->ic->tabq_coul_F;
1463 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1464 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1466 /* Setup water-specific parameters */
1467 inr = nlist->iinr[0];
1468 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1469 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1470 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1472 jq0 = _mm_set1_ps(charge[inr+0]);
1473 jq1 = _mm_set1_ps(charge[inr+1]);
1474 jq2 = _mm_set1_ps(charge[inr+2]);
1475 qq00 = _mm_mul_ps(iq0,jq0);
1476 qq01 = _mm_mul_ps(iq0,jq1);
1477 qq02 = _mm_mul_ps(iq0,jq2);
1478 qq10 = _mm_mul_ps(iq1,jq0);
1479 qq11 = _mm_mul_ps(iq1,jq1);
1480 qq12 = _mm_mul_ps(iq1,jq2);
1481 qq20 = _mm_mul_ps(iq2,jq0);
1482 qq21 = _mm_mul_ps(iq2,jq1);
1483 qq22 = _mm_mul_ps(iq2,jq2);
1485 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1486 rcutoff_scalar = fr->rcoulomb;
1487 rcutoff = _mm_set1_ps(rcutoff_scalar);
1488 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1490 /* Avoid stupid compiler warnings */
1491 jnrA = jnrB = jnrC = jnrD = 0;
1492 j_coord_offsetA = 0;
1493 j_coord_offsetB = 0;
1494 j_coord_offsetC = 0;
1495 j_coord_offsetD = 0;
1500 /* Start outer loop over neighborlists */
1501 for(iidx=0; iidx<nri; iidx++)
1503 /* Load shift vector for this list */
1504 i_shift_offset = DIM*shiftidx[iidx];
1505 shX = shiftvec[i_shift_offset+XX];
1506 shY = shiftvec[i_shift_offset+YY];
1507 shZ = shiftvec[i_shift_offset+ZZ];
1509 /* Load limits for loop over neighbors */
1510 j_index_start = jindex[iidx];
1511 j_index_end = jindex[iidx+1];
1513 /* Get outer coordinate index */
1515 i_coord_offset = DIM*inr;
1517 /* Load i particle coords and add shift vector */
1518 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1519 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1520 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1521 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1522 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1523 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1524 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1525 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1526 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1528 fix0 = _mm_setzero_ps();
1529 fiy0 = _mm_setzero_ps();
1530 fiz0 = _mm_setzero_ps();
1531 fix1 = _mm_setzero_ps();
1532 fiy1 = _mm_setzero_ps();
1533 fiz1 = _mm_setzero_ps();
1534 fix2 = _mm_setzero_ps();
1535 fiy2 = _mm_setzero_ps();
1536 fiz2 = _mm_setzero_ps();
1538 /* Start inner kernel loop */
1539 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1542 /* Get j neighbor index, and coordinate index */
1544 jnrB = jjnr[jidx+1];
1545 jnrC = jjnr[jidx+2];
1546 jnrD = jjnr[jidx+3];
1548 j_coord_offsetA = DIM*jnrA;
1549 j_coord_offsetB = DIM*jnrB;
1550 j_coord_offsetC = DIM*jnrC;
1551 j_coord_offsetD = DIM*jnrD;
1553 /* load j atom coordinates */
1554 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1555 x+j_coord_offsetC,x+j_coord_offsetD,
1556 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1558 /* Calculate displacement vector */
1559 dx00 = _mm_sub_ps(ix0,jx0);
1560 dy00 = _mm_sub_ps(iy0,jy0);
1561 dz00 = _mm_sub_ps(iz0,jz0);
1562 dx01 = _mm_sub_ps(ix0,jx1);
1563 dy01 = _mm_sub_ps(iy0,jy1);
1564 dz01 = _mm_sub_ps(iz0,jz1);
1565 dx02 = _mm_sub_ps(ix0,jx2);
1566 dy02 = _mm_sub_ps(iy0,jy2);
1567 dz02 = _mm_sub_ps(iz0,jz2);
1568 dx10 = _mm_sub_ps(ix1,jx0);
1569 dy10 = _mm_sub_ps(iy1,jy0);
1570 dz10 = _mm_sub_ps(iz1,jz0);
1571 dx11 = _mm_sub_ps(ix1,jx1);
1572 dy11 = _mm_sub_ps(iy1,jy1);
1573 dz11 = _mm_sub_ps(iz1,jz1);
1574 dx12 = _mm_sub_ps(ix1,jx2);
1575 dy12 = _mm_sub_ps(iy1,jy2);
1576 dz12 = _mm_sub_ps(iz1,jz2);
1577 dx20 = _mm_sub_ps(ix2,jx0);
1578 dy20 = _mm_sub_ps(iy2,jy0);
1579 dz20 = _mm_sub_ps(iz2,jz0);
1580 dx21 = _mm_sub_ps(ix2,jx1);
1581 dy21 = _mm_sub_ps(iy2,jy1);
1582 dz21 = _mm_sub_ps(iz2,jz1);
1583 dx22 = _mm_sub_ps(ix2,jx2);
1584 dy22 = _mm_sub_ps(iy2,jy2);
1585 dz22 = _mm_sub_ps(iz2,jz2);
1587 /* Calculate squared distance and things based on it */
1588 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1589 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1590 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1591 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1592 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1593 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1594 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1595 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1596 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1598 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1599 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1600 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1601 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1602 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1603 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1604 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1605 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1606 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1608 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1609 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1610 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1611 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1612 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1613 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1614 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1615 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1616 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1618 fjx0 = _mm_setzero_ps();
1619 fjy0 = _mm_setzero_ps();
1620 fjz0 = _mm_setzero_ps();
1621 fjx1 = _mm_setzero_ps();
1622 fjy1 = _mm_setzero_ps();
1623 fjz1 = _mm_setzero_ps();
1624 fjx2 = _mm_setzero_ps();
1625 fjy2 = _mm_setzero_ps();
1626 fjz2 = _mm_setzero_ps();
1628 /**************************
1629 * CALCULATE INTERACTIONS *
1630 **************************/
1632 if (gmx_mm_any_lt(rsq00,rcutoff2))
1635 r00 = _mm_mul_ps(rsq00,rinv00);
1637 /* EWALD ELECTROSTATICS */
1639 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1640 ewrt = _mm_mul_ps(r00,ewtabscale);
1641 ewitab = _mm_cvttps_epi32(ewrt);
1642 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1643 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1644 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1646 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1647 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1649 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1653 fscal = _mm_and_ps(fscal,cutoff_mask);
1655 /* Calculate temporary vectorial force */
1656 tx = _mm_mul_ps(fscal,dx00);
1657 ty = _mm_mul_ps(fscal,dy00);
1658 tz = _mm_mul_ps(fscal,dz00);
1660 /* Update vectorial force */
1661 fix0 = _mm_add_ps(fix0,tx);
1662 fiy0 = _mm_add_ps(fiy0,ty);
1663 fiz0 = _mm_add_ps(fiz0,tz);
1665 fjx0 = _mm_add_ps(fjx0,tx);
1666 fjy0 = _mm_add_ps(fjy0,ty);
1667 fjz0 = _mm_add_ps(fjz0,tz);
1671 /**************************
1672 * CALCULATE INTERACTIONS *
1673 **************************/
1675 if (gmx_mm_any_lt(rsq01,rcutoff2))
1678 r01 = _mm_mul_ps(rsq01,rinv01);
1680 /* EWALD ELECTROSTATICS */
1682 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1683 ewrt = _mm_mul_ps(r01,ewtabscale);
1684 ewitab = _mm_cvttps_epi32(ewrt);
1685 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1686 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1687 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1689 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1690 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1692 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1696 fscal = _mm_and_ps(fscal,cutoff_mask);
1698 /* Calculate temporary vectorial force */
1699 tx = _mm_mul_ps(fscal,dx01);
1700 ty = _mm_mul_ps(fscal,dy01);
1701 tz = _mm_mul_ps(fscal,dz01);
1703 /* Update vectorial force */
1704 fix0 = _mm_add_ps(fix0,tx);
1705 fiy0 = _mm_add_ps(fiy0,ty);
1706 fiz0 = _mm_add_ps(fiz0,tz);
1708 fjx1 = _mm_add_ps(fjx1,tx);
1709 fjy1 = _mm_add_ps(fjy1,ty);
1710 fjz1 = _mm_add_ps(fjz1,tz);
1714 /**************************
1715 * CALCULATE INTERACTIONS *
1716 **************************/
1718 if (gmx_mm_any_lt(rsq02,rcutoff2))
1721 r02 = _mm_mul_ps(rsq02,rinv02);
1723 /* EWALD ELECTROSTATICS */
1725 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1726 ewrt = _mm_mul_ps(r02,ewtabscale);
1727 ewitab = _mm_cvttps_epi32(ewrt);
1728 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1729 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1730 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1732 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1733 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1735 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1739 fscal = _mm_and_ps(fscal,cutoff_mask);
1741 /* Calculate temporary vectorial force */
1742 tx = _mm_mul_ps(fscal,dx02);
1743 ty = _mm_mul_ps(fscal,dy02);
1744 tz = _mm_mul_ps(fscal,dz02);
1746 /* Update vectorial force */
1747 fix0 = _mm_add_ps(fix0,tx);
1748 fiy0 = _mm_add_ps(fiy0,ty);
1749 fiz0 = _mm_add_ps(fiz0,tz);
1751 fjx2 = _mm_add_ps(fjx2,tx);
1752 fjy2 = _mm_add_ps(fjy2,ty);
1753 fjz2 = _mm_add_ps(fjz2,tz);
1757 /**************************
1758 * CALCULATE INTERACTIONS *
1759 **************************/
1761 if (gmx_mm_any_lt(rsq10,rcutoff2))
1764 r10 = _mm_mul_ps(rsq10,rinv10);
1766 /* EWALD ELECTROSTATICS */
1768 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1769 ewrt = _mm_mul_ps(r10,ewtabscale);
1770 ewitab = _mm_cvttps_epi32(ewrt);
1771 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1772 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1773 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1775 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1776 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1778 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1782 fscal = _mm_and_ps(fscal,cutoff_mask);
1784 /* Calculate temporary vectorial force */
1785 tx = _mm_mul_ps(fscal,dx10);
1786 ty = _mm_mul_ps(fscal,dy10);
1787 tz = _mm_mul_ps(fscal,dz10);
1789 /* Update vectorial force */
1790 fix1 = _mm_add_ps(fix1,tx);
1791 fiy1 = _mm_add_ps(fiy1,ty);
1792 fiz1 = _mm_add_ps(fiz1,tz);
1794 fjx0 = _mm_add_ps(fjx0,tx);
1795 fjy0 = _mm_add_ps(fjy0,ty);
1796 fjz0 = _mm_add_ps(fjz0,tz);
1800 /**************************
1801 * CALCULATE INTERACTIONS *
1802 **************************/
1804 if (gmx_mm_any_lt(rsq11,rcutoff2))
1807 r11 = _mm_mul_ps(rsq11,rinv11);
1809 /* EWALD ELECTROSTATICS */
1811 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1812 ewrt = _mm_mul_ps(r11,ewtabscale);
1813 ewitab = _mm_cvttps_epi32(ewrt);
1814 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1815 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1816 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1818 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1819 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1821 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1825 fscal = _mm_and_ps(fscal,cutoff_mask);
1827 /* Calculate temporary vectorial force */
1828 tx = _mm_mul_ps(fscal,dx11);
1829 ty = _mm_mul_ps(fscal,dy11);
1830 tz = _mm_mul_ps(fscal,dz11);
1832 /* Update vectorial force */
1833 fix1 = _mm_add_ps(fix1,tx);
1834 fiy1 = _mm_add_ps(fiy1,ty);
1835 fiz1 = _mm_add_ps(fiz1,tz);
1837 fjx1 = _mm_add_ps(fjx1,tx);
1838 fjy1 = _mm_add_ps(fjy1,ty);
1839 fjz1 = _mm_add_ps(fjz1,tz);
1843 /**************************
1844 * CALCULATE INTERACTIONS *
1845 **************************/
1847 if (gmx_mm_any_lt(rsq12,rcutoff2))
1850 r12 = _mm_mul_ps(rsq12,rinv12);
1852 /* EWALD ELECTROSTATICS */
1854 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1855 ewrt = _mm_mul_ps(r12,ewtabscale);
1856 ewitab = _mm_cvttps_epi32(ewrt);
1857 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1858 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1859 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1861 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1862 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1864 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1868 fscal = _mm_and_ps(fscal,cutoff_mask);
1870 /* Calculate temporary vectorial force */
1871 tx = _mm_mul_ps(fscal,dx12);
1872 ty = _mm_mul_ps(fscal,dy12);
1873 tz = _mm_mul_ps(fscal,dz12);
1875 /* Update vectorial force */
1876 fix1 = _mm_add_ps(fix1,tx);
1877 fiy1 = _mm_add_ps(fiy1,ty);
1878 fiz1 = _mm_add_ps(fiz1,tz);
1880 fjx2 = _mm_add_ps(fjx2,tx);
1881 fjy2 = _mm_add_ps(fjy2,ty);
1882 fjz2 = _mm_add_ps(fjz2,tz);
1886 /**************************
1887 * CALCULATE INTERACTIONS *
1888 **************************/
1890 if (gmx_mm_any_lt(rsq20,rcutoff2))
1893 r20 = _mm_mul_ps(rsq20,rinv20);
1895 /* EWALD ELECTROSTATICS */
1897 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1898 ewrt = _mm_mul_ps(r20,ewtabscale);
1899 ewitab = _mm_cvttps_epi32(ewrt);
1900 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1901 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1902 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1904 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1905 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1907 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1911 fscal = _mm_and_ps(fscal,cutoff_mask);
1913 /* Calculate temporary vectorial force */
1914 tx = _mm_mul_ps(fscal,dx20);
1915 ty = _mm_mul_ps(fscal,dy20);
1916 tz = _mm_mul_ps(fscal,dz20);
1918 /* Update vectorial force */
1919 fix2 = _mm_add_ps(fix2,tx);
1920 fiy2 = _mm_add_ps(fiy2,ty);
1921 fiz2 = _mm_add_ps(fiz2,tz);
1923 fjx0 = _mm_add_ps(fjx0,tx);
1924 fjy0 = _mm_add_ps(fjy0,ty);
1925 fjz0 = _mm_add_ps(fjz0,tz);
1929 /**************************
1930 * CALCULATE INTERACTIONS *
1931 **************************/
1933 if (gmx_mm_any_lt(rsq21,rcutoff2))
1936 r21 = _mm_mul_ps(rsq21,rinv21);
1938 /* EWALD ELECTROSTATICS */
1940 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1941 ewrt = _mm_mul_ps(r21,ewtabscale);
1942 ewitab = _mm_cvttps_epi32(ewrt);
1943 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1944 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1945 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1947 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1948 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1950 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1954 fscal = _mm_and_ps(fscal,cutoff_mask);
1956 /* Calculate temporary vectorial force */
1957 tx = _mm_mul_ps(fscal,dx21);
1958 ty = _mm_mul_ps(fscal,dy21);
1959 tz = _mm_mul_ps(fscal,dz21);
1961 /* Update vectorial force */
1962 fix2 = _mm_add_ps(fix2,tx);
1963 fiy2 = _mm_add_ps(fiy2,ty);
1964 fiz2 = _mm_add_ps(fiz2,tz);
1966 fjx1 = _mm_add_ps(fjx1,tx);
1967 fjy1 = _mm_add_ps(fjy1,ty);
1968 fjz1 = _mm_add_ps(fjz1,tz);
1972 /**************************
1973 * CALCULATE INTERACTIONS *
1974 **************************/
1976 if (gmx_mm_any_lt(rsq22,rcutoff2))
1979 r22 = _mm_mul_ps(rsq22,rinv22);
1981 /* EWALD ELECTROSTATICS */
1983 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1984 ewrt = _mm_mul_ps(r22,ewtabscale);
1985 ewitab = _mm_cvttps_epi32(ewrt);
1986 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1987 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1988 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1990 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1991 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1993 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1997 fscal = _mm_and_ps(fscal,cutoff_mask);
1999 /* Calculate temporary vectorial force */
2000 tx = _mm_mul_ps(fscal,dx22);
2001 ty = _mm_mul_ps(fscal,dy22);
2002 tz = _mm_mul_ps(fscal,dz22);
2004 /* Update vectorial force */
2005 fix2 = _mm_add_ps(fix2,tx);
2006 fiy2 = _mm_add_ps(fiy2,ty);
2007 fiz2 = _mm_add_ps(fiz2,tz);
2009 fjx2 = _mm_add_ps(fjx2,tx);
2010 fjy2 = _mm_add_ps(fjy2,ty);
2011 fjz2 = _mm_add_ps(fjz2,tz);
2015 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2016 f+j_coord_offsetC,f+j_coord_offsetD,
2017 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2019 /* Inner loop uses 351 flops */
2022 if(jidx<j_index_end)
2025 /* Get j neighbor index, and coordinate index */
2027 jnrB = jjnr[jidx+1];
2028 jnrC = jjnr[jidx+2];
2029 jnrD = jjnr[jidx+3];
2031 /* Sign of each element will be negative for non-real atoms.
2032 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2033 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2035 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2036 jnrA = (jnrA>=0) ? jnrA : 0;
2037 jnrB = (jnrB>=0) ? jnrB : 0;
2038 jnrC = (jnrC>=0) ? jnrC : 0;
2039 jnrD = (jnrD>=0) ? jnrD : 0;
2041 j_coord_offsetA = DIM*jnrA;
2042 j_coord_offsetB = DIM*jnrB;
2043 j_coord_offsetC = DIM*jnrC;
2044 j_coord_offsetD = DIM*jnrD;
2046 /* load j atom coordinates */
2047 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2048 x+j_coord_offsetC,x+j_coord_offsetD,
2049 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2051 /* Calculate displacement vector */
2052 dx00 = _mm_sub_ps(ix0,jx0);
2053 dy00 = _mm_sub_ps(iy0,jy0);
2054 dz00 = _mm_sub_ps(iz0,jz0);
2055 dx01 = _mm_sub_ps(ix0,jx1);
2056 dy01 = _mm_sub_ps(iy0,jy1);
2057 dz01 = _mm_sub_ps(iz0,jz1);
2058 dx02 = _mm_sub_ps(ix0,jx2);
2059 dy02 = _mm_sub_ps(iy0,jy2);
2060 dz02 = _mm_sub_ps(iz0,jz2);
2061 dx10 = _mm_sub_ps(ix1,jx0);
2062 dy10 = _mm_sub_ps(iy1,jy0);
2063 dz10 = _mm_sub_ps(iz1,jz0);
2064 dx11 = _mm_sub_ps(ix1,jx1);
2065 dy11 = _mm_sub_ps(iy1,jy1);
2066 dz11 = _mm_sub_ps(iz1,jz1);
2067 dx12 = _mm_sub_ps(ix1,jx2);
2068 dy12 = _mm_sub_ps(iy1,jy2);
2069 dz12 = _mm_sub_ps(iz1,jz2);
2070 dx20 = _mm_sub_ps(ix2,jx0);
2071 dy20 = _mm_sub_ps(iy2,jy0);
2072 dz20 = _mm_sub_ps(iz2,jz0);
2073 dx21 = _mm_sub_ps(ix2,jx1);
2074 dy21 = _mm_sub_ps(iy2,jy1);
2075 dz21 = _mm_sub_ps(iz2,jz1);
2076 dx22 = _mm_sub_ps(ix2,jx2);
2077 dy22 = _mm_sub_ps(iy2,jy2);
2078 dz22 = _mm_sub_ps(iz2,jz2);
2080 /* Calculate squared distance and things based on it */
2081 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2082 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2083 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2084 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2085 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2086 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2087 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2088 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2089 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2091 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2092 rinv01 = gmx_mm_invsqrt_ps(rsq01);
2093 rinv02 = gmx_mm_invsqrt_ps(rsq02);
2094 rinv10 = gmx_mm_invsqrt_ps(rsq10);
2095 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2096 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2097 rinv20 = gmx_mm_invsqrt_ps(rsq20);
2098 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2099 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2101 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2102 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
2103 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
2104 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
2105 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2106 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2107 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
2108 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2109 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2111 fjx0 = _mm_setzero_ps();
2112 fjy0 = _mm_setzero_ps();
2113 fjz0 = _mm_setzero_ps();
2114 fjx1 = _mm_setzero_ps();
2115 fjy1 = _mm_setzero_ps();
2116 fjz1 = _mm_setzero_ps();
2117 fjx2 = _mm_setzero_ps();
2118 fjy2 = _mm_setzero_ps();
2119 fjz2 = _mm_setzero_ps();
2121 /**************************
2122 * CALCULATE INTERACTIONS *
2123 **************************/
2125 if (gmx_mm_any_lt(rsq00,rcutoff2))
2128 r00 = _mm_mul_ps(rsq00,rinv00);
2129 r00 = _mm_andnot_ps(dummy_mask,r00);
2131 /* EWALD ELECTROSTATICS */
2133 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2134 ewrt = _mm_mul_ps(r00,ewtabscale);
2135 ewitab = _mm_cvttps_epi32(ewrt);
2136 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2137 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2138 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2140 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2141 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2143 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2147 fscal = _mm_and_ps(fscal,cutoff_mask);
2149 fscal = _mm_andnot_ps(dummy_mask,fscal);
2151 /* Calculate temporary vectorial force */
2152 tx = _mm_mul_ps(fscal,dx00);
2153 ty = _mm_mul_ps(fscal,dy00);
2154 tz = _mm_mul_ps(fscal,dz00);
2156 /* Update vectorial force */
2157 fix0 = _mm_add_ps(fix0,tx);
2158 fiy0 = _mm_add_ps(fiy0,ty);
2159 fiz0 = _mm_add_ps(fiz0,tz);
2161 fjx0 = _mm_add_ps(fjx0,tx);
2162 fjy0 = _mm_add_ps(fjy0,ty);
2163 fjz0 = _mm_add_ps(fjz0,tz);
2167 /**************************
2168 * CALCULATE INTERACTIONS *
2169 **************************/
2171 if (gmx_mm_any_lt(rsq01,rcutoff2))
2174 r01 = _mm_mul_ps(rsq01,rinv01);
2175 r01 = _mm_andnot_ps(dummy_mask,r01);
2177 /* EWALD ELECTROSTATICS */
2179 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2180 ewrt = _mm_mul_ps(r01,ewtabscale);
2181 ewitab = _mm_cvttps_epi32(ewrt);
2182 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2183 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2184 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2186 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2187 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2189 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2193 fscal = _mm_and_ps(fscal,cutoff_mask);
2195 fscal = _mm_andnot_ps(dummy_mask,fscal);
2197 /* Calculate temporary vectorial force */
2198 tx = _mm_mul_ps(fscal,dx01);
2199 ty = _mm_mul_ps(fscal,dy01);
2200 tz = _mm_mul_ps(fscal,dz01);
2202 /* Update vectorial force */
2203 fix0 = _mm_add_ps(fix0,tx);
2204 fiy0 = _mm_add_ps(fiy0,ty);
2205 fiz0 = _mm_add_ps(fiz0,tz);
2207 fjx1 = _mm_add_ps(fjx1,tx);
2208 fjy1 = _mm_add_ps(fjy1,ty);
2209 fjz1 = _mm_add_ps(fjz1,tz);
2213 /**************************
2214 * CALCULATE INTERACTIONS *
2215 **************************/
2217 if (gmx_mm_any_lt(rsq02,rcutoff2))
2220 r02 = _mm_mul_ps(rsq02,rinv02);
2221 r02 = _mm_andnot_ps(dummy_mask,r02);
2223 /* EWALD ELECTROSTATICS */
2225 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2226 ewrt = _mm_mul_ps(r02,ewtabscale);
2227 ewitab = _mm_cvttps_epi32(ewrt);
2228 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2229 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2230 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2232 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2233 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2235 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2239 fscal = _mm_and_ps(fscal,cutoff_mask);
2241 fscal = _mm_andnot_ps(dummy_mask,fscal);
2243 /* Calculate temporary vectorial force */
2244 tx = _mm_mul_ps(fscal,dx02);
2245 ty = _mm_mul_ps(fscal,dy02);
2246 tz = _mm_mul_ps(fscal,dz02);
2248 /* Update vectorial force */
2249 fix0 = _mm_add_ps(fix0,tx);
2250 fiy0 = _mm_add_ps(fiy0,ty);
2251 fiz0 = _mm_add_ps(fiz0,tz);
2253 fjx2 = _mm_add_ps(fjx2,tx);
2254 fjy2 = _mm_add_ps(fjy2,ty);
2255 fjz2 = _mm_add_ps(fjz2,tz);
2259 /**************************
2260 * CALCULATE INTERACTIONS *
2261 **************************/
2263 if (gmx_mm_any_lt(rsq10,rcutoff2))
2266 r10 = _mm_mul_ps(rsq10,rinv10);
2267 r10 = _mm_andnot_ps(dummy_mask,r10);
2269 /* EWALD ELECTROSTATICS */
2271 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2272 ewrt = _mm_mul_ps(r10,ewtabscale);
2273 ewitab = _mm_cvttps_epi32(ewrt);
2274 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2275 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2276 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2278 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2279 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2281 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2285 fscal = _mm_and_ps(fscal,cutoff_mask);
2287 fscal = _mm_andnot_ps(dummy_mask,fscal);
2289 /* Calculate temporary vectorial force */
2290 tx = _mm_mul_ps(fscal,dx10);
2291 ty = _mm_mul_ps(fscal,dy10);
2292 tz = _mm_mul_ps(fscal,dz10);
2294 /* Update vectorial force */
2295 fix1 = _mm_add_ps(fix1,tx);
2296 fiy1 = _mm_add_ps(fiy1,ty);
2297 fiz1 = _mm_add_ps(fiz1,tz);
2299 fjx0 = _mm_add_ps(fjx0,tx);
2300 fjy0 = _mm_add_ps(fjy0,ty);
2301 fjz0 = _mm_add_ps(fjz0,tz);
2305 /**************************
2306 * CALCULATE INTERACTIONS *
2307 **************************/
2309 if (gmx_mm_any_lt(rsq11,rcutoff2))
2312 r11 = _mm_mul_ps(rsq11,rinv11);
2313 r11 = _mm_andnot_ps(dummy_mask,r11);
2315 /* EWALD ELECTROSTATICS */
2317 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2318 ewrt = _mm_mul_ps(r11,ewtabscale);
2319 ewitab = _mm_cvttps_epi32(ewrt);
2320 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2321 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2322 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2324 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2325 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2327 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2331 fscal = _mm_and_ps(fscal,cutoff_mask);
2333 fscal = _mm_andnot_ps(dummy_mask,fscal);
2335 /* Calculate temporary vectorial force */
2336 tx = _mm_mul_ps(fscal,dx11);
2337 ty = _mm_mul_ps(fscal,dy11);
2338 tz = _mm_mul_ps(fscal,dz11);
2340 /* Update vectorial force */
2341 fix1 = _mm_add_ps(fix1,tx);
2342 fiy1 = _mm_add_ps(fiy1,ty);
2343 fiz1 = _mm_add_ps(fiz1,tz);
2345 fjx1 = _mm_add_ps(fjx1,tx);
2346 fjy1 = _mm_add_ps(fjy1,ty);
2347 fjz1 = _mm_add_ps(fjz1,tz);
2351 /**************************
2352 * CALCULATE INTERACTIONS *
2353 **************************/
2355 if (gmx_mm_any_lt(rsq12,rcutoff2))
2358 r12 = _mm_mul_ps(rsq12,rinv12);
2359 r12 = _mm_andnot_ps(dummy_mask,r12);
2361 /* EWALD ELECTROSTATICS */
2363 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2364 ewrt = _mm_mul_ps(r12,ewtabscale);
2365 ewitab = _mm_cvttps_epi32(ewrt);
2366 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2367 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2368 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2370 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2371 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2373 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2377 fscal = _mm_and_ps(fscal,cutoff_mask);
2379 fscal = _mm_andnot_ps(dummy_mask,fscal);
2381 /* Calculate temporary vectorial force */
2382 tx = _mm_mul_ps(fscal,dx12);
2383 ty = _mm_mul_ps(fscal,dy12);
2384 tz = _mm_mul_ps(fscal,dz12);
2386 /* Update vectorial force */
2387 fix1 = _mm_add_ps(fix1,tx);
2388 fiy1 = _mm_add_ps(fiy1,ty);
2389 fiz1 = _mm_add_ps(fiz1,tz);
2391 fjx2 = _mm_add_ps(fjx2,tx);
2392 fjy2 = _mm_add_ps(fjy2,ty);
2393 fjz2 = _mm_add_ps(fjz2,tz);
2397 /**************************
2398 * CALCULATE INTERACTIONS *
2399 **************************/
2401 if (gmx_mm_any_lt(rsq20,rcutoff2))
2404 r20 = _mm_mul_ps(rsq20,rinv20);
2405 r20 = _mm_andnot_ps(dummy_mask,r20);
2407 /* EWALD ELECTROSTATICS */
2409 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2410 ewrt = _mm_mul_ps(r20,ewtabscale);
2411 ewitab = _mm_cvttps_epi32(ewrt);
2412 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2413 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2414 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2416 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2417 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2419 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2423 fscal = _mm_and_ps(fscal,cutoff_mask);
2425 fscal = _mm_andnot_ps(dummy_mask,fscal);
2427 /* Calculate temporary vectorial force */
2428 tx = _mm_mul_ps(fscal,dx20);
2429 ty = _mm_mul_ps(fscal,dy20);
2430 tz = _mm_mul_ps(fscal,dz20);
2432 /* Update vectorial force */
2433 fix2 = _mm_add_ps(fix2,tx);
2434 fiy2 = _mm_add_ps(fiy2,ty);
2435 fiz2 = _mm_add_ps(fiz2,tz);
2437 fjx0 = _mm_add_ps(fjx0,tx);
2438 fjy0 = _mm_add_ps(fjy0,ty);
2439 fjz0 = _mm_add_ps(fjz0,tz);
2443 /**************************
2444 * CALCULATE INTERACTIONS *
2445 **************************/
2447 if (gmx_mm_any_lt(rsq21,rcutoff2))
2450 r21 = _mm_mul_ps(rsq21,rinv21);
2451 r21 = _mm_andnot_ps(dummy_mask,r21);
2453 /* EWALD ELECTROSTATICS */
2455 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2456 ewrt = _mm_mul_ps(r21,ewtabscale);
2457 ewitab = _mm_cvttps_epi32(ewrt);
2458 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2459 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2460 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2462 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2463 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2465 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2469 fscal = _mm_and_ps(fscal,cutoff_mask);
2471 fscal = _mm_andnot_ps(dummy_mask,fscal);
2473 /* Calculate temporary vectorial force */
2474 tx = _mm_mul_ps(fscal,dx21);
2475 ty = _mm_mul_ps(fscal,dy21);
2476 tz = _mm_mul_ps(fscal,dz21);
2478 /* Update vectorial force */
2479 fix2 = _mm_add_ps(fix2,tx);
2480 fiy2 = _mm_add_ps(fiy2,ty);
2481 fiz2 = _mm_add_ps(fiz2,tz);
2483 fjx1 = _mm_add_ps(fjx1,tx);
2484 fjy1 = _mm_add_ps(fjy1,ty);
2485 fjz1 = _mm_add_ps(fjz1,tz);
2489 /**************************
2490 * CALCULATE INTERACTIONS *
2491 **************************/
2493 if (gmx_mm_any_lt(rsq22,rcutoff2))
2496 r22 = _mm_mul_ps(rsq22,rinv22);
2497 r22 = _mm_andnot_ps(dummy_mask,r22);
2499 /* EWALD ELECTROSTATICS */
2501 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2502 ewrt = _mm_mul_ps(r22,ewtabscale);
2503 ewitab = _mm_cvttps_epi32(ewrt);
2504 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2505 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2506 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2508 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2509 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2511 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2515 fscal = _mm_and_ps(fscal,cutoff_mask);
2517 fscal = _mm_andnot_ps(dummy_mask,fscal);
2519 /* Calculate temporary vectorial force */
2520 tx = _mm_mul_ps(fscal,dx22);
2521 ty = _mm_mul_ps(fscal,dy22);
2522 tz = _mm_mul_ps(fscal,dz22);
2524 /* Update vectorial force */
2525 fix2 = _mm_add_ps(fix2,tx);
2526 fiy2 = _mm_add_ps(fiy2,ty);
2527 fiz2 = _mm_add_ps(fiz2,tz);
2529 fjx2 = _mm_add_ps(fjx2,tx);
2530 fjy2 = _mm_add_ps(fjy2,ty);
2531 fjz2 = _mm_add_ps(fjz2,tz);
2535 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2536 f+j_coord_offsetC,f+j_coord_offsetD,
2537 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2539 /* Inner loop uses 360 flops */
2542 /* End of innermost loop */
2544 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2545 f+i_coord_offset,fshift+i_shift_offset);
2547 /* Increment number of inner iterations */
2548 inneriter += j_index_end - j_index_start;
2550 /* Outer loop uses 27 flops */
2553 /* Increment number of outer iterations */
2556 /* Update outer/inner flops */
2558 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*27 + inneriter*360);