2 * Note: this file was generated by the Gromacs sse4_1_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse4_1_single.h"
34 #include "kernelutil_x86_sse4_1_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse4_1_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse4_1_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
93 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
96 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
97 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
99 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
101 __m128 dummy_mask,cutoff_mask;
102 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
103 __m128 one = _mm_set1_ps(1.0);
104 __m128 two = _mm_set1_ps(2.0);
110 jindex = nlist->jindex;
112 shiftidx = nlist->shift;
114 shiftvec = fr->shift_vec[0];
115 fshift = fr->fshift[0];
116 facel = _mm_set1_ps(fr->epsfac);
117 charge = mdatoms->chargeA;
118 nvdwtype = fr->ntype;
120 vdwtype = mdatoms->typeA;
122 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
123 ewtab = fr->ic->tabq_coul_FDV0;
124 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
125 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
127 /* Setup water-specific parameters */
128 inr = nlist->iinr[0];
129 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
130 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
131 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
132 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
134 jq0 = _mm_set1_ps(charge[inr+0]);
135 jq1 = _mm_set1_ps(charge[inr+1]);
136 jq2 = _mm_set1_ps(charge[inr+2]);
137 vdwjidx0A = 2*vdwtype[inr+0];
138 qq00 = _mm_mul_ps(iq0,jq0);
139 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
140 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
141 qq01 = _mm_mul_ps(iq0,jq1);
142 qq02 = _mm_mul_ps(iq0,jq2);
143 qq10 = _mm_mul_ps(iq1,jq0);
144 qq11 = _mm_mul_ps(iq1,jq1);
145 qq12 = _mm_mul_ps(iq1,jq2);
146 qq20 = _mm_mul_ps(iq2,jq0);
147 qq21 = _mm_mul_ps(iq2,jq1);
148 qq22 = _mm_mul_ps(iq2,jq2);
150 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
151 rcutoff_scalar = fr->rcoulomb;
152 rcutoff = _mm_set1_ps(rcutoff_scalar);
153 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
155 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
156 rvdw = _mm_set1_ps(fr->rvdw);
158 /* Avoid stupid compiler warnings */
159 jnrA = jnrB = jnrC = jnrD = 0;
168 for(iidx=0;iidx<4*DIM;iidx++)
173 /* Start outer loop over neighborlists */
174 for(iidx=0; iidx<nri; iidx++)
176 /* Load shift vector for this list */
177 i_shift_offset = DIM*shiftidx[iidx];
179 /* Load limits for loop over neighbors */
180 j_index_start = jindex[iidx];
181 j_index_end = jindex[iidx+1];
183 /* Get outer coordinate index */
185 i_coord_offset = DIM*inr;
187 /* Load i particle coords and add shift vector */
188 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
189 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
191 fix0 = _mm_setzero_ps();
192 fiy0 = _mm_setzero_ps();
193 fiz0 = _mm_setzero_ps();
194 fix1 = _mm_setzero_ps();
195 fiy1 = _mm_setzero_ps();
196 fiz1 = _mm_setzero_ps();
197 fix2 = _mm_setzero_ps();
198 fiy2 = _mm_setzero_ps();
199 fiz2 = _mm_setzero_ps();
201 /* Reset potential sums */
202 velecsum = _mm_setzero_ps();
203 vvdwsum = _mm_setzero_ps();
205 /* Start inner kernel loop */
206 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
209 /* Get j neighbor index, and coordinate index */
214 j_coord_offsetA = DIM*jnrA;
215 j_coord_offsetB = DIM*jnrB;
216 j_coord_offsetC = DIM*jnrC;
217 j_coord_offsetD = DIM*jnrD;
219 /* load j atom coordinates */
220 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
221 x+j_coord_offsetC,x+j_coord_offsetD,
222 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
224 /* Calculate displacement vector */
225 dx00 = _mm_sub_ps(ix0,jx0);
226 dy00 = _mm_sub_ps(iy0,jy0);
227 dz00 = _mm_sub_ps(iz0,jz0);
228 dx01 = _mm_sub_ps(ix0,jx1);
229 dy01 = _mm_sub_ps(iy0,jy1);
230 dz01 = _mm_sub_ps(iz0,jz1);
231 dx02 = _mm_sub_ps(ix0,jx2);
232 dy02 = _mm_sub_ps(iy0,jy2);
233 dz02 = _mm_sub_ps(iz0,jz2);
234 dx10 = _mm_sub_ps(ix1,jx0);
235 dy10 = _mm_sub_ps(iy1,jy0);
236 dz10 = _mm_sub_ps(iz1,jz0);
237 dx11 = _mm_sub_ps(ix1,jx1);
238 dy11 = _mm_sub_ps(iy1,jy1);
239 dz11 = _mm_sub_ps(iz1,jz1);
240 dx12 = _mm_sub_ps(ix1,jx2);
241 dy12 = _mm_sub_ps(iy1,jy2);
242 dz12 = _mm_sub_ps(iz1,jz2);
243 dx20 = _mm_sub_ps(ix2,jx0);
244 dy20 = _mm_sub_ps(iy2,jy0);
245 dz20 = _mm_sub_ps(iz2,jz0);
246 dx21 = _mm_sub_ps(ix2,jx1);
247 dy21 = _mm_sub_ps(iy2,jy1);
248 dz21 = _mm_sub_ps(iz2,jz1);
249 dx22 = _mm_sub_ps(ix2,jx2);
250 dy22 = _mm_sub_ps(iy2,jy2);
251 dz22 = _mm_sub_ps(iz2,jz2);
253 /* Calculate squared distance and things based on it */
254 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
255 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
256 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
257 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
258 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
259 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
260 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
261 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
262 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
264 rinv00 = gmx_mm_invsqrt_ps(rsq00);
265 rinv01 = gmx_mm_invsqrt_ps(rsq01);
266 rinv02 = gmx_mm_invsqrt_ps(rsq02);
267 rinv10 = gmx_mm_invsqrt_ps(rsq10);
268 rinv11 = gmx_mm_invsqrt_ps(rsq11);
269 rinv12 = gmx_mm_invsqrt_ps(rsq12);
270 rinv20 = gmx_mm_invsqrt_ps(rsq20);
271 rinv21 = gmx_mm_invsqrt_ps(rsq21);
272 rinv22 = gmx_mm_invsqrt_ps(rsq22);
274 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
275 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
276 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
277 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
278 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
279 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
280 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
281 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
282 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
284 fjx0 = _mm_setzero_ps();
285 fjy0 = _mm_setzero_ps();
286 fjz0 = _mm_setzero_ps();
287 fjx1 = _mm_setzero_ps();
288 fjy1 = _mm_setzero_ps();
289 fjz1 = _mm_setzero_ps();
290 fjx2 = _mm_setzero_ps();
291 fjy2 = _mm_setzero_ps();
292 fjz2 = _mm_setzero_ps();
294 /**************************
295 * CALCULATE INTERACTIONS *
296 **************************/
298 if (gmx_mm_any_lt(rsq00,rcutoff2))
301 r00 = _mm_mul_ps(rsq00,rinv00);
303 /* EWALD ELECTROSTATICS */
305 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
306 ewrt = _mm_mul_ps(r00,ewtabscale);
307 ewitab = _mm_cvttps_epi32(ewrt);
308 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
309 ewitab = _mm_slli_epi32(ewitab,2);
310 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
311 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
312 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
313 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
314 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
315 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
316 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
317 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
318 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
320 /* LENNARD-JONES DISPERSION/REPULSION */
322 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
323 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
324 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
325 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
326 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
327 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
329 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
331 /* Update potential sum for this i atom from the interaction with this j atom. */
332 velec = _mm_and_ps(velec,cutoff_mask);
333 velecsum = _mm_add_ps(velecsum,velec);
334 vvdw = _mm_and_ps(vvdw,cutoff_mask);
335 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
337 fscal = _mm_add_ps(felec,fvdw);
339 fscal = _mm_and_ps(fscal,cutoff_mask);
341 /* Calculate temporary vectorial force */
342 tx = _mm_mul_ps(fscal,dx00);
343 ty = _mm_mul_ps(fscal,dy00);
344 tz = _mm_mul_ps(fscal,dz00);
346 /* Update vectorial force */
347 fix0 = _mm_add_ps(fix0,tx);
348 fiy0 = _mm_add_ps(fiy0,ty);
349 fiz0 = _mm_add_ps(fiz0,tz);
351 fjx0 = _mm_add_ps(fjx0,tx);
352 fjy0 = _mm_add_ps(fjy0,ty);
353 fjz0 = _mm_add_ps(fjz0,tz);
357 /**************************
358 * CALCULATE INTERACTIONS *
359 **************************/
361 if (gmx_mm_any_lt(rsq01,rcutoff2))
364 r01 = _mm_mul_ps(rsq01,rinv01);
366 /* EWALD ELECTROSTATICS */
368 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
369 ewrt = _mm_mul_ps(r01,ewtabscale);
370 ewitab = _mm_cvttps_epi32(ewrt);
371 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
372 ewitab = _mm_slli_epi32(ewitab,2);
373 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
374 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
375 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
376 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
377 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
378 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
379 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
380 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
381 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
383 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
385 /* Update potential sum for this i atom from the interaction with this j atom. */
386 velec = _mm_and_ps(velec,cutoff_mask);
387 velecsum = _mm_add_ps(velecsum,velec);
391 fscal = _mm_and_ps(fscal,cutoff_mask);
393 /* Calculate temporary vectorial force */
394 tx = _mm_mul_ps(fscal,dx01);
395 ty = _mm_mul_ps(fscal,dy01);
396 tz = _mm_mul_ps(fscal,dz01);
398 /* Update vectorial force */
399 fix0 = _mm_add_ps(fix0,tx);
400 fiy0 = _mm_add_ps(fiy0,ty);
401 fiz0 = _mm_add_ps(fiz0,tz);
403 fjx1 = _mm_add_ps(fjx1,tx);
404 fjy1 = _mm_add_ps(fjy1,ty);
405 fjz1 = _mm_add_ps(fjz1,tz);
409 /**************************
410 * CALCULATE INTERACTIONS *
411 **************************/
413 if (gmx_mm_any_lt(rsq02,rcutoff2))
416 r02 = _mm_mul_ps(rsq02,rinv02);
418 /* EWALD ELECTROSTATICS */
420 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
421 ewrt = _mm_mul_ps(r02,ewtabscale);
422 ewitab = _mm_cvttps_epi32(ewrt);
423 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
424 ewitab = _mm_slli_epi32(ewitab,2);
425 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
426 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
427 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
428 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
429 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
430 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
431 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
432 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
433 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
435 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
437 /* Update potential sum for this i atom from the interaction with this j atom. */
438 velec = _mm_and_ps(velec,cutoff_mask);
439 velecsum = _mm_add_ps(velecsum,velec);
443 fscal = _mm_and_ps(fscal,cutoff_mask);
445 /* Calculate temporary vectorial force */
446 tx = _mm_mul_ps(fscal,dx02);
447 ty = _mm_mul_ps(fscal,dy02);
448 tz = _mm_mul_ps(fscal,dz02);
450 /* Update vectorial force */
451 fix0 = _mm_add_ps(fix0,tx);
452 fiy0 = _mm_add_ps(fiy0,ty);
453 fiz0 = _mm_add_ps(fiz0,tz);
455 fjx2 = _mm_add_ps(fjx2,tx);
456 fjy2 = _mm_add_ps(fjy2,ty);
457 fjz2 = _mm_add_ps(fjz2,tz);
461 /**************************
462 * CALCULATE INTERACTIONS *
463 **************************/
465 if (gmx_mm_any_lt(rsq10,rcutoff2))
468 r10 = _mm_mul_ps(rsq10,rinv10);
470 /* EWALD ELECTROSTATICS */
472 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
473 ewrt = _mm_mul_ps(r10,ewtabscale);
474 ewitab = _mm_cvttps_epi32(ewrt);
475 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
476 ewitab = _mm_slli_epi32(ewitab,2);
477 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
478 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
479 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
480 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
481 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
482 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
483 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
484 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
485 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
487 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
489 /* Update potential sum for this i atom from the interaction with this j atom. */
490 velec = _mm_and_ps(velec,cutoff_mask);
491 velecsum = _mm_add_ps(velecsum,velec);
495 fscal = _mm_and_ps(fscal,cutoff_mask);
497 /* Calculate temporary vectorial force */
498 tx = _mm_mul_ps(fscal,dx10);
499 ty = _mm_mul_ps(fscal,dy10);
500 tz = _mm_mul_ps(fscal,dz10);
502 /* Update vectorial force */
503 fix1 = _mm_add_ps(fix1,tx);
504 fiy1 = _mm_add_ps(fiy1,ty);
505 fiz1 = _mm_add_ps(fiz1,tz);
507 fjx0 = _mm_add_ps(fjx0,tx);
508 fjy0 = _mm_add_ps(fjy0,ty);
509 fjz0 = _mm_add_ps(fjz0,tz);
513 /**************************
514 * CALCULATE INTERACTIONS *
515 **************************/
517 if (gmx_mm_any_lt(rsq11,rcutoff2))
520 r11 = _mm_mul_ps(rsq11,rinv11);
522 /* EWALD ELECTROSTATICS */
524 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
525 ewrt = _mm_mul_ps(r11,ewtabscale);
526 ewitab = _mm_cvttps_epi32(ewrt);
527 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
528 ewitab = _mm_slli_epi32(ewitab,2);
529 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
530 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
531 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
532 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
533 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
534 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
535 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
536 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
537 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
539 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
541 /* Update potential sum for this i atom from the interaction with this j atom. */
542 velec = _mm_and_ps(velec,cutoff_mask);
543 velecsum = _mm_add_ps(velecsum,velec);
547 fscal = _mm_and_ps(fscal,cutoff_mask);
549 /* Calculate temporary vectorial force */
550 tx = _mm_mul_ps(fscal,dx11);
551 ty = _mm_mul_ps(fscal,dy11);
552 tz = _mm_mul_ps(fscal,dz11);
554 /* Update vectorial force */
555 fix1 = _mm_add_ps(fix1,tx);
556 fiy1 = _mm_add_ps(fiy1,ty);
557 fiz1 = _mm_add_ps(fiz1,tz);
559 fjx1 = _mm_add_ps(fjx1,tx);
560 fjy1 = _mm_add_ps(fjy1,ty);
561 fjz1 = _mm_add_ps(fjz1,tz);
565 /**************************
566 * CALCULATE INTERACTIONS *
567 **************************/
569 if (gmx_mm_any_lt(rsq12,rcutoff2))
572 r12 = _mm_mul_ps(rsq12,rinv12);
574 /* EWALD ELECTROSTATICS */
576 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
577 ewrt = _mm_mul_ps(r12,ewtabscale);
578 ewitab = _mm_cvttps_epi32(ewrt);
579 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
580 ewitab = _mm_slli_epi32(ewitab,2);
581 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
582 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
583 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
584 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
585 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
586 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
587 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
588 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
589 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
591 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
593 /* Update potential sum for this i atom from the interaction with this j atom. */
594 velec = _mm_and_ps(velec,cutoff_mask);
595 velecsum = _mm_add_ps(velecsum,velec);
599 fscal = _mm_and_ps(fscal,cutoff_mask);
601 /* Calculate temporary vectorial force */
602 tx = _mm_mul_ps(fscal,dx12);
603 ty = _mm_mul_ps(fscal,dy12);
604 tz = _mm_mul_ps(fscal,dz12);
606 /* Update vectorial force */
607 fix1 = _mm_add_ps(fix1,tx);
608 fiy1 = _mm_add_ps(fiy1,ty);
609 fiz1 = _mm_add_ps(fiz1,tz);
611 fjx2 = _mm_add_ps(fjx2,tx);
612 fjy2 = _mm_add_ps(fjy2,ty);
613 fjz2 = _mm_add_ps(fjz2,tz);
617 /**************************
618 * CALCULATE INTERACTIONS *
619 **************************/
621 if (gmx_mm_any_lt(rsq20,rcutoff2))
624 r20 = _mm_mul_ps(rsq20,rinv20);
626 /* EWALD ELECTROSTATICS */
628 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
629 ewrt = _mm_mul_ps(r20,ewtabscale);
630 ewitab = _mm_cvttps_epi32(ewrt);
631 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
632 ewitab = _mm_slli_epi32(ewitab,2);
633 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
634 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
635 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
636 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
637 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
638 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
639 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
640 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
641 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
643 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
645 /* Update potential sum for this i atom from the interaction with this j atom. */
646 velec = _mm_and_ps(velec,cutoff_mask);
647 velecsum = _mm_add_ps(velecsum,velec);
651 fscal = _mm_and_ps(fscal,cutoff_mask);
653 /* Calculate temporary vectorial force */
654 tx = _mm_mul_ps(fscal,dx20);
655 ty = _mm_mul_ps(fscal,dy20);
656 tz = _mm_mul_ps(fscal,dz20);
658 /* Update vectorial force */
659 fix2 = _mm_add_ps(fix2,tx);
660 fiy2 = _mm_add_ps(fiy2,ty);
661 fiz2 = _mm_add_ps(fiz2,tz);
663 fjx0 = _mm_add_ps(fjx0,tx);
664 fjy0 = _mm_add_ps(fjy0,ty);
665 fjz0 = _mm_add_ps(fjz0,tz);
669 /**************************
670 * CALCULATE INTERACTIONS *
671 **************************/
673 if (gmx_mm_any_lt(rsq21,rcutoff2))
676 r21 = _mm_mul_ps(rsq21,rinv21);
678 /* EWALD ELECTROSTATICS */
680 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
681 ewrt = _mm_mul_ps(r21,ewtabscale);
682 ewitab = _mm_cvttps_epi32(ewrt);
683 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
684 ewitab = _mm_slli_epi32(ewitab,2);
685 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
686 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
687 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
688 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
689 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
690 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
691 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
692 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
693 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
695 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
697 /* Update potential sum for this i atom from the interaction with this j atom. */
698 velec = _mm_and_ps(velec,cutoff_mask);
699 velecsum = _mm_add_ps(velecsum,velec);
703 fscal = _mm_and_ps(fscal,cutoff_mask);
705 /* Calculate temporary vectorial force */
706 tx = _mm_mul_ps(fscal,dx21);
707 ty = _mm_mul_ps(fscal,dy21);
708 tz = _mm_mul_ps(fscal,dz21);
710 /* Update vectorial force */
711 fix2 = _mm_add_ps(fix2,tx);
712 fiy2 = _mm_add_ps(fiy2,ty);
713 fiz2 = _mm_add_ps(fiz2,tz);
715 fjx1 = _mm_add_ps(fjx1,tx);
716 fjy1 = _mm_add_ps(fjy1,ty);
717 fjz1 = _mm_add_ps(fjz1,tz);
721 /**************************
722 * CALCULATE INTERACTIONS *
723 **************************/
725 if (gmx_mm_any_lt(rsq22,rcutoff2))
728 r22 = _mm_mul_ps(rsq22,rinv22);
730 /* EWALD ELECTROSTATICS */
732 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
733 ewrt = _mm_mul_ps(r22,ewtabscale);
734 ewitab = _mm_cvttps_epi32(ewrt);
735 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
736 ewitab = _mm_slli_epi32(ewitab,2);
737 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
738 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
739 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
740 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
741 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
742 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
743 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
744 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
745 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
747 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
749 /* Update potential sum for this i atom from the interaction with this j atom. */
750 velec = _mm_and_ps(velec,cutoff_mask);
751 velecsum = _mm_add_ps(velecsum,velec);
755 fscal = _mm_and_ps(fscal,cutoff_mask);
757 /* Calculate temporary vectorial force */
758 tx = _mm_mul_ps(fscal,dx22);
759 ty = _mm_mul_ps(fscal,dy22);
760 tz = _mm_mul_ps(fscal,dz22);
762 /* Update vectorial force */
763 fix2 = _mm_add_ps(fix2,tx);
764 fiy2 = _mm_add_ps(fiy2,ty);
765 fiz2 = _mm_add_ps(fiz2,tz);
767 fjx2 = _mm_add_ps(fjx2,tx);
768 fjy2 = _mm_add_ps(fjy2,ty);
769 fjz2 = _mm_add_ps(fjz2,tz);
773 fjptrA = f+j_coord_offsetA;
774 fjptrB = f+j_coord_offsetB;
775 fjptrC = f+j_coord_offsetC;
776 fjptrD = f+j_coord_offsetD;
778 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
779 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
781 /* Inner loop uses 432 flops */
787 /* Get j neighbor index, and coordinate index */
788 jnrlistA = jjnr[jidx];
789 jnrlistB = jjnr[jidx+1];
790 jnrlistC = jjnr[jidx+2];
791 jnrlistD = jjnr[jidx+3];
792 /* Sign of each element will be negative for non-real atoms.
793 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
794 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
796 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
797 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
798 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
799 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
800 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
801 j_coord_offsetA = DIM*jnrA;
802 j_coord_offsetB = DIM*jnrB;
803 j_coord_offsetC = DIM*jnrC;
804 j_coord_offsetD = DIM*jnrD;
806 /* load j atom coordinates */
807 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
808 x+j_coord_offsetC,x+j_coord_offsetD,
809 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
811 /* Calculate displacement vector */
812 dx00 = _mm_sub_ps(ix0,jx0);
813 dy00 = _mm_sub_ps(iy0,jy0);
814 dz00 = _mm_sub_ps(iz0,jz0);
815 dx01 = _mm_sub_ps(ix0,jx1);
816 dy01 = _mm_sub_ps(iy0,jy1);
817 dz01 = _mm_sub_ps(iz0,jz1);
818 dx02 = _mm_sub_ps(ix0,jx2);
819 dy02 = _mm_sub_ps(iy0,jy2);
820 dz02 = _mm_sub_ps(iz0,jz2);
821 dx10 = _mm_sub_ps(ix1,jx0);
822 dy10 = _mm_sub_ps(iy1,jy0);
823 dz10 = _mm_sub_ps(iz1,jz0);
824 dx11 = _mm_sub_ps(ix1,jx1);
825 dy11 = _mm_sub_ps(iy1,jy1);
826 dz11 = _mm_sub_ps(iz1,jz1);
827 dx12 = _mm_sub_ps(ix1,jx2);
828 dy12 = _mm_sub_ps(iy1,jy2);
829 dz12 = _mm_sub_ps(iz1,jz2);
830 dx20 = _mm_sub_ps(ix2,jx0);
831 dy20 = _mm_sub_ps(iy2,jy0);
832 dz20 = _mm_sub_ps(iz2,jz0);
833 dx21 = _mm_sub_ps(ix2,jx1);
834 dy21 = _mm_sub_ps(iy2,jy1);
835 dz21 = _mm_sub_ps(iz2,jz1);
836 dx22 = _mm_sub_ps(ix2,jx2);
837 dy22 = _mm_sub_ps(iy2,jy2);
838 dz22 = _mm_sub_ps(iz2,jz2);
840 /* Calculate squared distance and things based on it */
841 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
842 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
843 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
844 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
845 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
846 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
847 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
848 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
849 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
851 rinv00 = gmx_mm_invsqrt_ps(rsq00);
852 rinv01 = gmx_mm_invsqrt_ps(rsq01);
853 rinv02 = gmx_mm_invsqrt_ps(rsq02);
854 rinv10 = gmx_mm_invsqrt_ps(rsq10);
855 rinv11 = gmx_mm_invsqrt_ps(rsq11);
856 rinv12 = gmx_mm_invsqrt_ps(rsq12);
857 rinv20 = gmx_mm_invsqrt_ps(rsq20);
858 rinv21 = gmx_mm_invsqrt_ps(rsq21);
859 rinv22 = gmx_mm_invsqrt_ps(rsq22);
861 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
862 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
863 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
864 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
865 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
866 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
867 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
868 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
869 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
871 fjx0 = _mm_setzero_ps();
872 fjy0 = _mm_setzero_ps();
873 fjz0 = _mm_setzero_ps();
874 fjx1 = _mm_setzero_ps();
875 fjy1 = _mm_setzero_ps();
876 fjz1 = _mm_setzero_ps();
877 fjx2 = _mm_setzero_ps();
878 fjy2 = _mm_setzero_ps();
879 fjz2 = _mm_setzero_ps();
881 /**************************
882 * CALCULATE INTERACTIONS *
883 **************************/
885 if (gmx_mm_any_lt(rsq00,rcutoff2))
888 r00 = _mm_mul_ps(rsq00,rinv00);
889 r00 = _mm_andnot_ps(dummy_mask,r00);
891 /* EWALD ELECTROSTATICS */
893 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
894 ewrt = _mm_mul_ps(r00,ewtabscale);
895 ewitab = _mm_cvttps_epi32(ewrt);
896 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
897 ewitab = _mm_slli_epi32(ewitab,2);
898 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
899 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
900 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
901 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
902 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
903 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
904 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
905 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
906 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
908 /* LENNARD-JONES DISPERSION/REPULSION */
910 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
911 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
912 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
913 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
914 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
915 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
917 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
919 /* Update potential sum for this i atom from the interaction with this j atom. */
920 velec = _mm_and_ps(velec,cutoff_mask);
921 velec = _mm_andnot_ps(dummy_mask,velec);
922 velecsum = _mm_add_ps(velecsum,velec);
923 vvdw = _mm_and_ps(vvdw,cutoff_mask);
924 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
925 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
927 fscal = _mm_add_ps(felec,fvdw);
929 fscal = _mm_and_ps(fscal,cutoff_mask);
931 fscal = _mm_andnot_ps(dummy_mask,fscal);
933 /* Calculate temporary vectorial force */
934 tx = _mm_mul_ps(fscal,dx00);
935 ty = _mm_mul_ps(fscal,dy00);
936 tz = _mm_mul_ps(fscal,dz00);
938 /* Update vectorial force */
939 fix0 = _mm_add_ps(fix0,tx);
940 fiy0 = _mm_add_ps(fiy0,ty);
941 fiz0 = _mm_add_ps(fiz0,tz);
943 fjx0 = _mm_add_ps(fjx0,tx);
944 fjy0 = _mm_add_ps(fjy0,ty);
945 fjz0 = _mm_add_ps(fjz0,tz);
949 /**************************
950 * CALCULATE INTERACTIONS *
951 **************************/
953 if (gmx_mm_any_lt(rsq01,rcutoff2))
956 r01 = _mm_mul_ps(rsq01,rinv01);
957 r01 = _mm_andnot_ps(dummy_mask,r01);
959 /* EWALD ELECTROSTATICS */
961 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
962 ewrt = _mm_mul_ps(r01,ewtabscale);
963 ewitab = _mm_cvttps_epi32(ewrt);
964 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
965 ewitab = _mm_slli_epi32(ewitab,2);
966 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
967 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
968 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
969 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
970 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
971 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
972 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
973 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
974 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
976 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
978 /* Update potential sum for this i atom from the interaction with this j atom. */
979 velec = _mm_and_ps(velec,cutoff_mask);
980 velec = _mm_andnot_ps(dummy_mask,velec);
981 velecsum = _mm_add_ps(velecsum,velec);
985 fscal = _mm_and_ps(fscal,cutoff_mask);
987 fscal = _mm_andnot_ps(dummy_mask,fscal);
989 /* Calculate temporary vectorial force */
990 tx = _mm_mul_ps(fscal,dx01);
991 ty = _mm_mul_ps(fscal,dy01);
992 tz = _mm_mul_ps(fscal,dz01);
994 /* Update vectorial force */
995 fix0 = _mm_add_ps(fix0,tx);
996 fiy0 = _mm_add_ps(fiy0,ty);
997 fiz0 = _mm_add_ps(fiz0,tz);
999 fjx1 = _mm_add_ps(fjx1,tx);
1000 fjy1 = _mm_add_ps(fjy1,ty);
1001 fjz1 = _mm_add_ps(fjz1,tz);
1005 /**************************
1006 * CALCULATE INTERACTIONS *
1007 **************************/
1009 if (gmx_mm_any_lt(rsq02,rcutoff2))
1012 r02 = _mm_mul_ps(rsq02,rinv02);
1013 r02 = _mm_andnot_ps(dummy_mask,r02);
1015 /* EWALD ELECTROSTATICS */
1017 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1018 ewrt = _mm_mul_ps(r02,ewtabscale);
1019 ewitab = _mm_cvttps_epi32(ewrt);
1020 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1021 ewitab = _mm_slli_epi32(ewitab,2);
1022 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1023 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1024 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1025 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1026 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1027 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1028 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1029 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
1030 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1032 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1034 /* Update potential sum for this i atom from the interaction with this j atom. */
1035 velec = _mm_and_ps(velec,cutoff_mask);
1036 velec = _mm_andnot_ps(dummy_mask,velec);
1037 velecsum = _mm_add_ps(velecsum,velec);
1041 fscal = _mm_and_ps(fscal,cutoff_mask);
1043 fscal = _mm_andnot_ps(dummy_mask,fscal);
1045 /* Calculate temporary vectorial force */
1046 tx = _mm_mul_ps(fscal,dx02);
1047 ty = _mm_mul_ps(fscal,dy02);
1048 tz = _mm_mul_ps(fscal,dz02);
1050 /* Update vectorial force */
1051 fix0 = _mm_add_ps(fix0,tx);
1052 fiy0 = _mm_add_ps(fiy0,ty);
1053 fiz0 = _mm_add_ps(fiz0,tz);
1055 fjx2 = _mm_add_ps(fjx2,tx);
1056 fjy2 = _mm_add_ps(fjy2,ty);
1057 fjz2 = _mm_add_ps(fjz2,tz);
1061 /**************************
1062 * CALCULATE INTERACTIONS *
1063 **************************/
1065 if (gmx_mm_any_lt(rsq10,rcutoff2))
1068 r10 = _mm_mul_ps(rsq10,rinv10);
1069 r10 = _mm_andnot_ps(dummy_mask,r10);
1071 /* EWALD ELECTROSTATICS */
1073 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1074 ewrt = _mm_mul_ps(r10,ewtabscale);
1075 ewitab = _mm_cvttps_epi32(ewrt);
1076 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1077 ewitab = _mm_slli_epi32(ewitab,2);
1078 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1079 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1080 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1081 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1082 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1083 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1084 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1085 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
1086 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1088 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1090 /* Update potential sum for this i atom from the interaction with this j atom. */
1091 velec = _mm_and_ps(velec,cutoff_mask);
1092 velec = _mm_andnot_ps(dummy_mask,velec);
1093 velecsum = _mm_add_ps(velecsum,velec);
1097 fscal = _mm_and_ps(fscal,cutoff_mask);
1099 fscal = _mm_andnot_ps(dummy_mask,fscal);
1101 /* Calculate temporary vectorial force */
1102 tx = _mm_mul_ps(fscal,dx10);
1103 ty = _mm_mul_ps(fscal,dy10);
1104 tz = _mm_mul_ps(fscal,dz10);
1106 /* Update vectorial force */
1107 fix1 = _mm_add_ps(fix1,tx);
1108 fiy1 = _mm_add_ps(fiy1,ty);
1109 fiz1 = _mm_add_ps(fiz1,tz);
1111 fjx0 = _mm_add_ps(fjx0,tx);
1112 fjy0 = _mm_add_ps(fjy0,ty);
1113 fjz0 = _mm_add_ps(fjz0,tz);
1117 /**************************
1118 * CALCULATE INTERACTIONS *
1119 **************************/
1121 if (gmx_mm_any_lt(rsq11,rcutoff2))
1124 r11 = _mm_mul_ps(rsq11,rinv11);
1125 r11 = _mm_andnot_ps(dummy_mask,r11);
1127 /* EWALD ELECTROSTATICS */
1129 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1130 ewrt = _mm_mul_ps(r11,ewtabscale);
1131 ewitab = _mm_cvttps_epi32(ewrt);
1132 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1133 ewitab = _mm_slli_epi32(ewitab,2);
1134 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1135 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1136 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1137 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1138 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1139 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1140 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1141 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
1142 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1144 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1146 /* Update potential sum for this i atom from the interaction with this j atom. */
1147 velec = _mm_and_ps(velec,cutoff_mask);
1148 velec = _mm_andnot_ps(dummy_mask,velec);
1149 velecsum = _mm_add_ps(velecsum,velec);
1153 fscal = _mm_and_ps(fscal,cutoff_mask);
1155 fscal = _mm_andnot_ps(dummy_mask,fscal);
1157 /* Calculate temporary vectorial force */
1158 tx = _mm_mul_ps(fscal,dx11);
1159 ty = _mm_mul_ps(fscal,dy11);
1160 tz = _mm_mul_ps(fscal,dz11);
1162 /* Update vectorial force */
1163 fix1 = _mm_add_ps(fix1,tx);
1164 fiy1 = _mm_add_ps(fiy1,ty);
1165 fiz1 = _mm_add_ps(fiz1,tz);
1167 fjx1 = _mm_add_ps(fjx1,tx);
1168 fjy1 = _mm_add_ps(fjy1,ty);
1169 fjz1 = _mm_add_ps(fjz1,tz);
1173 /**************************
1174 * CALCULATE INTERACTIONS *
1175 **************************/
1177 if (gmx_mm_any_lt(rsq12,rcutoff2))
1180 r12 = _mm_mul_ps(rsq12,rinv12);
1181 r12 = _mm_andnot_ps(dummy_mask,r12);
1183 /* EWALD ELECTROSTATICS */
1185 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1186 ewrt = _mm_mul_ps(r12,ewtabscale);
1187 ewitab = _mm_cvttps_epi32(ewrt);
1188 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1189 ewitab = _mm_slli_epi32(ewitab,2);
1190 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1191 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1192 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1193 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1194 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1195 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1196 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1197 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
1198 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1200 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1202 /* Update potential sum for this i atom from the interaction with this j atom. */
1203 velec = _mm_and_ps(velec,cutoff_mask);
1204 velec = _mm_andnot_ps(dummy_mask,velec);
1205 velecsum = _mm_add_ps(velecsum,velec);
1209 fscal = _mm_and_ps(fscal,cutoff_mask);
1211 fscal = _mm_andnot_ps(dummy_mask,fscal);
1213 /* Calculate temporary vectorial force */
1214 tx = _mm_mul_ps(fscal,dx12);
1215 ty = _mm_mul_ps(fscal,dy12);
1216 tz = _mm_mul_ps(fscal,dz12);
1218 /* Update vectorial force */
1219 fix1 = _mm_add_ps(fix1,tx);
1220 fiy1 = _mm_add_ps(fiy1,ty);
1221 fiz1 = _mm_add_ps(fiz1,tz);
1223 fjx2 = _mm_add_ps(fjx2,tx);
1224 fjy2 = _mm_add_ps(fjy2,ty);
1225 fjz2 = _mm_add_ps(fjz2,tz);
1229 /**************************
1230 * CALCULATE INTERACTIONS *
1231 **************************/
1233 if (gmx_mm_any_lt(rsq20,rcutoff2))
1236 r20 = _mm_mul_ps(rsq20,rinv20);
1237 r20 = _mm_andnot_ps(dummy_mask,r20);
1239 /* EWALD ELECTROSTATICS */
1241 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1242 ewrt = _mm_mul_ps(r20,ewtabscale);
1243 ewitab = _mm_cvttps_epi32(ewrt);
1244 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1245 ewitab = _mm_slli_epi32(ewitab,2);
1246 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1247 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1248 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1249 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1250 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1251 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1252 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1253 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
1254 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1256 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1258 /* Update potential sum for this i atom from the interaction with this j atom. */
1259 velec = _mm_and_ps(velec,cutoff_mask);
1260 velec = _mm_andnot_ps(dummy_mask,velec);
1261 velecsum = _mm_add_ps(velecsum,velec);
1265 fscal = _mm_and_ps(fscal,cutoff_mask);
1267 fscal = _mm_andnot_ps(dummy_mask,fscal);
1269 /* Calculate temporary vectorial force */
1270 tx = _mm_mul_ps(fscal,dx20);
1271 ty = _mm_mul_ps(fscal,dy20);
1272 tz = _mm_mul_ps(fscal,dz20);
1274 /* Update vectorial force */
1275 fix2 = _mm_add_ps(fix2,tx);
1276 fiy2 = _mm_add_ps(fiy2,ty);
1277 fiz2 = _mm_add_ps(fiz2,tz);
1279 fjx0 = _mm_add_ps(fjx0,tx);
1280 fjy0 = _mm_add_ps(fjy0,ty);
1281 fjz0 = _mm_add_ps(fjz0,tz);
1285 /**************************
1286 * CALCULATE INTERACTIONS *
1287 **************************/
1289 if (gmx_mm_any_lt(rsq21,rcutoff2))
1292 r21 = _mm_mul_ps(rsq21,rinv21);
1293 r21 = _mm_andnot_ps(dummy_mask,r21);
1295 /* EWALD ELECTROSTATICS */
1297 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1298 ewrt = _mm_mul_ps(r21,ewtabscale);
1299 ewitab = _mm_cvttps_epi32(ewrt);
1300 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1301 ewitab = _mm_slli_epi32(ewitab,2);
1302 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1303 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1304 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1305 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1306 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1307 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1308 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1309 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
1310 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1312 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1314 /* Update potential sum for this i atom from the interaction with this j atom. */
1315 velec = _mm_and_ps(velec,cutoff_mask);
1316 velec = _mm_andnot_ps(dummy_mask,velec);
1317 velecsum = _mm_add_ps(velecsum,velec);
1321 fscal = _mm_and_ps(fscal,cutoff_mask);
1323 fscal = _mm_andnot_ps(dummy_mask,fscal);
1325 /* Calculate temporary vectorial force */
1326 tx = _mm_mul_ps(fscal,dx21);
1327 ty = _mm_mul_ps(fscal,dy21);
1328 tz = _mm_mul_ps(fscal,dz21);
1330 /* Update vectorial force */
1331 fix2 = _mm_add_ps(fix2,tx);
1332 fiy2 = _mm_add_ps(fiy2,ty);
1333 fiz2 = _mm_add_ps(fiz2,tz);
1335 fjx1 = _mm_add_ps(fjx1,tx);
1336 fjy1 = _mm_add_ps(fjy1,ty);
1337 fjz1 = _mm_add_ps(fjz1,tz);
1341 /**************************
1342 * CALCULATE INTERACTIONS *
1343 **************************/
1345 if (gmx_mm_any_lt(rsq22,rcutoff2))
1348 r22 = _mm_mul_ps(rsq22,rinv22);
1349 r22 = _mm_andnot_ps(dummy_mask,r22);
1351 /* EWALD ELECTROSTATICS */
1353 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1354 ewrt = _mm_mul_ps(r22,ewtabscale);
1355 ewitab = _mm_cvttps_epi32(ewrt);
1356 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1357 ewitab = _mm_slli_epi32(ewitab,2);
1358 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1359 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1360 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1361 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1362 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1363 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1364 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1365 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
1366 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1368 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1370 /* Update potential sum for this i atom from the interaction with this j atom. */
1371 velec = _mm_and_ps(velec,cutoff_mask);
1372 velec = _mm_andnot_ps(dummy_mask,velec);
1373 velecsum = _mm_add_ps(velecsum,velec);
1377 fscal = _mm_and_ps(fscal,cutoff_mask);
1379 fscal = _mm_andnot_ps(dummy_mask,fscal);
1381 /* Calculate temporary vectorial force */
1382 tx = _mm_mul_ps(fscal,dx22);
1383 ty = _mm_mul_ps(fscal,dy22);
1384 tz = _mm_mul_ps(fscal,dz22);
1386 /* Update vectorial force */
1387 fix2 = _mm_add_ps(fix2,tx);
1388 fiy2 = _mm_add_ps(fiy2,ty);
1389 fiz2 = _mm_add_ps(fiz2,tz);
1391 fjx2 = _mm_add_ps(fjx2,tx);
1392 fjy2 = _mm_add_ps(fjy2,ty);
1393 fjz2 = _mm_add_ps(fjz2,tz);
1397 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1398 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1399 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1400 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1402 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1403 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1405 /* Inner loop uses 441 flops */
1408 /* End of innermost loop */
1410 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1411 f+i_coord_offset,fshift+i_shift_offset);
1414 /* Update potential energies */
1415 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1416 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1418 /* Increment number of inner iterations */
1419 inneriter += j_index_end - j_index_start;
1421 /* Outer loop uses 20 flops */
1424 /* Increment number of outer iterations */
1427 /* Update outer/inner flops */
1429 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*441);
1432 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_single
1433 * Electrostatics interaction: Ewald
1434 * VdW interaction: LennardJones
1435 * Geometry: Water3-Water3
1436 * Calculate force/pot: Force
1439 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_single
1440 (t_nblist * gmx_restrict nlist,
1441 rvec * gmx_restrict xx,
1442 rvec * gmx_restrict ff,
1443 t_forcerec * gmx_restrict fr,
1444 t_mdatoms * gmx_restrict mdatoms,
1445 nb_kernel_data_t * gmx_restrict kernel_data,
1446 t_nrnb * gmx_restrict nrnb)
1448 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1449 * just 0 for non-waters.
1450 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1451 * jnr indices corresponding to data put in the four positions in the SIMD register.
1453 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1454 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1455 int jnrA,jnrB,jnrC,jnrD;
1456 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1457 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1458 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1459 real rcutoff_scalar;
1460 real *shiftvec,*fshift,*x,*f;
1461 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1462 real scratch[4*DIM];
1463 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1465 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1467 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1469 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1470 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1471 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1472 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1473 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1474 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1475 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1476 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1477 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1478 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1479 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1480 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1481 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1482 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1483 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1484 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1485 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1488 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1491 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1492 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1494 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1496 __m128 dummy_mask,cutoff_mask;
1497 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1498 __m128 one = _mm_set1_ps(1.0);
1499 __m128 two = _mm_set1_ps(2.0);
1505 jindex = nlist->jindex;
1507 shiftidx = nlist->shift;
1509 shiftvec = fr->shift_vec[0];
1510 fshift = fr->fshift[0];
1511 facel = _mm_set1_ps(fr->epsfac);
1512 charge = mdatoms->chargeA;
1513 nvdwtype = fr->ntype;
1514 vdwparam = fr->nbfp;
1515 vdwtype = mdatoms->typeA;
1517 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1518 ewtab = fr->ic->tabq_coul_F;
1519 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1520 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1522 /* Setup water-specific parameters */
1523 inr = nlist->iinr[0];
1524 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1525 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1526 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1527 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1529 jq0 = _mm_set1_ps(charge[inr+0]);
1530 jq1 = _mm_set1_ps(charge[inr+1]);
1531 jq2 = _mm_set1_ps(charge[inr+2]);
1532 vdwjidx0A = 2*vdwtype[inr+0];
1533 qq00 = _mm_mul_ps(iq0,jq0);
1534 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1535 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1536 qq01 = _mm_mul_ps(iq0,jq1);
1537 qq02 = _mm_mul_ps(iq0,jq2);
1538 qq10 = _mm_mul_ps(iq1,jq0);
1539 qq11 = _mm_mul_ps(iq1,jq1);
1540 qq12 = _mm_mul_ps(iq1,jq2);
1541 qq20 = _mm_mul_ps(iq2,jq0);
1542 qq21 = _mm_mul_ps(iq2,jq1);
1543 qq22 = _mm_mul_ps(iq2,jq2);
1545 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1546 rcutoff_scalar = fr->rcoulomb;
1547 rcutoff = _mm_set1_ps(rcutoff_scalar);
1548 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1550 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
1551 rvdw = _mm_set1_ps(fr->rvdw);
1553 /* Avoid stupid compiler warnings */
1554 jnrA = jnrB = jnrC = jnrD = 0;
1555 j_coord_offsetA = 0;
1556 j_coord_offsetB = 0;
1557 j_coord_offsetC = 0;
1558 j_coord_offsetD = 0;
1563 for(iidx=0;iidx<4*DIM;iidx++)
1565 scratch[iidx] = 0.0;
1568 /* Start outer loop over neighborlists */
1569 for(iidx=0; iidx<nri; iidx++)
1571 /* Load shift vector for this list */
1572 i_shift_offset = DIM*shiftidx[iidx];
1574 /* Load limits for loop over neighbors */
1575 j_index_start = jindex[iidx];
1576 j_index_end = jindex[iidx+1];
1578 /* Get outer coordinate index */
1580 i_coord_offset = DIM*inr;
1582 /* Load i particle coords and add shift vector */
1583 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1584 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1586 fix0 = _mm_setzero_ps();
1587 fiy0 = _mm_setzero_ps();
1588 fiz0 = _mm_setzero_ps();
1589 fix1 = _mm_setzero_ps();
1590 fiy1 = _mm_setzero_ps();
1591 fiz1 = _mm_setzero_ps();
1592 fix2 = _mm_setzero_ps();
1593 fiy2 = _mm_setzero_ps();
1594 fiz2 = _mm_setzero_ps();
1596 /* Start inner kernel loop */
1597 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1600 /* Get j neighbor index, and coordinate index */
1602 jnrB = jjnr[jidx+1];
1603 jnrC = jjnr[jidx+2];
1604 jnrD = jjnr[jidx+3];
1605 j_coord_offsetA = DIM*jnrA;
1606 j_coord_offsetB = DIM*jnrB;
1607 j_coord_offsetC = DIM*jnrC;
1608 j_coord_offsetD = DIM*jnrD;
1610 /* load j atom coordinates */
1611 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1612 x+j_coord_offsetC,x+j_coord_offsetD,
1613 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1615 /* Calculate displacement vector */
1616 dx00 = _mm_sub_ps(ix0,jx0);
1617 dy00 = _mm_sub_ps(iy0,jy0);
1618 dz00 = _mm_sub_ps(iz0,jz0);
1619 dx01 = _mm_sub_ps(ix0,jx1);
1620 dy01 = _mm_sub_ps(iy0,jy1);
1621 dz01 = _mm_sub_ps(iz0,jz1);
1622 dx02 = _mm_sub_ps(ix0,jx2);
1623 dy02 = _mm_sub_ps(iy0,jy2);
1624 dz02 = _mm_sub_ps(iz0,jz2);
1625 dx10 = _mm_sub_ps(ix1,jx0);
1626 dy10 = _mm_sub_ps(iy1,jy0);
1627 dz10 = _mm_sub_ps(iz1,jz0);
1628 dx11 = _mm_sub_ps(ix1,jx1);
1629 dy11 = _mm_sub_ps(iy1,jy1);
1630 dz11 = _mm_sub_ps(iz1,jz1);
1631 dx12 = _mm_sub_ps(ix1,jx2);
1632 dy12 = _mm_sub_ps(iy1,jy2);
1633 dz12 = _mm_sub_ps(iz1,jz2);
1634 dx20 = _mm_sub_ps(ix2,jx0);
1635 dy20 = _mm_sub_ps(iy2,jy0);
1636 dz20 = _mm_sub_ps(iz2,jz0);
1637 dx21 = _mm_sub_ps(ix2,jx1);
1638 dy21 = _mm_sub_ps(iy2,jy1);
1639 dz21 = _mm_sub_ps(iz2,jz1);
1640 dx22 = _mm_sub_ps(ix2,jx2);
1641 dy22 = _mm_sub_ps(iy2,jy2);
1642 dz22 = _mm_sub_ps(iz2,jz2);
1644 /* Calculate squared distance and things based on it */
1645 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1646 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1647 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1648 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1649 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1650 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1651 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1652 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1653 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1655 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1656 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1657 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1658 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1659 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1660 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1661 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1662 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1663 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1665 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1666 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1667 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1668 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1669 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1670 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1671 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1672 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1673 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1675 fjx0 = _mm_setzero_ps();
1676 fjy0 = _mm_setzero_ps();
1677 fjz0 = _mm_setzero_ps();
1678 fjx1 = _mm_setzero_ps();
1679 fjy1 = _mm_setzero_ps();
1680 fjz1 = _mm_setzero_ps();
1681 fjx2 = _mm_setzero_ps();
1682 fjy2 = _mm_setzero_ps();
1683 fjz2 = _mm_setzero_ps();
1685 /**************************
1686 * CALCULATE INTERACTIONS *
1687 **************************/
1689 if (gmx_mm_any_lt(rsq00,rcutoff2))
1692 r00 = _mm_mul_ps(rsq00,rinv00);
1694 /* EWALD ELECTROSTATICS */
1696 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1697 ewrt = _mm_mul_ps(r00,ewtabscale);
1698 ewitab = _mm_cvttps_epi32(ewrt);
1699 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1700 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1701 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1703 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1704 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1706 /* LENNARD-JONES DISPERSION/REPULSION */
1708 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1709 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1711 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1713 fscal = _mm_add_ps(felec,fvdw);
1715 fscal = _mm_and_ps(fscal,cutoff_mask);
1717 /* Calculate temporary vectorial force */
1718 tx = _mm_mul_ps(fscal,dx00);
1719 ty = _mm_mul_ps(fscal,dy00);
1720 tz = _mm_mul_ps(fscal,dz00);
1722 /* Update vectorial force */
1723 fix0 = _mm_add_ps(fix0,tx);
1724 fiy0 = _mm_add_ps(fiy0,ty);
1725 fiz0 = _mm_add_ps(fiz0,tz);
1727 fjx0 = _mm_add_ps(fjx0,tx);
1728 fjy0 = _mm_add_ps(fjy0,ty);
1729 fjz0 = _mm_add_ps(fjz0,tz);
1733 /**************************
1734 * CALCULATE INTERACTIONS *
1735 **************************/
1737 if (gmx_mm_any_lt(rsq01,rcutoff2))
1740 r01 = _mm_mul_ps(rsq01,rinv01);
1742 /* EWALD ELECTROSTATICS */
1744 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1745 ewrt = _mm_mul_ps(r01,ewtabscale);
1746 ewitab = _mm_cvttps_epi32(ewrt);
1747 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1748 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1749 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1751 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1752 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1754 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1758 fscal = _mm_and_ps(fscal,cutoff_mask);
1760 /* Calculate temporary vectorial force */
1761 tx = _mm_mul_ps(fscal,dx01);
1762 ty = _mm_mul_ps(fscal,dy01);
1763 tz = _mm_mul_ps(fscal,dz01);
1765 /* Update vectorial force */
1766 fix0 = _mm_add_ps(fix0,tx);
1767 fiy0 = _mm_add_ps(fiy0,ty);
1768 fiz0 = _mm_add_ps(fiz0,tz);
1770 fjx1 = _mm_add_ps(fjx1,tx);
1771 fjy1 = _mm_add_ps(fjy1,ty);
1772 fjz1 = _mm_add_ps(fjz1,tz);
1776 /**************************
1777 * CALCULATE INTERACTIONS *
1778 **************************/
1780 if (gmx_mm_any_lt(rsq02,rcutoff2))
1783 r02 = _mm_mul_ps(rsq02,rinv02);
1785 /* EWALD ELECTROSTATICS */
1787 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1788 ewrt = _mm_mul_ps(r02,ewtabscale);
1789 ewitab = _mm_cvttps_epi32(ewrt);
1790 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1791 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1792 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1794 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1795 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1797 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1801 fscal = _mm_and_ps(fscal,cutoff_mask);
1803 /* Calculate temporary vectorial force */
1804 tx = _mm_mul_ps(fscal,dx02);
1805 ty = _mm_mul_ps(fscal,dy02);
1806 tz = _mm_mul_ps(fscal,dz02);
1808 /* Update vectorial force */
1809 fix0 = _mm_add_ps(fix0,tx);
1810 fiy0 = _mm_add_ps(fiy0,ty);
1811 fiz0 = _mm_add_ps(fiz0,tz);
1813 fjx2 = _mm_add_ps(fjx2,tx);
1814 fjy2 = _mm_add_ps(fjy2,ty);
1815 fjz2 = _mm_add_ps(fjz2,tz);
1819 /**************************
1820 * CALCULATE INTERACTIONS *
1821 **************************/
1823 if (gmx_mm_any_lt(rsq10,rcutoff2))
1826 r10 = _mm_mul_ps(rsq10,rinv10);
1828 /* EWALD ELECTROSTATICS */
1830 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1831 ewrt = _mm_mul_ps(r10,ewtabscale);
1832 ewitab = _mm_cvttps_epi32(ewrt);
1833 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1834 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1835 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1837 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1838 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1840 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1844 fscal = _mm_and_ps(fscal,cutoff_mask);
1846 /* Calculate temporary vectorial force */
1847 tx = _mm_mul_ps(fscal,dx10);
1848 ty = _mm_mul_ps(fscal,dy10);
1849 tz = _mm_mul_ps(fscal,dz10);
1851 /* Update vectorial force */
1852 fix1 = _mm_add_ps(fix1,tx);
1853 fiy1 = _mm_add_ps(fiy1,ty);
1854 fiz1 = _mm_add_ps(fiz1,tz);
1856 fjx0 = _mm_add_ps(fjx0,tx);
1857 fjy0 = _mm_add_ps(fjy0,ty);
1858 fjz0 = _mm_add_ps(fjz0,tz);
1862 /**************************
1863 * CALCULATE INTERACTIONS *
1864 **************************/
1866 if (gmx_mm_any_lt(rsq11,rcutoff2))
1869 r11 = _mm_mul_ps(rsq11,rinv11);
1871 /* EWALD ELECTROSTATICS */
1873 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1874 ewrt = _mm_mul_ps(r11,ewtabscale);
1875 ewitab = _mm_cvttps_epi32(ewrt);
1876 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1877 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1878 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1880 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1881 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1883 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1887 fscal = _mm_and_ps(fscal,cutoff_mask);
1889 /* Calculate temporary vectorial force */
1890 tx = _mm_mul_ps(fscal,dx11);
1891 ty = _mm_mul_ps(fscal,dy11);
1892 tz = _mm_mul_ps(fscal,dz11);
1894 /* Update vectorial force */
1895 fix1 = _mm_add_ps(fix1,tx);
1896 fiy1 = _mm_add_ps(fiy1,ty);
1897 fiz1 = _mm_add_ps(fiz1,tz);
1899 fjx1 = _mm_add_ps(fjx1,tx);
1900 fjy1 = _mm_add_ps(fjy1,ty);
1901 fjz1 = _mm_add_ps(fjz1,tz);
1905 /**************************
1906 * CALCULATE INTERACTIONS *
1907 **************************/
1909 if (gmx_mm_any_lt(rsq12,rcutoff2))
1912 r12 = _mm_mul_ps(rsq12,rinv12);
1914 /* EWALD ELECTROSTATICS */
1916 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1917 ewrt = _mm_mul_ps(r12,ewtabscale);
1918 ewitab = _mm_cvttps_epi32(ewrt);
1919 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1920 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1921 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1923 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1924 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1926 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1930 fscal = _mm_and_ps(fscal,cutoff_mask);
1932 /* Calculate temporary vectorial force */
1933 tx = _mm_mul_ps(fscal,dx12);
1934 ty = _mm_mul_ps(fscal,dy12);
1935 tz = _mm_mul_ps(fscal,dz12);
1937 /* Update vectorial force */
1938 fix1 = _mm_add_ps(fix1,tx);
1939 fiy1 = _mm_add_ps(fiy1,ty);
1940 fiz1 = _mm_add_ps(fiz1,tz);
1942 fjx2 = _mm_add_ps(fjx2,tx);
1943 fjy2 = _mm_add_ps(fjy2,ty);
1944 fjz2 = _mm_add_ps(fjz2,tz);
1948 /**************************
1949 * CALCULATE INTERACTIONS *
1950 **************************/
1952 if (gmx_mm_any_lt(rsq20,rcutoff2))
1955 r20 = _mm_mul_ps(rsq20,rinv20);
1957 /* EWALD ELECTROSTATICS */
1959 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1960 ewrt = _mm_mul_ps(r20,ewtabscale);
1961 ewitab = _mm_cvttps_epi32(ewrt);
1962 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1963 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1964 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1966 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1967 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1969 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1973 fscal = _mm_and_ps(fscal,cutoff_mask);
1975 /* Calculate temporary vectorial force */
1976 tx = _mm_mul_ps(fscal,dx20);
1977 ty = _mm_mul_ps(fscal,dy20);
1978 tz = _mm_mul_ps(fscal,dz20);
1980 /* Update vectorial force */
1981 fix2 = _mm_add_ps(fix2,tx);
1982 fiy2 = _mm_add_ps(fiy2,ty);
1983 fiz2 = _mm_add_ps(fiz2,tz);
1985 fjx0 = _mm_add_ps(fjx0,tx);
1986 fjy0 = _mm_add_ps(fjy0,ty);
1987 fjz0 = _mm_add_ps(fjz0,tz);
1991 /**************************
1992 * CALCULATE INTERACTIONS *
1993 **************************/
1995 if (gmx_mm_any_lt(rsq21,rcutoff2))
1998 r21 = _mm_mul_ps(rsq21,rinv21);
2000 /* EWALD ELECTROSTATICS */
2002 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2003 ewrt = _mm_mul_ps(r21,ewtabscale);
2004 ewitab = _mm_cvttps_epi32(ewrt);
2005 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2006 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2007 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2009 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2010 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2012 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2016 fscal = _mm_and_ps(fscal,cutoff_mask);
2018 /* Calculate temporary vectorial force */
2019 tx = _mm_mul_ps(fscal,dx21);
2020 ty = _mm_mul_ps(fscal,dy21);
2021 tz = _mm_mul_ps(fscal,dz21);
2023 /* Update vectorial force */
2024 fix2 = _mm_add_ps(fix2,tx);
2025 fiy2 = _mm_add_ps(fiy2,ty);
2026 fiz2 = _mm_add_ps(fiz2,tz);
2028 fjx1 = _mm_add_ps(fjx1,tx);
2029 fjy1 = _mm_add_ps(fjy1,ty);
2030 fjz1 = _mm_add_ps(fjz1,tz);
2034 /**************************
2035 * CALCULATE INTERACTIONS *
2036 **************************/
2038 if (gmx_mm_any_lt(rsq22,rcutoff2))
2041 r22 = _mm_mul_ps(rsq22,rinv22);
2043 /* EWALD ELECTROSTATICS */
2045 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2046 ewrt = _mm_mul_ps(r22,ewtabscale);
2047 ewitab = _mm_cvttps_epi32(ewrt);
2048 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2049 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2050 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2052 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2053 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2055 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2059 fscal = _mm_and_ps(fscal,cutoff_mask);
2061 /* Calculate temporary vectorial force */
2062 tx = _mm_mul_ps(fscal,dx22);
2063 ty = _mm_mul_ps(fscal,dy22);
2064 tz = _mm_mul_ps(fscal,dz22);
2066 /* Update vectorial force */
2067 fix2 = _mm_add_ps(fix2,tx);
2068 fiy2 = _mm_add_ps(fiy2,ty);
2069 fiz2 = _mm_add_ps(fiz2,tz);
2071 fjx2 = _mm_add_ps(fjx2,tx);
2072 fjy2 = _mm_add_ps(fjy2,ty);
2073 fjz2 = _mm_add_ps(fjz2,tz);
2077 fjptrA = f+j_coord_offsetA;
2078 fjptrB = f+j_coord_offsetB;
2079 fjptrC = f+j_coord_offsetC;
2080 fjptrD = f+j_coord_offsetD;
2082 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2083 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2085 /* Inner loop uses 358 flops */
2088 if(jidx<j_index_end)
2091 /* Get j neighbor index, and coordinate index */
2092 jnrlistA = jjnr[jidx];
2093 jnrlistB = jjnr[jidx+1];
2094 jnrlistC = jjnr[jidx+2];
2095 jnrlistD = jjnr[jidx+3];
2096 /* Sign of each element will be negative for non-real atoms.
2097 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2098 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2100 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2101 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2102 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2103 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2104 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2105 j_coord_offsetA = DIM*jnrA;
2106 j_coord_offsetB = DIM*jnrB;
2107 j_coord_offsetC = DIM*jnrC;
2108 j_coord_offsetD = DIM*jnrD;
2110 /* load j atom coordinates */
2111 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2112 x+j_coord_offsetC,x+j_coord_offsetD,
2113 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2115 /* Calculate displacement vector */
2116 dx00 = _mm_sub_ps(ix0,jx0);
2117 dy00 = _mm_sub_ps(iy0,jy0);
2118 dz00 = _mm_sub_ps(iz0,jz0);
2119 dx01 = _mm_sub_ps(ix0,jx1);
2120 dy01 = _mm_sub_ps(iy0,jy1);
2121 dz01 = _mm_sub_ps(iz0,jz1);
2122 dx02 = _mm_sub_ps(ix0,jx2);
2123 dy02 = _mm_sub_ps(iy0,jy2);
2124 dz02 = _mm_sub_ps(iz0,jz2);
2125 dx10 = _mm_sub_ps(ix1,jx0);
2126 dy10 = _mm_sub_ps(iy1,jy0);
2127 dz10 = _mm_sub_ps(iz1,jz0);
2128 dx11 = _mm_sub_ps(ix1,jx1);
2129 dy11 = _mm_sub_ps(iy1,jy1);
2130 dz11 = _mm_sub_ps(iz1,jz1);
2131 dx12 = _mm_sub_ps(ix1,jx2);
2132 dy12 = _mm_sub_ps(iy1,jy2);
2133 dz12 = _mm_sub_ps(iz1,jz2);
2134 dx20 = _mm_sub_ps(ix2,jx0);
2135 dy20 = _mm_sub_ps(iy2,jy0);
2136 dz20 = _mm_sub_ps(iz2,jz0);
2137 dx21 = _mm_sub_ps(ix2,jx1);
2138 dy21 = _mm_sub_ps(iy2,jy1);
2139 dz21 = _mm_sub_ps(iz2,jz1);
2140 dx22 = _mm_sub_ps(ix2,jx2);
2141 dy22 = _mm_sub_ps(iy2,jy2);
2142 dz22 = _mm_sub_ps(iz2,jz2);
2144 /* Calculate squared distance and things based on it */
2145 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2146 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2147 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2148 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2149 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2150 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2151 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2152 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2153 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2155 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2156 rinv01 = gmx_mm_invsqrt_ps(rsq01);
2157 rinv02 = gmx_mm_invsqrt_ps(rsq02);
2158 rinv10 = gmx_mm_invsqrt_ps(rsq10);
2159 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2160 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2161 rinv20 = gmx_mm_invsqrt_ps(rsq20);
2162 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2163 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2165 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2166 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
2167 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
2168 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
2169 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2170 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2171 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
2172 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2173 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2175 fjx0 = _mm_setzero_ps();
2176 fjy0 = _mm_setzero_ps();
2177 fjz0 = _mm_setzero_ps();
2178 fjx1 = _mm_setzero_ps();
2179 fjy1 = _mm_setzero_ps();
2180 fjz1 = _mm_setzero_ps();
2181 fjx2 = _mm_setzero_ps();
2182 fjy2 = _mm_setzero_ps();
2183 fjz2 = _mm_setzero_ps();
2185 /**************************
2186 * CALCULATE INTERACTIONS *
2187 **************************/
2189 if (gmx_mm_any_lt(rsq00,rcutoff2))
2192 r00 = _mm_mul_ps(rsq00,rinv00);
2193 r00 = _mm_andnot_ps(dummy_mask,r00);
2195 /* EWALD ELECTROSTATICS */
2197 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2198 ewrt = _mm_mul_ps(r00,ewtabscale);
2199 ewitab = _mm_cvttps_epi32(ewrt);
2200 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2201 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2202 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2204 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2205 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2207 /* LENNARD-JONES DISPERSION/REPULSION */
2209 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2210 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
2212 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2214 fscal = _mm_add_ps(felec,fvdw);
2216 fscal = _mm_and_ps(fscal,cutoff_mask);
2218 fscal = _mm_andnot_ps(dummy_mask,fscal);
2220 /* Calculate temporary vectorial force */
2221 tx = _mm_mul_ps(fscal,dx00);
2222 ty = _mm_mul_ps(fscal,dy00);
2223 tz = _mm_mul_ps(fscal,dz00);
2225 /* Update vectorial force */
2226 fix0 = _mm_add_ps(fix0,tx);
2227 fiy0 = _mm_add_ps(fiy0,ty);
2228 fiz0 = _mm_add_ps(fiz0,tz);
2230 fjx0 = _mm_add_ps(fjx0,tx);
2231 fjy0 = _mm_add_ps(fjy0,ty);
2232 fjz0 = _mm_add_ps(fjz0,tz);
2236 /**************************
2237 * CALCULATE INTERACTIONS *
2238 **************************/
2240 if (gmx_mm_any_lt(rsq01,rcutoff2))
2243 r01 = _mm_mul_ps(rsq01,rinv01);
2244 r01 = _mm_andnot_ps(dummy_mask,r01);
2246 /* EWALD ELECTROSTATICS */
2248 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2249 ewrt = _mm_mul_ps(r01,ewtabscale);
2250 ewitab = _mm_cvttps_epi32(ewrt);
2251 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2252 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2253 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2255 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2256 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2258 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2262 fscal = _mm_and_ps(fscal,cutoff_mask);
2264 fscal = _mm_andnot_ps(dummy_mask,fscal);
2266 /* Calculate temporary vectorial force */
2267 tx = _mm_mul_ps(fscal,dx01);
2268 ty = _mm_mul_ps(fscal,dy01);
2269 tz = _mm_mul_ps(fscal,dz01);
2271 /* Update vectorial force */
2272 fix0 = _mm_add_ps(fix0,tx);
2273 fiy0 = _mm_add_ps(fiy0,ty);
2274 fiz0 = _mm_add_ps(fiz0,tz);
2276 fjx1 = _mm_add_ps(fjx1,tx);
2277 fjy1 = _mm_add_ps(fjy1,ty);
2278 fjz1 = _mm_add_ps(fjz1,tz);
2282 /**************************
2283 * CALCULATE INTERACTIONS *
2284 **************************/
2286 if (gmx_mm_any_lt(rsq02,rcutoff2))
2289 r02 = _mm_mul_ps(rsq02,rinv02);
2290 r02 = _mm_andnot_ps(dummy_mask,r02);
2292 /* EWALD ELECTROSTATICS */
2294 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2295 ewrt = _mm_mul_ps(r02,ewtabscale);
2296 ewitab = _mm_cvttps_epi32(ewrt);
2297 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2298 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2299 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2301 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2302 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2304 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2308 fscal = _mm_and_ps(fscal,cutoff_mask);
2310 fscal = _mm_andnot_ps(dummy_mask,fscal);
2312 /* Calculate temporary vectorial force */
2313 tx = _mm_mul_ps(fscal,dx02);
2314 ty = _mm_mul_ps(fscal,dy02);
2315 tz = _mm_mul_ps(fscal,dz02);
2317 /* Update vectorial force */
2318 fix0 = _mm_add_ps(fix0,tx);
2319 fiy0 = _mm_add_ps(fiy0,ty);
2320 fiz0 = _mm_add_ps(fiz0,tz);
2322 fjx2 = _mm_add_ps(fjx2,tx);
2323 fjy2 = _mm_add_ps(fjy2,ty);
2324 fjz2 = _mm_add_ps(fjz2,tz);
2328 /**************************
2329 * CALCULATE INTERACTIONS *
2330 **************************/
2332 if (gmx_mm_any_lt(rsq10,rcutoff2))
2335 r10 = _mm_mul_ps(rsq10,rinv10);
2336 r10 = _mm_andnot_ps(dummy_mask,r10);
2338 /* EWALD ELECTROSTATICS */
2340 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2341 ewrt = _mm_mul_ps(r10,ewtabscale);
2342 ewitab = _mm_cvttps_epi32(ewrt);
2343 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2344 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2345 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2347 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2348 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2350 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2354 fscal = _mm_and_ps(fscal,cutoff_mask);
2356 fscal = _mm_andnot_ps(dummy_mask,fscal);
2358 /* Calculate temporary vectorial force */
2359 tx = _mm_mul_ps(fscal,dx10);
2360 ty = _mm_mul_ps(fscal,dy10);
2361 tz = _mm_mul_ps(fscal,dz10);
2363 /* Update vectorial force */
2364 fix1 = _mm_add_ps(fix1,tx);
2365 fiy1 = _mm_add_ps(fiy1,ty);
2366 fiz1 = _mm_add_ps(fiz1,tz);
2368 fjx0 = _mm_add_ps(fjx0,tx);
2369 fjy0 = _mm_add_ps(fjy0,ty);
2370 fjz0 = _mm_add_ps(fjz0,tz);
2374 /**************************
2375 * CALCULATE INTERACTIONS *
2376 **************************/
2378 if (gmx_mm_any_lt(rsq11,rcutoff2))
2381 r11 = _mm_mul_ps(rsq11,rinv11);
2382 r11 = _mm_andnot_ps(dummy_mask,r11);
2384 /* EWALD ELECTROSTATICS */
2386 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2387 ewrt = _mm_mul_ps(r11,ewtabscale);
2388 ewitab = _mm_cvttps_epi32(ewrt);
2389 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2390 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2391 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2393 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2394 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2396 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2400 fscal = _mm_and_ps(fscal,cutoff_mask);
2402 fscal = _mm_andnot_ps(dummy_mask,fscal);
2404 /* Calculate temporary vectorial force */
2405 tx = _mm_mul_ps(fscal,dx11);
2406 ty = _mm_mul_ps(fscal,dy11);
2407 tz = _mm_mul_ps(fscal,dz11);
2409 /* Update vectorial force */
2410 fix1 = _mm_add_ps(fix1,tx);
2411 fiy1 = _mm_add_ps(fiy1,ty);
2412 fiz1 = _mm_add_ps(fiz1,tz);
2414 fjx1 = _mm_add_ps(fjx1,tx);
2415 fjy1 = _mm_add_ps(fjy1,ty);
2416 fjz1 = _mm_add_ps(fjz1,tz);
2420 /**************************
2421 * CALCULATE INTERACTIONS *
2422 **************************/
2424 if (gmx_mm_any_lt(rsq12,rcutoff2))
2427 r12 = _mm_mul_ps(rsq12,rinv12);
2428 r12 = _mm_andnot_ps(dummy_mask,r12);
2430 /* EWALD ELECTROSTATICS */
2432 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2433 ewrt = _mm_mul_ps(r12,ewtabscale);
2434 ewitab = _mm_cvttps_epi32(ewrt);
2435 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2436 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2437 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2439 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2440 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2442 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2446 fscal = _mm_and_ps(fscal,cutoff_mask);
2448 fscal = _mm_andnot_ps(dummy_mask,fscal);
2450 /* Calculate temporary vectorial force */
2451 tx = _mm_mul_ps(fscal,dx12);
2452 ty = _mm_mul_ps(fscal,dy12);
2453 tz = _mm_mul_ps(fscal,dz12);
2455 /* Update vectorial force */
2456 fix1 = _mm_add_ps(fix1,tx);
2457 fiy1 = _mm_add_ps(fiy1,ty);
2458 fiz1 = _mm_add_ps(fiz1,tz);
2460 fjx2 = _mm_add_ps(fjx2,tx);
2461 fjy2 = _mm_add_ps(fjy2,ty);
2462 fjz2 = _mm_add_ps(fjz2,tz);
2466 /**************************
2467 * CALCULATE INTERACTIONS *
2468 **************************/
2470 if (gmx_mm_any_lt(rsq20,rcutoff2))
2473 r20 = _mm_mul_ps(rsq20,rinv20);
2474 r20 = _mm_andnot_ps(dummy_mask,r20);
2476 /* EWALD ELECTROSTATICS */
2478 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2479 ewrt = _mm_mul_ps(r20,ewtabscale);
2480 ewitab = _mm_cvttps_epi32(ewrt);
2481 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2482 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2483 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2485 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2486 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2488 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2492 fscal = _mm_and_ps(fscal,cutoff_mask);
2494 fscal = _mm_andnot_ps(dummy_mask,fscal);
2496 /* Calculate temporary vectorial force */
2497 tx = _mm_mul_ps(fscal,dx20);
2498 ty = _mm_mul_ps(fscal,dy20);
2499 tz = _mm_mul_ps(fscal,dz20);
2501 /* Update vectorial force */
2502 fix2 = _mm_add_ps(fix2,tx);
2503 fiy2 = _mm_add_ps(fiy2,ty);
2504 fiz2 = _mm_add_ps(fiz2,tz);
2506 fjx0 = _mm_add_ps(fjx0,tx);
2507 fjy0 = _mm_add_ps(fjy0,ty);
2508 fjz0 = _mm_add_ps(fjz0,tz);
2512 /**************************
2513 * CALCULATE INTERACTIONS *
2514 **************************/
2516 if (gmx_mm_any_lt(rsq21,rcutoff2))
2519 r21 = _mm_mul_ps(rsq21,rinv21);
2520 r21 = _mm_andnot_ps(dummy_mask,r21);
2522 /* EWALD ELECTROSTATICS */
2524 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2525 ewrt = _mm_mul_ps(r21,ewtabscale);
2526 ewitab = _mm_cvttps_epi32(ewrt);
2527 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2528 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2529 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2531 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2532 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2534 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2538 fscal = _mm_and_ps(fscal,cutoff_mask);
2540 fscal = _mm_andnot_ps(dummy_mask,fscal);
2542 /* Calculate temporary vectorial force */
2543 tx = _mm_mul_ps(fscal,dx21);
2544 ty = _mm_mul_ps(fscal,dy21);
2545 tz = _mm_mul_ps(fscal,dz21);
2547 /* Update vectorial force */
2548 fix2 = _mm_add_ps(fix2,tx);
2549 fiy2 = _mm_add_ps(fiy2,ty);
2550 fiz2 = _mm_add_ps(fiz2,tz);
2552 fjx1 = _mm_add_ps(fjx1,tx);
2553 fjy1 = _mm_add_ps(fjy1,ty);
2554 fjz1 = _mm_add_ps(fjz1,tz);
2558 /**************************
2559 * CALCULATE INTERACTIONS *
2560 **************************/
2562 if (gmx_mm_any_lt(rsq22,rcutoff2))
2565 r22 = _mm_mul_ps(rsq22,rinv22);
2566 r22 = _mm_andnot_ps(dummy_mask,r22);
2568 /* EWALD ELECTROSTATICS */
2570 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2571 ewrt = _mm_mul_ps(r22,ewtabscale);
2572 ewitab = _mm_cvttps_epi32(ewrt);
2573 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2574 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2575 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2577 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2578 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2580 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2584 fscal = _mm_and_ps(fscal,cutoff_mask);
2586 fscal = _mm_andnot_ps(dummy_mask,fscal);
2588 /* Calculate temporary vectorial force */
2589 tx = _mm_mul_ps(fscal,dx22);
2590 ty = _mm_mul_ps(fscal,dy22);
2591 tz = _mm_mul_ps(fscal,dz22);
2593 /* Update vectorial force */
2594 fix2 = _mm_add_ps(fix2,tx);
2595 fiy2 = _mm_add_ps(fiy2,ty);
2596 fiz2 = _mm_add_ps(fiz2,tz);
2598 fjx2 = _mm_add_ps(fjx2,tx);
2599 fjy2 = _mm_add_ps(fjy2,ty);
2600 fjz2 = _mm_add_ps(fjz2,tz);
2604 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2605 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2606 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2607 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2609 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2610 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2612 /* Inner loop uses 367 flops */
2615 /* End of innermost loop */
2617 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2618 f+i_coord_offset,fshift+i_shift_offset);
2620 /* Increment number of inner iterations */
2621 inneriter += j_index_end - j_index_start;
2623 /* Outer loop uses 18 flops */
2626 /* Increment number of outer iterations */
2629 /* Update outer/inner flops */
2631 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*367);