2 * Note: this file was generated by the Gromacs sse4_1_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse4_1_single.h"
34 #include "kernelutil_x86_sse4_1_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse4_1_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_sse4_1_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
93 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
96 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
97 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
99 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
101 __m128 dummy_mask,cutoff_mask;
102 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
103 __m128 one = _mm_set1_ps(1.0);
104 __m128 two = _mm_set1_ps(2.0);
110 jindex = nlist->jindex;
112 shiftidx = nlist->shift;
114 shiftvec = fr->shift_vec[0];
115 fshift = fr->fshift[0];
116 facel = _mm_set1_ps(fr->epsfac);
117 charge = mdatoms->chargeA;
118 nvdwtype = fr->ntype;
120 vdwtype = mdatoms->typeA;
122 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
123 ewtab = fr->ic->tabq_coul_FDV0;
124 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
125 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
127 /* Setup water-specific parameters */
128 inr = nlist->iinr[0];
129 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
130 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
131 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
132 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
134 jq0 = _mm_set1_ps(charge[inr+0]);
135 jq1 = _mm_set1_ps(charge[inr+1]);
136 jq2 = _mm_set1_ps(charge[inr+2]);
137 vdwjidx0A = 2*vdwtype[inr+0];
138 qq00 = _mm_mul_ps(iq0,jq0);
139 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
140 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
141 qq01 = _mm_mul_ps(iq0,jq1);
142 qq02 = _mm_mul_ps(iq0,jq2);
143 qq10 = _mm_mul_ps(iq1,jq0);
144 qq11 = _mm_mul_ps(iq1,jq1);
145 qq12 = _mm_mul_ps(iq1,jq2);
146 qq20 = _mm_mul_ps(iq2,jq0);
147 qq21 = _mm_mul_ps(iq2,jq1);
148 qq22 = _mm_mul_ps(iq2,jq2);
150 /* Avoid stupid compiler warnings */
151 jnrA = jnrB = jnrC = jnrD = 0;
160 for(iidx=0;iidx<4*DIM;iidx++)
165 /* Start outer loop over neighborlists */
166 for(iidx=0; iidx<nri; iidx++)
168 /* Load shift vector for this list */
169 i_shift_offset = DIM*shiftidx[iidx];
171 /* Load limits for loop over neighbors */
172 j_index_start = jindex[iidx];
173 j_index_end = jindex[iidx+1];
175 /* Get outer coordinate index */
177 i_coord_offset = DIM*inr;
179 /* Load i particle coords and add shift vector */
180 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
181 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
183 fix0 = _mm_setzero_ps();
184 fiy0 = _mm_setzero_ps();
185 fiz0 = _mm_setzero_ps();
186 fix1 = _mm_setzero_ps();
187 fiy1 = _mm_setzero_ps();
188 fiz1 = _mm_setzero_ps();
189 fix2 = _mm_setzero_ps();
190 fiy2 = _mm_setzero_ps();
191 fiz2 = _mm_setzero_ps();
193 /* Reset potential sums */
194 velecsum = _mm_setzero_ps();
195 vvdwsum = _mm_setzero_ps();
197 /* Start inner kernel loop */
198 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
201 /* Get j neighbor index, and coordinate index */
206 j_coord_offsetA = DIM*jnrA;
207 j_coord_offsetB = DIM*jnrB;
208 j_coord_offsetC = DIM*jnrC;
209 j_coord_offsetD = DIM*jnrD;
211 /* load j atom coordinates */
212 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
213 x+j_coord_offsetC,x+j_coord_offsetD,
214 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
216 /* Calculate displacement vector */
217 dx00 = _mm_sub_ps(ix0,jx0);
218 dy00 = _mm_sub_ps(iy0,jy0);
219 dz00 = _mm_sub_ps(iz0,jz0);
220 dx01 = _mm_sub_ps(ix0,jx1);
221 dy01 = _mm_sub_ps(iy0,jy1);
222 dz01 = _mm_sub_ps(iz0,jz1);
223 dx02 = _mm_sub_ps(ix0,jx2);
224 dy02 = _mm_sub_ps(iy0,jy2);
225 dz02 = _mm_sub_ps(iz0,jz2);
226 dx10 = _mm_sub_ps(ix1,jx0);
227 dy10 = _mm_sub_ps(iy1,jy0);
228 dz10 = _mm_sub_ps(iz1,jz0);
229 dx11 = _mm_sub_ps(ix1,jx1);
230 dy11 = _mm_sub_ps(iy1,jy1);
231 dz11 = _mm_sub_ps(iz1,jz1);
232 dx12 = _mm_sub_ps(ix1,jx2);
233 dy12 = _mm_sub_ps(iy1,jy2);
234 dz12 = _mm_sub_ps(iz1,jz2);
235 dx20 = _mm_sub_ps(ix2,jx0);
236 dy20 = _mm_sub_ps(iy2,jy0);
237 dz20 = _mm_sub_ps(iz2,jz0);
238 dx21 = _mm_sub_ps(ix2,jx1);
239 dy21 = _mm_sub_ps(iy2,jy1);
240 dz21 = _mm_sub_ps(iz2,jz1);
241 dx22 = _mm_sub_ps(ix2,jx2);
242 dy22 = _mm_sub_ps(iy2,jy2);
243 dz22 = _mm_sub_ps(iz2,jz2);
245 /* Calculate squared distance and things based on it */
246 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
247 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
248 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
249 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
250 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
251 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
252 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
253 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
254 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
256 rinv00 = gmx_mm_invsqrt_ps(rsq00);
257 rinv01 = gmx_mm_invsqrt_ps(rsq01);
258 rinv02 = gmx_mm_invsqrt_ps(rsq02);
259 rinv10 = gmx_mm_invsqrt_ps(rsq10);
260 rinv11 = gmx_mm_invsqrt_ps(rsq11);
261 rinv12 = gmx_mm_invsqrt_ps(rsq12);
262 rinv20 = gmx_mm_invsqrt_ps(rsq20);
263 rinv21 = gmx_mm_invsqrt_ps(rsq21);
264 rinv22 = gmx_mm_invsqrt_ps(rsq22);
266 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
267 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
268 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
269 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
270 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
271 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
272 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
273 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
274 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
276 fjx0 = _mm_setzero_ps();
277 fjy0 = _mm_setzero_ps();
278 fjz0 = _mm_setzero_ps();
279 fjx1 = _mm_setzero_ps();
280 fjy1 = _mm_setzero_ps();
281 fjz1 = _mm_setzero_ps();
282 fjx2 = _mm_setzero_ps();
283 fjy2 = _mm_setzero_ps();
284 fjz2 = _mm_setzero_ps();
286 /**************************
287 * CALCULATE INTERACTIONS *
288 **************************/
290 r00 = _mm_mul_ps(rsq00,rinv00);
292 /* EWALD ELECTROSTATICS */
294 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
295 ewrt = _mm_mul_ps(r00,ewtabscale);
296 ewitab = _mm_cvttps_epi32(ewrt);
297 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
298 ewitab = _mm_slli_epi32(ewitab,2);
299 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
300 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
301 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
302 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
303 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
304 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
305 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
306 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
307 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
309 /* LENNARD-JONES DISPERSION/REPULSION */
311 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
312 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
313 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
314 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
315 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
317 /* Update potential sum for this i atom from the interaction with this j atom. */
318 velecsum = _mm_add_ps(velecsum,velec);
319 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
321 fscal = _mm_add_ps(felec,fvdw);
323 /* Calculate temporary vectorial force */
324 tx = _mm_mul_ps(fscal,dx00);
325 ty = _mm_mul_ps(fscal,dy00);
326 tz = _mm_mul_ps(fscal,dz00);
328 /* Update vectorial force */
329 fix0 = _mm_add_ps(fix0,tx);
330 fiy0 = _mm_add_ps(fiy0,ty);
331 fiz0 = _mm_add_ps(fiz0,tz);
333 fjx0 = _mm_add_ps(fjx0,tx);
334 fjy0 = _mm_add_ps(fjy0,ty);
335 fjz0 = _mm_add_ps(fjz0,tz);
337 /**************************
338 * CALCULATE INTERACTIONS *
339 **************************/
341 r01 = _mm_mul_ps(rsq01,rinv01);
343 /* EWALD ELECTROSTATICS */
345 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
346 ewrt = _mm_mul_ps(r01,ewtabscale);
347 ewitab = _mm_cvttps_epi32(ewrt);
348 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
349 ewitab = _mm_slli_epi32(ewitab,2);
350 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
351 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
352 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
353 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
354 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
355 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
356 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
357 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
358 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
360 /* Update potential sum for this i atom from the interaction with this j atom. */
361 velecsum = _mm_add_ps(velecsum,velec);
365 /* Calculate temporary vectorial force */
366 tx = _mm_mul_ps(fscal,dx01);
367 ty = _mm_mul_ps(fscal,dy01);
368 tz = _mm_mul_ps(fscal,dz01);
370 /* Update vectorial force */
371 fix0 = _mm_add_ps(fix0,tx);
372 fiy0 = _mm_add_ps(fiy0,ty);
373 fiz0 = _mm_add_ps(fiz0,tz);
375 fjx1 = _mm_add_ps(fjx1,tx);
376 fjy1 = _mm_add_ps(fjy1,ty);
377 fjz1 = _mm_add_ps(fjz1,tz);
379 /**************************
380 * CALCULATE INTERACTIONS *
381 **************************/
383 r02 = _mm_mul_ps(rsq02,rinv02);
385 /* EWALD ELECTROSTATICS */
387 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
388 ewrt = _mm_mul_ps(r02,ewtabscale);
389 ewitab = _mm_cvttps_epi32(ewrt);
390 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
391 ewitab = _mm_slli_epi32(ewitab,2);
392 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
393 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
394 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
395 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
396 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
397 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
398 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
399 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
400 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
402 /* Update potential sum for this i atom from the interaction with this j atom. */
403 velecsum = _mm_add_ps(velecsum,velec);
407 /* Calculate temporary vectorial force */
408 tx = _mm_mul_ps(fscal,dx02);
409 ty = _mm_mul_ps(fscal,dy02);
410 tz = _mm_mul_ps(fscal,dz02);
412 /* Update vectorial force */
413 fix0 = _mm_add_ps(fix0,tx);
414 fiy0 = _mm_add_ps(fiy0,ty);
415 fiz0 = _mm_add_ps(fiz0,tz);
417 fjx2 = _mm_add_ps(fjx2,tx);
418 fjy2 = _mm_add_ps(fjy2,ty);
419 fjz2 = _mm_add_ps(fjz2,tz);
421 /**************************
422 * CALCULATE INTERACTIONS *
423 **************************/
425 r10 = _mm_mul_ps(rsq10,rinv10);
427 /* EWALD ELECTROSTATICS */
429 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
430 ewrt = _mm_mul_ps(r10,ewtabscale);
431 ewitab = _mm_cvttps_epi32(ewrt);
432 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
433 ewitab = _mm_slli_epi32(ewitab,2);
434 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
435 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
436 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
437 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
438 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
439 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
440 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
441 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
442 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
444 /* Update potential sum for this i atom from the interaction with this j atom. */
445 velecsum = _mm_add_ps(velecsum,velec);
449 /* Calculate temporary vectorial force */
450 tx = _mm_mul_ps(fscal,dx10);
451 ty = _mm_mul_ps(fscal,dy10);
452 tz = _mm_mul_ps(fscal,dz10);
454 /* Update vectorial force */
455 fix1 = _mm_add_ps(fix1,tx);
456 fiy1 = _mm_add_ps(fiy1,ty);
457 fiz1 = _mm_add_ps(fiz1,tz);
459 fjx0 = _mm_add_ps(fjx0,tx);
460 fjy0 = _mm_add_ps(fjy0,ty);
461 fjz0 = _mm_add_ps(fjz0,tz);
463 /**************************
464 * CALCULATE INTERACTIONS *
465 **************************/
467 r11 = _mm_mul_ps(rsq11,rinv11);
469 /* EWALD ELECTROSTATICS */
471 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
472 ewrt = _mm_mul_ps(r11,ewtabscale);
473 ewitab = _mm_cvttps_epi32(ewrt);
474 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
475 ewitab = _mm_slli_epi32(ewitab,2);
476 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
477 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
478 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
479 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
480 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
481 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
482 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
483 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
484 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
486 /* Update potential sum for this i atom from the interaction with this j atom. */
487 velecsum = _mm_add_ps(velecsum,velec);
491 /* Calculate temporary vectorial force */
492 tx = _mm_mul_ps(fscal,dx11);
493 ty = _mm_mul_ps(fscal,dy11);
494 tz = _mm_mul_ps(fscal,dz11);
496 /* Update vectorial force */
497 fix1 = _mm_add_ps(fix1,tx);
498 fiy1 = _mm_add_ps(fiy1,ty);
499 fiz1 = _mm_add_ps(fiz1,tz);
501 fjx1 = _mm_add_ps(fjx1,tx);
502 fjy1 = _mm_add_ps(fjy1,ty);
503 fjz1 = _mm_add_ps(fjz1,tz);
505 /**************************
506 * CALCULATE INTERACTIONS *
507 **************************/
509 r12 = _mm_mul_ps(rsq12,rinv12);
511 /* EWALD ELECTROSTATICS */
513 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
514 ewrt = _mm_mul_ps(r12,ewtabscale);
515 ewitab = _mm_cvttps_epi32(ewrt);
516 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
517 ewitab = _mm_slli_epi32(ewitab,2);
518 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
519 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
520 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
521 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
522 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
523 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
524 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
525 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
526 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
528 /* Update potential sum for this i atom from the interaction with this j atom. */
529 velecsum = _mm_add_ps(velecsum,velec);
533 /* Calculate temporary vectorial force */
534 tx = _mm_mul_ps(fscal,dx12);
535 ty = _mm_mul_ps(fscal,dy12);
536 tz = _mm_mul_ps(fscal,dz12);
538 /* Update vectorial force */
539 fix1 = _mm_add_ps(fix1,tx);
540 fiy1 = _mm_add_ps(fiy1,ty);
541 fiz1 = _mm_add_ps(fiz1,tz);
543 fjx2 = _mm_add_ps(fjx2,tx);
544 fjy2 = _mm_add_ps(fjy2,ty);
545 fjz2 = _mm_add_ps(fjz2,tz);
547 /**************************
548 * CALCULATE INTERACTIONS *
549 **************************/
551 r20 = _mm_mul_ps(rsq20,rinv20);
553 /* EWALD ELECTROSTATICS */
555 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
556 ewrt = _mm_mul_ps(r20,ewtabscale);
557 ewitab = _mm_cvttps_epi32(ewrt);
558 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
559 ewitab = _mm_slli_epi32(ewitab,2);
560 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
561 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
562 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
563 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
564 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
565 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
566 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
567 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
568 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
570 /* Update potential sum for this i atom from the interaction with this j atom. */
571 velecsum = _mm_add_ps(velecsum,velec);
575 /* Calculate temporary vectorial force */
576 tx = _mm_mul_ps(fscal,dx20);
577 ty = _mm_mul_ps(fscal,dy20);
578 tz = _mm_mul_ps(fscal,dz20);
580 /* Update vectorial force */
581 fix2 = _mm_add_ps(fix2,tx);
582 fiy2 = _mm_add_ps(fiy2,ty);
583 fiz2 = _mm_add_ps(fiz2,tz);
585 fjx0 = _mm_add_ps(fjx0,tx);
586 fjy0 = _mm_add_ps(fjy0,ty);
587 fjz0 = _mm_add_ps(fjz0,tz);
589 /**************************
590 * CALCULATE INTERACTIONS *
591 **************************/
593 r21 = _mm_mul_ps(rsq21,rinv21);
595 /* EWALD ELECTROSTATICS */
597 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
598 ewrt = _mm_mul_ps(r21,ewtabscale);
599 ewitab = _mm_cvttps_epi32(ewrt);
600 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
601 ewitab = _mm_slli_epi32(ewitab,2);
602 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
603 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
604 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
605 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
606 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
607 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
608 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
609 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
610 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
612 /* Update potential sum for this i atom from the interaction with this j atom. */
613 velecsum = _mm_add_ps(velecsum,velec);
617 /* Calculate temporary vectorial force */
618 tx = _mm_mul_ps(fscal,dx21);
619 ty = _mm_mul_ps(fscal,dy21);
620 tz = _mm_mul_ps(fscal,dz21);
622 /* Update vectorial force */
623 fix2 = _mm_add_ps(fix2,tx);
624 fiy2 = _mm_add_ps(fiy2,ty);
625 fiz2 = _mm_add_ps(fiz2,tz);
627 fjx1 = _mm_add_ps(fjx1,tx);
628 fjy1 = _mm_add_ps(fjy1,ty);
629 fjz1 = _mm_add_ps(fjz1,tz);
631 /**************************
632 * CALCULATE INTERACTIONS *
633 **************************/
635 r22 = _mm_mul_ps(rsq22,rinv22);
637 /* EWALD ELECTROSTATICS */
639 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
640 ewrt = _mm_mul_ps(r22,ewtabscale);
641 ewitab = _mm_cvttps_epi32(ewrt);
642 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
643 ewitab = _mm_slli_epi32(ewitab,2);
644 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
645 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
646 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
647 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
648 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
649 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
650 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
651 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
652 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
654 /* Update potential sum for this i atom from the interaction with this j atom. */
655 velecsum = _mm_add_ps(velecsum,velec);
659 /* Calculate temporary vectorial force */
660 tx = _mm_mul_ps(fscal,dx22);
661 ty = _mm_mul_ps(fscal,dy22);
662 tz = _mm_mul_ps(fscal,dz22);
664 /* Update vectorial force */
665 fix2 = _mm_add_ps(fix2,tx);
666 fiy2 = _mm_add_ps(fiy2,ty);
667 fiz2 = _mm_add_ps(fiz2,tz);
669 fjx2 = _mm_add_ps(fjx2,tx);
670 fjy2 = _mm_add_ps(fjy2,ty);
671 fjz2 = _mm_add_ps(fjz2,tz);
673 fjptrA = f+j_coord_offsetA;
674 fjptrB = f+j_coord_offsetB;
675 fjptrC = f+j_coord_offsetC;
676 fjptrD = f+j_coord_offsetD;
678 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
679 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
681 /* Inner loop uses 381 flops */
687 /* Get j neighbor index, and coordinate index */
688 jnrlistA = jjnr[jidx];
689 jnrlistB = jjnr[jidx+1];
690 jnrlistC = jjnr[jidx+2];
691 jnrlistD = jjnr[jidx+3];
692 /* Sign of each element will be negative for non-real atoms.
693 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
694 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
696 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
697 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
698 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
699 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
700 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
701 j_coord_offsetA = DIM*jnrA;
702 j_coord_offsetB = DIM*jnrB;
703 j_coord_offsetC = DIM*jnrC;
704 j_coord_offsetD = DIM*jnrD;
706 /* load j atom coordinates */
707 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
708 x+j_coord_offsetC,x+j_coord_offsetD,
709 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
711 /* Calculate displacement vector */
712 dx00 = _mm_sub_ps(ix0,jx0);
713 dy00 = _mm_sub_ps(iy0,jy0);
714 dz00 = _mm_sub_ps(iz0,jz0);
715 dx01 = _mm_sub_ps(ix0,jx1);
716 dy01 = _mm_sub_ps(iy0,jy1);
717 dz01 = _mm_sub_ps(iz0,jz1);
718 dx02 = _mm_sub_ps(ix0,jx2);
719 dy02 = _mm_sub_ps(iy0,jy2);
720 dz02 = _mm_sub_ps(iz0,jz2);
721 dx10 = _mm_sub_ps(ix1,jx0);
722 dy10 = _mm_sub_ps(iy1,jy0);
723 dz10 = _mm_sub_ps(iz1,jz0);
724 dx11 = _mm_sub_ps(ix1,jx1);
725 dy11 = _mm_sub_ps(iy1,jy1);
726 dz11 = _mm_sub_ps(iz1,jz1);
727 dx12 = _mm_sub_ps(ix1,jx2);
728 dy12 = _mm_sub_ps(iy1,jy2);
729 dz12 = _mm_sub_ps(iz1,jz2);
730 dx20 = _mm_sub_ps(ix2,jx0);
731 dy20 = _mm_sub_ps(iy2,jy0);
732 dz20 = _mm_sub_ps(iz2,jz0);
733 dx21 = _mm_sub_ps(ix2,jx1);
734 dy21 = _mm_sub_ps(iy2,jy1);
735 dz21 = _mm_sub_ps(iz2,jz1);
736 dx22 = _mm_sub_ps(ix2,jx2);
737 dy22 = _mm_sub_ps(iy2,jy2);
738 dz22 = _mm_sub_ps(iz2,jz2);
740 /* Calculate squared distance and things based on it */
741 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
742 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
743 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
744 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
745 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
746 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
747 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
748 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
749 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
751 rinv00 = gmx_mm_invsqrt_ps(rsq00);
752 rinv01 = gmx_mm_invsqrt_ps(rsq01);
753 rinv02 = gmx_mm_invsqrt_ps(rsq02);
754 rinv10 = gmx_mm_invsqrt_ps(rsq10);
755 rinv11 = gmx_mm_invsqrt_ps(rsq11);
756 rinv12 = gmx_mm_invsqrt_ps(rsq12);
757 rinv20 = gmx_mm_invsqrt_ps(rsq20);
758 rinv21 = gmx_mm_invsqrt_ps(rsq21);
759 rinv22 = gmx_mm_invsqrt_ps(rsq22);
761 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
762 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
763 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
764 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
765 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
766 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
767 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
768 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
769 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
771 fjx0 = _mm_setzero_ps();
772 fjy0 = _mm_setzero_ps();
773 fjz0 = _mm_setzero_ps();
774 fjx1 = _mm_setzero_ps();
775 fjy1 = _mm_setzero_ps();
776 fjz1 = _mm_setzero_ps();
777 fjx2 = _mm_setzero_ps();
778 fjy2 = _mm_setzero_ps();
779 fjz2 = _mm_setzero_ps();
781 /**************************
782 * CALCULATE INTERACTIONS *
783 **************************/
785 r00 = _mm_mul_ps(rsq00,rinv00);
786 r00 = _mm_andnot_ps(dummy_mask,r00);
788 /* EWALD ELECTROSTATICS */
790 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
791 ewrt = _mm_mul_ps(r00,ewtabscale);
792 ewitab = _mm_cvttps_epi32(ewrt);
793 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
794 ewitab = _mm_slli_epi32(ewitab,2);
795 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
796 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
797 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
798 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
799 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
800 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
801 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
802 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
803 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
805 /* LENNARD-JONES DISPERSION/REPULSION */
807 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
808 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
809 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
810 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
811 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
813 /* Update potential sum for this i atom from the interaction with this j atom. */
814 velec = _mm_andnot_ps(dummy_mask,velec);
815 velecsum = _mm_add_ps(velecsum,velec);
816 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
817 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
819 fscal = _mm_add_ps(felec,fvdw);
821 fscal = _mm_andnot_ps(dummy_mask,fscal);
823 /* Calculate temporary vectorial force */
824 tx = _mm_mul_ps(fscal,dx00);
825 ty = _mm_mul_ps(fscal,dy00);
826 tz = _mm_mul_ps(fscal,dz00);
828 /* Update vectorial force */
829 fix0 = _mm_add_ps(fix0,tx);
830 fiy0 = _mm_add_ps(fiy0,ty);
831 fiz0 = _mm_add_ps(fiz0,tz);
833 fjx0 = _mm_add_ps(fjx0,tx);
834 fjy0 = _mm_add_ps(fjy0,ty);
835 fjz0 = _mm_add_ps(fjz0,tz);
837 /**************************
838 * CALCULATE INTERACTIONS *
839 **************************/
841 r01 = _mm_mul_ps(rsq01,rinv01);
842 r01 = _mm_andnot_ps(dummy_mask,r01);
844 /* EWALD ELECTROSTATICS */
846 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
847 ewrt = _mm_mul_ps(r01,ewtabscale);
848 ewitab = _mm_cvttps_epi32(ewrt);
849 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
850 ewitab = _mm_slli_epi32(ewitab,2);
851 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
852 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
853 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
854 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
855 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
856 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
857 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
858 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
859 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
861 /* Update potential sum for this i atom from the interaction with this j atom. */
862 velec = _mm_andnot_ps(dummy_mask,velec);
863 velecsum = _mm_add_ps(velecsum,velec);
867 fscal = _mm_andnot_ps(dummy_mask,fscal);
869 /* Calculate temporary vectorial force */
870 tx = _mm_mul_ps(fscal,dx01);
871 ty = _mm_mul_ps(fscal,dy01);
872 tz = _mm_mul_ps(fscal,dz01);
874 /* Update vectorial force */
875 fix0 = _mm_add_ps(fix0,tx);
876 fiy0 = _mm_add_ps(fiy0,ty);
877 fiz0 = _mm_add_ps(fiz0,tz);
879 fjx1 = _mm_add_ps(fjx1,tx);
880 fjy1 = _mm_add_ps(fjy1,ty);
881 fjz1 = _mm_add_ps(fjz1,tz);
883 /**************************
884 * CALCULATE INTERACTIONS *
885 **************************/
887 r02 = _mm_mul_ps(rsq02,rinv02);
888 r02 = _mm_andnot_ps(dummy_mask,r02);
890 /* EWALD ELECTROSTATICS */
892 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
893 ewrt = _mm_mul_ps(r02,ewtabscale);
894 ewitab = _mm_cvttps_epi32(ewrt);
895 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
896 ewitab = _mm_slli_epi32(ewitab,2);
897 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
898 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
899 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
900 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
901 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
902 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
903 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
904 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
905 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
907 /* Update potential sum for this i atom from the interaction with this j atom. */
908 velec = _mm_andnot_ps(dummy_mask,velec);
909 velecsum = _mm_add_ps(velecsum,velec);
913 fscal = _mm_andnot_ps(dummy_mask,fscal);
915 /* Calculate temporary vectorial force */
916 tx = _mm_mul_ps(fscal,dx02);
917 ty = _mm_mul_ps(fscal,dy02);
918 tz = _mm_mul_ps(fscal,dz02);
920 /* Update vectorial force */
921 fix0 = _mm_add_ps(fix0,tx);
922 fiy0 = _mm_add_ps(fiy0,ty);
923 fiz0 = _mm_add_ps(fiz0,tz);
925 fjx2 = _mm_add_ps(fjx2,tx);
926 fjy2 = _mm_add_ps(fjy2,ty);
927 fjz2 = _mm_add_ps(fjz2,tz);
929 /**************************
930 * CALCULATE INTERACTIONS *
931 **************************/
933 r10 = _mm_mul_ps(rsq10,rinv10);
934 r10 = _mm_andnot_ps(dummy_mask,r10);
936 /* EWALD ELECTROSTATICS */
938 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
939 ewrt = _mm_mul_ps(r10,ewtabscale);
940 ewitab = _mm_cvttps_epi32(ewrt);
941 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
942 ewitab = _mm_slli_epi32(ewitab,2);
943 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
944 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
945 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
946 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
947 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
948 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
949 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
950 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
951 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
953 /* Update potential sum for this i atom from the interaction with this j atom. */
954 velec = _mm_andnot_ps(dummy_mask,velec);
955 velecsum = _mm_add_ps(velecsum,velec);
959 fscal = _mm_andnot_ps(dummy_mask,fscal);
961 /* Calculate temporary vectorial force */
962 tx = _mm_mul_ps(fscal,dx10);
963 ty = _mm_mul_ps(fscal,dy10);
964 tz = _mm_mul_ps(fscal,dz10);
966 /* Update vectorial force */
967 fix1 = _mm_add_ps(fix1,tx);
968 fiy1 = _mm_add_ps(fiy1,ty);
969 fiz1 = _mm_add_ps(fiz1,tz);
971 fjx0 = _mm_add_ps(fjx0,tx);
972 fjy0 = _mm_add_ps(fjy0,ty);
973 fjz0 = _mm_add_ps(fjz0,tz);
975 /**************************
976 * CALCULATE INTERACTIONS *
977 **************************/
979 r11 = _mm_mul_ps(rsq11,rinv11);
980 r11 = _mm_andnot_ps(dummy_mask,r11);
982 /* EWALD ELECTROSTATICS */
984 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
985 ewrt = _mm_mul_ps(r11,ewtabscale);
986 ewitab = _mm_cvttps_epi32(ewrt);
987 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
988 ewitab = _mm_slli_epi32(ewitab,2);
989 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
990 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
991 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
992 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
993 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
994 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
995 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
996 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
997 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
999 /* Update potential sum for this i atom from the interaction with this j atom. */
1000 velec = _mm_andnot_ps(dummy_mask,velec);
1001 velecsum = _mm_add_ps(velecsum,velec);
1005 fscal = _mm_andnot_ps(dummy_mask,fscal);
1007 /* Calculate temporary vectorial force */
1008 tx = _mm_mul_ps(fscal,dx11);
1009 ty = _mm_mul_ps(fscal,dy11);
1010 tz = _mm_mul_ps(fscal,dz11);
1012 /* Update vectorial force */
1013 fix1 = _mm_add_ps(fix1,tx);
1014 fiy1 = _mm_add_ps(fiy1,ty);
1015 fiz1 = _mm_add_ps(fiz1,tz);
1017 fjx1 = _mm_add_ps(fjx1,tx);
1018 fjy1 = _mm_add_ps(fjy1,ty);
1019 fjz1 = _mm_add_ps(fjz1,tz);
1021 /**************************
1022 * CALCULATE INTERACTIONS *
1023 **************************/
1025 r12 = _mm_mul_ps(rsq12,rinv12);
1026 r12 = _mm_andnot_ps(dummy_mask,r12);
1028 /* EWALD ELECTROSTATICS */
1030 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1031 ewrt = _mm_mul_ps(r12,ewtabscale);
1032 ewitab = _mm_cvttps_epi32(ewrt);
1033 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1034 ewitab = _mm_slli_epi32(ewitab,2);
1035 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1036 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1037 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1038 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1039 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1040 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1041 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1042 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1043 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1045 /* Update potential sum for this i atom from the interaction with this j atom. */
1046 velec = _mm_andnot_ps(dummy_mask,velec);
1047 velecsum = _mm_add_ps(velecsum,velec);
1051 fscal = _mm_andnot_ps(dummy_mask,fscal);
1053 /* Calculate temporary vectorial force */
1054 tx = _mm_mul_ps(fscal,dx12);
1055 ty = _mm_mul_ps(fscal,dy12);
1056 tz = _mm_mul_ps(fscal,dz12);
1058 /* Update vectorial force */
1059 fix1 = _mm_add_ps(fix1,tx);
1060 fiy1 = _mm_add_ps(fiy1,ty);
1061 fiz1 = _mm_add_ps(fiz1,tz);
1063 fjx2 = _mm_add_ps(fjx2,tx);
1064 fjy2 = _mm_add_ps(fjy2,ty);
1065 fjz2 = _mm_add_ps(fjz2,tz);
1067 /**************************
1068 * CALCULATE INTERACTIONS *
1069 **************************/
1071 r20 = _mm_mul_ps(rsq20,rinv20);
1072 r20 = _mm_andnot_ps(dummy_mask,r20);
1074 /* EWALD ELECTROSTATICS */
1076 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1077 ewrt = _mm_mul_ps(r20,ewtabscale);
1078 ewitab = _mm_cvttps_epi32(ewrt);
1079 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1080 ewitab = _mm_slli_epi32(ewitab,2);
1081 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1082 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1083 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1084 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1085 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1086 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1087 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1088 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1089 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1091 /* Update potential sum for this i atom from the interaction with this j atom. */
1092 velec = _mm_andnot_ps(dummy_mask,velec);
1093 velecsum = _mm_add_ps(velecsum,velec);
1097 fscal = _mm_andnot_ps(dummy_mask,fscal);
1099 /* Calculate temporary vectorial force */
1100 tx = _mm_mul_ps(fscal,dx20);
1101 ty = _mm_mul_ps(fscal,dy20);
1102 tz = _mm_mul_ps(fscal,dz20);
1104 /* Update vectorial force */
1105 fix2 = _mm_add_ps(fix2,tx);
1106 fiy2 = _mm_add_ps(fiy2,ty);
1107 fiz2 = _mm_add_ps(fiz2,tz);
1109 fjx0 = _mm_add_ps(fjx0,tx);
1110 fjy0 = _mm_add_ps(fjy0,ty);
1111 fjz0 = _mm_add_ps(fjz0,tz);
1113 /**************************
1114 * CALCULATE INTERACTIONS *
1115 **************************/
1117 r21 = _mm_mul_ps(rsq21,rinv21);
1118 r21 = _mm_andnot_ps(dummy_mask,r21);
1120 /* EWALD ELECTROSTATICS */
1122 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1123 ewrt = _mm_mul_ps(r21,ewtabscale);
1124 ewitab = _mm_cvttps_epi32(ewrt);
1125 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1126 ewitab = _mm_slli_epi32(ewitab,2);
1127 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1128 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1129 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1130 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1131 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1132 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1133 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1134 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1135 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1137 /* Update potential sum for this i atom from the interaction with this j atom. */
1138 velec = _mm_andnot_ps(dummy_mask,velec);
1139 velecsum = _mm_add_ps(velecsum,velec);
1143 fscal = _mm_andnot_ps(dummy_mask,fscal);
1145 /* Calculate temporary vectorial force */
1146 tx = _mm_mul_ps(fscal,dx21);
1147 ty = _mm_mul_ps(fscal,dy21);
1148 tz = _mm_mul_ps(fscal,dz21);
1150 /* Update vectorial force */
1151 fix2 = _mm_add_ps(fix2,tx);
1152 fiy2 = _mm_add_ps(fiy2,ty);
1153 fiz2 = _mm_add_ps(fiz2,tz);
1155 fjx1 = _mm_add_ps(fjx1,tx);
1156 fjy1 = _mm_add_ps(fjy1,ty);
1157 fjz1 = _mm_add_ps(fjz1,tz);
1159 /**************************
1160 * CALCULATE INTERACTIONS *
1161 **************************/
1163 r22 = _mm_mul_ps(rsq22,rinv22);
1164 r22 = _mm_andnot_ps(dummy_mask,r22);
1166 /* EWALD ELECTROSTATICS */
1168 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1169 ewrt = _mm_mul_ps(r22,ewtabscale);
1170 ewitab = _mm_cvttps_epi32(ewrt);
1171 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1172 ewitab = _mm_slli_epi32(ewitab,2);
1173 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1174 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1175 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1176 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1177 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1178 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1179 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1180 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1181 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1183 /* Update potential sum for this i atom from the interaction with this j atom. */
1184 velec = _mm_andnot_ps(dummy_mask,velec);
1185 velecsum = _mm_add_ps(velecsum,velec);
1189 fscal = _mm_andnot_ps(dummy_mask,fscal);
1191 /* Calculate temporary vectorial force */
1192 tx = _mm_mul_ps(fscal,dx22);
1193 ty = _mm_mul_ps(fscal,dy22);
1194 tz = _mm_mul_ps(fscal,dz22);
1196 /* Update vectorial force */
1197 fix2 = _mm_add_ps(fix2,tx);
1198 fiy2 = _mm_add_ps(fiy2,ty);
1199 fiz2 = _mm_add_ps(fiz2,tz);
1201 fjx2 = _mm_add_ps(fjx2,tx);
1202 fjy2 = _mm_add_ps(fjy2,ty);
1203 fjz2 = _mm_add_ps(fjz2,tz);
1205 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1206 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1207 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1208 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1210 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1211 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1213 /* Inner loop uses 390 flops */
1216 /* End of innermost loop */
1218 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1219 f+i_coord_offset,fshift+i_shift_offset);
1222 /* Update potential energies */
1223 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1224 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1226 /* Increment number of inner iterations */
1227 inneriter += j_index_end - j_index_start;
1229 /* Outer loop uses 20 flops */
1232 /* Increment number of outer iterations */
1235 /* Update outer/inner flops */
1237 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*390);
1240 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse4_1_single
1241 * Electrostatics interaction: Ewald
1242 * VdW interaction: LennardJones
1243 * Geometry: Water3-Water3
1244 * Calculate force/pot: Force
1247 nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_sse4_1_single
1248 (t_nblist * gmx_restrict nlist,
1249 rvec * gmx_restrict xx,
1250 rvec * gmx_restrict ff,
1251 t_forcerec * gmx_restrict fr,
1252 t_mdatoms * gmx_restrict mdatoms,
1253 nb_kernel_data_t * gmx_restrict kernel_data,
1254 t_nrnb * gmx_restrict nrnb)
1256 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1257 * just 0 for non-waters.
1258 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1259 * jnr indices corresponding to data put in the four positions in the SIMD register.
1261 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1262 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1263 int jnrA,jnrB,jnrC,jnrD;
1264 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1265 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1266 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1267 real rcutoff_scalar;
1268 real *shiftvec,*fshift,*x,*f;
1269 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1270 real scratch[4*DIM];
1271 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1273 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1275 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1277 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1278 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1279 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1280 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1281 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1282 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1283 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1284 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1285 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1286 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1287 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1288 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1289 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1290 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1291 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1292 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1293 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1296 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1299 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1300 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1302 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1304 __m128 dummy_mask,cutoff_mask;
1305 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1306 __m128 one = _mm_set1_ps(1.0);
1307 __m128 two = _mm_set1_ps(2.0);
1313 jindex = nlist->jindex;
1315 shiftidx = nlist->shift;
1317 shiftvec = fr->shift_vec[0];
1318 fshift = fr->fshift[0];
1319 facel = _mm_set1_ps(fr->epsfac);
1320 charge = mdatoms->chargeA;
1321 nvdwtype = fr->ntype;
1322 vdwparam = fr->nbfp;
1323 vdwtype = mdatoms->typeA;
1325 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1326 ewtab = fr->ic->tabq_coul_F;
1327 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1328 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1330 /* Setup water-specific parameters */
1331 inr = nlist->iinr[0];
1332 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1333 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1334 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1335 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1337 jq0 = _mm_set1_ps(charge[inr+0]);
1338 jq1 = _mm_set1_ps(charge[inr+1]);
1339 jq2 = _mm_set1_ps(charge[inr+2]);
1340 vdwjidx0A = 2*vdwtype[inr+0];
1341 qq00 = _mm_mul_ps(iq0,jq0);
1342 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1343 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1344 qq01 = _mm_mul_ps(iq0,jq1);
1345 qq02 = _mm_mul_ps(iq0,jq2);
1346 qq10 = _mm_mul_ps(iq1,jq0);
1347 qq11 = _mm_mul_ps(iq1,jq1);
1348 qq12 = _mm_mul_ps(iq1,jq2);
1349 qq20 = _mm_mul_ps(iq2,jq0);
1350 qq21 = _mm_mul_ps(iq2,jq1);
1351 qq22 = _mm_mul_ps(iq2,jq2);
1353 /* Avoid stupid compiler warnings */
1354 jnrA = jnrB = jnrC = jnrD = 0;
1355 j_coord_offsetA = 0;
1356 j_coord_offsetB = 0;
1357 j_coord_offsetC = 0;
1358 j_coord_offsetD = 0;
1363 for(iidx=0;iidx<4*DIM;iidx++)
1365 scratch[iidx] = 0.0;
1368 /* Start outer loop over neighborlists */
1369 for(iidx=0; iidx<nri; iidx++)
1371 /* Load shift vector for this list */
1372 i_shift_offset = DIM*shiftidx[iidx];
1374 /* Load limits for loop over neighbors */
1375 j_index_start = jindex[iidx];
1376 j_index_end = jindex[iidx+1];
1378 /* Get outer coordinate index */
1380 i_coord_offset = DIM*inr;
1382 /* Load i particle coords and add shift vector */
1383 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1384 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1386 fix0 = _mm_setzero_ps();
1387 fiy0 = _mm_setzero_ps();
1388 fiz0 = _mm_setzero_ps();
1389 fix1 = _mm_setzero_ps();
1390 fiy1 = _mm_setzero_ps();
1391 fiz1 = _mm_setzero_ps();
1392 fix2 = _mm_setzero_ps();
1393 fiy2 = _mm_setzero_ps();
1394 fiz2 = _mm_setzero_ps();
1396 /* Start inner kernel loop */
1397 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1400 /* Get j neighbor index, and coordinate index */
1402 jnrB = jjnr[jidx+1];
1403 jnrC = jjnr[jidx+2];
1404 jnrD = jjnr[jidx+3];
1405 j_coord_offsetA = DIM*jnrA;
1406 j_coord_offsetB = DIM*jnrB;
1407 j_coord_offsetC = DIM*jnrC;
1408 j_coord_offsetD = DIM*jnrD;
1410 /* load j atom coordinates */
1411 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1412 x+j_coord_offsetC,x+j_coord_offsetD,
1413 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1415 /* Calculate displacement vector */
1416 dx00 = _mm_sub_ps(ix0,jx0);
1417 dy00 = _mm_sub_ps(iy0,jy0);
1418 dz00 = _mm_sub_ps(iz0,jz0);
1419 dx01 = _mm_sub_ps(ix0,jx1);
1420 dy01 = _mm_sub_ps(iy0,jy1);
1421 dz01 = _mm_sub_ps(iz0,jz1);
1422 dx02 = _mm_sub_ps(ix0,jx2);
1423 dy02 = _mm_sub_ps(iy0,jy2);
1424 dz02 = _mm_sub_ps(iz0,jz2);
1425 dx10 = _mm_sub_ps(ix1,jx0);
1426 dy10 = _mm_sub_ps(iy1,jy0);
1427 dz10 = _mm_sub_ps(iz1,jz0);
1428 dx11 = _mm_sub_ps(ix1,jx1);
1429 dy11 = _mm_sub_ps(iy1,jy1);
1430 dz11 = _mm_sub_ps(iz1,jz1);
1431 dx12 = _mm_sub_ps(ix1,jx2);
1432 dy12 = _mm_sub_ps(iy1,jy2);
1433 dz12 = _mm_sub_ps(iz1,jz2);
1434 dx20 = _mm_sub_ps(ix2,jx0);
1435 dy20 = _mm_sub_ps(iy2,jy0);
1436 dz20 = _mm_sub_ps(iz2,jz0);
1437 dx21 = _mm_sub_ps(ix2,jx1);
1438 dy21 = _mm_sub_ps(iy2,jy1);
1439 dz21 = _mm_sub_ps(iz2,jz1);
1440 dx22 = _mm_sub_ps(ix2,jx2);
1441 dy22 = _mm_sub_ps(iy2,jy2);
1442 dz22 = _mm_sub_ps(iz2,jz2);
1444 /* Calculate squared distance and things based on it */
1445 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1446 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1447 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1448 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1449 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1450 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1451 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1452 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1453 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1455 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1456 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1457 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1458 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1459 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1460 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1461 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1462 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1463 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1465 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1466 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1467 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1468 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1469 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1470 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1471 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1472 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1473 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1475 fjx0 = _mm_setzero_ps();
1476 fjy0 = _mm_setzero_ps();
1477 fjz0 = _mm_setzero_ps();
1478 fjx1 = _mm_setzero_ps();
1479 fjy1 = _mm_setzero_ps();
1480 fjz1 = _mm_setzero_ps();
1481 fjx2 = _mm_setzero_ps();
1482 fjy2 = _mm_setzero_ps();
1483 fjz2 = _mm_setzero_ps();
1485 /**************************
1486 * CALCULATE INTERACTIONS *
1487 **************************/
1489 r00 = _mm_mul_ps(rsq00,rinv00);
1491 /* EWALD ELECTROSTATICS */
1493 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1494 ewrt = _mm_mul_ps(r00,ewtabscale);
1495 ewitab = _mm_cvttps_epi32(ewrt);
1496 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1497 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1498 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1500 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1501 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1503 /* LENNARD-JONES DISPERSION/REPULSION */
1505 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1506 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1508 fscal = _mm_add_ps(felec,fvdw);
1510 /* Calculate temporary vectorial force */
1511 tx = _mm_mul_ps(fscal,dx00);
1512 ty = _mm_mul_ps(fscal,dy00);
1513 tz = _mm_mul_ps(fscal,dz00);
1515 /* Update vectorial force */
1516 fix0 = _mm_add_ps(fix0,tx);
1517 fiy0 = _mm_add_ps(fiy0,ty);
1518 fiz0 = _mm_add_ps(fiz0,tz);
1520 fjx0 = _mm_add_ps(fjx0,tx);
1521 fjy0 = _mm_add_ps(fjy0,ty);
1522 fjz0 = _mm_add_ps(fjz0,tz);
1524 /**************************
1525 * CALCULATE INTERACTIONS *
1526 **************************/
1528 r01 = _mm_mul_ps(rsq01,rinv01);
1530 /* EWALD ELECTROSTATICS */
1532 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1533 ewrt = _mm_mul_ps(r01,ewtabscale);
1534 ewitab = _mm_cvttps_epi32(ewrt);
1535 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1536 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1537 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1539 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1540 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1544 /* Calculate temporary vectorial force */
1545 tx = _mm_mul_ps(fscal,dx01);
1546 ty = _mm_mul_ps(fscal,dy01);
1547 tz = _mm_mul_ps(fscal,dz01);
1549 /* Update vectorial force */
1550 fix0 = _mm_add_ps(fix0,tx);
1551 fiy0 = _mm_add_ps(fiy0,ty);
1552 fiz0 = _mm_add_ps(fiz0,tz);
1554 fjx1 = _mm_add_ps(fjx1,tx);
1555 fjy1 = _mm_add_ps(fjy1,ty);
1556 fjz1 = _mm_add_ps(fjz1,tz);
1558 /**************************
1559 * CALCULATE INTERACTIONS *
1560 **************************/
1562 r02 = _mm_mul_ps(rsq02,rinv02);
1564 /* EWALD ELECTROSTATICS */
1566 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1567 ewrt = _mm_mul_ps(r02,ewtabscale);
1568 ewitab = _mm_cvttps_epi32(ewrt);
1569 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1570 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1571 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1573 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1574 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1578 /* Calculate temporary vectorial force */
1579 tx = _mm_mul_ps(fscal,dx02);
1580 ty = _mm_mul_ps(fscal,dy02);
1581 tz = _mm_mul_ps(fscal,dz02);
1583 /* Update vectorial force */
1584 fix0 = _mm_add_ps(fix0,tx);
1585 fiy0 = _mm_add_ps(fiy0,ty);
1586 fiz0 = _mm_add_ps(fiz0,tz);
1588 fjx2 = _mm_add_ps(fjx2,tx);
1589 fjy2 = _mm_add_ps(fjy2,ty);
1590 fjz2 = _mm_add_ps(fjz2,tz);
1592 /**************************
1593 * CALCULATE INTERACTIONS *
1594 **************************/
1596 r10 = _mm_mul_ps(rsq10,rinv10);
1598 /* EWALD ELECTROSTATICS */
1600 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1601 ewrt = _mm_mul_ps(r10,ewtabscale);
1602 ewitab = _mm_cvttps_epi32(ewrt);
1603 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1604 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1605 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1607 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1608 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1612 /* Calculate temporary vectorial force */
1613 tx = _mm_mul_ps(fscal,dx10);
1614 ty = _mm_mul_ps(fscal,dy10);
1615 tz = _mm_mul_ps(fscal,dz10);
1617 /* Update vectorial force */
1618 fix1 = _mm_add_ps(fix1,tx);
1619 fiy1 = _mm_add_ps(fiy1,ty);
1620 fiz1 = _mm_add_ps(fiz1,tz);
1622 fjx0 = _mm_add_ps(fjx0,tx);
1623 fjy0 = _mm_add_ps(fjy0,ty);
1624 fjz0 = _mm_add_ps(fjz0,tz);
1626 /**************************
1627 * CALCULATE INTERACTIONS *
1628 **************************/
1630 r11 = _mm_mul_ps(rsq11,rinv11);
1632 /* EWALD ELECTROSTATICS */
1634 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1635 ewrt = _mm_mul_ps(r11,ewtabscale);
1636 ewitab = _mm_cvttps_epi32(ewrt);
1637 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1638 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1639 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1641 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1642 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1646 /* Calculate temporary vectorial force */
1647 tx = _mm_mul_ps(fscal,dx11);
1648 ty = _mm_mul_ps(fscal,dy11);
1649 tz = _mm_mul_ps(fscal,dz11);
1651 /* Update vectorial force */
1652 fix1 = _mm_add_ps(fix1,tx);
1653 fiy1 = _mm_add_ps(fiy1,ty);
1654 fiz1 = _mm_add_ps(fiz1,tz);
1656 fjx1 = _mm_add_ps(fjx1,tx);
1657 fjy1 = _mm_add_ps(fjy1,ty);
1658 fjz1 = _mm_add_ps(fjz1,tz);
1660 /**************************
1661 * CALCULATE INTERACTIONS *
1662 **************************/
1664 r12 = _mm_mul_ps(rsq12,rinv12);
1666 /* EWALD ELECTROSTATICS */
1668 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1669 ewrt = _mm_mul_ps(r12,ewtabscale);
1670 ewitab = _mm_cvttps_epi32(ewrt);
1671 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1672 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1673 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1675 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1676 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1680 /* Calculate temporary vectorial force */
1681 tx = _mm_mul_ps(fscal,dx12);
1682 ty = _mm_mul_ps(fscal,dy12);
1683 tz = _mm_mul_ps(fscal,dz12);
1685 /* Update vectorial force */
1686 fix1 = _mm_add_ps(fix1,tx);
1687 fiy1 = _mm_add_ps(fiy1,ty);
1688 fiz1 = _mm_add_ps(fiz1,tz);
1690 fjx2 = _mm_add_ps(fjx2,tx);
1691 fjy2 = _mm_add_ps(fjy2,ty);
1692 fjz2 = _mm_add_ps(fjz2,tz);
1694 /**************************
1695 * CALCULATE INTERACTIONS *
1696 **************************/
1698 r20 = _mm_mul_ps(rsq20,rinv20);
1700 /* EWALD ELECTROSTATICS */
1702 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1703 ewrt = _mm_mul_ps(r20,ewtabscale);
1704 ewitab = _mm_cvttps_epi32(ewrt);
1705 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1706 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1707 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1709 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1710 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1714 /* Calculate temporary vectorial force */
1715 tx = _mm_mul_ps(fscal,dx20);
1716 ty = _mm_mul_ps(fscal,dy20);
1717 tz = _mm_mul_ps(fscal,dz20);
1719 /* Update vectorial force */
1720 fix2 = _mm_add_ps(fix2,tx);
1721 fiy2 = _mm_add_ps(fiy2,ty);
1722 fiz2 = _mm_add_ps(fiz2,tz);
1724 fjx0 = _mm_add_ps(fjx0,tx);
1725 fjy0 = _mm_add_ps(fjy0,ty);
1726 fjz0 = _mm_add_ps(fjz0,tz);
1728 /**************************
1729 * CALCULATE INTERACTIONS *
1730 **************************/
1732 r21 = _mm_mul_ps(rsq21,rinv21);
1734 /* EWALD ELECTROSTATICS */
1736 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1737 ewrt = _mm_mul_ps(r21,ewtabscale);
1738 ewitab = _mm_cvttps_epi32(ewrt);
1739 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1740 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1741 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1743 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1744 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1748 /* Calculate temporary vectorial force */
1749 tx = _mm_mul_ps(fscal,dx21);
1750 ty = _mm_mul_ps(fscal,dy21);
1751 tz = _mm_mul_ps(fscal,dz21);
1753 /* Update vectorial force */
1754 fix2 = _mm_add_ps(fix2,tx);
1755 fiy2 = _mm_add_ps(fiy2,ty);
1756 fiz2 = _mm_add_ps(fiz2,tz);
1758 fjx1 = _mm_add_ps(fjx1,tx);
1759 fjy1 = _mm_add_ps(fjy1,ty);
1760 fjz1 = _mm_add_ps(fjz1,tz);
1762 /**************************
1763 * CALCULATE INTERACTIONS *
1764 **************************/
1766 r22 = _mm_mul_ps(rsq22,rinv22);
1768 /* EWALD ELECTROSTATICS */
1770 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1771 ewrt = _mm_mul_ps(r22,ewtabscale);
1772 ewitab = _mm_cvttps_epi32(ewrt);
1773 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1774 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1775 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1777 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1778 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1782 /* Calculate temporary vectorial force */
1783 tx = _mm_mul_ps(fscal,dx22);
1784 ty = _mm_mul_ps(fscal,dy22);
1785 tz = _mm_mul_ps(fscal,dz22);
1787 /* Update vectorial force */
1788 fix2 = _mm_add_ps(fix2,tx);
1789 fiy2 = _mm_add_ps(fiy2,ty);
1790 fiz2 = _mm_add_ps(fiz2,tz);
1792 fjx2 = _mm_add_ps(fjx2,tx);
1793 fjy2 = _mm_add_ps(fjy2,ty);
1794 fjz2 = _mm_add_ps(fjz2,tz);
1796 fjptrA = f+j_coord_offsetA;
1797 fjptrB = f+j_coord_offsetB;
1798 fjptrC = f+j_coord_offsetC;
1799 fjptrD = f+j_coord_offsetD;
1801 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1802 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1804 /* Inner loop uses 331 flops */
1807 if(jidx<j_index_end)
1810 /* Get j neighbor index, and coordinate index */
1811 jnrlistA = jjnr[jidx];
1812 jnrlistB = jjnr[jidx+1];
1813 jnrlistC = jjnr[jidx+2];
1814 jnrlistD = jjnr[jidx+3];
1815 /* Sign of each element will be negative for non-real atoms.
1816 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1817 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1819 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1820 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1821 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1822 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1823 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1824 j_coord_offsetA = DIM*jnrA;
1825 j_coord_offsetB = DIM*jnrB;
1826 j_coord_offsetC = DIM*jnrC;
1827 j_coord_offsetD = DIM*jnrD;
1829 /* load j atom coordinates */
1830 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1831 x+j_coord_offsetC,x+j_coord_offsetD,
1832 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1834 /* Calculate displacement vector */
1835 dx00 = _mm_sub_ps(ix0,jx0);
1836 dy00 = _mm_sub_ps(iy0,jy0);
1837 dz00 = _mm_sub_ps(iz0,jz0);
1838 dx01 = _mm_sub_ps(ix0,jx1);
1839 dy01 = _mm_sub_ps(iy0,jy1);
1840 dz01 = _mm_sub_ps(iz0,jz1);
1841 dx02 = _mm_sub_ps(ix0,jx2);
1842 dy02 = _mm_sub_ps(iy0,jy2);
1843 dz02 = _mm_sub_ps(iz0,jz2);
1844 dx10 = _mm_sub_ps(ix1,jx0);
1845 dy10 = _mm_sub_ps(iy1,jy0);
1846 dz10 = _mm_sub_ps(iz1,jz0);
1847 dx11 = _mm_sub_ps(ix1,jx1);
1848 dy11 = _mm_sub_ps(iy1,jy1);
1849 dz11 = _mm_sub_ps(iz1,jz1);
1850 dx12 = _mm_sub_ps(ix1,jx2);
1851 dy12 = _mm_sub_ps(iy1,jy2);
1852 dz12 = _mm_sub_ps(iz1,jz2);
1853 dx20 = _mm_sub_ps(ix2,jx0);
1854 dy20 = _mm_sub_ps(iy2,jy0);
1855 dz20 = _mm_sub_ps(iz2,jz0);
1856 dx21 = _mm_sub_ps(ix2,jx1);
1857 dy21 = _mm_sub_ps(iy2,jy1);
1858 dz21 = _mm_sub_ps(iz2,jz1);
1859 dx22 = _mm_sub_ps(ix2,jx2);
1860 dy22 = _mm_sub_ps(iy2,jy2);
1861 dz22 = _mm_sub_ps(iz2,jz2);
1863 /* Calculate squared distance and things based on it */
1864 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1865 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1866 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1867 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1868 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1869 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1870 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1871 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1872 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1874 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1875 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1876 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1877 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1878 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1879 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1880 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1881 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1882 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1884 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1885 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1886 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1887 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1888 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1889 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1890 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1891 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1892 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1894 fjx0 = _mm_setzero_ps();
1895 fjy0 = _mm_setzero_ps();
1896 fjz0 = _mm_setzero_ps();
1897 fjx1 = _mm_setzero_ps();
1898 fjy1 = _mm_setzero_ps();
1899 fjz1 = _mm_setzero_ps();
1900 fjx2 = _mm_setzero_ps();
1901 fjy2 = _mm_setzero_ps();
1902 fjz2 = _mm_setzero_ps();
1904 /**************************
1905 * CALCULATE INTERACTIONS *
1906 **************************/
1908 r00 = _mm_mul_ps(rsq00,rinv00);
1909 r00 = _mm_andnot_ps(dummy_mask,r00);
1911 /* EWALD ELECTROSTATICS */
1913 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1914 ewrt = _mm_mul_ps(r00,ewtabscale);
1915 ewitab = _mm_cvttps_epi32(ewrt);
1916 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1917 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1918 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1920 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1921 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1923 /* LENNARD-JONES DISPERSION/REPULSION */
1925 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1926 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1928 fscal = _mm_add_ps(felec,fvdw);
1930 fscal = _mm_andnot_ps(dummy_mask,fscal);
1932 /* Calculate temporary vectorial force */
1933 tx = _mm_mul_ps(fscal,dx00);
1934 ty = _mm_mul_ps(fscal,dy00);
1935 tz = _mm_mul_ps(fscal,dz00);
1937 /* Update vectorial force */
1938 fix0 = _mm_add_ps(fix0,tx);
1939 fiy0 = _mm_add_ps(fiy0,ty);
1940 fiz0 = _mm_add_ps(fiz0,tz);
1942 fjx0 = _mm_add_ps(fjx0,tx);
1943 fjy0 = _mm_add_ps(fjy0,ty);
1944 fjz0 = _mm_add_ps(fjz0,tz);
1946 /**************************
1947 * CALCULATE INTERACTIONS *
1948 **************************/
1950 r01 = _mm_mul_ps(rsq01,rinv01);
1951 r01 = _mm_andnot_ps(dummy_mask,r01);
1953 /* EWALD ELECTROSTATICS */
1955 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1956 ewrt = _mm_mul_ps(r01,ewtabscale);
1957 ewitab = _mm_cvttps_epi32(ewrt);
1958 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1959 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1960 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1962 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1963 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1967 fscal = _mm_andnot_ps(dummy_mask,fscal);
1969 /* Calculate temporary vectorial force */
1970 tx = _mm_mul_ps(fscal,dx01);
1971 ty = _mm_mul_ps(fscal,dy01);
1972 tz = _mm_mul_ps(fscal,dz01);
1974 /* Update vectorial force */
1975 fix0 = _mm_add_ps(fix0,tx);
1976 fiy0 = _mm_add_ps(fiy0,ty);
1977 fiz0 = _mm_add_ps(fiz0,tz);
1979 fjx1 = _mm_add_ps(fjx1,tx);
1980 fjy1 = _mm_add_ps(fjy1,ty);
1981 fjz1 = _mm_add_ps(fjz1,tz);
1983 /**************************
1984 * CALCULATE INTERACTIONS *
1985 **************************/
1987 r02 = _mm_mul_ps(rsq02,rinv02);
1988 r02 = _mm_andnot_ps(dummy_mask,r02);
1990 /* EWALD ELECTROSTATICS */
1992 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1993 ewrt = _mm_mul_ps(r02,ewtabscale);
1994 ewitab = _mm_cvttps_epi32(ewrt);
1995 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1996 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1997 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1999 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2000 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2004 fscal = _mm_andnot_ps(dummy_mask,fscal);
2006 /* Calculate temporary vectorial force */
2007 tx = _mm_mul_ps(fscal,dx02);
2008 ty = _mm_mul_ps(fscal,dy02);
2009 tz = _mm_mul_ps(fscal,dz02);
2011 /* Update vectorial force */
2012 fix0 = _mm_add_ps(fix0,tx);
2013 fiy0 = _mm_add_ps(fiy0,ty);
2014 fiz0 = _mm_add_ps(fiz0,tz);
2016 fjx2 = _mm_add_ps(fjx2,tx);
2017 fjy2 = _mm_add_ps(fjy2,ty);
2018 fjz2 = _mm_add_ps(fjz2,tz);
2020 /**************************
2021 * CALCULATE INTERACTIONS *
2022 **************************/
2024 r10 = _mm_mul_ps(rsq10,rinv10);
2025 r10 = _mm_andnot_ps(dummy_mask,r10);
2027 /* EWALD ELECTROSTATICS */
2029 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2030 ewrt = _mm_mul_ps(r10,ewtabscale);
2031 ewitab = _mm_cvttps_epi32(ewrt);
2032 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2033 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2034 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2036 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2037 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2041 fscal = _mm_andnot_ps(dummy_mask,fscal);
2043 /* Calculate temporary vectorial force */
2044 tx = _mm_mul_ps(fscal,dx10);
2045 ty = _mm_mul_ps(fscal,dy10);
2046 tz = _mm_mul_ps(fscal,dz10);
2048 /* Update vectorial force */
2049 fix1 = _mm_add_ps(fix1,tx);
2050 fiy1 = _mm_add_ps(fiy1,ty);
2051 fiz1 = _mm_add_ps(fiz1,tz);
2053 fjx0 = _mm_add_ps(fjx0,tx);
2054 fjy0 = _mm_add_ps(fjy0,ty);
2055 fjz0 = _mm_add_ps(fjz0,tz);
2057 /**************************
2058 * CALCULATE INTERACTIONS *
2059 **************************/
2061 r11 = _mm_mul_ps(rsq11,rinv11);
2062 r11 = _mm_andnot_ps(dummy_mask,r11);
2064 /* EWALD ELECTROSTATICS */
2066 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2067 ewrt = _mm_mul_ps(r11,ewtabscale);
2068 ewitab = _mm_cvttps_epi32(ewrt);
2069 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2070 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2071 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2073 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2074 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2078 fscal = _mm_andnot_ps(dummy_mask,fscal);
2080 /* Calculate temporary vectorial force */
2081 tx = _mm_mul_ps(fscal,dx11);
2082 ty = _mm_mul_ps(fscal,dy11);
2083 tz = _mm_mul_ps(fscal,dz11);
2085 /* Update vectorial force */
2086 fix1 = _mm_add_ps(fix1,tx);
2087 fiy1 = _mm_add_ps(fiy1,ty);
2088 fiz1 = _mm_add_ps(fiz1,tz);
2090 fjx1 = _mm_add_ps(fjx1,tx);
2091 fjy1 = _mm_add_ps(fjy1,ty);
2092 fjz1 = _mm_add_ps(fjz1,tz);
2094 /**************************
2095 * CALCULATE INTERACTIONS *
2096 **************************/
2098 r12 = _mm_mul_ps(rsq12,rinv12);
2099 r12 = _mm_andnot_ps(dummy_mask,r12);
2101 /* EWALD ELECTROSTATICS */
2103 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2104 ewrt = _mm_mul_ps(r12,ewtabscale);
2105 ewitab = _mm_cvttps_epi32(ewrt);
2106 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2107 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2108 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2110 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2111 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2115 fscal = _mm_andnot_ps(dummy_mask,fscal);
2117 /* Calculate temporary vectorial force */
2118 tx = _mm_mul_ps(fscal,dx12);
2119 ty = _mm_mul_ps(fscal,dy12);
2120 tz = _mm_mul_ps(fscal,dz12);
2122 /* Update vectorial force */
2123 fix1 = _mm_add_ps(fix1,tx);
2124 fiy1 = _mm_add_ps(fiy1,ty);
2125 fiz1 = _mm_add_ps(fiz1,tz);
2127 fjx2 = _mm_add_ps(fjx2,tx);
2128 fjy2 = _mm_add_ps(fjy2,ty);
2129 fjz2 = _mm_add_ps(fjz2,tz);
2131 /**************************
2132 * CALCULATE INTERACTIONS *
2133 **************************/
2135 r20 = _mm_mul_ps(rsq20,rinv20);
2136 r20 = _mm_andnot_ps(dummy_mask,r20);
2138 /* EWALD ELECTROSTATICS */
2140 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2141 ewrt = _mm_mul_ps(r20,ewtabscale);
2142 ewitab = _mm_cvttps_epi32(ewrt);
2143 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2144 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2145 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2147 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2148 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2152 fscal = _mm_andnot_ps(dummy_mask,fscal);
2154 /* Calculate temporary vectorial force */
2155 tx = _mm_mul_ps(fscal,dx20);
2156 ty = _mm_mul_ps(fscal,dy20);
2157 tz = _mm_mul_ps(fscal,dz20);
2159 /* Update vectorial force */
2160 fix2 = _mm_add_ps(fix2,tx);
2161 fiy2 = _mm_add_ps(fiy2,ty);
2162 fiz2 = _mm_add_ps(fiz2,tz);
2164 fjx0 = _mm_add_ps(fjx0,tx);
2165 fjy0 = _mm_add_ps(fjy0,ty);
2166 fjz0 = _mm_add_ps(fjz0,tz);
2168 /**************************
2169 * CALCULATE INTERACTIONS *
2170 **************************/
2172 r21 = _mm_mul_ps(rsq21,rinv21);
2173 r21 = _mm_andnot_ps(dummy_mask,r21);
2175 /* EWALD ELECTROSTATICS */
2177 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2178 ewrt = _mm_mul_ps(r21,ewtabscale);
2179 ewitab = _mm_cvttps_epi32(ewrt);
2180 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2181 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2182 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2184 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2185 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2189 fscal = _mm_andnot_ps(dummy_mask,fscal);
2191 /* Calculate temporary vectorial force */
2192 tx = _mm_mul_ps(fscal,dx21);
2193 ty = _mm_mul_ps(fscal,dy21);
2194 tz = _mm_mul_ps(fscal,dz21);
2196 /* Update vectorial force */
2197 fix2 = _mm_add_ps(fix2,tx);
2198 fiy2 = _mm_add_ps(fiy2,ty);
2199 fiz2 = _mm_add_ps(fiz2,tz);
2201 fjx1 = _mm_add_ps(fjx1,tx);
2202 fjy1 = _mm_add_ps(fjy1,ty);
2203 fjz1 = _mm_add_ps(fjz1,tz);
2205 /**************************
2206 * CALCULATE INTERACTIONS *
2207 **************************/
2209 r22 = _mm_mul_ps(rsq22,rinv22);
2210 r22 = _mm_andnot_ps(dummy_mask,r22);
2212 /* EWALD ELECTROSTATICS */
2214 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2215 ewrt = _mm_mul_ps(r22,ewtabscale);
2216 ewitab = _mm_cvttps_epi32(ewrt);
2217 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2218 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2219 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2221 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2222 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2226 fscal = _mm_andnot_ps(dummy_mask,fscal);
2228 /* Calculate temporary vectorial force */
2229 tx = _mm_mul_ps(fscal,dx22);
2230 ty = _mm_mul_ps(fscal,dy22);
2231 tz = _mm_mul_ps(fscal,dz22);
2233 /* Update vectorial force */
2234 fix2 = _mm_add_ps(fix2,tx);
2235 fiy2 = _mm_add_ps(fiy2,ty);
2236 fiz2 = _mm_add_ps(fiz2,tz);
2238 fjx2 = _mm_add_ps(fjx2,tx);
2239 fjy2 = _mm_add_ps(fjy2,ty);
2240 fjz2 = _mm_add_ps(fjz2,tz);
2242 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2243 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2244 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2245 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2247 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2248 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2250 /* Inner loop uses 340 flops */
2253 /* End of innermost loop */
2255 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2256 f+i_coord_offset,fshift+i_shift_offset);
2258 /* Increment number of inner iterations */
2259 inneriter += j_index_end - j_index_start;
2261 /* Outer loop uses 18 flops */
2264 /* Increment number of outer iterations */
2267 /* Update outer/inner flops */
2269 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*340);