2 * Note: this file was generated by the Gromacs sse4_1_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse4_1_single.h"
34 #include "kernelutil_x86_sse4_1_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse4_1_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse4_1_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
77 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
78 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
79 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
80 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
81 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
82 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
83 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
84 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
85 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
86 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
89 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
92 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
93 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
94 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
95 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
98 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
101 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
102 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
104 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
106 __m128 dummy_mask,cutoff_mask;
107 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
108 __m128 one = _mm_set1_ps(1.0);
109 __m128 two = _mm_set1_ps(2.0);
115 jindex = nlist->jindex;
117 shiftidx = nlist->shift;
119 shiftvec = fr->shift_vec[0];
120 fshift = fr->fshift[0];
121 facel = _mm_set1_ps(fr->epsfac);
122 charge = mdatoms->chargeA;
123 nvdwtype = fr->ntype;
125 vdwtype = mdatoms->typeA;
127 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
128 ewtab = fr->ic->tabq_coul_FDV0;
129 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
130 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
132 /* Setup water-specific parameters */
133 inr = nlist->iinr[0];
134 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
135 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
136 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
137 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
139 jq1 = _mm_set1_ps(charge[inr+1]);
140 jq2 = _mm_set1_ps(charge[inr+2]);
141 jq3 = _mm_set1_ps(charge[inr+3]);
142 vdwjidx0A = 2*vdwtype[inr+0];
143 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
144 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
145 qq11 = _mm_mul_ps(iq1,jq1);
146 qq12 = _mm_mul_ps(iq1,jq2);
147 qq13 = _mm_mul_ps(iq1,jq3);
148 qq21 = _mm_mul_ps(iq2,jq1);
149 qq22 = _mm_mul_ps(iq2,jq2);
150 qq23 = _mm_mul_ps(iq2,jq3);
151 qq31 = _mm_mul_ps(iq3,jq1);
152 qq32 = _mm_mul_ps(iq3,jq2);
153 qq33 = _mm_mul_ps(iq3,jq3);
155 /* Avoid stupid compiler warnings */
156 jnrA = jnrB = jnrC = jnrD = 0;
165 for(iidx=0;iidx<4*DIM;iidx++)
170 /* Start outer loop over neighborlists */
171 for(iidx=0; iidx<nri; iidx++)
173 /* Load shift vector for this list */
174 i_shift_offset = DIM*shiftidx[iidx];
176 /* Load limits for loop over neighbors */
177 j_index_start = jindex[iidx];
178 j_index_end = jindex[iidx+1];
180 /* Get outer coordinate index */
182 i_coord_offset = DIM*inr;
184 /* Load i particle coords and add shift vector */
185 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
186 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
188 fix0 = _mm_setzero_ps();
189 fiy0 = _mm_setzero_ps();
190 fiz0 = _mm_setzero_ps();
191 fix1 = _mm_setzero_ps();
192 fiy1 = _mm_setzero_ps();
193 fiz1 = _mm_setzero_ps();
194 fix2 = _mm_setzero_ps();
195 fiy2 = _mm_setzero_ps();
196 fiz2 = _mm_setzero_ps();
197 fix3 = _mm_setzero_ps();
198 fiy3 = _mm_setzero_ps();
199 fiz3 = _mm_setzero_ps();
201 /* Reset potential sums */
202 velecsum = _mm_setzero_ps();
203 vvdwsum = _mm_setzero_ps();
205 /* Start inner kernel loop */
206 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
209 /* Get j neighbor index, and coordinate index */
214 j_coord_offsetA = DIM*jnrA;
215 j_coord_offsetB = DIM*jnrB;
216 j_coord_offsetC = DIM*jnrC;
217 j_coord_offsetD = DIM*jnrD;
219 /* load j atom coordinates */
220 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
221 x+j_coord_offsetC,x+j_coord_offsetD,
222 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
223 &jy2,&jz2,&jx3,&jy3,&jz3);
225 /* Calculate displacement vector */
226 dx00 = _mm_sub_ps(ix0,jx0);
227 dy00 = _mm_sub_ps(iy0,jy0);
228 dz00 = _mm_sub_ps(iz0,jz0);
229 dx11 = _mm_sub_ps(ix1,jx1);
230 dy11 = _mm_sub_ps(iy1,jy1);
231 dz11 = _mm_sub_ps(iz1,jz1);
232 dx12 = _mm_sub_ps(ix1,jx2);
233 dy12 = _mm_sub_ps(iy1,jy2);
234 dz12 = _mm_sub_ps(iz1,jz2);
235 dx13 = _mm_sub_ps(ix1,jx3);
236 dy13 = _mm_sub_ps(iy1,jy3);
237 dz13 = _mm_sub_ps(iz1,jz3);
238 dx21 = _mm_sub_ps(ix2,jx1);
239 dy21 = _mm_sub_ps(iy2,jy1);
240 dz21 = _mm_sub_ps(iz2,jz1);
241 dx22 = _mm_sub_ps(ix2,jx2);
242 dy22 = _mm_sub_ps(iy2,jy2);
243 dz22 = _mm_sub_ps(iz2,jz2);
244 dx23 = _mm_sub_ps(ix2,jx3);
245 dy23 = _mm_sub_ps(iy2,jy3);
246 dz23 = _mm_sub_ps(iz2,jz3);
247 dx31 = _mm_sub_ps(ix3,jx1);
248 dy31 = _mm_sub_ps(iy3,jy1);
249 dz31 = _mm_sub_ps(iz3,jz1);
250 dx32 = _mm_sub_ps(ix3,jx2);
251 dy32 = _mm_sub_ps(iy3,jy2);
252 dz32 = _mm_sub_ps(iz3,jz2);
253 dx33 = _mm_sub_ps(ix3,jx3);
254 dy33 = _mm_sub_ps(iy3,jy3);
255 dz33 = _mm_sub_ps(iz3,jz3);
257 /* Calculate squared distance and things based on it */
258 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
259 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
260 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
261 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
262 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
263 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
264 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
265 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
266 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
267 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
269 rinv11 = gmx_mm_invsqrt_ps(rsq11);
270 rinv12 = gmx_mm_invsqrt_ps(rsq12);
271 rinv13 = gmx_mm_invsqrt_ps(rsq13);
272 rinv21 = gmx_mm_invsqrt_ps(rsq21);
273 rinv22 = gmx_mm_invsqrt_ps(rsq22);
274 rinv23 = gmx_mm_invsqrt_ps(rsq23);
275 rinv31 = gmx_mm_invsqrt_ps(rsq31);
276 rinv32 = gmx_mm_invsqrt_ps(rsq32);
277 rinv33 = gmx_mm_invsqrt_ps(rsq33);
279 rinvsq00 = gmx_mm_inv_ps(rsq00);
280 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
281 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
282 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
283 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
284 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
285 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
286 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
287 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
288 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
290 fjx0 = _mm_setzero_ps();
291 fjy0 = _mm_setzero_ps();
292 fjz0 = _mm_setzero_ps();
293 fjx1 = _mm_setzero_ps();
294 fjy1 = _mm_setzero_ps();
295 fjz1 = _mm_setzero_ps();
296 fjx2 = _mm_setzero_ps();
297 fjy2 = _mm_setzero_ps();
298 fjz2 = _mm_setzero_ps();
299 fjx3 = _mm_setzero_ps();
300 fjy3 = _mm_setzero_ps();
301 fjz3 = _mm_setzero_ps();
303 /**************************
304 * CALCULATE INTERACTIONS *
305 **************************/
307 /* LENNARD-JONES DISPERSION/REPULSION */
309 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
310 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
311 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
312 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
313 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
315 /* Update potential sum for this i atom from the interaction with this j atom. */
316 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
320 /* Calculate temporary vectorial force */
321 tx = _mm_mul_ps(fscal,dx00);
322 ty = _mm_mul_ps(fscal,dy00);
323 tz = _mm_mul_ps(fscal,dz00);
325 /* Update vectorial force */
326 fix0 = _mm_add_ps(fix0,tx);
327 fiy0 = _mm_add_ps(fiy0,ty);
328 fiz0 = _mm_add_ps(fiz0,tz);
330 fjx0 = _mm_add_ps(fjx0,tx);
331 fjy0 = _mm_add_ps(fjy0,ty);
332 fjz0 = _mm_add_ps(fjz0,tz);
334 /**************************
335 * CALCULATE INTERACTIONS *
336 **************************/
338 r11 = _mm_mul_ps(rsq11,rinv11);
340 /* EWALD ELECTROSTATICS */
342 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
343 ewrt = _mm_mul_ps(r11,ewtabscale);
344 ewitab = _mm_cvttps_epi32(ewrt);
345 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
346 ewitab = _mm_slli_epi32(ewitab,2);
347 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
348 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
349 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
350 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
351 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
352 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
353 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
354 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
355 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
357 /* Update potential sum for this i atom from the interaction with this j atom. */
358 velecsum = _mm_add_ps(velecsum,velec);
362 /* Calculate temporary vectorial force */
363 tx = _mm_mul_ps(fscal,dx11);
364 ty = _mm_mul_ps(fscal,dy11);
365 tz = _mm_mul_ps(fscal,dz11);
367 /* Update vectorial force */
368 fix1 = _mm_add_ps(fix1,tx);
369 fiy1 = _mm_add_ps(fiy1,ty);
370 fiz1 = _mm_add_ps(fiz1,tz);
372 fjx1 = _mm_add_ps(fjx1,tx);
373 fjy1 = _mm_add_ps(fjy1,ty);
374 fjz1 = _mm_add_ps(fjz1,tz);
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
380 r12 = _mm_mul_ps(rsq12,rinv12);
382 /* EWALD ELECTROSTATICS */
384 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
385 ewrt = _mm_mul_ps(r12,ewtabscale);
386 ewitab = _mm_cvttps_epi32(ewrt);
387 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
388 ewitab = _mm_slli_epi32(ewitab,2);
389 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
390 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
391 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
392 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
393 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
394 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
395 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
396 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
397 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
399 /* Update potential sum for this i atom from the interaction with this j atom. */
400 velecsum = _mm_add_ps(velecsum,velec);
404 /* Calculate temporary vectorial force */
405 tx = _mm_mul_ps(fscal,dx12);
406 ty = _mm_mul_ps(fscal,dy12);
407 tz = _mm_mul_ps(fscal,dz12);
409 /* Update vectorial force */
410 fix1 = _mm_add_ps(fix1,tx);
411 fiy1 = _mm_add_ps(fiy1,ty);
412 fiz1 = _mm_add_ps(fiz1,tz);
414 fjx2 = _mm_add_ps(fjx2,tx);
415 fjy2 = _mm_add_ps(fjy2,ty);
416 fjz2 = _mm_add_ps(fjz2,tz);
418 /**************************
419 * CALCULATE INTERACTIONS *
420 **************************/
422 r13 = _mm_mul_ps(rsq13,rinv13);
424 /* EWALD ELECTROSTATICS */
426 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
427 ewrt = _mm_mul_ps(r13,ewtabscale);
428 ewitab = _mm_cvttps_epi32(ewrt);
429 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
430 ewitab = _mm_slli_epi32(ewitab,2);
431 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
432 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
433 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
434 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
435 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
436 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
437 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
438 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
439 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
441 /* Update potential sum for this i atom from the interaction with this j atom. */
442 velecsum = _mm_add_ps(velecsum,velec);
446 /* Calculate temporary vectorial force */
447 tx = _mm_mul_ps(fscal,dx13);
448 ty = _mm_mul_ps(fscal,dy13);
449 tz = _mm_mul_ps(fscal,dz13);
451 /* Update vectorial force */
452 fix1 = _mm_add_ps(fix1,tx);
453 fiy1 = _mm_add_ps(fiy1,ty);
454 fiz1 = _mm_add_ps(fiz1,tz);
456 fjx3 = _mm_add_ps(fjx3,tx);
457 fjy3 = _mm_add_ps(fjy3,ty);
458 fjz3 = _mm_add_ps(fjz3,tz);
460 /**************************
461 * CALCULATE INTERACTIONS *
462 **************************/
464 r21 = _mm_mul_ps(rsq21,rinv21);
466 /* EWALD ELECTROSTATICS */
468 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
469 ewrt = _mm_mul_ps(r21,ewtabscale);
470 ewitab = _mm_cvttps_epi32(ewrt);
471 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
472 ewitab = _mm_slli_epi32(ewitab,2);
473 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
474 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
475 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
476 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
477 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
478 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
479 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
480 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
481 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
483 /* Update potential sum for this i atom from the interaction with this j atom. */
484 velecsum = _mm_add_ps(velecsum,velec);
488 /* Calculate temporary vectorial force */
489 tx = _mm_mul_ps(fscal,dx21);
490 ty = _mm_mul_ps(fscal,dy21);
491 tz = _mm_mul_ps(fscal,dz21);
493 /* Update vectorial force */
494 fix2 = _mm_add_ps(fix2,tx);
495 fiy2 = _mm_add_ps(fiy2,ty);
496 fiz2 = _mm_add_ps(fiz2,tz);
498 fjx1 = _mm_add_ps(fjx1,tx);
499 fjy1 = _mm_add_ps(fjy1,ty);
500 fjz1 = _mm_add_ps(fjz1,tz);
502 /**************************
503 * CALCULATE INTERACTIONS *
504 **************************/
506 r22 = _mm_mul_ps(rsq22,rinv22);
508 /* EWALD ELECTROSTATICS */
510 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
511 ewrt = _mm_mul_ps(r22,ewtabscale);
512 ewitab = _mm_cvttps_epi32(ewrt);
513 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
514 ewitab = _mm_slli_epi32(ewitab,2);
515 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
516 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
517 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
518 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
519 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
520 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
521 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
522 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
523 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
525 /* Update potential sum for this i atom from the interaction with this j atom. */
526 velecsum = _mm_add_ps(velecsum,velec);
530 /* Calculate temporary vectorial force */
531 tx = _mm_mul_ps(fscal,dx22);
532 ty = _mm_mul_ps(fscal,dy22);
533 tz = _mm_mul_ps(fscal,dz22);
535 /* Update vectorial force */
536 fix2 = _mm_add_ps(fix2,tx);
537 fiy2 = _mm_add_ps(fiy2,ty);
538 fiz2 = _mm_add_ps(fiz2,tz);
540 fjx2 = _mm_add_ps(fjx2,tx);
541 fjy2 = _mm_add_ps(fjy2,ty);
542 fjz2 = _mm_add_ps(fjz2,tz);
544 /**************************
545 * CALCULATE INTERACTIONS *
546 **************************/
548 r23 = _mm_mul_ps(rsq23,rinv23);
550 /* EWALD ELECTROSTATICS */
552 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
553 ewrt = _mm_mul_ps(r23,ewtabscale);
554 ewitab = _mm_cvttps_epi32(ewrt);
555 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
556 ewitab = _mm_slli_epi32(ewitab,2);
557 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
558 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
559 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
560 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
561 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
562 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
563 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
564 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
565 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
567 /* Update potential sum for this i atom from the interaction with this j atom. */
568 velecsum = _mm_add_ps(velecsum,velec);
572 /* Calculate temporary vectorial force */
573 tx = _mm_mul_ps(fscal,dx23);
574 ty = _mm_mul_ps(fscal,dy23);
575 tz = _mm_mul_ps(fscal,dz23);
577 /* Update vectorial force */
578 fix2 = _mm_add_ps(fix2,tx);
579 fiy2 = _mm_add_ps(fiy2,ty);
580 fiz2 = _mm_add_ps(fiz2,tz);
582 fjx3 = _mm_add_ps(fjx3,tx);
583 fjy3 = _mm_add_ps(fjy3,ty);
584 fjz3 = _mm_add_ps(fjz3,tz);
586 /**************************
587 * CALCULATE INTERACTIONS *
588 **************************/
590 r31 = _mm_mul_ps(rsq31,rinv31);
592 /* EWALD ELECTROSTATICS */
594 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
595 ewrt = _mm_mul_ps(r31,ewtabscale);
596 ewitab = _mm_cvttps_epi32(ewrt);
597 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
598 ewitab = _mm_slli_epi32(ewitab,2);
599 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
600 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
601 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
602 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
603 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
604 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
605 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
606 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
607 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
609 /* Update potential sum for this i atom from the interaction with this j atom. */
610 velecsum = _mm_add_ps(velecsum,velec);
614 /* Calculate temporary vectorial force */
615 tx = _mm_mul_ps(fscal,dx31);
616 ty = _mm_mul_ps(fscal,dy31);
617 tz = _mm_mul_ps(fscal,dz31);
619 /* Update vectorial force */
620 fix3 = _mm_add_ps(fix3,tx);
621 fiy3 = _mm_add_ps(fiy3,ty);
622 fiz3 = _mm_add_ps(fiz3,tz);
624 fjx1 = _mm_add_ps(fjx1,tx);
625 fjy1 = _mm_add_ps(fjy1,ty);
626 fjz1 = _mm_add_ps(fjz1,tz);
628 /**************************
629 * CALCULATE INTERACTIONS *
630 **************************/
632 r32 = _mm_mul_ps(rsq32,rinv32);
634 /* EWALD ELECTROSTATICS */
636 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
637 ewrt = _mm_mul_ps(r32,ewtabscale);
638 ewitab = _mm_cvttps_epi32(ewrt);
639 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
640 ewitab = _mm_slli_epi32(ewitab,2);
641 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
642 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
643 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
644 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
645 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
646 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
647 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
648 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
649 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
651 /* Update potential sum for this i atom from the interaction with this j atom. */
652 velecsum = _mm_add_ps(velecsum,velec);
656 /* Calculate temporary vectorial force */
657 tx = _mm_mul_ps(fscal,dx32);
658 ty = _mm_mul_ps(fscal,dy32);
659 tz = _mm_mul_ps(fscal,dz32);
661 /* Update vectorial force */
662 fix3 = _mm_add_ps(fix3,tx);
663 fiy3 = _mm_add_ps(fiy3,ty);
664 fiz3 = _mm_add_ps(fiz3,tz);
666 fjx2 = _mm_add_ps(fjx2,tx);
667 fjy2 = _mm_add_ps(fjy2,ty);
668 fjz2 = _mm_add_ps(fjz2,tz);
670 /**************************
671 * CALCULATE INTERACTIONS *
672 **************************/
674 r33 = _mm_mul_ps(rsq33,rinv33);
676 /* EWALD ELECTROSTATICS */
678 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
679 ewrt = _mm_mul_ps(r33,ewtabscale);
680 ewitab = _mm_cvttps_epi32(ewrt);
681 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
682 ewitab = _mm_slli_epi32(ewitab,2);
683 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
684 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
685 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
686 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
687 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
688 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
689 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
690 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
691 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
693 /* Update potential sum for this i atom from the interaction with this j atom. */
694 velecsum = _mm_add_ps(velecsum,velec);
698 /* Calculate temporary vectorial force */
699 tx = _mm_mul_ps(fscal,dx33);
700 ty = _mm_mul_ps(fscal,dy33);
701 tz = _mm_mul_ps(fscal,dz33);
703 /* Update vectorial force */
704 fix3 = _mm_add_ps(fix3,tx);
705 fiy3 = _mm_add_ps(fiy3,ty);
706 fiz3 = _mm_add_ps(fiz3,tz);
708 fjx3 = _mm_add_ps(fjx3,tx);
709 fjy3 = _mm_add_ps(fjy3,ty);
710 fjz3 = _mm_add_ps(fjz3,tz);
712 fjptrA = f+j_coord_offsetA;
713 fjptrB = f+j_coord_offsetB;
714 fjptrC = f+j_coord_offsetC;
715 fjptrD = f+j_coord_offsetD;
717 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
718 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
719 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
721 /* Inner loop uses 404 flops */
727 /* Get j neighbor index, and coordinate index */
728 jnrlistA = jjnr[jidx];
729 jnrlistB = jjnr[jidx+1];
730 jnrlistC = jjnr[jidx+2];
731 jnrlistD = jjnr[jidx+3];
732 /* Sign of each element will be negative for non-real atoms.
733 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
734 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
736 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
737 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
738 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
739 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
740 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
741 j_coord_offsetA = DIM*jnrA;
742 j_coord_offsetB = DIM*jnrB;
743 j_coord_offsetC = DIM*jnrC;
744 j_coord_offsetD = DIM*jnrD;
746 /* load j atom coordinates */
747 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
748 x+j_coord_offsetC,x+j_coord_offsetD,
749 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
750 &jy2,&jz2,&jx3,&jy3,&jz3);
752 /* Calculate displacement vector */
753 dx00 = _mm_sub_ps(ix0,jx0);
754 dy00 = _mm_sub_ps(iy0,jy0);
755 dz00 = _mm_sub_ps(iz0,jz0);
756 dx11 = _mm_sub_ps(ix1,jx1);
757 dy11 = _mm_sub_ps(iy1,jy1);
758 dz11 = _mm_sub_ps(iz1,jz1);
759 dx12 = _mm_sub_ps(ix1,jx2);
760 dy12 = _mm_sub_ps(iy1,jy2);
761 dz12 = _mm_sub_ps(iz1,jz2);
762 dx13 = _mm_sub_ps(ix1,jx3);
763 dy13 = _mm_sub_ps(iy1,jy3);
764 dz13 = _mm_sub_ps(iz1,jz3);
765 dx21 = _mm_sub_ps(ix2,jx1);
766 dy21 = _mm_sub_ps(iy2,jy1);
767 dz21 = _mm_sub_ps(iz2,jz1);
768 dx22 = _mm_sub_ps(ix2,jx2);
769 dy22 = _mm_sub_ps(iy2,jy2);
770 dz22 = _mm_sub_ps(iz2,jz2);
771 dx23 = _mm_sub_ps(ix2,jx3);
772 dy23 = _mm_sub_ps(iy2,jy3);
773 dz23 = _mm_sub_ps(iz2,jz3);
774 dx31 = _mm_sub_ps(ix3,jx1);
775 dy31 = _mm_sub_ps(iy3,jy1);
776 dz31 = _mm_sub_ps(iz3,jz1);
777 dx32 = _mm_sub_ps(ix3,jx2);
778 dy32 = _mm_sub_ps(iy3,jy2);
779 dz32 = _mm_sub_ps(iz3,jz2);
780 dx33 = _mm_sub_ps(ix3,jx3);
781 dy33 = _mm_sub_ps(iy3,jy3);
782 dz33 = _mm_sub_ps(iz3,jz3);
784 /* Calculate squared distance and things based on it */
785 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
786 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
787 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
788 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
789 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
790 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
791 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
792 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
793 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
794 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
796 rinv11 = gmx_mm_invsqrt_ps(rsq11);
797 rinv12 = gmx_mm_invsqrt_ps(rsq12);
798 rinv13 = gmx_mm_invsqrt_ps(rsq13);
799 rinv21 = gmx_mm_invsqrt_ps(rsq21);
800 rinv22 = gmx_mm_invsqrt_ps(rsq22);
801 rinv23 = gmx_mm_invsqrt_ps(rsq23);
802 rinv31 = gmx_mm_invsqrt_ps(rsq31);
803 rinv32 = gmx_mm_invsqrt_ps(rsq32);
804 rinv33 = gmx_mm_invsqrt_ps(rsq33);
806 rinvsq00 = gmx_mm_inv_ps(rsq00);
807 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
808 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
809 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
810 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
811 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
812 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
813 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
814 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
815 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
817 fjx0 = _mm_setzero_ps();
818 fjy0 = _mm_setzero_ps();
819 fjz0 = _mm_setzero_ps();
820 fjx1 = _mm_setzero_ps();
821 fjy1 = _mm_setzero_ps();
822 fjz1 = _mm_setzero_ps();
823 fjx2 = _mm_setzero_ps();
824 fjy2 = _mm_setzero_ps();
825 fjz2 = _mm_setzero_ps();
826 fjx3 = _mm_setzero_ps();
827 fjy3 = _mm_setzero_ps();
828 fjz3 = _mm_setzero_ps();
830 /**************************
831 * CALCULATE INTERACTIONS *
832 **************************/
834 /* LENNARD-JONES DISPERSION/REPULSION */
836 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
837 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
838 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
839 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
840 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
842 /* Update potential sum for this i atom from the interaction with this j atom. */
843 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
844 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
848 fscal = _mm_andnot_ps(dummy_mask,fscal);
850 /* Calculate temporary vectorial force */
851 tx = _mm_mul_ps(fscal,dx00);
852 ty = _mm_mul_ps(fscal,dy00);
853 tz = _mm_mul_ps(fscal,dz00);
855 /* Update vectorial force */
856 fix0 = _mm_add_ps(fix0,tx);
857 fiy0 = _mm_add_ps(fiy0,ty);
858 fiz0 = _mm_add_ps(fiz0,tz);
860 fjx0 = _mm_add_ps(fjx0,tx);
861 fjy0 = _mm_add_ps(fjy0,ty);
862 fjz0 = _mm_add_ps(fjz0,tz);
864 /**************************
865 * CALCULATE INTERACTIONS *
866 **************************/
868 r11 = _mm_mul_ps(rsq11,rinv11);
869 r11 = _mm_andnot_ps(dummy_mask,r11);
871 /* EWALD ELECTROSTATICS */
873 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
874 ewrt = _mm_mul_ps(r11,ewtabscale);
875 ewitab = _mm_cvttps_epi32(ewrt);
876 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
877 ewitab = _mm_slli_epi32(ewitab,2);
878 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
879 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
880 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
881 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
882 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
883 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
884 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
885 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
886 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
888 /* Update potential sum for this i atom from the interaction with this j atom. */
889 velec = _mm_andnot_ps(dummy_mask,velec);
890 velecsum = _mm_add_ps(velecsum,velec);
894 fscal = _mm_andnot_ps(dummy_mask,fscal);
896 /* Calculate temporary vectorial force */
897 tx = _mm_mul_ps(fscal,dx11);
898 ty = _mm_mul_ps(fscal,dy11);
899 tz = _mm_mul_ps(fscal,dz11);
901 /* Update vectorial force */
902 fix1 = _mm_add_ps(fix1,tx);
903 fiy1 = _mm_add_ps(fiy1,ty);
904 fiz1 = _mm_add_ps(fiz1,tz);
906 fjx1 = _mm_add_ps(fjx1,tx);
907 fjy1 = _mm_add_ps(fjy1,ty);
908 fjz1 = _mm_add_ps(fjz1,tz);
910 /**************************
911 * CALCULATE INTERACTIONS *
912 **************************/
914 r12 = _mm_mul_ps(rsq12,rinv12);
915 r12 = _mm_andnot_ps(dummy_mask,r12);
917 /* EWALD ELECTROSTATICS */
919 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
920 ewrt = _mm_mul_ps(r12,ewtabscale);
921 ewitab = _mm_cvttps_epi32(ewrt);
922 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
923 ewitab = _mm_slli_epi32(ewitab,2);
924 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
925 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
926 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
927 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
928 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
929 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
930 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
931 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
932 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
934 /* Update potential sum for this i atom from the interaction with this j atom. */
935 velec = _mm_andnot_ps(dummy_mask,velec);
936 velecsum = _mm_add_ps(velecsum,velec);
940 fscal = _mm_andnot_ps(dummy_mask,fscal);
942 /* Calculate temporary vectorial force */
943 tx = _mm_mul_ps(fscal,dx12);
944 ty = _mm_mul_ps(fscal,dy12);
945 tz = _mm_mul_ps(fscal,dz12);
947 /* Update vectorial force */
948 fix1 = _mm_add_ps(fix1,tx);
949 fiy1 = _mm_add_ps(fiy1,ty);
950 fiz1 = _mm_add_ps(fiz1,tz);
952 fjx2 = _mm_add_ps(fjx2,tx);
953 fjy2 = _mm_add_ps(fjy2,ty);
954 fjz2 = _mm_add_ps(fjz2,tz);
956 /**************************
957 * CALCULATE INTERACTIONS *
958 **************************/
960 r13 = _mm_mul_ps(rsq13,rinv13);
961 r13 = _mm_andnot_ps(dummy_mask,r13);
963 /* EWALD ELECTROSTATICS */
965 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
966 ewrt = _mm_mul_ps(r13,ewtabscale);
967 ewitab = _mm_cvttps_epi32(ewrt);
968 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
969 ewitab = _mm_slli_epi32(ewitab,2);
970 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
971 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
972 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
973 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
974 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
975 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
976 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
977 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
978 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
980 /* Update potential sum for this i atom from the interaction with this j atom. */
981 velec = _mm_andnot_ps(dummy_mask,velec);
982 velecsum = _mm_add_ps(velecsum,velec);
986 fscal = _mm_andnot_ps(dummy_mask,fscal);
988 /* Calculate temporary vectorial force */
989 tx = _mm_mul_ps(fscal,dx13);
990 ty = _mm_mul_ps(fscal,dy13);
991 tz = _mm_mul_ps(fscal,dz13);
993 /* Update vectorial force */
994 fix1 = _mm_add_ps(fix1,tx);
995 fiy1 = _mm_add_ps(fiy1,ty);
996 fiz1 = _mm_add_ps(fiz1,tz);
998 fjx3 = _mm_add_ps(fjx3,tx);
999 fjy3 = _mm_add_ps(fjy3,ty);
1000 fjz3 = _mm_add_ps(fjz3,tz);
1002 /**************************
1003 * CALCULATE INTERACTIONS *
1004 **************************/
1006 r21 = _mm_mul_ps(rsq21,rinv21);
1007 r21 = _mm_andnot_ps(dummy_mask,r21);
1009 /* EWALD ELECTROSTATICS */
1011 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1012 ewrt = _mm_mul_ps(r21,ewtabscale);
1013 ewitab = _mm_cvttps_epi32(ewrt);
1014 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1015 ewitab = _mm_slli_epi32(ewitab,2);
1016 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1017 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1018 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1019 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1020 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1021 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1022 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1023 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1024 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1026 /* Update potential sum for this i atom from the interaction with this j atom. */
1027 velec = _mm_andnot_ps(dummy_mask,velec);
1028 velecsum = _mm_add_ps(velecsum,velec);
1032 fscal = _mm_andnot_ps(dummy_mask,fscal);
1034 /* Calculate temporary vectorial force */
1035 tx = _mm_mul_ps(fscal,dx21);
1036 ty = _mm_mul_ps(fscal,dy21);
1037 tz = _mm_mul_ps(fscal,dz21);
1039 /* Update vectorial force */
1040 fix2 = _mm_add_ps(fix2,tx);
1041 fiy2 = _mm_add_ps(fiy2,ty);
1042 fiz2 = _mm_add_ps(fiz2,tz);
1044 fjx1 = _mm_add_ps(fjx1,tx);
1045 fjy1 = _mm_add_ps(fjy1,ty);
1046 fjz1 = _mm_add_ps(fjz1,tz);
1048 /**************************
1049 * CALCULATE INTERACTIONS *
1050 **************************/
1052 r22 = _mm_mul_ps(rsq22,rinv22);
1053 r22 = _mm_andnot_ps(dummy_mask,r22);
1055 /* EWALD ELECTROSTATICS */
1057 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1058 ewrt = _mm_mul_ps(r22,ewtabscale);
1059 ewitab = _mm_cvttps_epi32(ewrt);
1060 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1061 ewitab = _mm_slli_epi32(ewitab,2);
1062 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1063 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1064 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1065 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1066 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1067 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1068 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1069 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1070 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1072 /* Update potential sum for this i atom from the interaction with this j atom. */
1073 velec = _mm_andnot_ps(dummy_mask,velec);
1074 velecsum = _mm_add_ps(velecsum,velec);
1078 fscal = _mm_andnot_ps(dummy_mask,fscal);
1080 /* Calculate temporary vectorial force */
1081 tx = _mm_mul_ps(fscal,dx22);
1082 ty = _mm_mul_ps(fscal,dy22);
1083 tz = _mm_mul_ps(fscal,dz22);
1085 /* Update vectorial force */
1086 fix2 = _mm_add_ps(fix2,tx);
1087 fiy2 = _mm_add_ps(fiy2,ty);
1088 fiz2 = _mm_add_ps(fiz2,tz);
1090 fjx2 = _mm_add_ps(fjx2,tx);
1091 fjy2 = _mm_add_ps(fjy2,ty);
1092 fjz2 = _mm_add_ps(fjz2,tz);
1094 /**************************
1095 * CALCULATE INTERACTIONS *
1096 **************************/
1098 r23 = _mm_mul_ps(rsq23,rinv23);
1099 r23 = _mm_andnot_ps(dummy_mask,r23);
1101 /* EWALD ELECTROSTATICS */
1103 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1104 ewrt = _mm_mul_ps(r23,ewtabscale);
1105 ewitab = _mm_cvttps_epi32(ewrt);
1106 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1107 ewitab = _mm_slli_epi32(ewitab,2);
1108 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1109 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1110 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1111 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1112 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1113 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1114 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1115 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1116 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1118 /* Update potential sum for this i atom from the interaction with this j atom. */
1119 velec = _mm_andnot_ps(dummy_mask,velec);
1120 velecsum = _mm_add_ps(velecsum,velec);
1124 fscal = _mm_andnot_ps(dummy_mask,fscal);
1126 /* Calculate temporary vectorial force */
1127 tx = _mm_mul_ps(fscal,dx23);
1128 ty = _mm_mul_ps(fscal,dy23);
1129 tz = _mm_mul_ps(fscal,dz23);
1131 /* Update vectorial force */
1132 fix2 = _mm_add_ps(fix2,tx);
1133 fiy2 = _mm_add_ps(fiy2,ty);
1134 fiz2 = _mm_add_ps(fiz2,tz);
1136 fjx3 = _mm_add_ps(fjx3,tx);
1137 fjy3 = _mm_add_ps(fjy3,ty);
1138 fjz3 = _mm_add_ps(fjz3,tz);
1140 /**************************
1141 * CALCULATE INTERACTIONS *
1142 **************************/
1144 r31 = _mm_mul_ps(rsq31,rinv31);
1145 r31 = _mm_andnot_ps(dummy_mask,r31);
1147 /* EWALD ELECTROSTATICS */
1149 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1150 ewrt = _mm_mul_ps(r31,ewtabscale);
1151 ewitab = _mm_cvttps_epi32(ewrt);
1152 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1153 ewitab = _mm_slli_epi32(ewitab,2);
1154 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1155 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1156 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1157 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1158 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1159 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1160 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1161 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1162 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1164 /* Update potential sum for this i atom from the interaction with this j atom. */
1165 velec = _mm_andnot_ps(dummy_mask,velec);
1166 velecsum = _mm_add_ps(velecsum,velec);
1170 fscal = _mm_andnot_ps(dummy_mask,fscal);
1172 /* Calculate temporary vectorial force */
1173 tx = _mm_mul_ps(fscal,dx31);
1174 ty = _mm_mul_ps(fscal,dy31);
1175 tz = _mm_mul_ps(fscal,dz31);
1177 /* Update vectorial force */
1178 fix3 = _mm_add_ps(fix3,tx);
1179 fiy3 = _mm_add_ps(fiy3,ty);
1180 fiz3 = _mm_add_ps(fiz3,tz);
1182 fjx1 = _mm_add_ps(fjx1,tx);
1183 fjy1 = _mm_add_ps(fjy1,ty);
1184 fjz1 = _mm_add_ps(fjz1,tz);
1186 /**************************
1187 * CALCULATE INTERACTIONS *
1188 **************************/
1190 r32 = _mm_mul_ps(rsq32,rinv32);
1191 r32 = _mm_andnot_ps(dummy_mask,r32);
1193 /* EWALD ELECTROSTATICS */
1195 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1196 ewrt = _mm_mul_ps(r32,ewtabscale);
1197 ewitab = _mm_cvttps_epi32(ewrt);
1198 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1199 ewitab = _mm_slli_epi32(ewitab,2);
1200 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1201 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1202 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1203 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1204 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1205 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1206 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1207 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1208 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1210 /* Update potential sum for this i atom from the interaction with this j atom. */
1211 velec = _mm_andnot_ps(dummy_mask,velec);
1212 velecsum = _mm_add_ps(velecsum,velec);
1216 fscal = _mm_andnot_ps(dummy_mask,fscal);
1218 /* Calculate temporary vectorial force */
1219 tx = _mm_mul_ps(fscal,dx32);
1220 ty = _mm_mul_ps(fscal,dy32);
1221 tz = _mm_mul_ps(fscal,dz32);
1223 /* Update vectorial force */
1224 fix3 = _mm_add_ps(fix3,tx);
1225 fiy3 = _mm_add_ps(fiy3,ty);
1226 fiz3 = _mm_add_ps(fiz3,tz);
1228 fjx2 = _mm_add_ps(fjx2,tx);
1229 fjy2 = _mm_add_ps(fjy2,ty);
1230 fjz2 = _mm_add_ps(fjz2,tz);
1232 /**************************
1233 * CALCULATE INTERACTIONS *
1234 **************************/
1236 r33 = _mm_mul_ps(rsq33,rinv33);
1237 r33 = _mm_andnot_ps(dummy_mask,r33);
1239 /* EWALD ELECTROSTATICS */
1241 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1242 ewrt = _mm_mul_ps(r33,ewtabscale);
1243 ewitab = _mm_cvttps_epi32(ewrt);
1244 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1245 ewitab = _mm_slli_epi32(ewitab,2);
1246 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1247 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1248 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1249 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1250 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1251 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1252 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1253 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1254 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1256 /* Update potential sum for this i atom from the interaction with this j atom. */
1257 velec = _mm_andnot_ps(dummy_mask,velec);
1258 velecsum = _mm_add_ps(velecsum,velec);
1262 fscal = _mm_andnot_ps(dummy_mask,fscal);
1264 /* Calculate temporary vectorial force */
1265 tx = _mm_mul_ps(fscal,dx33);
1266 ty = _mm_mul_ps(fscal,dy33);
1267 tz = _mm_mul_ps(fscal,dz33);
1269 /* Update vectorial force */
1270 fix3 = _mm_add_ps(fix3,tx);
1271 fiy3 = _mm_add_ps(fiy3,ty);
1272 fiz3 = _mm_add_ps(fiz3,tz);
1274 fjx3 = _mm_add_ps(fjx3,tx);
1275 fjy3 = _mm_add_ps(fjy3,ty);
1276 fjz3 = _mm_add_ps(fjz3,tz);
1278 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1279 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1280 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1281 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1283 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1284 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1285 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1287 /* Inner loop uses 413 flops */
1290 /* End of innermost loop */
1292 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1293 f+i_coord_offset,fshift+i_shift_offset);
1296 /* Update potential energies */
1297 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1298 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1300 /* Increment number of inner iterations */
1301 inneriter += j_index_end - j_index_start;
1303 /* Outer loop uses 26 flops */
1306 /* Increment number of outer iterations */
1309 /* Update outer/inner flops */
1311 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*413);
1314 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse4_1_single
1315 * Electrostatics interaction: Ewald
1316 * VdW interaction: LennardJones
1317 * Geometry: Water4-Water4
1318 * Calculate force/pot: Force
1321 nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse4_1_single
1322 (t_nblist * gmx_restrict nlist,
1323 rvec * gmx_restrict xx,
1324 rvec * gmx_restrict ff,
1325 t_forcerec * gmx_restrict fr,
1326 t_mdatoms * gmx_restrict mdatoms,
1327 nb_kernel_data_t * gmx_restrict kernel_data,
1328 t_nrnb * gmx_restrict nrnb)
1330 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1331 * just 0 for non-waters.
1332 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1333 * jnr indices corresponding to data put in the four positions in the SIMD register.
1335 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1336 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1337 int jnrA,jnrB,jnrC,jnrD;
1338 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1339 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1340 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1341 real rcutoff_scalar;
1342 real *shiftvec,*fshift,*x,*f;
1343 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1344 real scratch[4*DIM];
1345 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1347 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1349 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1351 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1353 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1354 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1355 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1356 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1357 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1358 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1359 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1360 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1361 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1362 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1363 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1364 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1365 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1366 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1367 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1368 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1369 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1370 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1371 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1372 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1375 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1378 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1379 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1381 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1383 __m128 dummy_mask,cutoff_mask;
1384 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1385 __m128 one = _mm_set1_ps(1.0);
1386 __m128 two = _mm_set1_ps(2.0);
1392 jindex = nlist->jindex;
1394 shiftidx = nlist->shift;
1396 shiftvec = fr->shift_vec[0];
1397 fshift = fr->fshift[0];
1398 facel = _mm_set1_ps(fr->epsfac);
1399 charge = mdatoms->chargeA;
1400 nvdwtype = fr->ntype;
1401 vdwparam = fr->nbfp;
1402 vdwtype = mdatoms->typeA;
1404 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1405 ewtab = fr->ic->tabq_coul_F;
1406 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1407 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1409 /* Setup water-specific parameters */
1410 inr = nlist->iinr[0];
1411 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1412 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1413 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1414 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1416 jq1 = _mm_set1_ps(charge[inr+1]);
1417 jq2 = _mm_set1_ps(charge[inr+2]);
1418 jq3 = _mm_set1_ps(charge[inr+3]);
1419 vdwjidx0A = 2*vdwtype[inr+0];
1420 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1421 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1422 qq11 = _mm_mul_ps(iq1,jq1);
1423 qq12 = _mm_mul_ps(iq1,jq2);
1424 qq13 = _mm_mul_ps(iq1,jq3);
1425 qq21 = _mm_mul_ps(iq2,jq1);
1426 qq22 = _mm_mul_ps(iq2,jq2);
1427 qq23 = _mm_mul_ps(iq2,jq3);
1428 qq31 = _mm_mul_ps(iq3,jq1);
1429 qq32 = _mm_mul_ps(iq3,jq2);
1430 qq33 = _mm_mul_ps(iq3,jq3);
1432 /* Avoid stupid compiler warnings */
1433 jnrA = jnrB = jnrC = jnrD = 0;
1434 j_coord_offsetA = 0;
1435 j_coord_offsetB = 0;
1436 j_coord_offsetC = 0;
1437 j_coord_offsetD = 0;
1442 for(iidx=0;iidx<4*DIM;iidx++)
1444 scratch[iidx] = 0.0;
1447 /* Start outer loop over neighborlists */
1448 for(iidx=0; iidx<nri; iidx++)
1450 /* Load shift vector for this list */
1451 i_shift_offset = DIM*shiftidx[iidx];
1453 /* Load limits for loop over neighbors */
1454 j_index_start = jindex[iidx];
1455 j_index_end = jindex[iidx+1];
1457 /* Get outer coordinate index */
1459 i_coord_offset = DIM*inr;
1461 /* Load i particle coords and add shift vector */
1462 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1463 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1465 fix0 = _mm_setzero_ps();
1466 fiy0 = _mm_setzero_ps();
1467 fiz0 = _mm_setzero_ps();
1468 fix1 = _mm_setzero_ps();
1469 fiy1 = _mm_setzero_ps();
1470 fiz1 = _mm_setzero_ps();
1471 fix2 = _mm_setzero_ps();
1472 fiy2 = _mm_setzero_ps();
1473 fiz2 = _mm_setzero_ps();
1474 fix3 = _mm_setzero_ps();
1475 fiy3 = _mm_setzero_ps();
1476 fiz3 = _mm_setzero_ps();
1478 /* Start inner kernel loop */
1479 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1482 /* Get j neighbor index, and coordinate index */
1484 jnrB = jjnr[jidx+1];
1485 jnrC = jjnr[jidx+2];
1486 jnrD = jjnr[jidx+3];
1487 j_coord_offsetA = DIM*jnrA;
1488 j_coord_offsetB = DIM*jnrB;
1489 j_coord_offsetC = DIM*jnrC;
1490 j_coord_offsetD = DIM*jnrD;
1492 /* load j atom coordinates */
1493 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1494 x+j_coord_offsetC,x+j_coord_offsetD,
1495 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1496 &jy2,&jz2,&jx3,&jy3,&jz3);
1498 /* Calculate displacement vector */
1499 dx00 = _mm_sub_ps(ix0,jx0);
1500 dy00 = _mm_sub_ps(iy0,jy0);
1501 dz00 = _mm_sub_ps(iz0,jz0);
1502 dx11 = _mm_sub_ps(ix1,jx1);
1503 dy11 = _mm_sub_ps(iy1,jy1);
1504 dz11 = _mm_sub_ps(iz1,jz1);
1505 dx12 = _mm_sub_ps(ix1,jx2);
1506 dy12 = _mm_sub_ps(iy1,jy2);
1507 dz12 = _mm_sub_ps(iz1,jz2);
1508 dx13 = _mm_sub_ps(ix1,jx3);
1509 dy13 = _mm_sub_ps(iy1,jy3);
1510 dz13 = _mm_sub_ps(iz1,jz3);
1511 dx21 = _mm_sub_ps(ix2,jx1);
1512 dy21 = _mm_sub_ps(iy2,jy1);
1513 dz21 = _mm_sub_ps(iz2,jz1);
1514 dx22 = _mm_sub_ps(ix2,jx2);
1515 dy22 = _mm_sub_ps(iy2,jy2);
1516 dz22 = _mm_sub_ps(iz2,jz2);
1517 dx23 = _mm_sub_ps(ix2,jx3);
1518 dy23 = _mm_sub_ps(iy2,jy3);
1519 dz23 = _mm_sub_ps(iz2,jz3);
1520 dx31 = _mm_sub_ps(ix3,jx1);
1521 dy31 = _mm_sub_ps(iy3,jy1);
1522 dz31 = _mm_sub_ps(iz3,jz1);
1523 dx32 = _mm_sub_ps(ix3,jx2);
1524 dy32 = _mm_sub_ps(iy3,jy2);
1525 dz32 = _mm_sub_ps(iz3,jz2);
1526 dx33 = _mm_sub_ps(ix3,jx3);
1527 dy33 = _mm_sub_ps(iy3,jy3);
1528 dz33 = _mm_sub_ps(iz3,jz3);
1530 /* Calculate squared distance and things based on it */
1531 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1532 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1533 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1534 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1535 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1536 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1537 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1538 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1539 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1540 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1542 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1543 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1544 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1545 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1546 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1547 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1548 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1549 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1550 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1552 rinvsq00 = gmx_mm_inv_ps(rsq00);
1553 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1554 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1555 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1556 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1557 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1558 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1559 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1560 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1561 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1563 fjx0 = _mm_setzero_ps();
1564 fjy0 = _mm_setzero_ps();
1565 fjz0 = _mm_setzero_ps();
1566 fjx1 = _mm_setzero_ps();
1567 fjy1 = _mm_setzero_ps();
1568 fjz1 = _mm_setzero_ps();
1569 fjx2 = _mm_setzero_ps();
1570 fjy2 = _mm_setzero_ps();
1571 fjz2 = _mm_setzero_ps();
1572 fjx3 = _mm_setzero_ps();
1573 fjy3 = _mm_setzero_ps();
1574 fjz3 = _mm_setzero_ps();
1576 /**************************
1577 * CALCULATE INTERACTIONS *
1578 **************************/
1580 /* LENNARD-JONES DISPERSION/REPULSION */
1582 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1583 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1587 /* Calculate temporary vectorial force */
1588 tx = _mm_mul_ps(fscal,dx00);
1589 ty = _mm_mul_ps(fscal,dy00);
1590 tz = _mm_mul_ps(fscal,dz00);
1592 /* Update vectorial force */
1593 fix0 = _mm_add_ps(fix0,tx);
1594 fiy0 = _mm_add_ps(fiy0,ty);
1595 fiz0 = _mm_add_ps(fiz0,tz);
1597 fjx0 = _mm_add_ps(fjx0,tx);
1598 fjy0 = _mm_add_ps(fjy0,ty);
1599 fjz0 = _mm_add_ps(fjz0,tz);
1601 /**************************
1602 * CALCULATE INTERACTIONS *
1603 **************************/
1605 r11 = _mm_mul_ps(rsq11,rinv11);
1607 /* EWALD ELECTROSTATICS */
1609 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1610 ewrt = _mm_mul_ps(r11,ewtabscale);
1611 ewitab = _mm_cvttps_epi32(ewrt);
1612 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1613 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1614 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1616 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1617 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1621 /* Calculate temporary vectorial force */
1622 tx = _mm_mul_ps(fscal,dx11);
1623 ty = _mm_mul_ps(fscal,dy11);
1624 tz = _mm_mul_ps(fscal,dz11);
1626 /* Update vectorial force */
1627 fix1 = _mm_add_ps(fix1,tx);
1628 fiy1 = _mm_add_ps(fiy1,ty);
1629 fiz1 = _mm_add_ps(fiz1,tz);
1631 fjx1 = _mm_add_ps(fjx1,tx);
1632 fjy1 = _mm_add_ps(fjy1,ty);
1633 fjz1 = _mm_add_ps(fjz1,tz);
1635 /**************************
1636 * CALCULATE INTERACTIONS *
1637 **************************/
1639 r12 = _mm_mul_ps(rsq12,rinv12);
1641 /* EWALD ELECTROSTATICS */
1643 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1644 ewrt = _mm_mul_ps(r12,ewtabscale);
1645 ewitab = _mm_cvttps_epi32(ewrt);
1646 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1647 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1648 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1650 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1651 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1655 /* Calculate temporary vectorial force */
1656 tx = _mm_mul_ps(fscal,dx12);
1657 ty = _mm_mul_ps(fscal,dy12);
1658 tz = _mm_mul_ps(fscal,dz12);
1660 /* Update vectorial force */
1661 fix1 = _mm_add_ps(fix1,tx);
1662 fiy1 = _mm_add_ps(fiy1,ty);
1663 fiz1 = _mm_add_ps(fiz1,tz);
1665 fjx2 = _mm_add_ps(fjx2,tx);
1666 fjy2 = _mm_add_ps(fjy2,ty);
1667 fjz2 = _mm_add_ps(fjz2,tz);
1669 /**************************
1670 * CALCULATE INTERACTIONS *
1671 **************************/
1673 r13 = _mm_mul_ps(rsq13,rinv13);
1675 /* EWALD ELECTROSTATICS */
1677 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1678 ewrt = _mm_mul_ps(r13,ewtabscale);
1679 ewitab = _mm_cvttps_epi32(ewrt);
1680 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1681 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1682 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1684 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1685 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1689 /* Calculate temporary vectorial force */
1690 tx = _mm_mul_ps(fscal,dx13);
1691 ty = _mm_mul_ps(fscal,dy13);
1692 tz = _mm_mul_ps(fscal,dz13);
1694 /* Update vectorial force */
1695 fix1 = _mm_add_ps(fix1,tx);
1696 fiy1 = _mm_add_ps(fiy1,ty);
1697 fiz1 = _mm_add_ps(fiz1,tz);
1699 fjx3 = _mm_add_ps(fjx3,tx);
1700 fjy3 = _mm_add_ps(fjy3,ty);
1701 fjz3 = _mm_add_ps(fjz3,tz);
1703 /**************************
1704 * CALCULATE INTERACTIONS *
1705 **************************/
1707 r21 = _mm_mul_ps(rsq21,rinv21);
1709 /* EWALD ELECTROSTATICS */
1711 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1712 ewrt = _mm_mul_ps(r21,ewtabscale);
1713 ewitab = _mm_cvttps_epi32(ewrt);
1714 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1715 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1716 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1718 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1719 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1723 /* Calculate temporary vectorial force */
1724 tx = _mm_mul_ps(fscal,dx21);
1725 ty = _mm_mul_ps(fscal,dy21);
1726 tz = _mm_mul_ps(fscal,dz21);
1728 /* Update vectorial force */
1729 fix2 = _mm_add_ps(fix2,tx);
1730 fiy2 = _mm_add_ps(fiy2,ty);
1731 fiz2 = _mm_add_ps(fiz2,tz);
1733 fjx1 = _mm_add_ps(fjx1,tx);
1734 fjy1 = _mm_add_ps(fjy1,ty);
1735 fjz1 = _mm_add_ps(fjz1,tz);
1737 /**************************
1738 * CALCULATE INTERACTIONS *
1739 **************************/
1741 r22 = _mm_mul_ps(rsq22,rinv22);
1743 /* EWALD ELECTROSTATICS */
1745 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1746 ewrt = _mm_mul_ps(r22,ewtabscale);
1747 ewitab = _mm_cvttps_epi32(ewrt);
1748 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1749 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1750 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1752 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1753 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1757 /* Calculate temporary vectorial force */
1758 tx = _mm_mul_ps(fscal,dx22);
1759 ty = _mm_mul_ps(fscal,dy22);
1760 tz = _mm_mul_ps(fscal,dz22);
1762 /* Update vectorial force */
1763 fix2 = _mm_add_ps(fix2,tx);
1764 fiy2 = _mm_add_ps(fiy2,ty);
1765 fiz2 = _mm_add_ps(fiz2,tz);
1767 fjx2 = _mm_add_ps(fjx2,tx);
1768 fjy2 = _mm_add_ps(fjy2,ty);
1769 fjz2 = _mm_add_ps(fjz2,tz);
1771 /**************************
1772 * CALCULATE INTERACTIONS *
1773 **************************/
1775 r23 = _mm_mul_ps(rsq23,rinv23);
1777 /* EWALD ELECTROSTATICS */
1779 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1780 ewrt = _mm_mul_ps(r23,ewtabscale);
1781 ewitab = _mm_cvttps_epi32(ewrt);
1782 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1783 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1784 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1786 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1787 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1791 /* Calculate temporary vectorial force */
1792 tx = _mm_mul_ps(fscal,dx23);
1793 ty = _mm_mul_ps(fscal,dy23);
1794 tz = _mm_mul_ps(fscal,dz23);
1796 /* Update vectorial force */
1797 fix2 = _mm_add_ps(fix2,tx);
1798 fiy2 = _mm_add_ps(fiy2,ty);
1799 fiz2 = _mm_add_ps(fiz2,tz);
1801 fjx3 = _mm_add_ps(fjx3,tx);
1802 fjy3 = _mm_add_ps(fjy3,ty);
1803 fjz3 = _mm_add_ps(fjz3,tz);
1805 /**************************
1806 * CALCULATE INTERACTIONS *
1807 **************************/
1809 r31 = _mm_mul_ps(rsq31,rinv31);
1811 /* EWALD ELECTROSTATICS */
1813 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1814 ewrt = _mm_mul_ps(r31,ewtabscale);
1815 ewitab = _mm_cvttps_epi32(ewrt);
1816 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1817 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1818 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1820 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1821 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1825 /* Calculate temporary vectorial force */
1826 tx = _mm_mul_ps(fscal,dx31);
1827 ty = _mm_mul_ps(fscal,dy31);
1828 tz = _mm_mul_ps(fscal,dz31);
1830 /* Update vectorial force */
1831 fix3 = _mm_add_ps(fix3,tx);
1832 fiy3 = _mm_add_ps(fiy3,ty);
1833 fiz3 = _mm_add_ps(fiz3,tz);
1835 fjx1 = _mm_add_ps(fjx1,tx);
1836 fjy1 = _mm_add_ps(fjy1,ty);
1837 fjz1 = _mm_add_ps(fjz1,tz);
1839 /**************************
1840 * CALCULATE INTERACTIONS *
1841 **************************/
1843 r32 = _mm_mul_ps(rsq32,rinv32);
1845 /* EWALD ELECTROSTATICS */
1847 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1848 ewrt = _mm_mul_ps(r32,ewtabscale);
1849 ewitab = _mm_cvttps_epi32(ewrt);
1850 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1851 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1852 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1854 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1855 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1859 /* Calculate temporary vectorial force */
1860 tx = _mm_mul_ps(fscal,dx32);
1861 ty = _mm_mul_ps(fscal,dy32);
1862 tz = _mm_mul_ps(fscal,dz32);
1864 /* Update vectorial force */
1865 fix3 = _mm_add_ps(fix3,tx);
1866 fiy3 = _mm_add_ps(fiy3,ty);
1867 fiz3 = _mm_add_ps(fiz3,tz);
1869 fjx2 = _mm_add_ps(fjx2,tx);
1870 fjy2 = _mm_add_ps(fjy2,ty);
1871 fjz2 = _mm_add_ps(fjz2,tz);
1873 /**************************
1874 * CALCULATE INTERACTIONS *
1875 **************************/
1877 r33 = _mm_mul_ps(rsq33,rinv33);
1879 /* EWALD ELECTROSTATICS */
1881 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1882 ewrt = _mm_mul_ps(r33,ewtabscale);
1883 ewitab = _mm_cvttps_epi32(ewrt);
1884 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1885 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1886 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1888 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1889 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1893 /* Calculate temporary vectorial force */
1894 tx = _mm_mul_ps(fscal,dx33);
1895 ty = _mm_mul_ps(fscal,dy33);
1896 tz = _mm_mul_ps(fscal,dz33);
1898 /* Update vectorial force */
1899 fix3 = _mm_add_ps(fix3,tx);
1900 fiy3 = _mm_add_ps(fiy3,ty);
1901 fiz3 = _mm_add_ps(fiz3,tz);
1903 fjx3 = _mm_add_ps(fjx3,tx);
1904 fjy3 = _mm_add_ps(fjy3,ty);
1905 fjz3 = _mm_add_ps(fjz3,tz);
1907 fjptrA = f+j_coord_offsetA;
1908 fjptrB = f+j_coord_offsetB;
1909 fjptrC = f+j_coord_offsetC;
1910 fjptrD = f+j_coord_offsetD;
1912 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1913 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1914 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1916 /* Inner loop uses 354 flops */
1919 if(jidx<j_index_end)
1922 /* Get j neighbor index, and coordinate index */
1923 jnrlistA = jjnr[jidx];
1924 jnrlistB = jjnr[jidx+1];
1925 jnrlistC = jjnr[jidx+2];
1926 jnrlistD = jjnr[jidx+3];
1927 /* Sign of each element will be negative for non-real atoms.
1928 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1929 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1931 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1932 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1933 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1934 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1935 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1936 j_coord_offsetA = DIM*jnrA;
1937 j_coord_offsetB = DIM*jnrB;
1938 j_coord_offsetC = DIM*jnrC;
1939 j_coord_offsetD = DIM*jnrD;
1941 /* load j atom coordinates */
1942 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1943 x+j_coord_offsetC,x+j_coord_offsetD,
1944 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1945 &jy2,&jz2,&jx3,&jy3,&jz3);
1947 /* Calculate displacement vector */
1948 dx00 = _mm_sub_ps(ix0,jx0);
1949 dy00 = _mm_sub_ps(iy0,jy0);
1950 dz00 = _mm_sub_ps(iz0,jz0);
1951 dx11 = _mm_sub_ps(ix1,jx1);
1952 dy11 = _mm_sub_ps(iy1,jy1);
1953 dz11 = _mm_sub_ps(iz1,jz1);
1954 dx12 = _mm_sub_ps(ix1,jx2);
1955 dy12 = _mm_sub_ps(iy1,jy2);
1956 dz12 = _mm_sub_ps(iz1,jz2);
1957 dx13 = _mm_sub_ps(ix1,jx3);
1958 dy13 = _mm_sub_ps(iy1,jy3);
1959 dz13 = _mm_sub_ps(iz1,jz3);
1960 dx21 = _mm_sub_ps(ix2,jx1);
1961 dy21 = _mm_sub_ps(iy2,jy1);
1962 dz21 = _mm_sub_ps(iz2,jz1);
1963 dx22 = _mm_sub_ps(ix2,jx2);
1964 dy22 = _mm_sub_ps(iy2,jy2);
1965 dz22 = _mm_sub_ps(iz2,jz2);
1966 dx23 = _mm_sub_ps(ix2,jx3);
1967 dy23 = _mm_sub_ps(iy2,jy3);
1968 dz23 = _mm_sub_ps(iz2,jz3);
1969 dx31 = _mm_sub_ps(ix3,jx1);
1970 dy31 = _mm_sub_ps(iy3,jy1);
1971 dz31 = _mm_sub_ps(iz3,jz1);
1972 dx32 = _mm_sub_ps(ix3,jx2);
1973 dy32 = _mm_sub_ps(iy3,jy2);
1974 dz32 = _mm_sub_ps(iz3,jz2);
1975 dx33 = _mm_sub_ps(ix3,jx3);
1976 dy33 = _mm_sub_ps(iy3,jy3);
1977 dz33 = _mm_sub_ps(iz3,jz3);
1979 /* Calculate squared distance and things based on it */
1980 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1981 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1982 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1983 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1984 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1985 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1986 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1987 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1988 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1989 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1991 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1992 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1993 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1994 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1995 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1996 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1997 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1998 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1999 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2001 rinvsq00 = gmx_mm_inv_ps(rsq00);
2002 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2003 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2004 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2005 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2006 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2007 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2008 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2009 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2010 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2012 fjx0 = _mm_setzero_ps();
2013 fjy0 = _mm_setzero_ps();
2014 fjz0 = _mm_setzero_ps();
2015 fjx1 = _mm_setzero_ps();
2016 fjy1 = _mm_setzero_ps();
2017 fjz1 = _mm_setzero_ps();
2018 fjx2 = _mm_setzero_ps();
2019 fjy2 = _mm_setzero_ps();
2020 fjz2 = _mm_setzero_ps();
2021 fjx3 = _mm_setzero_ps();
2022 fjy3 = _mm_setzero_ps();
2023 fjz3 = _mm_setzero_ps();
2025 /**************************
2026 * CALCULATE INTERACTIONS *
2027 **************************/
2029 /* LENNARD-JONES DISPERSION/REPULSION */
2031 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2032 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
2036 fscal = _mm_andnot_ps(dummy_mask,fscal);
2038 /* Calculate temporary vectorial force */
2039 tx = _mm_mul_ps(fscal,dx00);
2040 ty = _mm_mul_ps(fscal,dy00);
2041 tz = _mm_mul_ps(fscal,dz00);
2043 /* Update vectorial force */
2044 fix0 = _mm_add_ps(fix0,tx);
2045 fiy0 = _mm_add_ps(fiy0,ty);
2046 fiz0 = _mm_add_ps(fiz0,tz);
2048 fjx0 = _mm_add_ps(fjx0,tx);
2049 fjy0 = _mm_add_ps(fjy0,ty);
2050 fjz0 = _mm_add_ps(fjz0,tz);
2052 /**************************
2053 * CALCULATE INTERACTIONS *
2054 **************************/
2056 r11 = _mm_mul_ps(rsq11,rinv11);
2057 r11 = _mm_andnot_ps(dummy_mask,r11);
2059 /* EWALD ELECTROSTATICS */
2061 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2062 ewrt = _mm_mul_ps(r11,ewtabscale);
2063 ewitab = _mm_cvttps_epi32(ewrt);
2064 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2065 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2066 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2068 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2069 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2073 fscal = _mm_andnot_ps(dummy_mask,fscal);
2075 /* Calculate temporary vectorial force */
2076 tx = _mm_mul_ps(fscal,dx11);
2077 ty = _mm_mul_ps(fscal,dy11);
2078 tz = _mm_mul_ps(fscal,dz11);
2080 /* Update vectorial force */
2081 fix1 = _mm_add_ps(fix1,tx);
2082 fiy1 = _mm_add_ps(fiy1,ty);
2083 fiz1 = _mm_add_ps(fiz1,tz);
2085 fjx1 = _mm_add_ps(fjx1,tx);
2086 fjy1 = _mm_add_ps(fjy1,ty);
2087 fjz1 = _mm_add_ps(fjz1,tz);
2089 /**************************
2090 * CALCULATE INTERACTIONS *
2091 **************************/
2093 r12 = _mm_mul_ps(rsq12,rinv12);
2094 r12 = _mm_andnot_ps(dummy_mask,r12);
2096 /* EWALD ELECTROSTATICS */
2098 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2099 ewrt = _mm_mul_ps(r12,ewtabscale);
2100 ewitab = _mm_cvttps_epi32(ewrt);
2101 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2102 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2103 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2105 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2106 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2110 fscal = _mm_andnot_ps(dummy_mask,fscal);
2112 /* Calculate temporary vectorial force */
2113 tx = _mm_mul_ps(fscal,dx12);
2114 ty = _mm_mul_ps(fscal,dy12);
2115 tz = _mm_mul_ps(fscal,dz12);
2117 /* Update vectorial force */
2118 fix1 = _mm_add_ps(fix1,tx);
2119 fiy1 = _mm_add_ps(fiy1,ty);
2120 fiz1 = _mm_add_ps(fiz1,tz);
2122 fjx2 = _mm_add_ps(fjx2,tx);
2123 fjy2 = _mm_add_ps(fjy2,ty);
2124 fjz2 = _mm_add_ps(fjz2,tz);
2126 /**************************
2127 * CALCULATE INTERACTIONS *
2128 **************************/
2130 r13 = _mm_mul_ps(rsq13,rinv13);
2131 r13 = _mm_andnot_ps(dummy_mask,r13);
2133 /* EWALD ELECTROSTATICS */
2135 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2136 ewrt = _mm_mul_ps(r13,ewtabscale);
2137 ewitab = _mm_cvttps_epi32(ewrt);
2138 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2139 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2140 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2142 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2143 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2147 fscal = _mm_andnot_ps(dummy_mask,fscal);
2149 /* Calculate temporary vectorial force */
2150 tx = _mm_mul_ps(fscal,dx13);
2151 ty = _mm_mul_ps(fscal,dy13);
2152 tz = _mm_mul_ps(fscal,dz13);
2154 /* Update vectorial force */
2155 fix1 = _mm_add_ps(fix1,tx);
2156 fiy1 = _mm_add_ps(fiy1,ty);
2157 fiz1 = _mm_add_ps(fiz1,tz);
2159 fjx3 = _mm_add_ps(fjx3,tx);
2160 fjy3 = _mm_add_ps(fjy3,ty);
2161 fjz3 = _mm_add_ps(fjz3,tz);
2163 /**************************
2164 * CALCULATE INTERACTIONS *
2165 **************************/
2167 r21 = _mm_mul_ps(rsq21,rinv21);
2168 r21 = _mm_andnot_ps(dummy_mask,r21);
2170 /* EWALD ELECTROSTATICS */
2172 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2173 ewrt = _mm_mul_ps(r21,ewtabscale);
2174 ewitab = _mm_cvttps_epi32(ewrt);
2175 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2176 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2177 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2179 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2180 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2184 fscal = _mm_andnot_ps(dummy_mask,fscal);
2186 /* Calculate temporary vectorial force */
2187 tx = _mm_mul_ps(fscal,dx21);
2188 ty = _mm_mul_ps(fscal,dy21);
2189 tz = _mm_mul_ps(fscal,dz21);
2191 /* Update vectorial force */
2192 fix2 = _mm_add_ps(fix2,tx);
2193 fiy2 = _mm_add_ps(fiy2,ty);
2194 fiz2 = _mm_add_ps(fiz2,tz);
2196 fjx1 = _mm_add_ps(fjx1,tx);
2197 fjy1 = _mm_add_ps(fjy1,ty);
2198 fjz1 = _mm_add_ps(fjz1,tz);
2200 /**************************
2201 * CALCULATE INTERACTIONS *
2202 **************************/
2204 r22 = _mm_mul_ps(rsq22,rinv22);
2205 r22 = _mm_andnot_ps(dummy_mask,r22);
2207 /* EWALD ELECTROSTATICS */
2209 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2210 ewrt = _mm_mul_ps(r22,ewtabscale);
2211 ewitab = _mm_cvttps_epi32(ewrt);
2212 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2213 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2214 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2216 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2217 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2221 fscal = _mm_andnot_ps(dummy_mask,fscal);
2223 /* Calculate temporary vectorial force */
2224 tx = _mm_mul_ps(fscal,dx22);
2225 ty = _mm_mul_ps(fscal,dy22);
2226 tz = _mm_mul_ps(fscal,dz22);
2228 /* Update vectorial force */
2229 fix2 = _mm_add_ps(fix2,tx);
2230 fiy2 = _mm_add_ps(fiy2,ty);
2231 fiz2 = _mm_add_ps(fiz2,tz);
2233 fjx2 = _mm_add_ps(fjx2,tx);
2234 fjy2 = _mm_add_ps(fjy2,ty);
2235 fjz2 = _mm_add_ps(fjz2,tz);
2237 /**************************
2238 * CALCULATE INTERACTIONS *
2239 **************************/
2241 r23 = _mm_mul_ps(rsq23,rinv23);
2242 r23 = _mm_andnot_ps(dummy_mask,r23);
2244 /* EWALD ELECTROSTATICS */
2246 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2247 ewrt = _mm_mul_ps(r23,ewtabscale);
2248 ewitab = _mm_cvttps_epi32(ewrt);
2249 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2250 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2251 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2253 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2254 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2258 fscal = _mm_andnot_ps(dummy_mask,fscal);
2260 /* Calculate temporary vectorial force */
2261 tx = _mm_mul_ps(fscal,dx23);
2262 ty = _mm_mul_ps(fscal,dy23);
2263 tz = _mm_mul_ps(fscal,dz23);
2265 /* Update vectorial force */
2266 fix2 = _mm_add_ps(fix2,tx);
2267 fiy2 = _mm_add_ps(fiy2,ty);
2268 fiz2 = _mm_add_ps(fiz2,tz);
2270 fjx3 = _mm_add_ps(fjx3,tx);
2271 fjy3 = _mm_add_ps(fjy3,ty);
2272 fjz3 = _mm_add_ps(fjz3,tz);
2274 /**************************
2275 * CALCULATE INTERACTIONS *
2276 **************************/
2278 r31 = _mm_mul_ps(rsq31,rinv31);
2279 r31 = _mm_andnot_ps(dummy_mask,r31);
2281 /* EWALD ELECTROSTATICS */
2283 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2284 ewrt = _mm_mul_ps(r31,ewtabscale);
2285 ewitab = _mm_cvttps_epi32(ewrt);
2286 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2287 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2288 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2290 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2291 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2295 fscal = _mm_andnot_ps(dummy_mask,fscal);
2297 /* Calculate temporary vectorial force */
2298 tx = _mm_mul_ps(fscal,dx31);
2299 ty = _mm_mul_ps(fscal,dy31);
2300 tz = _mm_mul_ps(fscal,dz31);
2302 /* Update vectorial force */
2303 fix3 = _mm_add_ps(fix3,tx);
2304 fiy3 = _mm_add_ps(fiy3,ty);
2305 fiz3 = _mm_add_ps(fiz3,tz);
2307 fjx1 = _mm_add_ps(fjx1,tx);
2308 fjy1 = _mm_add_ps(fjy1,ty);
2309 fjz1 = _mm_add_ps(fjz1,tz);
2311 /**************************
2312 * CALCULATE INTERACTIONS *
2313 **************************/
2315 r32 = _mm_mul_ps(rsq32,rinv32);
2316 r32 = _mm_andnot_ps(dummy_mask,r32);
2318 /* EWALD ELECTROSTATICS */
2320 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2321 ewrt = _mm_mul_ps(r32,ewtabscale);
2322 ewitab = _mm_cvttps_epi32(ewrt);
2323 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2324 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2325 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2327 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2328 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2332 fscal = _mm_andnot_ps(dummy_mask,fscal);
2334 /* Calculate temporary vectorial force */
2335 tx = _mm_mul_ps(fscal,dx32);
2336 ty = _mm_mul_ps(fscal,dy32);
2337 tz = _mm_mul_ps(fscal,dz32);
2339 /* Update vectorial force */
2340 fix3 = _mm_add_ps(fix3,tx);
2341 fiy3 = _mm_add_ps(fiy3,ty);
2342 fiz3 = _mm_add_ps(fiz3,tz);
2344 fjx2 = _mm_add_ps(fjx2,tx);
2345 fjy2 = _mm_add_ps(fjy2,ty);
2346 fjz2 = _mm_add_ps(fjz2,tz);
2348 /**************************
2349 * CALCULATE INTERACTIONS *
2350 **************************/
2352 r33 = _mm_mul_ps(rsq33,rinv33);
2353 r33 = _mm_andnot_ps(dummy_mask,r33);
2355 /* EWALD ELECTROSTATICS */
2357 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2358 ewrt = _mm_mul_ps(r33,ewtabscale);
2359 ewitab = _mm_cvttps_epi32(ewrt);
2360 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2361 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2362 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2364 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2365 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2369 fscal = _mm_andnot_ps(dummy_mask,fscal);
2371 /* Calculate temporary vectorial force */
2372 tx = _mm_mul_ps(fscal,dx33);
2373 ty = _mm_mul_ps(fscal,dy33);
2374 tz = _mm_mul_ps(fscal,dz33);
2376 /* Update vectorial force */
2377 fix3 = _mm_add_ps(fix3,tx);
2378 fiy3 = _mm_add_ps(fiy3,ty);
2379 fiz3 = _mm_add_ps(fiz3,tz);
2381 fjx3 = _mm_add_ps(fjx3,tx);
2382 fjy3 = _mm_add_ps(fjy3,ty);
2383 fjz3 = _mm_add_ps(fjz3,tz);
2385 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2386 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2387 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2388 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2390 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2391 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2392 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2394 /* Inner loop uses 363 flops */
2397 /* End of innermost loop */
2399 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2400 f+i_coord_offset,fshift+i_shift_offset);
2402 /* Increment number of inner iterations */
2403 inneriter += j_index_end - j_index_start;
2405 /* Outer loop uses 24 flops */
2408 /* Increment number of outer iterations */
2411 /* Update outer/inner flops */
2413 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*363);