2 * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_single.h"
34 #include "kernelutil_x86_avx_128_fma_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_128_fma_single
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_128_fma_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
93 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
96 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
97 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
99 __m128i ifour = _mm_set1_epi32(4);
100 __m128 rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
102 __m128 dummy_mask,cutoff_mask;
103 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
104 __m128 one = _mm_set1_ps(1.0);
105 __m128 two = _mm_set1_ps(2.0);
111 jindex = nlist->jindex;
113 shiftidx = nlist->shift;
115 shiftvec = fr->shift_vec[0];
116 fshift = fr->fshift[0];
117 facel = _mm_set1_ps(fr->epsfac);
118 charge = mdatoms->chargeA;
119 nvdwtype = fr->ntype;
121 vdwtype = mdatoms->typeA;
123 vftab = kernel_data->table_elec->data;
124 vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
126 /* Setup water-specific parameters */
127 inr = nlist->iinr[0];
128 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
129 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
130 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
131 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
133 jq0 = _mm_set1_ps(charge[inr+0]);
134 jq1 = _mm_set1_ps(charge[inr+1]);
135 jq2 = _mm_set1_ps(charge[inr+2]);
136 vdwjidx0A = 2*vdwtype[inr+0];
137 qq00 = _mm_mul_ps(iq0,jq0);
138 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
139 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
140 qq01 = _mm_mul_ps(iq0,jq1);
141 qq02 = _mm_mul_ps(iq0,jq2);
142 qq10 = _mm_mul_ps(iq1,jq0);
143 qq11 = _mm_mul_ps(iq1,jq1);
144 qq12 = _mm_mul_ps(iq1,jq2);
145 qq20 = _mm_mul_ps(iq2,jq0);
146 qq21 = _mm_mul_ps(iq2,jq1);
147 qq22 = _mm_mul_ps(iq2,jq2);
149 /* Avoid stupid compiler warnings */
150 jnrA = jnrB = jnrC = jnrD = 0;
159 for(iidx=0;iidx<4*DIM;iidx++)
164 /* Start outer loop over neighborlists */
165 for(iidx=0; iidx<nri; iidx++)
167 /* Load shift vector for this list */
168 i_shift_offset = DIM*shiftidx[iidx];
170 /* Load limits for loop over neighbors */
171 j_index_start = jindex[iidx];
172 j_index_end = jindex[iidx+1];
174 /* Get outer coordinate index */
176 i_coord_offset = DIM*inr;
178 /* Load i particle coords and add shift vector */
179 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
180 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
182 fix0 = _mm_setzero_ps();
183 fiy0 = _mm_setzero_ps();
184 fiz0 = _mm_setzero_ps();
185 fix1 = _mm_setzero_ps();
186 fiy1 = _mm_setzero_ps();
187 fiz1 = _mm_setzero_ps();
188 fix2 = _mm_setzero_ps();
189 fiy2 = _mm_setzero_ps();
190 fiz2 = _mm_setzero_ps();
192 /* Reset potential sums */
193 velecsum = _mm_setzero_ps();
194 vvdwsum = _mm_setzero_ps();
196 /* Start inner kernel loop */
197 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
200 /* Get j neighbor index, and coordinate index */
205 j_coord_offsetA = DIM*jnrA;
206 j_coord_offsetB = DIM*jnrB;
207 j_coord_offsetC = DIM*jnrC;
208 j_coord_offsetD = DIM*jnrD;
210 /* load j atom coordinates */
211 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
212 x+j_coord_offsetC,x+j_coord_offsetD,
213 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
215 /* Calculate displacement vector */
216 dx00 = _mm_sub_ps(ix0,jx0);
217 dy00 = _mm_sub_ps(iy0,jy0);
218 dz00 = _mm_sub_ps(iz0,jz0);
219 dx01 = _mm_sub_ps(ix0,jx1);
220 dy01 = _mm_sub_ps(iy0,jy1);
221 dz01 = _mm_sub_ps(iz0,jz1);
222 dx02 = _mm_sub_ps(ix0,jx2);
223 dy02 = _mm_sub_ps(iy0,jy2);
224 dz02 = _mm_sub_ps(iz0,jz2);
225 dx10 = _mm_sub_ps(ix1,jx0);
226 dy10 = _mm_sub_ps(iy1,jy0);
227 dz10 = _mm_sub_ps(iz1,jz0);
228 dx11 = _mm_sub_ps(ix1,jx1);
229 dy11 = _mm_sub_ps(iy1,jy1);
230 dz11 = _mm_sub_ps(iz1,jz1);
231 dx12 = _mm_sub_ps(ix1,jx2);
232 dy12 = _mm_sub_ps(iy1,jy2);
233 dz12 = _mm_sub_ps(iz1,jz2);
234 dx20 = _mm_sub_ps(ix2,jx0);
235 dy20 = _mm_sub_ps(iy2,jy0);
236 dz20 = _mm_sub_ps(iz2,jz0);
237 dx21 = _mm_sub_ps(ix2,jx1);
238 dy21 = _mm_sub_ps(iy2,jy1);
239 dz21 = _mm_sub_ps(iz2,jz1);
240 dx22 = _mm_sub_ps(ix2,jx2);
241 dy22 = _mm_sub_ps(iy2,jy2);
242 dz22 = _mm_sub_ps(iz2,jz2);
244 /* Calculate squared distance and things based on it */
245 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
246 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
247 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
248 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
249 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
250 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
251 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
252 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
253 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
255 rinv00 = gmx_mm_invsqrt_ps(rsq00);
256 rinv01 = gmx_mm_invsqrt_ps(rsq01);
257 rinv02 = gmx_mm_invsqrt_ps(rsq02);
258 rinv10 = gmx_mm_invsqrt_ps(rsq10);
259 rinv11 = gmx_mm_invsqrt_ps(rsq11);
260 rinv12 = gmx_mm_invsqrt_ps(rsq12);
261 rinv20 = gmx_mm_invsqrt_ps(rsq20);
262 rinv21 = gmx_mm_invsqrt_ps(rsq21);
263 rinv22 = gmx_mm_invsqrt_ps(rsq22);
265 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
267 fjx0 = _mm_setzero_ps();
268 fjy0 = _mm_setzero_ps();
269 fjz0 = _mm_setzero_ps();
270 fjx1 = _mm_setzero_ps();
271 fjy1 = _mm_setzero_ps();
272 fjz1 = _mm_setzero_ps();
273 fjx2 = _mm_setzero_ps();
274 fjy2 = _mm_setzero_ps();
275 fjz2 = _mm_setzero_ps();
277 /**************************
278 * CALCULATE INTERACTIONS *
279 **************************/
281 r00 = _mm_mul_ps(rsq00,rinv00);
283 /* Calculate table index by multiplying r with table scale and truncate to integer */
284 rt = _mm_mul_ps(r00,vftabscale);
285 vfitab = _mm_cvttps_epi32(rt);
287 vfeps = _mm_frcz_ps(rt);
289 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
291 twovfeps = _mm_add_ps(vfeps,vfeps);
292 vfitab = _mm_slli_epi32(vfitab,2);
294 /* CUBIC SPLINE TABLE ELECTROSTATICS */
295 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
296 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
297 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
298 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
299 _MM_TRANSPOSE4_PS(Y,F,G,H);
300 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
301 VV = _mm_macc_ps(vfeps,Fp,Y);
302 velec = _mm_mul_ps(qq00,VV);
303 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
304 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
306 /* LENNARD-JONES DISPERSION/REPULSION */
308 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
309 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
310 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
311 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
312 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
314 /* Update potential sum for this i atom from the interaction with this j atom. */
315 velecsum = _mm_add_ps(velecsum,velec);
316 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
318 fscal = _mm_add_ps(felec,fvdw);
320 /* Update vectorial force */
321 fix0 = _mm_macc_ps(dx00,fscal,fix0);
322 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
323 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
325 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
326 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
327 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
329 /**************************
330 * CALCULATE INTERACTIONS *
331 **************************/
333 r01 = _mm_mul_ps(rsq01,rinv01);
335 /* Calculate table index by multiplying r with table scale and truncate to integer */
336 rt = _mm_mul_ps(r01,vftabscale);
337 vfitab = _mm_cvttps_epi32(rt);
339 vfeps = _mm_frcz_ps(rt);
341 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
343 twovfeps = _mm_add_ps(vfeps,vfeps);
344 vfitab = _mm_slli_epi32(vfitab,2);
346 /* CUBIC SPLINE TABLE ELECTROSTATICS */
347 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
348 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
349 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
350 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
351 _MM_TRANSPOSE4_PS(Y,F,G,H);
352 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
353 VV = _mm_macc_ps(vfeps,Fp,Y);
354 velec = _mm_mul_ps(qq01,VV);
355 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
356 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
358 /* Update potential sum for this i atom from the interaction with this j atom. */
359 velecsum = _mm_add_ps(velecsum,velec);
363 /* Update vectorial force */
364 fix0 = _mm_macc_ps(dx01,fscal,fix0);
365 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
366 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
368 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
369 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
370 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
372 /**************************
373 * CALCULATE INTERACTIONS *
374 **************************/
376 r02 = _mm_mul_ps(rsq02,rinv02);
378 /* Calculate table index by multiplying r with table scale and truncate to integer */
379 rt = _mm_mul_ps(r02,vftabscale);
380 vfitab = _mm_cvttps_epi32(rt);
382 vfeps = _mm_frcz_ps(rt);
384 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
386 twovfeps = _mm_add_ps(vfeps,vfeps);
387 vfitab = _mm_slli_epi32(vfitab,2);
389 /* CUBIC SPLINE TABLE ELECTROSTATICS */
390 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
391 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
392 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
393 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
394 _MM_TRANSPOSE4_PS(Y,F,G,H);
395 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
396 VV = _mm_macc_ps(vfeps,Fp,Y);
397 velec = _mm_mul_ps(qq02,VV);
398 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
399 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
401 /* Update potential sum for this i atom from the interaction with this j atom. */
402 velecsum = _mm_add_ps(velecsum,velec);
406 /* Update vectorial force */
407 fix0 = _mm_macc_ps(dx02,fscal,fix0);
408 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
409 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
411 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
412 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
413 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
415 /**************************
416 * CALCULATE INTERACTIONS *
417 **************************/
419 r10 = _mm_mul_ps(rsq10,rinv10);
421 /* Calculate table index by multiplying r with table scale and truncate to integer */
422 rt = _mm_mul_ps(r10,vftabscale);
423 vfitab = _mm_cvttps_epi32(rt);
425 vfeps = _mm_frcz_ps(rt);
427 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
429 twovfeps = _mm_add_ps(vfeps,vfeps);
430 vfitab = _mm_slli_epi32(vfitab,2);
432 /* CUBIC SPLINE TABLE ELECTROSTATICS */
433 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
434 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
435 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
436 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
437 _MM_TRANSPOSE4_PS(Y,F,G,H);
438 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
439 VV = _mm_macc_ps(vfeps,Fp,Y);
440 velec = _mm_mul_ps(qq10,VV);
441 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
442 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
444 /* Update potential sum for this i atom from the interaction with this j atom. */
445 velecsum = _mm_add_ps(velecsum,velec);
449 /* Update vectorial force */
450 fix1 = _mm_macc_ps(dx10,fscal,fix1);
451 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
452 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
454 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
455 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
456 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
458 /**************************
459 * CALCULATE INTERACTIONS *
460 **************************/
462 r11 = _mm_mul_ps(rsq11,rinv11);
464 /* Calculate table index by multiplying r with table scale and truncate to integer */
465 rt = _mm_mul_ps(r11,vftabscale);
466 vfitab = _mm_cvttps_epi32(rt);
468 vfeps = _mm_frcz_ps(rt);
470 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
472 twovfeps = _mm_add_ps(vfeps,vfeps);
473 vfitab = _mm_slli_epi32(vfitab,2);
475 /* CUBIC SPLINE TABLE ELECTROSTATICS */
476 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
477 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
478 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
479 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
480 _MM_TRANSPOSE4_PS(Y,F,G,H);
481 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
482 VV = _mm_macc_ps(vfeps,Fp,Y);
483 velec = _mm_mul_ps(qq11,VV);
484 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
485 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
487 /* Update potential sum for this i atom from the interaction with this j atom. */
488 velecsum = _mm_add_ps(velecsum,velec);
492 /* Update vectorial force */
493 fix1 = _mm_macc_ps(dx11,fscal,fix1);
494 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
495 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
497 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
498 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
499 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
501 /**************************
502 * CALCULATE INTERACTIONS *
503 **************************/
505 r12 = _mm_mul_ps(rsq12,rinv12);
507 /* Calculate table index by multiplying r with table scale and truncate to integer */
508 rt = _mm_mul_ps(r12,vftabscale);
509 vfitab = _mm_cvttps_epi32(rt);
511 vfeps = _mm_frcz_ps(rt);
513 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
515 twovfeps = _mm_add_ps(vfeps,vfeps);
516 vfitab = _mm_slli_epi32(vfitab,2);
518 /* CUBIC SPLINE TABLE ELECTROSTATICS */
519 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
520 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
521 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
522 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
523 _MM_TRANSPOSE4_PS(Y,F,G,H);
524 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
525 VV = _mm_macc_ps(vfeps,Fp,Y);
526 velec = _mm_mul_ps(qq12,VV);
527 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
528 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
530 /* Update potential sum for this i atom from the interaction with this j atom. */
531 velecsum = _mm_add_ps(velecsum,velec);
535 /* Update vectorial force */
536 fix1 = _mm_macc_ps(dx12,fscal,fix1);
537 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
538 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
540 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
541 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
542 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
544 /**************************
545 * CALCULATE INTERACTIONS *
546 **************************/
548 r20 = _mm_mul_ps(rsq20,rinv20);
550 /* Calculate table index by multiplying r with table scale and truncate to integer */
551 rt = _mm_mul_ps(r20,vftabscale);
552 vfitab = _mm_cvttps_epi32(rt);
554 vfeps = _mm_frcz_ps(rt);
556 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
558 twovfeps = _mm_add_ps(vfeps,vfeps);
559 vfitab = _mm_slli_epi32(vfitab,2);
561 /* CUBIC SPLINE TABLE ELECTROSTATICS */
562 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
563 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
564 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
565 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
566 _MM_TRANSPOSE4_PS(Y,F,G,H);
567 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
568 VV = _mm_macc_ps(vfeps,Fp,Y);
569 velec = _mm_mul_ps(qq20,VV);
570 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
571 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
573 /* Update potential sum for this i atom from the interaction with this j atom. */
574 velecsum = _mm_add_ps(velecsum,velec);
578 /* Update vectorial force */
579 fix2 = _mm_macc_ps(dx20,fscal,fix2);
580 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
581 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
583 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
584 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
585 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
587 /**************************
588 * CALCULATE INTERACTIONS *
589 **************************/
591 r21 = _mm_mul_ps(rsq21,rinv21);
593 /* Calculate table index by multiplying r with table scale and truncate to integer */
594 rt = _mm_mul_ps(r21,vftabscale);
595 vfitab = _mm_cvttps_epi32(rt);
597 vfeps = _mm_frcz_ps(rt);
599 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
601 twovfeps = _mm_add_ps(vfeps,vfeps);
602 vfitab = _mm_slli_epi32(vfitab,2);
604 /* CUBIC SPLINE TABLE ELECTROSTATICS */
605 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
606 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
607 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
608 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
609 _MM_TRANSPOSE4_PS(Y,F,G,H);
610 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
611 VV = _mm_macc_ps(vfeps,Fp,Y);
612 velec = _mm_mul_ps(qq21,VV);
613 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
614 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
616 /* Update potential sum for this i atom from the interaction with this j atom. */
617 velecsum = _mm_add_ps(velecsum,velec);
621 /* Update vectorial force */
622 fix2 = _mm_macc_ps(dx21,fscal,fix2);
623 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
624 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
626 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
627 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
628 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
630 /**************************
631 * CALCULATE INTERACTIONS *
632 **************************/
634 r22 = _mm_mul_ps(rsq22,rinv22);
636 /* Calculate table index by multiplying r with table scale and truncate to integer */
637 rt = _mm_mul_ps(r22,vftabscale);
638 vfitab = _mm_cvttps_epi32(rt);
640 vfeps = _mm_frcz_ps(rt);
642 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
644 twovfeps = _mm_add_ps(vfeps,vfeps);
645 vfitab = _mm_slli_epi32(vfitab,2);
647 /* CUBIC SPLINE TABLE ELECTROSTATICS */
648 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
649 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
650 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
651 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
652 _MM_TRANSPOSE4_PS(Y,F,G,H);
653 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
654 VV = _mm_macc_ps(vfeps,Fp,Y);
655 velec = _mm_mul_ps(qq22,VV);
656 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
657 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
659 /* Update potential sum for this i atom from the interaction with this j atom. */
660 velecsum = _mm_add_ps(velecsum,velec);
664 /* Update vectorial force */
665 fix2 = _mm_macc_ps(dx22,fscal,fix2);
666 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
667 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
669 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
670 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
671 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
673 fjptrA = f+j_coord_offsetA;
674 fjptrB = f+j_coord_offsetB;
675 fjptrC = f+j_coord_offsetC;
676 fjptrD = f+j_coord_offsetD;
678 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
679 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
681 /* Inner loop uses 427 flops */
687 /* Get j neighbor index, and coordinate index */
688 jnrlistA = jjnr[jidx];
689 jnrlistB = jjnr[jidx+1];
690 jnrlistC = jjnr[jidx+2];
691 jnrlistD = jjnr[jidx+3];
692 /* Sign of each element will be negative for non-real atoms.
693 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
694 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
696 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
697 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
698 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
699 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
700 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
701 j_coord_offsetA = DIM*jnrA;
702 j_coord_offsetB = DIM*jnrB;
703 j_coord_offsetC = DIM*jnrC;
704 j_coord_offsetD = DIM*jnrD;
706 /* load j atom coordinates */
707 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
708 x+j_coord_offsetC,x+j_coord_offsetD,
709 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
711 /* Calculate displacement vector */
712 dx00 = _mm_sub_ps(ix0,jx0);
713 dy00 = _mm_sub_ps(iy0,jy0);
714 dz00 = _mm_sub_ps(iz0,jz0);
715 dx01 = _mm_sub_ps(ix0,jx1);
716 dy01 = _mm_sub_ps(iy0,jy1);
717 dz01 = _mm_sub_ps(iz0,jz1);
718 dx02 = _mm_sub_ps(ix0,jx2);
719 dy02 = _mm_sub_ps(iy0,jy2);
720 dz02 = _mm_sub_ps(iz0,jz2);
721 dx10 = _mm_sub_ps(ix1,jx0);
722 dy10 = _mm_sub_ps(iy1,jy0);
723 dz10 = _mm_sub_ps(iz1,jz0);
724 dx11 = _mm_sub_ps(ix1,jx1);
725 dy11 = _mm_sub_ps(iy1,jy1);
726 dz11 = _mm_sub_ps(iz1,jz1);
727 dx12 = _mm_sub_ps(ix1,jx2);
728 dy12 = _mm_sub_ps(iy1,jy2);
729 dz12 = _mm_sub_ps(iz1,jz2);
730 dx20 = _mm_sub_ps(ix2,jx0);
731 dy20 = _mm_sub_ps(iy2,jy0);
732 dz20 = _mm_sub_ps(iz2,jz0);
733 dx21 = _mm_sub_ps(ix2,jx1);
734 dy21 = _mm_sub_ps(iy2,jy1);
735 dz21 = _mm_sub_ps(iz2,jz1);
736 dx22 = _mm_sub_ps(ix2,jx2);
737 dy22 = _mm_sub_ps(iy2,jy2);
738 dz22 = _mm_sub_ps(iz2,jz2);
740 /* Calculate squared distance and things based on it */
741 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
742 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
743 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
744 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
745 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
746 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
747 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
748 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
749 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
751 rinv00 = gmx_mm_invsqrt_ps(rsq00);
752 rinv01 = gmx_mm_invsqrt_ps(rsq01);
753 rinv02 = gmx_mm_invsqrt_ps(rsq02);
754 rinv10 = gmx_mm_invsqrt_ps(rsq10);
755 rinv11 = gmx_mm_invsqrt_ps(rsq11);
756 rinv12 = gmx_mm_invsqrt_ps(rsq12);
757 rinv20 = gmx_mm_invsqrt_ps(rsq20);
758 rinv21 = gmx_mm_invsqrt_ps(rsq21);
759 rinv22 = gmx_mm_invsqrt_ps(rsq22);
761 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
763 fjx0 = _mm_setzero_ps();
764 fjy0 = _mm_setzero_ps();
765 fjz0 = _mm_setzero_ps();
766 fjx1 = _mm_setzero_ps();
767 fjy1 = _mm_setzero_ps();
768 fjz1 = _mm_setzero_ps();
769 fjx2 = _mm_setzero_ps();
770 fjy2 = _mm_setzero_ps();
771 fjz2 = _mm_setzero_ps();
773 /**************************
774 * CALCULATE INTERACTIONS *
775 **************************/
777 r00 = _mm_mul_ps(rsq00,rinv00);
778 r00 = _mm_andnot_ps(dummy_mask,r00);
780 /* Calculate table index by multiplying r with table scale and truncate to integer */
781 rt = _mm_mul_ps(r00,vftabscale);
782 vfitab = _mm_cvttps_epi32(rt);
784 vfeps = _mm_frcz_ps(rt);
786 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
788 twovfeps = _mm_add_ps(vfeps,vfeps);
789 vfitab = _mm_slli_epi32(vfitab,2);
791 /* CUBIC SPLINE TABLE ELECTROSTATICS */
792 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
793 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
794 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
795 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
796 _MM_TRANSPOSE4_PS(Y,F,G,H);
797 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
798 VV = _mm_macc_ps(vfeps,Fp,Y);
799 velec = _mm_mul_ps(qq00,VV);
800 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
801 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
803 /* LENNARD-JONES DISPERSION/REPULSION */
805 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
806 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
807 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
808 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
809 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
811 /* Update potential sum for this i atom from the interaction with this j atom. */
812 velec = _mm_andnot_ps(dummy_mask,velec);
813 velecsum = _mm_add_ps(velecsum,velec);
814 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
815 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
817 fscal = _mm_add_ps(felec,fvdw);
819 fscal = _mm_andnot_ps(dummy_mask,fscal);
821 /* Update vectorial force */
822 fix0 = _mm_macc_ps(dx00,fscal,fix0);
823 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
824 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
826 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
827 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
828 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
830 /**************************
831 * CALCULATE INTERACTIONS *
832 **************************/
834 r01 = _mm_mul_ps(rsq01,rinv01);
835 r01 = _mm_andnot_ps(dummy_mask,r01);
837 /* Calculate table index by multiplying r with table scale and truncate to integer */
838 rt = _mm_mul_ps(r01,vftabscale);
839 vfitab = _mm_cvttps_epi32(rt);
841 vfeps = _mm_frcz_ps(rt);
843 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
845 twovfeps = _mm_add_ps(vfeps,vfeps);
846 vfitab = _mm_slli_epi32(vfitab,2);
848 /* CUBIC SPLINE TABLE ELECTROSTATICS */
849 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
850 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
851 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
852 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
853 _MM_TRANSPOSE4_PS(Y,F,G,H);
854 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
855 VV = _mm_macc_ps(vfeps,Fp,Y);
856 velec = _mm_mul_ps(qq01,VV);
857 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
858 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
860 /* Update potential sum for this i atom from the interaction with this j atom. */
861 velec = _mm_andnot_ps(dummy_mask,velec);
862 velecsum = _mm_add_ps(velecsum,velec);
866 fscal = _mm_andnot_ps(dummy_mask,fscal);
868 /* Update vectorial force */
869 fix0 = _mm_macc_ps(dx01,fscal,fix0);
870 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
871 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
873 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
874 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
875 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
877 /**************************
878 * CALCULATE INTERACTIONS *
879 **************************/
881 r02 = _mm_mul_ps(rsq02,rinv02);
882 r02 = _mm_andnot_ps(dummy_mask,r02);
884 /* Calculate table index by multiplying r with table scale and truncate to integer */
885 rt = _mm_mul_ps(r02,vftabscale);
886 vfitab = _mm_cvttps_epi32(rt);
888 vfeps = _mm_frcz_ps(rt);
890 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
892 twovfeps = _mm_add_ps(vfeps,vfeps);
893 vfitab = _mm_slli_epi32(vfitab,2);
895 /* CUBIC SPLINE TABLE ELECTROSTATICS */
896 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
897 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
898 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
899 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
900 _MM_TRANSPOSE4_PS(Y,F,G,H);
901 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
902 VV = _mm_macc_ps(vfeps,Fp,Y);
903 velec = _mm_mul_ps(qq02,VV);
904 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
905 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
907 /* Update potential sum for this i atom from the interaction with this j atom. */
908 velec = _mm_andnot_ps(dummy_mask,velec);
909 velecsum = _mm_add_ps(velecsum,velec);
913 fscal = _mm_andnot_ps(dummy_mask,fscal);
915 /* Update vectorial force */
916 fix0 = _mm_macc_ps(dx02,fscal,fix0);
917 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
918 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
920 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
921 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
922 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
924 /**************************
925 * CALCULATE INTERACTIONS *
926 **************************/
928 r10 = _mm_mul_ps(rsq10,rinv10);
929 r10 = _mm_andnot_ps(dummy_mask,r10);
931 /* Calculate table index by multiplying r with table scale and truncate to integer */
932 rt = _mm_mul_ps(r10,vftabscale);
933 vfitab = _mm_cvttps_epi32(rt);
935 vfeps = _mm_frcz_ps(rt);
937 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
939 twovfeps = _mm_add_ps(vfeps,vfeps);
940 vfitab = _mm_slli_epi32(vfitab,2);
942 /* CUBIC SPLINE TABLE ELECTROSTATICS */
943 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
944 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
945 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
946 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
947 _MM_TRANSPOSE4_PS(Y,F,G,H);
948 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
949 VV = _mm_macc_ps(vfeps,Fp,Y);
950 velec = _mm_mul_ps(qq10,VV);
951 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
952 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
954 /* Update potential sum for this i atom from the interaction with this j atom. */
955 velec = _mm_andnot_ps(dummy_mask,velec);
956 velecsum = _mm_add_ps(velecsum,velec);
960 fscal = _mm_andnot_ps(dummy_mask,fscal);
962 /* Update vectorial force */
963 fix1 = _mm_macc_ps(dx10,fscal,fix1);
964 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
965 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
967 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
968 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
969 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
971 /**************************
972 * CALCULATE INTERACTIONS *
973 **************************/
975 r11 = _mm_mul_ps(rsq11,rinv11);
976 r11 = _mm_andnot_ps(dummy_mask,r11);
978 /* Calculate table index by multiplying r with table scale and truncate to integer */
979 rt = _mm_mul_ps(r11,vftabscale);
980 vfitab = _mm_cvttps_epi32(rt);
982 vfeps = _mm_frcz_ps(rt);
984 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
986 twovfeps = _mm_add_ps(vfeps,vfeps);
987 vfitab = _mm_slli_epi32(vfitab,2);
989 /* CUBIC SPLINE TABLE ELECTROSTATICS */
990 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
991 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
992 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
993 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
994 _MM_TRANSPOSE4_PS(Y,F,G,H);
995 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
996 VV = _mm_macc_ps(vfeps,Fp,Y);
997 velec = _mm_mul_ps(qq11,VV);
998 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
999 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1001 /* Update potential sum for this i atom from the interaction with this j atom. */
1002 velec = _mm_andnot_ps(dummy_mask,velec);
1003 velecsum = _mm_add_ps(velecsum,velec);
1007 fscal = _mm_andnot_ps(dummy_mask,fscal);
1009 /* Update vectorial force */
1010 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1011 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1012 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1014 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1015 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1016 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1018 /**************************
1019 * CALCULATE INTERACTIONS *
1020 **************************/
1022 r12 = _mm_mul_ps(rsq12,rinv12);
1023 r12 = _mm_andnot_ps(dummy_mask,r12);
1025 /* Calculate table index by multiplying r with table scale and truncate to integer */
1026 rt = _mm_mul_ps(r12,vftabscale);
1027 vfitab = _mm_cvttps_epi32(rt);
1029 vfeps = _mm_frcz_ps(rt);
1031 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1033 twovfeps = _mm_add_ps(vfeps,vfeps);
1034 vfitab = _mm_slli_epi32(vfitab,2);
1036 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1037 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1038 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1039 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1040 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1041 _MM_TRANSPOSE4_PS(Y,F,G,H);
1042 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1043 VV = _mm_macc_ps(vfeps,Fp,Y);
1044 velec = _mm_mul_ps(qq12,VV);
1045 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1046 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1048 /* Update potential sum for this i atom from the interaction with this j atom. */
1049 velec = _mm_andnot_ps(dummy_mask,velec);
1050 velecsum = _mm_add_ps(velecsum,velec);
1054 fscal = _mm_andnot_ps(dummy_mask,fscal);
1056 /* Update vectorial force */
1057 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1058 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1059 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1061 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1062 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1063 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1065 /**************************
1066 * CALCULATE INTERACTIONS *
1067 **************************/
1069 r20 = _mm_mul_ps(rsq20,rinv20);
1070 r20 = _mm_andnot_ps(dummy_mask,r20);
1072 /* Calculate table index by multiplying r with table scale and truncate to integer */
1073 rt = _mm_mul_ps(r20,vftabscale);
1074 vfitab = _mm_cvttps_epi32(rt);
1076 vfeps = _mm_frcz_ps(rt);
1078 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1080 twovfeps = _mm_add_ps(vfeps,vfeps);
1081 vfitab = _mm_slli_epi32(vfitab,2);
1083 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1084 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1085 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1086 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1087 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1088 _MM_TRANSPOSE4_PS(Y,F,G,H);
1089 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1090 VV = _mm_macc_ps(vfeps,Fp,Y);
1091 velec = _mm_mul_ps(qq20,VV);
1092 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1093 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1095 /* Update potential sum for this i atom from the interaction with this j atom. */
1096 velec = _mm_andnot_ps(dummy_mask,velec);
1097 velecsum = _mm_add_ps(velecsum,velec);
1101 fscal = _mm_andnot_ps(dummy_mask,fscal);
1103 /* Update vectorial force */
1104 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1105 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1106 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1108 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1109 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1110 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1112 /**************************
1113 * CALCULATE INTERACTIONS *
1114 **************************/
1116 r21 = _mm_mul_ps(rsq21,rinv21);
1117 r21 = _mm_andnot_ps(dummy_mask,r21);
1119 /* Calculate table index by multiplying r with table scale and truncate to integer */
1120 rt = _mm_mul_ps(r21,vftabscale);
1121 vfitab = _mm_cvttps_epi32(rt);
1123 vfeps = _mm_frcz_ps(rt);
1125 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1127 twovfeps = _mm_add_ps(vfeps,vfeps);
1128 vfitab = _mm_slli_epi32(vfitab,2);
1130 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1131 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1132 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1133 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1134 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1135 _MM_TRANSPOSE4_PS(Y,F,G,H);
1136 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1137 VV = _mm_macc_ps(vfeps,Fp,Y);
1138 velec = _mm_mul_ps(qq21,VV);
1139 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1140 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1142 /* Update potential sum for this i atom from the interaction with this j atom. */
1143 velec = _mm_andnot_ps(dummy_mask,velec);
1144 velecsum = _mm_add_ps(velecsum,velec);
1148 fscal = _mm_andnot_ps(dummy_mask,fscal);
1150 /* Update vectorial force */
1151 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1152 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1153 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1155 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1156 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1157 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1159 /**************************
1160 * CALCULATE INTERACTIONS *
1161 **************************/
1163 r22 = _mm_mul_ps(rsq22,rinv22);
1164 r22 = _mm_andnot_ps(dummy_mask,r22);
1166 /* Calculate table index by multiplying r with table scale and truncate to integer */
1167 rt = _mm_mul_ps(r22,vftabscale);
1168 vfitab = _mm_cvttps_epi32(rt);
1170 vfeps = _mm_frcz_ps(rt);
1172 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1174 twovfeps = _mm_add_ps(vfeps,vfeps);
1175 vfitab = _mm_slli_epi32(vfitab,2);
1177 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1178 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1179 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1180 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1181 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1182 _MM_TRANSPOSE4_PS(Y,F,G,H);
1183 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1184 VV = _mm_macc_ps(vfeps,Fp,Y);
1185 velec = _mm_mul_ps(qq22,VV);
1186 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1187 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1189 /* Update potential sum for this i atom from the interaction with this j atom. */
1190 velec = _mm_andnot_ps(dummy_mask,velec);
1191 velecsum = _mm_add_ps(velecsum,velec);
1195 fscal = _mm_andnot_ps(dummy_mask,fscal);
1197 /* Update vectorial force */
1198 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1199 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1200 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1202 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1203 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1204 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1206 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1207 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1208 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1209 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1211 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1212 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1214 /* Inner loop uses 436 flops */
1217 /* End of innermost loop */
1219 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1220 f+i_coord_offset,fshift+i_shift_offset);
1223 /* Update potential energies */
1224 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1225 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1227 /* Increment number of inner iterations */
1228 inneriter += j_index_end - j_index_start;
1230 /* Outer loop uses 20 flops */
1233 /* Increment number of outer iterations */
1236 /* Update outer/inner flops */
1238 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*436);
1241 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_128_fma_single
1242 * Electrostatics interaction: CubicSplineTable
1243 * VdW interaction: LennardJones
1244 * Geometry: Water3-Water3
1245 * Calculate force/pot: Force
1248 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_128_fma_single
1249 (t_nblist * gmx_restrict nlist,
1250 rvec * gmx_restrict xx,
1251 rvec * gmx_restrict ff,
1252 t_forcerec * gmx_restrict fr,
1253 t_mdatoms * gmx_restrict mdatoms,
1254 nb_kernel_data_t * gmx_restrict kernel_data,
1255 t_nrnb * gmx_restrict nrnb)
1257 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1258 * just 0 for non-waters.
1259 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1260 * jnr indices corresponding to data put in the four positions in the SIMD register.
1262 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1263 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1264 int jnrA,jnrB,jnrC,jnrD;
1265 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1266 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1267 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1268 real rcutoff_scalar;
1269 real *shiftvec,*fshift,*x,*f;
1270 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1271 real scratch[4*DIM];
1272 __m128 fscal,rcutoff,rcutoff2,jidxall;
1274 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1276 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1278 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1279 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1280 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1281 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1282 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1283 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1284 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1285 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1286 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1287 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1288 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1289 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1290 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1291 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1292 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1293 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1294 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1297 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1300 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1301 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1303 __m128i ifour = _mm_set1_epi32(4);
1304 __m128 rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
1306 __m128 dummy_mask,cutoff_mask;
1307 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1308 __m128 one = _mm_set1_ps(1.0);
1309 __m128 two = _mm_set1_ps(2.0);
1315 jindex = nlist->jindex;
1317 shiftidx = nlist->shift;
1319 shiftvec = fr->shift_vec[0];
1320 fshift = fr->fshift[0];
1321 facel = _mm_set1_ps(fr->epsfac);
1322 charge = mdatoms->chargeA;
1323 nvdwtype = fr->ntype;
1324 vdwparam = fr->nbfp;
1325 vdwtype = mdatoms->typeA;
1327 vftab = kernel_data->table_elec->data;
1328 vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
1330 /* Setup water-specific parameters */
1331 inr = nlist->iinr[0];
1332 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1333 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1334 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1335 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1337 jq0 = _mm_set1_ps(charge[inr+0]);
1338 jq1 = _mm_set1_ps(charge[inr+1]);
1339 jq2 = _mm_set1_ps(charge[inr+2]);
1340 vdwjidx0A = 2*vdwtype[inr+0];
1341 qq00 = _mm_mul_ps(iq0,jq0);
1342 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1343 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1344 qq01 = _mm_mul_ps(iq0,jq1);
1345 qq02 = _mm_mul_ps(iq0,jq2);
1346 qq10 = _mm_mul_ps(iq1,jq0);
1347 qq11 = _mm_mul_ps(iq1,jq1);
1348 qq12 = _mm_mul_ps(iq1,jq2);
1349 qq20 = _mm_mul_ps(iq2,jq0);
1350 qq21 = _mm_mul_ps(iq2,jq1);
1351 qq22 = _mm_mul_ps(iq2,jq2);
1353 /* Avoid stupid compiler warnings */
1354 jnrA = jnrB = jnrC = jnrD = 0;
1355 j_coord_offsetA = 0;
1356 j_coord_offsetB = 0;
1357 j_coord_offsetC = 0;
1358 j_coord_offsetD = 0;
1363 for(iidx=0;iidx<4*DIM;iidx++)
1365 scratch[iidx] = 0.0;
1368 /* Start outer loop over neighborlists */
1369 for(iidx=0; iidx<nri; iidx++)
1371 /* Load shift vector for this list */
1372 i_shift_offset = DIM*shiftidx[iidx];
1374 /* Load limits for loop over neighbors */
1375 j_index_start = jindex[iidx];
1376 j_index_end = jindex[iidx+1];
1378 /* Get outer coordinate index */
1380 i_coord_offset = DIM*inr;
1382 /* Load i particle coords and add shift vector */
1383 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1384 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1386 fix0 = _mm_setzero_ps();
1387 fiy0 = _mm_setzero_ps();
1388 fiz0 = _mm_setzero_ps();
1389 fix1 = _mm_setzero_ps();
1390 fiy1 = _mm_setzero_ps();
1391 fiz1 = _mm_setzero_ps();
1392 fix2 = _mm_setzero_ps();
1393 fiy2 = _mm_setzero_ps();
1394 fiz2 = _mm_setzero_ps();
1396 /* Start inner kernel loop */
1397 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1400 /* Get j neighbor index, and coordinate index */
1402 jnrB = jjnr[jidx+1];
1403 jnrC = jjnr[jidx+2];
1404 jnrD = jjnr[jidx+3];
1405 j_coord_offsetA = DIM*jnrA;
1406 j_coord_offsetB = DIM*jnrB;
1407 j_coord_offsetC = DIM*jnrC;
1408 j_coord_offsetD = DIM*jnrD;
1410 /* load j atom coordinates */
1411 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1412 x+j_coord_offsetC,x+j_coord_offsetD,
1413 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1415 /* Calculate displacement vector */
1416 dx00 = _mm_sub_ps(ix0,jx0);
1417 dy00 = _mm_sub_ps(iy0,jy0);
1418 dz00 = _mm_sub_ps(iz0,jz0);
1419 dx01 = _mm_sub_ps(ix0,jx1);
1420 dy01 = _mm_sub_ps(iy0,jy1);
1421 dz01 = _mm_sub_ps(iz0,jz1);
1422 dx02 = _mm_sub_ps(ix0,jx2);
1423 dy02 = _mm_sub_ps(iy0,jy2);
1424 dz02 = _mm_sub_ps(iz0,jz2);
1425 dx10 = _mm_sub_ps(ix1,jx0);
1426 dy10 = _mm_sub_ps(iy1,jy0);
1427 dz10 = _mm_sub_ps(iz1,jz0);
1428 dx11 = _mm_sub_ps(ix1,jx1);
1429 dy11 = _mm_sub_ps(iy1,jy1);
1430 dz11 = _mm_sub_ps(iz1,jz1);
1431 dx12 = _mm_sub_ps(ix1,jx2);
1432 dy12 = _mm_sub_ps(iy1,jy2);
1433 dz12 = _mm_sub_ps(iz1,jz2);
1434 dx20 = _mm_sub_ps(ix2,jx0);
1435 dy20 = _mm_sub_ps(iy2,jy0);
1436 dz20 = _mm_sub_ps(iz2,jz0);
1437 dx21 = _mm_sub_ps(ix2,jx1);
1438 dy21 = _mm_sub_ps(iy2,jy1);
1439 dz21 = _mm_sub_ps(iz2,jz1);
1440 dx22 = _mm_sub_ps(ix2,jx2);
1441 dy22 = _mm_sub_ps(iy2,jy2);
1442 dz22 = _mm_sub_ps(iz2,jz2);
1444 /* Calculate squared distance and things based on it */
1445 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1446 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1447 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1448 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1449 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1450 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1451 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1452 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1453 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1455 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1456 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1457 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1458 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1459 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1460 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1461 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1462 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1463 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1465 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1467 fjx0 = _mm_setzero_ps();
1468 fjy0 = _mm_setzero_ps();
1469 fjz0 = _mm_setzero_ps();
1470 fjx1 = _mm_setzero_ps();
1471 fjy1 = _mm_setzero_ps();
1472 fjz1 = _mm_setzero_ps();
1473 fjx2 = _mm_setzero_ps();
1474 fjy2 = _mm_setzero_ps();
1475 fjz2 = _mm_setzero_ps();
1477 /**************************
1478 * CALCULATE INTERACTIONS *
1479 **************************/
1481 r00 = _mm_mul_ps(rsq00,rinv00);
1483 /* Calculate table index by multiplying r with table scale and truncate to integer */
1484 rt = _mm_mul_ps(r00,vftabscale);
1485 vfitab = _mm_cvttps_epi32(rt);
1487 vfeps = _mm_frcz_ps(rt);
1489 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1491 twovfeps = _mm_add_ps(vfeps,vfeps);
1492 vfitab = _mm_slli_epi32(vfitab,2);
1494 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1495 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1496 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1497 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1498 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1499 _MM_TRANSPOSE4_PS(Y,F,G,H);
1500 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1501 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1502 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1504 /* LENNARD-JONES DISPERSION/REPULSION */
1506 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1507 fvdw = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1509 fscal = _mm_add_ps(felec,fvdw);
1511 /* Update vectorial force */
1512 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1513 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1514 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1516 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1517 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1518 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1520 /**************************
1521 * CALCULATE INTERACTIONS *
1522 **************************/
1524 r01 = _mm_mul_ps(rsq01,rinv01);
1526 /* Calculate table index by multiplying r with table scale and truncate to integer */
1527 rt = _mm_mul_ps(r01,vftabscale);
1528 vfitab = _mm_cvttps_epi32(rt);
1530 vfeps = _mm_frcz_ps(rt);
1532 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1534 twovfeps = _mm_add_ps(vfeps,vfeps);
1535 vfitab = _mm_slli_epi32(vfitab,2);
1537 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1538 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1539 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1540 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1541 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1542 _MM_TRANSPOSE4_PS(Y,F,G,H);
1543 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1544 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1545 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
1549 /* Update vectorial force */
1550 fix0 = _mm_macc_ps(dx01,fscal,fix0);
1551 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1552 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1554 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1555 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1556 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1558 /**************************
1559 * CALCULATE INTERACTIONS *
1560 **************************/
1562 r02 = _mm_mul_ps(rsq02,rinv02);
1564 /* Calculate table index by multiplying r with table scale and truncate to integer */
1565 rt = _mm_mul_ps(r02,vftabscale);
1566 vfitab = _mm_cvttps_epi32(rt);
1568 vfeps = _mm_frcz_ps(rt);
1570 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1572 twovfeps = _mm_add_ps(vfeps,vfeps);
1573 vfitab = _mm_slli_epi32(vfitab,2);
1575 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1576 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1577 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1578 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1579 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1580 _MM_TRANSPOSE4_PS(Y,F,G,H);
1581 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1582 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1583 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
1587 /* Update vectorial force */
1588 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1589 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1590 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
1592 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
1593 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
1594 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
1596 /**************************
1597 * CALCULATE INTERACTIONS *
1598 **************************/
1600 r10 = _mm_mul_ps(rsq10,rinv10);
1602 /* Calculate table index by multiplying r with table scale and truncate to integer */
1603 rt = _mm_mul_ps(r10,vftabscale);
1604 vfitab = _mm_cvttps_epi32(rt);
1606 vfeps = _mm_frcz_ps(rt);
1608 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1610 twovfeps = _mm_add_ps(vfeps,vfeps);
1611 vfitab = _mm_slli_epi32(vfitab,2);
1613 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1614 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1615 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1616 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1617 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1618 _MM_TRANSPOSE4_PS(Y,F,G,H);
1619 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1620 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1621 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1625 /* Update vectorial force */
1626 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1627 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1628 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1630 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1631 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1632 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1634 /**************************
1635 * CALCULATE INTERACTIONS *
1636 **************************/
1638 r11 = _mm_mul_ps(rsq11,rinv11);
1640 /* Calculate table index by multiplying r with table scale and truncate to integer */
1641 rt = _mm_mul_ps(r11,vftabscale);
1642 vfitab = _mm_cvttps_epi32(rt);
1644 vfeps = _mm_frcz_ps(rt);
1646 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1648 twovfeps = _mm_add_ps(vfeps,vfeps);
1649 vfitab = _mm_slli_epi32(vfitab,2);
1651 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1652 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1653 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1654 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1655 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1656 _MM_TRANSPOSE4_PS(Y,F,G,H);
1657 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1658 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1659 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1663 /* Update vectorial force */
1664 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1665 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1666 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1668 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1669 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1670 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1672 /**************************
1673 * CALCULATE INTERACTIONS *
1674 **************************/
1676 r12 = _mm_mul_ps(rsq12,rinv12);
1678 /* Calculate table index by multiplying r with table scale and truncate to integer */
1679 rt = _mm_mul_ps(r12,vftabscale);
1680 vfitab = _mm_cvttps_epi32(rt);
1682 vfeps = _mm_frcz_ps(rt);
1684 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1686 twovfeps = _mm_add_ps(vfeps,vfeps);
1687 vfitab = _mm_slli_epi32(vfitab,2);
1689 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1690 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1691 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1692 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1693 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1694 _MM_TRANSPOSE4_PS(Y,F,G,H);
1695 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1696 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1697 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1701 /* Update vectorial force */
1702 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1703 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1704 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1706 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1707 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1708 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1710 /**************************
1711 * CALCULATE INTERACTIONS *
1712 **************************/
1714 r20 = _mm_mul_ps(rsq20,rinv20);
1716 /* Calculate table index by multiplying r with table scale and truncate to integer */
1717 rt = _mm_mul_ps(r20,vftabscale);
1718 vfitab = _mm_cvttps_epi32(rt);
1720 vfeps = _mm_frcz_ps(rt);
1722 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1724 twovfeps = _mm_add_ps(vfeps,vfeps);
1725 vfitab = _mm_slli_epi32(vfitab,2);
1727 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1728 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1729 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1730 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1731 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1732 _MM_TRANSPOSE4_PS(Y,F,G,H);
1733 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1734 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1735 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1739 /* Update vectorial force */
1740 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1741 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1742 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1744 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1745 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1746 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1748 /**************************
1749 * CALCULATE INTERACTIONS *
1750 **************************/
1752 r21 = _mm_mul_ps(rsq21,rinv21);
1754 /* Calculate table index by multiplying r with table scale and truncate to integer */
1755 rt = _mm_mul_ps(r21,vftabscale);
1756 vfitab = _mm_cvttps_epi32(rt);
1758 vfeps = _mm_frcz_ps(rt);
1760 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1762 twovfeps = _mm_add_ps(vfeps,vfeps);
1763 vfitab = _mm_slli_epi32(vfitab,2);
1765 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1766 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1767 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1768 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1769 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1770 _MM_TRANSPOSE4_PS(Y,F,G,H);
1771 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1772 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1773 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1777 /* Update vectorial force */
1778 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1779 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1780 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1782 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1783 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1784 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1786 /**************************
1787 * CALCULATE INTERACTIONS *
1788 **************************/
1790 r22 = _mm_mul_ps(rsq22,rinv22);
1792 /* Calculate table index by multiplying r with table scale and truncate to integer */
1793 rt = _mm_mul_ps(r22,vftabscale);
1794 vfitab = _mm_cvttps_epi32(rt);
1796 vfeps = _mm_frcz_ps(rt);
1798 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1800 twovfeps = _mm_add_ps(vfeps,vfeps);
1801 vfitab = _mm_slli_epi32(vfitab,2);
1803 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1804 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1805 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1806 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1807 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1808 _MM_TRANSPOSE4_PS(Y,F,G,H);
1809 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1810 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1811 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1815 /* Update vectorial force */
1816 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1817 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1818 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1820 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1821 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1822 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1824 fjptrA = f+j_coord_offsetA;
1825 fjptrB = f+j_coord_offsetB;
1826 fjptrC = f+j_coord_offsetC;
1827 fjptrD = f+j_coord_offsetD;
1829 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1830 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1832 /* Inner loop uses 386 flops */
1835 if(jidx<j_index_end)
1838 /* Get j neighbor index, and coordinate index */
1839 jnrlistA = jjnr[jidx];
1840 jnrlistB = jjnr[jidx+1];
1841 jnrlistC = jjnr[jidx+2];
1842 jnrlistD = jjnr[jidx+3];
1843 /* Sign of each element will be negative for non-real atoms.
1844 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1845 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1847 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1848 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1849 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1850 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1851 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1852 j_coord_offsetA = DIM*jnrA;
1853 j_coord_offsetB = DIM*jnrB;
1854 j_coord_offsetC = DIM*jnrC;
1855 j_coord_offsetD = DIM*jnrD;
1857 /* load j atom coordinates */
1858 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1859 x+j_coord_offsetC,x+j_coord_offsetD,
1860 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1862 /* Calculate displacement vector */
1863 dx00 = _mm_sub_ps(ix0,jx0);
1864 dy00 = _mm_sub_ps(iy0,jy0);
1865 dz00 = _mm_sub_ps(iz0,jz0);
1866 dx01 = _mm_sub_ps(ix0,jx1);
1867 dy01 = _mm_sub_ps(iy0,jy1);
1868 dz01 = _mm_sub_ps(iz0,jz1);
1869 dx02 = _mm_sub_ps(ix0,jx2);
1870 dy02 = _mm_sub_ps(iy0,jy2);
1871 dz02 = _mm_sub_ps(iz0,jz2);
1872 dx10 = _mm_sub_ps(ix1,jx0);
1873 dy10 = _mm_sub_ps(iy1,jy0);
1874 dz10 = _mm_sub_ps(iz1,jz0);
1875 dx11 = _mm_sub_ps(ix1,jx1);
1876 dy11 = _mm_sub_ps(iy1,jy1);
1877 dz11 = _mm_sub_ps(iz1,jz1);
1878 dx12 = _mm_sub_ps(ix1,jx2);
1879 dy12 = _mm_sub_ps(iy1,jy2);
1880 dz12 = _mm_sub_ps(iz1,jz2);
1881 dx20 = _mm_sub_ps(ix2,jx0);
1882 dy20 = _mm_sub_ps(iy2,jy0);
1883 dz20 = _mm_sub_ps(iz2,jz0);
1884 dx21 = _mm_sub_ps(ix2,jx1);
1885 dy21 = _mm_sub_ps(iy2,jy1);
1886 dz21 = _mm_sub_ps(iz2,jz1);
1887 dx22 = _mm_sub_ps(ix2,jx2);
1888 dy22 = _mm_sub_ps(iy2,jy2);
1889 dz22 = _mm_sub_ps(iz2,jz2);
1891 /* Calculate squared distance and things based on it */
1892 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1893 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1894 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1895 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1896 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1897 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1898 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1899 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1900 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1902 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1903 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1904 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1905 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1906 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1907 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1908 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1909 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1910 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1912 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1914 fjx0 = _mm_setzero_ps();
1915 fjy0 = _mm_setzero_ps();
1916 fjz0 = _mm_setzero_ps();
1917 fjx1 = _mm_setzero_ps();
1918 fjy1 = _mm_setzero_ps();
1919 fjz1 = _mm_setzero_ps();
1920 fjx2 = _mm_setzero_ps();
1921 fjy2 = _mm_setzero_ps();
1922 fjz2 = _mm_setzero_ps();
1924 /**************************
1925 * CALCULATE INTERACTIONS *
1926 **************************/
1928 r00 = _mm_mul_ps(rsq00,rinv00);
1929 r00 = _mm_andnot_ps(dummy_mask,r00);
1931 /* Calculate table index by multiplying r with table scale and truncate to integer */
1932 rt = _mm_mul_ps(r00,vftabscale);
1933 vfitab = _mm_cvttps_epi32(rt);
1935 vfeps = _mm_frcz_ps(rt);
1937 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1939 twovfeps = _mm_add_ps(vfeps,vfeps);
1940 vfitab = _mm_slli_epi32(vfitab,2);
1942 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1943 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1944 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1945 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1946 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1947 _MM_TRANSPOSE4_PS(Y,F,G,H);
1948 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1949 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1950 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1952 /* LENNARD-JONES DISPERSION/REPULSION */
1954 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1955 fvdw = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1957 fscal = _mm_add_ps(felec,fvdw);
1959 fscal = _mm_andnot_ps(dummy_mask,fscal);
1961 /* Update vectorial force */
1962 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1963 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1964 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1966 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1967 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1968 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1970 /**************************
1971 * CALCULATE INTERACTIONS *
1972 **************************/
1974 r01 = _mm_mul_ps(rsq01,rinv01);
1975 r01 = _mm_andnot_ps(dummy_mask,r01);
1977 /* Calculate table index by multiplying r with table scale and truncate to integer */
1978 rt = _mm_mul_ps(r01,vftabscale);
1979 vfitab = _mm_cvttps_epi32(rt);
1981 vfeps = _mm_frcz_ps(rt);
1983 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1985 twovfeps = _mm_add_ps(vfeps,vfeps);
1986 vfitab = _mm_slli_epi32(vfitab,2);
1988 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1989 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1990 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1991 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1992 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1993 _MM_TRANSPOSE4_PS(Y,F,G,H);
1994 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1995 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1996 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
2000 fscal = _mm_andnot_ps(dummy_mask,fscal);
2002 /* Update vectorial force */
2003 fix0 = _mm_macc_ps(dx01,fscal,fix0);
2004 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
2005 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
2007 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
2008 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
2009 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
2011 /**************************
2012 * CALCULATE INTERACTIONS *
2013 **************************/
2015 r02 = _mm_mul_ps(rsq02,rinv02);
2016 r02 = _mm_andnot_ps(dummy_mask,r02);
2018 /* Calculate table index by multiplying r with table scale and truncate to integer */
2019 rt = _mm_mul_ps(r02,vftabscale);
2020 vfitab = _mm_cvttps_epi32(rt);
2022 vfeps = _mm_frcz_ps(rt);
2024 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2026 twovfeps = _mm_add_ps(vfeps,vfeps);
2027 vfitab = _mm_slli_epi32(vfitab,2);
2029 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2030 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2031 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2032 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2033 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2034 _MM_TRANSPOSE4_PS(Y,F,G,H);
2035 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2036 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2037 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
2041 fscal = _mm_andnot_ps(dummy_mask,fscal);
2043 /* Update vectorial force */
2044 fix0 = _mm_macc_ps(dx02,fscal,fix0);
2045 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
2046 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
2048 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
2049 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
2050 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
2052 /**************************
2053 * CALCULATE INTERACTIONS *
2054 **************************/
2056 r10 = _mm_mul_ps(rsq10,rinv10);
2057 r10 = _mm_andnot_ps(dummy_mask,r10);
2059 /* Calculate table index by multiplying r with table scale and truncate to integer */
2060 rt = _mm_mul_ps(r10,vftabscale);
2061 vfitab = _mm_cvttps_epi32(rt);
2063 vfeps = _mm_frcz_ps(rt);
2065 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2067 twovfeps = _mm_add_ps(vfeps,vfeps);
2068 vfitab = _mm_slli_epi32(vfitab,2);
2070 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2071 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2072 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2073 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2074 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2075 _MM_TRANSPOSE4_PS(Y,F,G,H);
2076 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2077 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2078 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
2082 fscal = _mm_andnot_ps(dummy_mask,fscal);
2084 /* Update vectorial force */
2085 fix1 = _mm_macc_ps(dx10,fscal,fix1);
2086 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
2087 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
2089 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
2090 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
2091 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
2093 /**************************
2094 * CALCULATE INTERACTIONS *
2095 **************************/
2097 r11 = _mm_mul_ps(rsq11,rinv11);
2098 r11 = _mm_andnot_ps(dummy_mask,r11);
2100 /* Calculate table index by multiplying r with table scale and truncate to integer */
2101 rt = _mm_mul_ps(r11,vftabscale);
2102 vfitab = _mm_cvttps_epi32(rt);
2104 vfeps = _mm_frcz_ps(rt);
2106 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2108 twovfeps = _mm_add_ps(vfeps,vfeps);
2109 vfitab = _mm_slli_epi32(vfitab,2);
2111 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2112 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2113 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2114 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2115 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2116 _MM_TRANSPOSE4_PS(Y,F,G,H);
2117 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2118 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2119 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2123 fscal = _mm_andnot_ps(dummy_mask,fscal);
2125 /* Update vectorial force */
2126 fix1 = _mm_macc_ps(dx11,fscal,fix1);
2127 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
2128 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
2130 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
2131 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
2132 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
2134 /**************************
2135 * CALCULATE INTERACTIONS *
2136 **************************/
2138 r12 = _mm_mul_ps(rsq12,rinv12);
2139 r12 = _mm_andnot_ps(dummy_mask,r12);
2141 /* Calculate table index by multiplying r with table scale and truncate to integer */
2142 rt = _mm_mul_ps(r12,vftabscale);
2143 vfitab = _mm_cvttps_epi32(rt);
2145 vfeps = _mm_frcz_ps(rt);
2147 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2149 twovfeps = _mm_add_ps(vfeps,vfeps);
2150 vfitab = _mm_slli_epi32(vfitab,2);
2152 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2153 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2154 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2155 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2156 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2157 _MM_TRANSPOSE4_PS(Y,F,G,H);
2158 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2159 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2160 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2164 fscal = _mm_andnot_ps(dummy_mask,fscal);
2166 /* Update vectorial force */
2167 fix1 = _mm_macc_ps(dx12,fscal,fix1);
2168 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
2169 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
2171 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
2172 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
2173 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
2175 /**************************
2176 * CALCULATE INTERACTIONS *
2177 **************************/
2179 r20 = _mm_mul_ps(rsq20,rinv20);
2180 r20 = _mm_andnot_ps(dummy_mask,r20);
2182 /* Calculate table index by multiplying r with table scale and truncate to integer */
2183 rt = _mm_mul_ps(r20,vftabscale);
2184 vfitab = _mm_cvttps_epi32(rt);
2186 vfeps = _mm_frcz_ps(rt);
2188 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2190 twovfeps = _mm_add_ps(vfeps,vfeps);
2191 vfitab = _mm_slli_epi32(vfitab,2);
2193 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2194 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2195 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2196 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2197 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2198 _MM_TRANSPOSE4_PS(Y,F,G,H);
2199 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2200 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2201 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
2205 fscal = _mm_andnot_ps(dummy_mask,fscal);
2207 /* Update vectorial force */
2208 fix2 = _mm_macc_ps(dx20,fscal,fix2);
2209 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
2210 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
2212 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
2213 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
2214 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
2216 /**************************
2217 * CALCULATE INTERACTIONS *
2218 **************************/
2220 r21 = _mm_mul_ps(rsq21,rinv21);
2221 r21 = _mm_andnot_ps(dummy_mask,r21);
2223 /* Calculate table index by multiplying r with table scale and truncate to integer */
2224 rt = _mm_mul_ps(r21,vftabscale);
2225 vfitab = _mm_cvttps_epi32(rt);
2227 vfeps = _mm_frcz_ps(rt);
2229 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2231 twovfeps = _mm_add_ps(vfeps,vfeps);
2232 vfitab = _mm_slli_epi32(vfitab,2);
2234 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2235 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2236 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2237 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2238 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2239 _MM_TRANSPOSE4_PS(Y,F,G,H);
2240 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2241 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2242 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2246 fscal = _mm_andnot_ps(dummy_mask,fscal);
2248 /* Update vectorial force */
2249 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2250 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2251 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2253 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2254 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2255 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2257 /**************************
2258 * CALCULATE INTERACTIONS *
2259 **************************/
2261 r22 = _mm_mul_ps(rsq22,rinv22);
2262 r22 = _mm_andnot_ps(dummy_mask,r22);
2264 /* Calculate table index by multiplying r with table scale and truncate to integer */
2265 rt = _mm_mul_ps(r22,vftabscale);
2266 vfitab = _mm_cvttps_epi32(rt);
2268 vfeps = _mm_frcz_ps(rt);
2270 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2272 twovfeps = _mm_add_ps(vfeps,vfeps);
2273 vfitab = _mm_slli_epi32(vfitab,2);
2275 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2276 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2277 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2278 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2279 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2280 _MM_TRANSPOSE4_PS(Y,F,G,H);
2281 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2282 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2283 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2287 fscal = _mm_andnot_ps(dummy_mask,fscal);
2289 /* Update vectorial force */
2290 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2291 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2292 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2294 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2295 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2296 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2298 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2299 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2300 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2301 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2303 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2304 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2306 /* Inner loop uses 395 flops */
2309 /* End of innermost loop */
2311 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2312 f+i_coord_offset,fshift+i_shift_offset);
2314 /* Increment number of inner iterations */
2315 inneriter += j_index_end - j_index_start;
2317 /* Outer loop uses 18 flops */
2320 /* Increment number of outer iterations */
2323 /* Update outer/inner flops */
2325 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*395);