2 * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_single.h"
34 #include "kernelutil_x86_avx_128_fma_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_128_fma_single
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_128_fma_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
93 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
96 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
97 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
99 __m128i ifour = _mm_set1_epi32(4);
100 __m128 rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
102 __m128 dummy_mask,cutoff_mask;
103 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
104 __m128 one = _mm_set1_ps(1.0);
105 __m128 two = _mm_set1_ps(2.0);
111 jindex = nlist->jindex;
113 shiftidx = nlist->shift;
115 shiftvec = fr->shift_vec[0];
116 fshift = fr->fshift[0];
117 facel = _mm_set1_ps(fr->epsfac);
118 charge = mdatoms->chargeA;
119 nvdwtype = fr->ntype;
121 vdwtype = mdatoms->typeA;
123 vftab = kernel_data->table_elec_vdw->data;
124 vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
126 /* Setup water-specific parameters */
127 inr = nlist->iinr[0];
128 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
129 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
130 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
131 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
133 jq0 = _mm_set1_ps(charge[inr+0]);
134 jq1 = _mm_set1_ps(charge[inr+1]);
135 jq2 = _mm_set1_ps(charge[inr+2]);
136 vdwjidx0A = 2*vdwtype[inr+0];
137 qq00 = _mm_mul_ps(iq0,jq0);
138 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
139 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
140 qq01 = _mm_mul_ps(iq0,jq1);
141 qq02 = _mm_mul_ps(iq0,jq2);
142 qq10 = _mm_mul_ps(iq1,jq0);
143 qq11 = _mm_mul_ps(iq1,jq1);
144 qq12 = _mm_mul_ps(iq1,jq2);
145 qq20 = _mm_mul_ps(iq2,jq0);
146 qq21 = _mm_mul_ps(iq2,jq1);
147 qq22 = _mm_mul_ps(iq2,jq2);
149 /* Avoid stupid compiler warnings */
150 jnrA = jnrB = jnrC = jnrD = 0;
159 for(iidx=0;iidx<4*DIM;iidx++)
164 /* Start outer loop over neighborlists */
165 for(iidx=0; iidx<nri; iidx++)
167 /* Load shift vector for this list */
168 i_shift_offset = DIM*shiftidx[iidx];
170 /* Load limits for loop over neighbors */
171 j_index_start = jindex[iidx];
172 j_index_end = jindex[iidx+1];
174 /* Get outer coordinate index */
176 i_coord_offset = DIM*inr;
178 /* Load i particle coords and add shift vector */
179 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
180 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
182 fix0 = _mm_setzero_ps();
183 fiy0 = _mm_setzero_ps();
184 fiz0 = _mm_setzero_ps();
185 fix1 = _mm_setzero_ps();
186 fiy1 = _mm_setzero_ps();
187 fiz1 = _mm_setzero_ps();
188 fix2 = _mm_setzero_ps();
189 fiy2 = _mm_setzero_ps();
190 fiz2 = _mm_setzero_ps();
192 /* Reset potential sums */
193 velecsum = _mm_setzero_ps();
194 vvdwsum = _mm_setzero_ps();
196 /* Start inner kernel loop */
197 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
200 /* Get j neighbor index, and coordinate index */
205 j_coord_offsetA = DIM*jnrA;
206 j_coord_offsetB = DIM*jnrB;
207 j_coord_offsetC = DIM*jnrC;
208 j_coord_offsetD = DIM*jnrD;
210 /* load j atom coordinates */
211 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
212 x+j_coord_offsetC,x+j_coord_offsetD,
213 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
215 /* Calculate displacement vector */
216 dx00 = _mm_sub_ps(ix0,jx0);
217 dy00 = _mm_sub_ps(iy0,jy0);
218 dz00 = _mm_sub_ps(iz0,jz0);
219 dx01 = _mm_sub_ps(ix0,jx1);
220 dy01 = _mm_sub_ps(iy0,jy1);
221 dz01 = _mm_sub_ps(iz0,jz1);
222 dx02 = _mm_sub_ps(ix0,jx2);
223 dy02 = _mm_sub_ps(iy0,jy2);
224 dz02 = _mm_sub_ps(iz0,jz2);
225 dx10 = _mm_sub_ps(ix1,jx0);
226 dy10 = _mm_sub_ps(iy1,jy0);
227 dz10 = _mm_sub_ps(iz1,jz0);
228 dx11 = _mm_sub_ps(ix1,jx1);
229 dy11 = _mm_sub_ps(iy1,jy1);
230 dz11 = _mm_sub_ps(iz1,jz1);
231 dx12 = _mm_sub_ps(ix1,jx2);
232 dy12 = _mm_sub_ps(iy1,jy2);
233 dz12 = _mm_sub_ps(iz1,jz2);
234 dx20 = _mm_sub_ps(ix2,jx0);
235 dy20 = _mm_sub_ps(iy2,jy0);
236 dz20 = _mm_sub_ps(iz2,jz0);
237 dx21 = _mm_sub_ps(ix2,jx1);
238 dy21 = _mm_sub_ps(iy2,jy1);
239 dz21 = _mm_sub_ps(iz2,jz1);
240 dx22 = _mm_sub_ps(ix2,jx2);
241 dy22 = _mm_sub_ps(iy2,jy2);
242 dz22 = _mm_sub_ps(iz2,jz2);
244 /* Calculate squared distance and things based on it */
245 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
246 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
247 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
248 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
249 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
250 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
251 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
252 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
253 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
255 rinv00 = gmx_mm_invsqrt_ps(rsq00);
256 rinv01 = gmx_mm_invsqrt_ps(rsq01);
257 rinv02 = gmx_mm_invsqrt_ps(rsq02);
258 rinv10 = gmx_mm_invsqrt_ps(rsq10);
259 rinv11 = gmx_mm_invsqrt_ps(rsq11);
260 rinv12 = gmx_mm_invsqrt_ps(rsq12);
261 rinv20 = gmx_mm_invsqrt_ps(rsq20);
262 rinv21 = gmx_mm_invsqrt_ps(rsq21);
263 rinv22 = gmx_mm_invsqrt_ps(rsq22);
265 fjx0 = _mm_setzero_ps();
266 fjy0 = _mm_setzero_ps();
267 fjz0 = _mm_setzero_ps();
268 fjx1 = _mm_setzero_ps();
269 fjy1 = _mm_setzero_ps();
270 fjz1 = _mm_setzero_ps();
271 fjx2 = _mm_setzero_ps();
272 fjy2 = _mm_setzero_ps();
273 fjz2 = _mm_setzero_ps();
275 /**************************
276 * CALCULATE INTERACTIONS *
277 **************************/
279 r00 = _mm_mul_ps(rsq00,rinv00);
281 /* Calculate table index by multiplying r with table scale and truncate to integer */
282 rt = _mm_mul_ps(r00,vftabscale);
283 vfitab = _mm_cvttps_epi32(rt);
285 vfeps = _mm_frcz_ps(rt);
287 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
289 twovfeps = _mm_add_ps(vfeps,vfeps);
290 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
292 /* CUBIC SPLINE TABLE ELECTROSTATICS */
293 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
294 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
295 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
296 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
297 _MM_TRANSPOSE4_PS(Y,F,G,H);
298 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
299 VV = _mm_macc_ps(vfeps,Fp,Y);
300 velec = _mm_mul_ps(qq00,VV);
301 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
302 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
304 /* CUBIC SPLINE TABLE DISPERSION */
305 vfitab = _mm_add_epi32(vfitab,ifour);
306 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
307 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
308 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
309 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
310 _MM_TRANSPOSE4_PS(Y,F,G,H);
311 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
312 VV = _mm_macc_ps(vfeps,Fp,Y);
313 vvdw6 = _mm_mul_ps(c6_00,VV);
314 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
315 fvdw6 = _mm_mul_ps(c6_00,FF);
317 /* CUBIC SPLINE TABLE REPULSION */
318 vfitab = _mm_add_epi32(vfitab,ifour);
319 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
320 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
321 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
322 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
323 _MM_TRANSPOSE4_PS(Y,F,G,H);
324 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
325 VV = _mm_macc_ps(vfeps,Fp,Y);
326 vvdw12 = _mm_mul_ps(c12_00,VV);
327 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
328 fvdw12 = _mm_mul_ps(c12_00,FF);
329 vvdw = _mm_add_ps(vvdw12,vvdw6);
330 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
332 /* Update potential sum for this i atom from the interaction with this j atom. */
333 velecsum = _mm_add_ps(velecsum,velec);
334 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
336 fscal = _mm_add_ps(felec,fvdw);
338 /* Update vectorial force */
339 fix0 = _mm_macc_ps(dx00,fscal,fix0);
340 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
341 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
343 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
344 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
345 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
347 /**************************
348 * CALCULATE INTERACTIONS *
349 **************************/
351 r01 = _mm_mul_ps(rsq01,rinv01);
353 /* Calculate table index by multiplying r with table scale and truncate to integer */
354 rt = _mm_mul_ps(r01,vftabscale);
355 vfitab = _mm_cvttps_epi32(rt);
357 vfeps = _mm_frcz_ps(rt);
359 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
361 twovfeps = _mm_add_ps(vfeps,vfeps);
362 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
364 /* CUBIC SPLINE TABLE ELECTROSTATICS */
365 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
366 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
367 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
368 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
369 _MM_TRANSPOSE4_PS(Y,F,G,H);
370 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
371 VV = _mm_macc_ps(vfeps,Fp,Y);
372 velec = _mm_mul_ps(qq01,VV);
373 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
374 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
376 /* Update potential sum for this i atom from the interaction with this j atom. */
377 velecsum = _mm_add_ps(velecsum,velec);
381 /* Update vectorial force */
382 fix0 = _mm_macc_ps(dx01,fscal,fix0);
383 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
384 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
386 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
387 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
388 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
390 /**************************
391 * CALCULATE INTERACTIONS *
392 **************************/
394 r02 = _mm_mul_ps(rsq02,rinv02);
396 /* Calculate table index by multiplying r with table scale and truncate to integer */
397 rt = _mm_mul_ps(r02,vftabscale);
398 vfitab = _mm_cvttps_epi32(rt);
400 vfeps = _mm_frcz_ps(rt);
402 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
404 twovfeps = _mm_add_ps(vfeps,vfeps);
405 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
407 /* CUBIC SPLINE TABLE ELECTROSTATICS */
408 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
409 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
410 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
411 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
412 _MM_TRANSPOSE4_PS(Y,F,G,H);
413 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
414 VV = _mm_macc_ps(vfeps,Fp,Y);
415 velec = _mm_mul_ps(qq02,VV);
416 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
417 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
419 /* Update potential sum for this i atom from the interaction with this j atom. */
420 velecsum = _mm_add_ps(velecsum,velec);
424 /* Update vectorial force */
425 fix0 = _mm_macc_ps(dx02,fscal,fix0);
426 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
427 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
429 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
430 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
431 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
433 /**************************
434 * CALCULATE INTERACTIONS *
435 **************************/
437 r10 = _mm_mul_ps(rsq10,rinv10);
439 /* Calculate table index by multiplying r with table scale and truncate to integer */
440 rt = _mm_mul_ps(r10,vftabscale);
441 vfitab = _mm_cvttps_epi32(rt);
443 vfeps = _mm_frcz_ps(rt);
445 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
447 twovfeps = _mm_add_ps(vfeps,vfeps);
448 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
450 /* CUBIC SPLINE TABLE ELECTROSTATICS */
451 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
452 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
453 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
454 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
455 _MM_TRANSPOSE4_PS(Y,F,G,H);
456 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
457 VV = _mm_macc_ps(vfeps,Fp,Y);
458 velec = _mm_mul_ps(qq10,VV);
459 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
460 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
462 /* Update potential sum for this i atom from the interaction with this j atom. */
463 velecsum = _mm_add_ps(velecsum,velec);
467 /* Update vectorial force */
468 fix1 = _mm_macc_ps(dx10,fscal,fix1);
469 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
470 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
472 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
473 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
474 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
476 /**************************
477 * CALCULATE INTERACTIONS *
478 **************************/
480 r11 = _mm_mul_ps(rsq11,rinv11);
482 /* Calculate table index by multiplying r with table scale and truncate to integer */
483 rt = _mm_mul_ps(r11,vftabscale);
484 vfitab = _mm_cvttps_epi32(rt);
486 vfeps = _mm_frcz_ps(rt);
488 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
490 twovfeps = _mm_add_ps(vfeps,vfeps);
491 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
493 /* CUBIC SPLINE TABLE ELECTROSTATICS */
494 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
495 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
496 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
497 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
498 _MM_TRANSPOSE4_PS(Y,F,G,H);
499 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
500 VV = _mm_macc_ps(vfeps,Fp,Y);
501 velec = _mm_mul_ps(qq11,VV);
502 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
503 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
505 /* Update potential sum for this i atom from the interaction with this j atom. */
506 velecsum = _mm_add_ps(velecsum,velec);
510 /* Update vectorial force */
511 fix1 = _mm_macc_ps(dx11,fscal,fix1);
512 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
513 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
515 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
516 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
517 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
519 /**************************
520 * CALCULATE INTERACTIONS *
521 **************************/
523 r12 = _mm_mul_ps(rsq12,rinv12);
525 /* Calculate table index by multiplying r with table scale and truncate to integer */
526 rt = _mm_mul_ps(r12,vftabscale);
527 vfitab = _mm_cvttps_epi32(rt);
529 vfeps = _mm_frcz_ps(rt);
531 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
533 twovfeps = _mm_add_ps(vfeps,vfeps);
534 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
536 /* CUBIC SPLINE TABLE ELECTROSTATICS */
537 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
538 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
539 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
540 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
541 _MM_TRANSPOSE4_PS(Y,F,G,H);
542 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
543 VV = _mm_macc_ps(vfeps,Fp,Y);
544 velec = _mm_mul_ps(qq12,VV);
545 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
546 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
548 /* Update potential sum for this i atom from the interaction with this j atom. */
549 velecsum = _mm_add_ps(velecsum,velec);
553 /* Update vectorial force */
554 fix1 = _mm_macc_ps(dx12,fscal,fix1);
555 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
556 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
558 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
559 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
560 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
562 /**************************
563 * CALCULATE INTERACTIONS *
564 **************************/
566 r20 = _mm_mul_ps(rsq20,rinv20);
568 /* Calculate table index by multiplying r with table scale and truncate to integer */
569 rt = _mm_mul_ps(r20,vftabscale);
570 vfitab = _mm_cvttps_epi32(rt);
572 vfeps = _mm_frcz_ps(rt);
574 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
576 twovfeps = _mm_add_ps(vfeps,vfeps);
577 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
579 /* CUBIC SPLINE TABLE ELECTROSTATICS */
580 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
581 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
582 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
583 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
584 _MM_TRANSPOSE4_PS(Y,F,G,H);
585 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
586 VV = _mm_macc_ps(vfeps,Fp,Y);
587 velec = _mm_mul_ps(qq20,VV);
588 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
589 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
591 /* Update potential sum for this i atom from the interaction with this j atom. */
592 velecsum = _mm_add_ps(velecsum,velec);
596 /* Update vectorial force */
597 fix2 = _mm_macc_ps(dx20,fscal,fix2);
598 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
599 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
601 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
602 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
603 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
605 /**************************
606 * CALCULATE INTERACTIONS *
607 **************************/
609 r21 = _mm_mul_ps(rsq21,rinv21);
611 /* Calculate table index by multiplying r with table scale and truncate to integer */
612 rt = _mm_mul_ps(r21,vftabscale);
613 vfitab = _mm_cvttps_epi32(rt);
615 vfeps = _mm_frcz_ps(rt);
617 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
619 twovfeps = _mm_add_ps(vfeps,vfeps);
620 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
622 /* CUBIC SPLINE TABLE ELECTROSTATICS */
623 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
624 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
625 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
626 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
627 _MM_TRANSPOSE4_PS(Y,F,G,H);
628 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
629 VV = _mm_macc_ps(vfeps,Fp,Y);
630 velec = _mm_mul_ps(qq21,VV);
631 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
632 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
634 /* Update potential sum for this i atom from the interaction with this j atom. */
635 velecsum = _mm_add_ps(velecsum,velec);
639 /* Update vectorial force */
640 fix2 = _mm_macc_ps(dx21,fscal,fix2);
641 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
642 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
644 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
645 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
646 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
648 /**************************
649 * CALCULATE INTERACTIONS *
650 **************************/
652 r22 = _mm_mul_ps(rsq22,rinv22);
654 /* Calculate table index by multiplying r with table scale and truncate to integer */
655 rt = _mm_mul_ps(r22,vftabscale);
656 vfitab = _mm_cvttps_epi32(rt);
658 vfeps = _mm_frcz_ps(rt);
660 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
662 twovfeps = _mm_add_ps(vfeps,vfeps);
663 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
665 /* CUBIC SPLINE TABLE ELECTROSTATICS */
666 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
667 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
668 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
669 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
670 _MM_TRANSPOSE4_PS(Y,F,G,H);
671 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
672 VV = _mm_macc_ps(vfeps,Fp,Y);
673 velec = _mm_mul_ps(qq22,VV);
674 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
675 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
677 /* Update potential sum for this i atom from the interaction with this j atom. */
678 velecsum = _mm_add_ps(velecsum,velec);
682 /* Update vectorial force */
683 fix2 = _mm_macc_ps(dx22,fscal,fix2);
684 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
685 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
687 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
688 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
689 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
691 fjptrA = f+j_coord_offsetA;
692 fjptrB = f+j_coord_offsetB;
693 fjptrC = f+j_coord_offsetC;
694 fjptrD = f+j_coord_offsetD;
696 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
697 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
699 /* Inner loop uses 444 flops */
705 /* Get j neighbor index, and coordinate index */
706 jnrlistA = jjnr[jidx];
707 jnrlistB = jjnr[jidx+1];
708 jnrlistC = jjnr[jidx+2];
709 jnrlistD = jjnr[jidx+3];
710 /* Sign of each element will be negative for non-real atoms.
711 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
712 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
714 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
715 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
716 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
717 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
718 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
719 j_coord_offsetA = DIM*jnrA;
720 j_coord_offsetB = DIM*jnrB;
721 j_coord_offsetC = DIM*jnrC;
722 j_coord_offsetD = DIM*jnrD;
724 /* load j atom coordinates */
725 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
726 x+j_coord_offsetC,x+j_coord_offsetD,
727 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
729 /* Calculate displacement vector */
730 dx00 = _mm_sub_ps(ix0,jx0);
731 dy00 = _mm_sub_ps(iy0,jy0);
732 dz00 = _mm_sub_ps(iz0,jz0);
733 dx01 = _mm_sub_ps(ix0,jx1);
734 dy01 = _mm_sub_ps(iy0,jy1);
735 dz01 = _mm_sub_ps(iz0,jz1);
736 dx02 = _mm_sub_ps(ix0,jx2);
737 dy02 = _mm_sub_ps(iy0,jy2);
738 dz02 = _mm_sub_ps(iz0,jz2);
739 dx10 = _mm_sub_ps(ix1,jx0);
740 dy10 = _mm_sub_ps(iy1,jy0);
741 dz10 = _mm_sub_ps(iz1,jz0);
742 dx11 = _mm_sub_ps(ix1,jx1);
743 dy11 = _mm_sub_ps(iy1,jy1);
744 dz11 = _mm_sub_ps(iz1,jz1);
745 dx12 = _mm_sub_ps(ix1,jx2);
746 dy12 = _mm_sub_ps(iy1,jy2);
747 dz12 = _mm_sub_ps(iz1,jz2);
748 dx20 = _mm_sub_ps(ix2,jx0);
749 dy20 = _mm_sub_ps(iy2,jy0);
750 dz20 = _mm_sub_ps(iz2,jz0);
751 dx21 = _mm_sub_ps(ix2,jx1);
752 dy21 = _mm_sub_ps(iy2,jy1);
753 dz21 = _mm_sub_ps(iz2,jz1);
754 dx22 = _mm_sub_ps(ix2,jx2);
755 dy22 = _mm_sub_ps(iy2,jy2);
756 dz22 = _mm_sub_ps(iz2,jz2);
758 /* Calculate squared distance and things based on it */
759 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
760 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
761 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
762 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
763 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
764 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
765 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
766 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
767 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
769 rinv00 = gmx_mm_invsqrt_ps(rsq00);
770 rinv01 = gmx_mm_invsqrt_ps(rsq01);
771 rinv02 = gmx_mm_invsqrt_ps(rsq02);
772 rinv10 = gmx_mm_invsqrt_ps(rsq10);
773 rinv11 = gmx_mm_invsqrt_ps(rsq11);
774 rinv12 = gmx_mm_invsqrt_ps(rsq12);
775 rinv20 = gmx_mm_invsqrt_ps(rsq20);
776 rinv21 = gmx_mm_invsqrt_ps(rsq21);
777 rinv22 = gmx_mm_invsqrt_ps(rsq22);
779 fjx0 = _mm_setzero_ps();
780 fjy0 = _mm_setzero_ps();
781 fjz0 = _mm_setzero_ps();
782 fjx1 = _mm_setzero_ps();
783 fjy1 = _mm_setzero_ps();
784 fjz1 = _mm_setzero_ps();
785 fjx2 = _mm_setzero_ps();
786 fjy2 = _mm_setzero_ps();
787 fjz2 = _mm_setzero_ps();
789 /**************************
790 * CALCULATE INTERACTIONS *
791 **************************/
793 r00 = _mm_mul_ps(rsq00,rinv00);
794 r00 = _mm_andnot_ps(dummy_mask,r00);
796 /* Calculate table index by multiplying r with table scale and truncate to integer */
797 rt = _mm_mul_ps(r00,vftabscale);
798 vfitab = _mm_cvttps_epi32(rt);
800 vfeps = _mm_frcz_ps(rt);
802 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
804 twovfeps = _mm_add_ps(vfeps,vfeps);
805 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
807 /* CUBIC SPLINE TABLE ELECTROSTATICS */
808 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
809 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
810 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
811 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
812 _MM_TRANSPOSE4_PS(Y,F,G,H);
813 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
814 VV = _mm_macc_ps(vfeps,Fp,Y);
815 velec = _mm_mul_ps(qq00,VV);
816 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
817 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
819 /* CUBIC SPLINE TABLE DISPERSION */
820 vfitab = _mm_add_epi32(vfitab,ifour);
821 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
822 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
823 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
824 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
825 _MM_TRANSPOSE4_PS(Y,F,G,H);
826 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
827 VV = _mm_macc_ps(vfeps,Fp,Y);
828 vvdw6 = _mm_mul_ps(c6_00,VV);
829 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
830 fvdw6 = _mm_mul_ps(c6_00,FF);
832 /* CUBIC SPLINE TABLE REPULSION */
833 vfitab = _mm_add_epi32(vfitab,ifour);
834 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
835 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
836 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
837 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
838 _MM_TRANSPOSE4_PS(Y,F,G,H);
839 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
840 VV = _mm_macc_ps(vfeps,Fp,Y);
841 vvdw12 = _mm_mul_ps(c12_00,VV);
842 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
843 fvdw12 = _mm_mul_ps(c12_00,FF);
844 vvdw = _mm_add_ps(vvdw12,vvdw6);
845 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
847 /* Update potential sum for this i atom from the interaction with this j atom. */
848 velec = _mm_andnot_ps(dummy_mask,velec);
849 velecsum = _mm_add_ps(velecsum,velec);
850 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
851 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
853 fscal = _mm_add_ps(felec,fvdw);
855 fscal = _mm_andnot_ps(dummy_mask,fscal);
857 /* Update vectorial force */
858 fix0 = _mm_macc_ps(dx00,fscal,fix0);
859 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
860 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
862 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
863 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
864 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
866 /**************************
867 * CALCULATE INTERACTIONS *
868 **************************/
870 r01 = _mm_mul_ps(rsq01,rinv01);
871 r01 = _mm_andnot_ps(dummy_mask,r01);
873 /* Calculate table index by multiplying r with table scale and truncate to integer */
874 rt = _mm_mul_ps(r01,vftabscale);
875 vfitab = _mm_cvttps_epi32(rt);
877 vfeps = _mm_frcz_ps(rt);
879 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
881 twovfeps = _mm_add_ps(vfeps,vfeps);
882 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
884 /* CUBIC SPLINE TABLE ELECTROSTATICS */
885 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
886 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
887 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
888 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
889 _MM_TRANSPOSE4_PS(Y,F,G,H);
890 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
891 VV = _mm_macc_ps(vfeps,Fp,Y);
892 velec = _mm_mul_ps(qq01,VV);
893 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
894 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
896 /* Update potential sum for this i atom from the interaction with this j atom. */
897 velec = _mm_andnot_ps(dummy_mask,velec);
898 velecsum = _mm_add_ps(velecsum,velec);
902 fscal = _mm_andnot_ps(dummy_mask,fscal);
904 /* Update vectorial force */
905 fix0 = _mm_macc_ps(dx01,fscal,fix0);
906 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
907 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
909 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
910 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
911 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
913 /**************************
914 * CALCULATE INTERACTIONS *
915 **************************/
917 r02 = _mm_mul_ps(rsq02,rinv02);
918 r02 = _mm_andnot_ps(dummy_mask,r02);
920 /* Calculate table index by multiplying r with table scale and truncate to integer */
921 rt = _mm_mul_ps(r02,vftabscale);
922 vfitab = _mm_cvttps_epi32(rt);
924 vfeps = _mm_frcz_ps(rt);
926 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
928 twovfeps = _mm_add_ps(vfeps,vfeps);
929 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
931 /* CUBIC SPLINE TABLE ELECTROSTATICS */
932 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
933 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
934 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
935 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
936 _MM_TRANSPOSE4_PS(Y,F,G,H);
937 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
938 VV = _mm_macc_ps(vfeps,Fp,Y);
939 velec = _mm_mul_ps(qq02,VV);
940 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
941 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
943 /* Update potential sum for this i atom from the interaction with this j atom. */
944 velec = _mm_andnot_ps(dummy_mask,velec);
945 velecsum = _mm_add_ps(velecsum,velec);
949 fscal = _mm_andnot_ps(dummy_mask,fscal);
951 /* Update vectorial force */
952 fix0 = _mm_macc_ps(dx02,fscal,fix0);
953 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
954 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
956 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
957 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
958 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
960 /**************************
961 * CALCULATE INTERACTIONS *
962 **************************/
964 r10 = _mm_mul_ps(rsq10,rinv10);
965 r10 = _mm_andnot_ps(dummy_mask,r10);
967 /* Calculate table index by multiplying r with table scale and truncate to integer */
968 rt = _mm_mul_ps(r10,vftabscale);
969 vfitab = _mm_cvttps_epi32(rt);
971 vfeps = _mm_frcz_ps(rt);
973 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
975 twovfeps = _mm_add_ps(vfeps,vfeps);
976 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
978 /* CUBIC SPLINE TABLE ELECTROSTATICS */
979 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
980 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
981 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
982 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
983 _MM_TRANSPOSE4_PS(Y,F,G,H);
984 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
985 VV = _mm_macc_ps(vfeps,Fp,Y);
986 velec = _mm_mul_ps(qq10,VV);
987 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
988 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
990 /* Update potential sum for this i atom from the interaction with this j atom. */
991 velec = _mm_andnot_ps(dummy_mask,velec);
992 velecsum = _mm_add_ps(velecsum,velec);
996 fscal = _mm_andnot_ps(dummy_mask,fscal);
998 /* Update vectorial force */
999 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1000 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1001 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1003 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1004 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1005 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1007 /**************************
1008 * CALCULATE INTERACTIONS *
1009 **************************/
1011 r11 = _mm_mul_ps(rsq11,rinv11);
1012 r11 = _mm_andnot_ps(dummy_mask,r11);
1014 /* Calculate table index by multiplying r with table scale and truncate to integer */
1015 rt = _mm_mul_ps(r11,vftabscale);
1016 vfitab = _mm_cvttps_epi32(rt);
1018 vfeps = _mm_frcz_ps(rt);
1020 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1022 twovfeps = _mm_add_ps(vfeps,vfeps);
1023 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1025 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1026 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1027 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1028 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1029 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1030 _MM_TRANSPOSE4_PS(Y,F,G,H);
1031 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1032 VV = _mm_macc_ps(vfeps,Fp,Y);
1033 velec = _mm_mul_ps(qq11,VV);
1034 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1035 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1037 /* Update potential sum for this i atom from the interaction with this j atom. */
1038 velec = _mm_andnot_ps(dummy_mask,velec);
1039 velecsum = _mm_add_ps(velecsum,velec);
1043 fscal = _mm_andnot_ps(dummy_mask,fscal);
1045 /* Update vectorial force */
1046 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1047 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1048 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1050 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1051 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1052 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1054 /**************************
1055 * CALCULATE INTERACTIONS *
1056 **************************/
1058 r12 = _mm_mul_ps(rsq12,rinv12);
1059 r12 = _mm_andnot_ps(dummy_mask,r12);
1061 /* Calculate table index by multiplying r with table scale and truncate to integer */
1062 rt = _mm_mul_ps(r12,vftabscale);
1063 vfitab = _mm_cvttps_epi32(rt);
1065 vfeps = _mm_frcz_ps(rt);
1067 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1069 twovfeps = _mm_add_ps(vfeps,vfeps);
1070 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1072 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1073 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1074 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1075 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1076 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1077 _MM_TRANSPOSE4_PS(Y,F,G,H);
1078 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1079 VV = _mm_macc_ps(vfeps,Fp,Y);
1080 velec = _mm_mul_ps(qq12,VV);
1081 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1082 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1084 /* Update potential sum for this i atom from the interaction with this j atom. */
1085 velec = _mm_andnot_ps(dummy_mask,velec);
1086 velecsum = _mm_add_ps(velecsum,velec);
1090 fscal = _mm_andnot_ps(dummy_mask,fscal);
1092 /* Update vectorial force */
1093 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1094 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1095 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1097 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1098 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1099 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1101 /**************************
1102 * CALCULATE INTERACTIONS *
1103 **************************/
1105 r20 = _mm_mul_ps(rsq20,rinv20);
1106 r20 = _mm_andnot_ps(dummy_mask,r20);
1108 /* Calculate table index by multiplying r with table scale and truncate to integer */
1109 rt = _mm_mul_ps(r20,vftabscale);
1110 vfitab = _mm_cvttps_epi32(rt);
1112 vfeps = _mm_frcz_ps(rt);
1114 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1116 twovfeps = _mm_add_ps(vfeps,vfeps);
1117 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1119 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1120 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1121 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1122 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1123 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1124 _MM_TRANSPOSE4_PS(Y,F,G,H);
1125 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1126 VV = _mm_macc_ps(vfeps,Fp,Y);
1127 velec = _mm_mul_ps(qq20,VV);
1128 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1129 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1131 /* Update potential sum for this i atom from the interaction with this j atom. */
1132 velec = _mm_andnot_ps(dummy_mask,velec);
1133 velecsum = _mm_add_ps(velecsum,velec);
1137 fscal = _mm_andnot_ps(dummy_mask,fscal);
1139 /* Update vectorial force */
1140 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1141 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1142 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1144 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1145 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1146 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1148 /**************************
1149 * CALCULATE INTERACTIONS *
1150 **************************/
1152 r21 = _mm_mul_ps(rsq21,rinv21);
1153 r21 = _mm_andnot_ps(dummy_mask,r21);
1155 /* Calculate table index by multiplying r with table scale and truncate to integer */
1156 rt = _mm_mul_ps(r21,vftabscale);
1157 vfitab = _mm_cvttps_epi32(rt);
1159 vfeps = _mm_frcz_ps(rt);
1161 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1163 twovfeps = _mm_add_ps(vfeps,vfeps);
1164 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1166 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1167 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1168 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1169 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1170 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1171 _MM_TRANSPOSE4_PS(Y,F,G,H);
1172 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1173 VV = _mm_macc_ps(vfeps,Fp,Y);
1174 velec = _mm_mul_ps(qq21,VV);
1175 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1176 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1178 /* Update potential sum for this i atom from the interaction with this j atom. */
1179 velec = _mm_andnot_ps(dummy_mask,velec);
1180 velecsum = _mm_add_ps(velecsum,velec);
1184 fscal = _mm_andnot_ps(dummy_mask,fscal);
1186 /* Update vectorial force */
1187 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1188 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1189 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1191 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1192 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1193 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1195 /**************************
1196 * CALCULATE INTERACTIONS *
1197 **************************/
1199 r22 = _mm_mul_ps(rsq22,rinv22);
1200 r22 = _mm_andnot_ps(dummy_mask,r22);
1202 /* Calculate table index by multiplying r with table scale and truncate to integer */
1203 rt = _mm_mul_ps(r22,vftabscale);
1204 vfitab = _mm_cvttps_epi32(rt);
1206 vfeps = _mm_frcz_ps(rt);
1208 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1210 twovfeps = _mm_add_ps(vfeps,vfeps);
1211 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1213 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1214 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1215 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1216 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1217 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1218 _MM_TRANSPOSE4_PS(Y,F,G,H);
1219 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1220 VV = _mm_macc_ps(vfeps,Fp,Y);
1221 velec = _mm_mul_ps(qq22,VV);
1222 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1223 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1225 /* Update potential sum for this i atom from the interaction with this j atom. */
1226 velec = _mm_andnot_ps(dummy_mask,velec);
1227 velecsum = _mm_add_ps(velecsum,velec);
1231 fscal = _mm_andnot_ps(dummy_mask,fscal);
1233 /* Update vectorial force */
1234 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1235 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1236 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1238 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1239 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1240 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1242 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1243 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1244 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1245 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1247 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1248 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1250 /* Inner loop uses 453 flops */
1253 /* End of innermost loop */
1255 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1256 f+i_coord_offset,fshift+i_shift_offset);
1259 /* Update potential energies */
1260 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1261 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1263 /* Increment number of inner iterations */
1264 inneriter += j_index_end - j_index_start;
1266 /* Outer loop uses 20 flops */
1269 /* Increment number of outer iterations */
1272 /* Update outer/inner flops */
1274 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*453);
1277 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_128_fma_single
1278 * Electrostatics interaction: CubicSplineTable
1279 * VdW interaction: CubicSplineTable
1280 * Geometry: Water3-Water3
1281 * Calculate force/pot: Force
1284 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_128_fma_single
1285 (t_nblist * gmx_restrict nlist,
1286 rvec * gmx_restrict xx,
1287 rvec * gmx_restrict ff,
1288 t_forcerec * gmx_restrict fr,
1289 t_mdatoms * gmx_restrict mdatoms,
1290 nb_kernel_data_t * gmx_restrict kernel_data,
1291 t_nrnb * gmx_restrict nrnb)
1293 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1294 * just 0 for non-waters.
1295 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1296 * jnr indices corresponding to data put in the four positions in the SIMD register.
1298 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1299 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1300 int jnrA,jnrB,jnrC,jnrD;
1301 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1302 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1303 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1304 real rcutoff_scalar;
1305 real *shiftvec,*fshift,*x,*f;
1306 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1307 real scratch[4*DIM];
1308 __m128 fscal,rcutoff,rcutoff2,jidxall;
1310 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1312 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1314 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1315 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1316 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1317 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1318 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1319 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1320 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1321 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1322 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1323 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1324 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1325 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1326 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1327 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1328 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1329 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1330 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1333 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1336 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1337 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1339 __m128i ifour = _mm_set1_epi32(4);
1340 __m128 rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
1342 __m128 dummy_mask,cutoff_mask;
1343 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1344 __m128 one = _mm_set1_ps(1.0);
1345 __m128 two = _mm_set1_ps(2.0);
1351 jindex = nlist->jindex;
1353 shiftidx = nlist->shift;
1355 shiftvec = fr->shift_vec[0];
1356 fshift = fr->fshift[0];
1357 facel = _mm_set1_ps(fr->epsfac);
1358 charge = mdatoms->chargeA;
1359 nvdwtype = fr->ntype;
1360 vdwparam = fr->nbfp;
1361 vdwtype = mdatoms->typeA;
1363 vftab = kernel_data->table_elec_vdw->data;
1364 vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
1366 /* Setup water-specific parameters */
1367 inr = nlist->iinr[0];
1368 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1369 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1370 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1371 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1373 jq0 = _mm_set1_ps(charge[inr+0]);
1374 jq1 = _mm_set1_ps(charge[inr+1]);
1375 jq2 = _mm_set1_ps(charge[inr+2]);
1376 vdwjidx0A = 2*vdwtype[inr+0];
1377 qq00 = _mm_mul_ps(iq0,jq0);
1378 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1379 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1380 qq01 = _mm_mul_ps(iq0,jq1);
1381 qq02 = _mm_mul_ps(iq0,jq2);
1382 qq10 = _mm_mul_ps(iq1,jq0);
1383 qq11 = _mm_mul_ps(iq1,jq1);
1384 qq12 = _mm_mul_ps(iq1,jq2);
1385 qq20 = _mm_mul_ps(iq2,jq0);
1386 qq21 = _mm_mul_ps(iq2,jq1);
1387 qq22 = _mm_mul_ps(iq2,jq2);
1389 /* Avoid stupid compiler warnings */
1390 jnrA = jnrB = jnrC = jnrD = 0;
1391 j_coord_offsetA = 0;
1392 j_coord_offsetB = 0;
1393 j_coord_offsetC = 0;
1394 j_coord_offsetD = 0;
1399 for(iidx=0;iidx<4*DIM;iidx++)
1401 scratch[iidx] = 0.0;
1404 /* Start outer loop over neighborlists */
1405 for(iidx=0; iidx<nri; iidx++)
1407 /* Load shift vector for this list */
1408 i_shift_offset = DIM*shiftidx[iidx];
1410 /* Load limits for loop over neighbors */
1411 j_index_start = jindex[iidx];
1412 j_index_end = jindex[iidx+1];
1414 /* Get outer coordinate index */
1416 i_coord_offset = DIM*inr;
1418 /* Load i particle coords and add shift vector */
1419 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1420 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1422 fix0 = _mm_setzero_ps();
1423 fiy0 = _mm_setzero_ps();
1424 fiz0 = _mm_setzero_ps();
1425 fix1 = _mm_setzero_ps();
1426 fiy1 = _mm_setzero_ps();
1427 fiz1 = _mm_setzero_ps();
1428 fix2 = _mm_setzero_ps();
1429 fiy2 = _mm_setzero_ps();
1430 fiz2 = _mm_setzero_ps();
1432 /* Start inner kernel loop */
1433 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1436 /* Get j neighbor index, and coordinate index */
1438 jnrB = jjnr[jidx+1];
1439 jnrC = jjnr[jidx+2];
1440 jnrD = jjnr[jidx+3];
1441 j_coord_offsetA = DIM*jnrA;
1442 j_coord_offsetB = DIM*jnrB;
1443 j_coord_offsetC = DIM*jnrC;
1444 j_coord_offsetD = DIM*jnrD;
1446 /* load j atom coordinates */
1447 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1448 x+j_coord_offsetC,x+j_coord_offsetD,
1449 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1451 /* Calculate displacement vector */
1452 dx00 = _mm_sub_ps(ix0,jx0);
1453 dy00 = _mm_sub_ps(iy0,jy0);
1454 dz00 = _mm_sub_ps(iz0,jz0);
1455 dx01 = _mm_sub_ps(ix0,jx1);
1456 dy01 = _mm_sub_ps(iy0,jy1);
1457 dz01 = _mm_sub_ps(iz0,jz1);
1458 dx02 = _mm_sub_ps(ix0,jx2);
1459 dy02 = _mm_sub_ps(iy0,jy2);
1460 dz02 = _mm_sub_ps(iz0,jz2);
1461 dx10 = _mm_sub_ps(ix1,jx0);
1462 dy10 = _mm_sub_ps(iy1,jy0);
1463 dz10 = _mm_sub_ps(iz1,jz0);
1464 dx11 = _mm_sub_ps(ix1,jx1);
1465 dy11 = _mm_sub_ps(iy1,jy1);
1466 dz11 = _mm_sub_ps(iz1,jz1);
1467 dx12 = _mm_sub_ps(ix1,jx2);
1468 dy12 = _mm_sub_ps(iy1,jy2);
1469 dz12 = _mm_sub_ps(iz1,jz2);
1470 dx20 = _mm_sub_ps(ix2,jx0);
1471 dy20 = _mm_sub_ps(iy2,jy0);
1472 dz20 = _mm_sub_ps(iz2,jz0);
1473 dx21 = _mm_sub_ps(ix2,jx1);
1474 dy21 = _mm_sub_ps(iy2,jy1);
1475 dz21 = _mm_sub_ps(iz2,jz1);
1476 dx22 = _mm_sub_ps(ix2,jx2);
1477 dy22 = _mm_sub_ps(iy2,jy2);
1478 dz22 = _mm_sub_ps(iz2,jz2);
1480 /* Calculate squared distance and things based on it */
1481 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1482 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1483 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1484 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1485 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1486 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1487 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1488 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1489 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1491 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1492 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1493 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1494 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1495 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1496 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1497 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1498 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1499 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1501 fjx0 = _mm_setzero_ps();
1502 fjy0 = _mm_setzero_ps();
1503 fjz0 = _mm_setzero_ps();
1504 fjx1 = _mm_setzero_ps();
1505 fjy1 = _mm_setzero_ps();
1506 fjz1 = _mm_setzero_ps();
1507 fjx2 = _mm_setzero_ps();
1508 fjy2 = _mm_setzero_ps();
1509 fjz2 = _mm_setzero_ps();
1511 /**************************
1512 * CALCULATE INTERACTIONS *
1513 **************************/
1515 r00 = _mm_mul_ps(rsq00,rinv00);
1517 /* Calculate table index by multiplying r with table scale and truncate to integer */
1518 rt = _mm_mul_ps(r00,vftabscale);
1519 vfitab = _mm_cvttps_epi32(rt);
1521 vfeps = _mm_frcz_ps(rt);
1523 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1525 twovfeps = _mm_add_ps(vfeps,vfeps);
1526 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1528 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1529 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1530 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1531 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1532 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1533 _MM_TRANSPOSE4_PS(Y,F,G,H);
1534 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1535 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1536 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1538 /* CUBIC SPLINE TABLE DISPERSION */
1539 vfitab = _mm_add_epi32(vfitab,ifour);
1540 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1541 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1542 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1543 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1544 _MM_TRANSPOSE4_PS(Y,F,G,H);
1545 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1546 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1547 fvdw6 = _mm_mul_ps(c6_00,FF);
1549 /* CUBIC SPLINE TABLE REPULSION */
1550 vfitab = _mm_add_epi32(vfitab,ifour);
1551 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1552 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1553 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1554 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1555 _MM_TRANSPOSE4_PS(Y,F,G,H);
1556 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1557 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1558 fvdw12 = _mm_mul_ps(c12_00,FF);
1559 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1561 fscal = _mm_add_ps(felec,fvdw);
1563 /* Update vectorial force */
1564 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1565 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1566 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1568 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1569 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1570 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1572 /**************************
1573 * CALCULATE INTERACTIONS *
1574 **************************/
1576 r01 = _mm_mul_ps(rsq01,rinv01);
1578 /* Calculate table index by multiplying r with table scale and truncate to integer */
1579 rt = _mm_mul_ps(r01,vftabscale);
1580 vfitab = _mm_cvttps_epi32(rt);
1582 vfeps = _mm_frcz_ps(rt);
1584 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1586 twovfeps = _mm_add_ps(vfeps,vfeps);
1587 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1589 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1590 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1591 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1592 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1593 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1594 _MM_TRANSPOSE4_PS(Y,F,G,H);
1595 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1596 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1597 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
1601 /* Update vectorial force */
1602 fix0 = _mm_macc_ps(dx01,fscal,fix0);
1603 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1604 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1606 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1607 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1608 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1610 /**************************
1611 * CALCULATE INTERACTIONS *
1612 **************************/
1614 r02 = _mm_mul_ps(rsq02,rinv02);
1616 /* Calculate table index by multiplying r with table scale and truncate to integer */
1617 rt = _mm_mul_ps(r02,vftabscale);
1618 vfitab = _mm_cvttps_epi32(rt);
1620 vfeps = _mm_frcz_ps(rt);
1622 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1624 twovfeps = _mm_add_ps(vfeps,vfeps);
1625 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1627 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1628 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1629 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1630 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1631 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1632 _MM_TRANSPOSE4_PS(Y,F,G,H);
1633 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1634 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1635 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
1639 /* Update vectorial force */
1640 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1641 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1642 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
1644 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
1645 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
1646 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
1648 /**************************
1649 * CALCULATE INTERACTIONS *
1650 **************************/
1652 r10 = _mm_mul_ps(rsq10,rinv10);
1654 /* Calculate table index by multiplying r with table scale and truncate to integer */
1655 rt = _mm_mul_ps(r10,vftabscale);
1656 vfitab = _mm_cvttps_epi32(rt);
1658 vfeps = _mm_frcz_ps(rt);
1660 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1662 twovfeps = _mm_add_ps(vfeps,vfeps);
1663 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1665 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1666 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1667 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1668 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1669 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1670 _MM_TRANSPOSE4_PS(Y,F,G,H);
1671 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1672 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1673 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1677 /* Update vectorial force */
1678 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1679 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1680 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1682 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1683 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1684 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1686 /**************************
1687 * CALCULATE INTERACTIONS *
1688 **************************/
1690 r11 = _mm_mul_ps(rsq11,rinv11);
1692 /* Calculate table index by multiplying r with table scale and truncate to integer */
1693 rt = _mm_mul_ps(r11,vftabscale);
1694 vfitab = _mm_cvttps_epi32(rt);
1696 vfeps = _mm_frcz_ps(rt);
1698 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1700 twovfeps = _mm_add_ps(vfeps,vfeps);
1701 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1703 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1704 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1705 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1706 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1707 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1708 _MM_TRANSPOSE4_PS(Y,F,G,H);
1709 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1710 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1711 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1715 /* Update vectorial force */
1716 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1717 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1718 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1720 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1721 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1722 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1724 /**************************
1725 * CALCULATE INTERACTIONS *
1726 **************************/
1728 r12 = _mm_mul_ps(rsq12,rinv12);
1730 /* Calculate table index by multiplying r with table scale and truncate to integer */
1731 rt = _mm_mul_ps(r12,vftabscale);
1732 vfitab = _mm_cvttps_epi32(rt);
1734 vfeps = _mm_frcz_ps(rt);
1736 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1738 twovfeps = _mm_add_ps(vfeps,vfeps);
1739 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1741 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1742 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1743 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1744 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1745 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1746 _MM_TRANSPOSE4_PS(Y,F,G,H);
1747 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1748 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1749 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1753 /* Update vectorial force */
1754 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1755 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1756 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1758 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1759 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1760 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1762 /**************************
1763 * CALCULATE INTERACTIONS *
1764 **************************/
1766 r20 = _mm_mul_ps(rsq20,rinv20);
1768 /* Calculate table index by multiplying r with table scale and truncate to integer */
1769 rt = _mm_mul_ps(r20,vftabscale);
1770 vfitab = _mm_cvttps_epi32(rt);
1772 vfeps = _mm_frcz_ps(rt);
1774 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1776 twovfeps = _mm_add_ps(vfeps,vfeps);
1777 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1779 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1780 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1781 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1782 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1783 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1784 _MM_TRANSPOSE4_PS(Y,F,G,H);
1785 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1786 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1787 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1791 /* Update vectorial force */
1792 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1793 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1794 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1796 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1797 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1798 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1800 /**************************
1801 * CALCULATE INTERACTIONS *
1802 **************************/
1804 r21 = _mm_mul_ps(rsq21,rinv21);
1806 /* Calculate table index by multiplying r with table scale and truncate to integer */
1807 rt = _mm_mul_ps(r21,vftabscale);
1808 vfitab = _mm_cvttps_epi32(rt);
1810 vfeps = _mm_frcz_ps(rt);
1812 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1814 twovfeps = _mm_add_ps(vfeps,vfeps);
1815 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1817 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1818 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1819 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1820 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1821 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1822 _MM_TRANSPOSE4_PS(Y,F,G,H);
1823 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1824 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1825 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1829 /* Update vectorial force */
1830 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1831 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1832 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1834 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1835 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1836 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1838 /**************************
1839 * CALCULATE INTERACTIONS *
1840 **************************/
1842 r22 = _mm_mul_ps(rsq22,rinv22);
1844 /* Calculate table index by multiplying r with table scale and truncate to integer */
1845 rt = _mm_mul_ps(r22,vftabscale);
1846 vfitab = _mm_cvttps_epi32(rt);
1848 vfeps = _mm_frcz_ps(rt);
1850 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1852 twovfeps = _mm_add_ps(vfeps,vfeps);
1853 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1855 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1856 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1857 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1858 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1859 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1860 _MM_TRANSPOSE4_PS(Y,F,G,H);
1861 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1862 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1863 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1867 /* Update vectorial force */
1868 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1869 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1870 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1872 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1873 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1874 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1876 fjptrA = f+j_coord_offsetA;
1877 fjptrB = f+j_coord_offsetB;
1878 fjptrC = f+j_coord_offsetC;
1879 fjptrD = f+j_coord_offsetD;
1881 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1882 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1884 /* Inner loop uses 400 flops */
1887 if(jidx<j_index_end)
1890 /* Get j neighbor index, and coordinate index */
1891 jnrlistA = jjnr[jidx];
1892 jnrlistB = jjnr[jidx+1];
1893 jnrlistC = jjnr[jidx+2];
1894 jnrlistD = jjnr[jidx+3];
1895 /* Sign of each element will be negative for non-real atoms.
1896 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1897 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1899 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1900 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1901 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1902 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1903 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1904 j_coord_offsetA = DIM*jnrA;
1905 j_coord_offsetB = DIM*jnrB;
1906 j_coord_offsetC = DIM*jnrC;
1907 j_coord_offsetD = DIM*jnrD;
1909 /* load j atom coordinates */
1910 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1911 x+j_coord_offsetC,x+j_coord_offsetD,
1912 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1914 /* Calculate displacement vector */
1915 dx00 = _mm_sub_ps(ix0,jx0);
1916 dy00 = _mm_sub_ps(iy0,jy0);
1917 dz00 = _mm_sub_ps(iz0,jz0);
1918 dx01 = _mm_sub_ps(ix0,jx1);
1919 dy01 = _mm_sub_ps(iy0,jy1);
1920 dz01 = _mm_sub_ps(iz0,jz1);
1921 dx02 = _mm_sub_ps(ix0,jx2);
1922 dy02 = _mm_sub_ps(iy0,jy2);
1923 dz02 = _mm_sub_ps(iz0,jz2);
1924 dx10 = _mm_sub_ps(ix1,jx0);
1925 dy10 = _mm_sub_ps(iy1,jy0);
1926 dz10 = _mm_sub_ps(iz1,jz0);
1927 dx11 = _mm_sub_ps(ix1,jx1);
1928 dy11 = _mm_sub_ps(iy1,jy1);
1929 dz11 = _mm_sub_ps(iz1,jz1);
1930 dx12 = _mm_sub_ps(ix1,jx2);
1931 dy12 = _mm_sub_ps(iy1,jy2);
1932 dz12 = _mm_sub_ps(iz1,jz2);
1933 dx20 = _mm_sub_ps(ix2,jx0);
1934 dy20 = _mm_sub_ps(iy2,jy0);
1935 dz20 = _mm_sub_ps(iz2,jz0);
1936 dx21 = _mm_sub_ps(ix2,jx1);
1937 dy21 = _mm_sub_ps(iy2,jy1);
1938 dz21 = _mm_sub_ps(iz2,jz1);
1939 dx22 = _mm_sub_ps(ix2,jx2);
1940 dy22 = _mm_sub_ps(iy2,jy2);
1941 dz22 = _mm_sub_ps(iz2,jz2);
1943 /* Calculate squared distance and things based on it */
1944 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1945 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1946 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1947 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1948 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1949 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1950 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1951 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1952 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1954 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1955 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1956 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1957 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1958 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1959 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1960 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1961 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1962 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1964 fjx0 = _mm_setzero_ps();
1965 fjy0 = _mm_setzero_ps();
1966 fjz0 = _mm_setzero_ps();
1967 fjx1 = _mm_setzero_ps();
1968 fjy1 = _mm_setzero_ps();
1969 fjz1 = _mm_setzero_ps();
1970 fjx2 = _mm_setzero_ps();
1971 fjy2 = _mm_setzero_ps();
1972 fjz2 = _mm_setzero_ps();
1974 /**************************
1975 * CALCULATE INTERACTIONS *
1976 **************************/
1978 r00 = _mm_mul_ps(rsq00,rinv00);
1979 r00 = _mm_andnot_ps(dummy_mask,r00);
1981 /* Calculate table index by multiplying r with table scale and truncate to integer */
1982 rt = _mm_mul_ps(r00,vftabscale);
1983 vfitab = _mm_cvttps_epi32(rt);
1985 vfeps = _mm_frcz_ps(rt);
1987 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1989 twovfeps = _mm_add_ps(vfeps,vfeps);
1990 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1992 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1993 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1994 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1995 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1996 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1997 _MM_TRANSPOSE4_PS(Y,F,G,H);
1998 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1999 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2000 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
2002 /* CUBIC SPLINE TABLE DISPERSION */
2003 vfitab = _mm_add_epi32(vfitab,ifour);
2004 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2005 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2006 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2007 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2008 _MM_TRANSPOSE4_PS(Y,F,G,H);
2009 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2010 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2011 fvdw6 = _mm_mul_ps(c6_00,FF);
2013 /* CUBIC SPLINE TABLE REPULSION */
2014 vfitab = _mm_add_epi32(vfitab,ifour);
2015 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2016 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2017 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2018 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2019 _MM_TRANSPOSE4_PS(Y,F,G,H);
2020 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2021 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2022 fvdw12 = _mm_mul_ps(c12_00,FF);
2023 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
2025 fscal = _mm_add_ps(felec,fvdw);
2027 fscal = _mm_andnot_ps(dummy_mask,fscal);
2029 /* Update vectorial force */
2030 fix0 = _mm_macc_ps(dx00,fscal,fix0);
2031 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
2032 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
2034 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
2035 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
2036 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
2038 /**************************
2039 * CALCULATE INTERACTIONS *
2040 **************************/
2042 r01 = _mm_mul_ps(rsq01,rinv01);
2043 r01 = _mm_andnot_ps(dummy_mask,r01);
2045 /* Calculate table index by multiplying r with table scale and truncate to integer */
2046 rt = _mm_mul_ps(r01,vftabscale);
2047 vfitab = _mm_cvttps_epi32(rt);
2049 vfeps = _mm_frcz_ps(rt);
2051 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2053 twovfeps = _mm_add_ps(vfeps,vfeps);
2054 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2056 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2057 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2058 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2059 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2060 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2061 _MM_TRANSPOSE4_PS(Y,F,G,H);
2062 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2063 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2064 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
2068 fscal = _mm_andnot_ps(dummy_mask,fscal);
2070 /* Update vectorial force */
2071 fix0 = _mm_macc_ps(dx01,fscal,fix0);
2072 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
2073 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
2075 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
2076 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
2077 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
2079 /**************************
2080 * CALCULATE INTERACTIONS *
2081 **************************/
2083 r02 = _mm_mul_ps(rsq02,rinv02);
2084 r02 = _mm_andnot_ps(dummy_mask,r02);
2086 /* Calculate table index by multiplying r with table scale and truncate to integer */
2087 rt = _mm_mul_ps(r02,vftabscale);
2088 vfitab = _mm_cvttps_epi32(rt);
2090 vfeps = _mm_frcz_ps(rt);
2092 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2094 twovfeps = _mm_add_ps(vfeps,vfeps);
2095 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2097 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2098 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2099 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2100 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2101 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2102 _MM_TRANSPOSE4_PS(Y,F,G,H);
2103 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2104 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2105 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
2109 fscal = _mm_andnot_ps(dummy_mask,fscal);
2111 /* Update vectorial force */
2112 fix0 = _mm_macc_ps(dx02,fscal,fix0);
2113 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
2114 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
2116 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
2117 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
2118 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
2120 /**************************
2121 * CALCULATE INTERACTIONS *
2122 **************************/
2124 r10 = _mm_mul_ps(rsq10,rinv10);
2125 r10 = _mm_andnot_ps(dummy_mask,r10);
2127 /* Calculate table index by multiplying r with table scale and truncate to integer */
2128 rt = _mm_mul_ps(r10,vftabscale);
2129 vfitab = _mm_cvttps_epi32(rt);
2131 vfeps = _mm_frcz_ps(rt);
2133 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2135 twovfeps = _mm_add_ps(vfeps,vfeps);
2136 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2138 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2139 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2140 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2141 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2142 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2143 _MM_TRANSPOSE4_PS(Y,F,G,H);
2144 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2145 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2146 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
2150 fscal = _mm_andnot_ps(dummy_mask,fscal);
2152 /* Update vectorial force */
2153 fix1 = _mm_macc_ps(dx10,fscal,fix1);
2154 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
2155 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
2157 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
2158 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
2159 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
2161 /**************************
2162 * CALCULATE INTERACTIONS *
2163 **************************/
2165 r11 = _mm_mul_ps(rsq11,rinv11);
2166 r11 = _mm_andnot_ps(dummy_mask,r11);
2168 /* Calculate table index by multiplying r with table scale and truncate to integer */
2169 rt = _mm_mul_ps(r11,vftabscale);
2170 vfitab = _mm_cvttps_epi32(rt);
2172 vfeps = _mm_frcz_ps(rt);
2174 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2176 twovfeps = _mm_add_ps(vfeps,vfeps);
2177 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2179 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2180 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2181 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2182 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2183 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2184 _MM_TRANSPOSE4_PS(Y,F,G,H);
2185 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2186 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2187 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2191 fscal = _mm_andnot_ps(dummy_mask,fscal);
2193 /* Update vectorial force */
2194 fix1 = _mm_macc_ps(dx11,fscal,fix1);
2195 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
2196 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
2198 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
2199 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
2200 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
2202 /**************************
2203 * CALCULATE INTERACTIONS *
2204 **************************/
2206 r12 = _mm_mul_ps(rsq12,rinv12);
2207 r12 = _mm_andnot_ps(dummy_mask,r12);
2209 /* Calculate table index by multiplying r with table scale and truncate to integer */
2210 rt = _mm_mul_ps(r12,vftabscale);
2211 vfitab = _mm_cvttps_epi32(rt);
2213 vfeps = _mm_frcz_ps(rt);
2215 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2217 twovfeps = _mm_add_ps(vfeps,vfeps);
2218 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2220 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2221 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2222 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2223 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2224 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2225 _MM_TRANSPOSE4_PS(Y,F,G,H);
2226 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2227 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2228 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2232 fscal = _mm_andnot_ps(dummy_mask,fscal);
2234 /* Update vectorial force */
2235 fix1 = _mm_macc_ps(dx12,fscal,fix1);
2236 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
2237 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
2239 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
2240 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
2241 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
2243 /**************************
2244 * CALCULATE INTERACTIONS *
2245 **************************/
2247 r20 = _mm_mul_ps(rsq20,rinv20);
2248 r20 = _mm_andnot_ps(dummy_mask,r20);
2250 /* Calculate table index by multiplying r with table scale and truncate to integer */
2251 rt = _mm_mul_ps(r20,vftabscale);
2252 vfitab = _mm_cvttps_epi32(rt);
2254 vfeps = _mm_frcz_ps(rt);
2256 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2258 twovfeps = _mm_add_ps(vfeps,vfeps);
2259 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2261 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2262 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2263 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2264 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2265 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2266 _MM_TRANSPOSE4_PS(Y,F,G,H);
2267 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2268 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2269 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
2273 fscal = _mm_andnot_ps(dummy_mask,fscal);
2275 /* Update vectorial force */
2276 fix2 = _mm_macc_ps(dx20,fscal,fix2);
2277 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
2278 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
2280 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
2281 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
2282 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
2284 /**************************
2285 * CALCULATE INTERACTIONS *
2286 **************************/
2288 r21 = _mm_mul_ps(rsq21,rinv21);
2289 r21 = _mm_andnot_ps(dummy_mask,r21);
2291 /* Calculate table index by multiplying r with table scale and truncate to integer */
2292 rt = _mm_mul_ps(r21,vftabscale);
2293 vfitab = _mm_cvttps_epi32(rt);
2295 vfeps = _mm_frcz_ps(rt);
2297 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2299 twovfeps = _mm_add_ps(vfeps,vfeps);
2300 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2302 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2303 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2304 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2305 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2306 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2307 _MM_TRANSPOSE4_PS(Y,F,G,H);
2308 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2309 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2310 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2314 fscal = _mm_andnot_ps(dummy_mask,fscal);
2316 /* Update vectorial force */
2317 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2318 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2319 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2321 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2322 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2323 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2325 /**************************
2326 * CALCULATE INTERACTIONS *
2327 **************************/
2329 r22 = _mm_mul_ps(rsq22,rinv22);
2330 r22 = _mm_andnot_ps(dummy_mask,r22);
2332 /* Calculate table index by multiplying r with table scale and truncate to integer */
2333 rt = _mm_mul_ps(r22,vftabscale);
2334 vfitab = _mm_cvttps_epi32(rt);
2336 vfeps = _mm_frcz_ps(rt);
2338 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2340 twovfeps = _mm_add_ps(vfeps,vfeps);
2341 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2343 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2344 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2345 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2346 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2347 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2348 _MM_TRANSPOSE4_PS(Y,F,G,H);
2349 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2350 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2351 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2355 fscal = _mm_andnot_ps(dummy_mask,fscal);
2357 /* Update vectorial force */
2358 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2359 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2360 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2362 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2363 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2364 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2366 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2367 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2368 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2369 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2371 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2372 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2374 /* Inner loop uses 409 flops */
2377 /* End of innermost loop */
2379 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2380 f+i_coord_offset,fshift+i_shift_offset);
2382 /* Increment number of inner iterations */
2383 inneriter += j_index_end - j_index_start;
2385 /* Outer loop uses 18 flops */
2388 /* Increment number of outer iterations */
2391 /* Update outer/inner flops */
2393 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*409);