2 * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_single.h"
34 #include "kernelutil_x86_avx_128_fma_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_128_fma_single
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: None
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_128_fma_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
72 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
74 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
75 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
76 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
77 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
78 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
79 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
80 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
81 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
82 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
83 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
84 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
85 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
86 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
87 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
88 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
89 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
90 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
93 __m128i ifour = _mm_set1_epi32(4);
94 __m128 rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
96 __m128 dummy_mask,cutoff_mask;
97 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
98 __m128 one = _mm_set1_ps(1.0);
99 __m128 two = _mm_set1_ps(2.0);
105 jindex = nlist->jindex;
107 shiftidx = nlist->shift;
109 shiftvec = fr->shift_vec[0];
110 fshift = fr->fshift[0];
111 facel = _mm_set1_ps(fr->epsfac);
112 charge = mdatoms->chargeA;
114 vftab = kernel_data->table_elec->data;
115 vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
117 /* Setup water-specific parameters */
118 inr = nlist->iinr[0];
119 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
120 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
121 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
123 jq1 = _mm_set1_ps(charge[inr+1]);
124 jq2 = _mm_set1_ps(charge[inr+2]);
125 jq3 = _mm_set1_ps(charge[inr+3]);
126 qq11 = _mm_mul_ps(iq1,jq1);
127 qq12 = _mm_mul_ps(iq1,jq2);
128 qq13 = _mm_mul_ps(iq1,jq3);
129 qq21 = _mm_mul_ps(iq2,jq1);
130 qq22 = _mm_mul_ps(iq2,jq2);
131 qq23 = _mm_mul_ps(iq2,jq3);
132 qq31 = _mm_mul_ps(iq3,jq1);
133 qq32 = _mm_mul_ps(iq3,jq2);
134 qq33 = _mm_mul_ps(iq3,jq3);
136 /* Avoid stupid compiler warnings */
137 jnrA = jnrB = jnrC = jnrD = 0;
146 for(iidx=0;iidx<4*DIM;iidx++)
151 /* Start outer loop over neighborlists */
152 for(iidx=0; iidx<nri; iidx++)
154 /* Load shift vector for this list */
155 i_shift_offset = DIM*shiftidx[iidx];
157 /* Load limits for loop over neighbors */
158 j_index_start = jindex[iidx];
159 j_index_end = jindex[iidx+1];
161 /* Get outer coordinate index */
163 i_coord_offset = DIM*inr;
165 /* Load i particle coords and add shift vector */
166 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
167 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
169 fix1 = _mm_setzero_ps();
170 fiy1 = _mm_setzero_ps();
171 fiz1 = _mm_setzero_ps();
172 fix2 = _mm_setzero_ps();
173 fiy2 = _mm_setzero_ps();
174 fiz2 = _mm_setzero_ps();
175 fix3 = _mm_setzero_ps();
176 fiy3 = _mm_setzero_ps();
177 fiz3 = _mm_setzero_ps();
179 /* Reset potential sums */
180 velecsum = _mm_setzero_ps();
182 /* Start inner kernel loop */
183 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
186 /* Get j neighbor index, and coordinate index */
191 j_coord_offsetA = DIM*jnrA;
192 j_coord_offsetB = DIM*jnrB;
193 j_coord_offsetC = DIM*jnrC;
194 j_coord_offsetD = DIM*jnrD;
196 /* load j atom coordinates */
197 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
198 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
199 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
201 /* Calculate displacement vector */
202 dx11 = _mm_sub_ps(ix1,jx1);
203 dy11 = _mm_sub_ps(iy1,jy1);
204 dz11 = _mm_sub_ps(iz1,jz1);
205 dx12 = _mm_sub_ps(ix1,jx2);
206 dy12 = _mm_sub_ps(iy1,jy2);
207 dz12 = _mm_sub_ps(iz1,jz2);
208 dx13 = _mm_sub_ps(ix1,jx3);
209 dy13 = _mm_sub_ps(iy1,jy3);
210 dz13 = _mm_sub_ps(iz1,jz3);
211 dx21 = _mm_sub_ps(ix2,jx1);
212 dy21 = _mm_sub_ps(iy2,jy1);
213 dz21 = _mm_sub_ps(iz2,jz1);
214 dx22 = _mm_sub_ps(ix2,jx2);
215 dy22 = _mm_sub_ps(iy2,jy2);
216 dz22 = _mm_sub_ps(iz2,jz2);
217 dx23 = _mm_sub_ps(ix2,jx3);
218 dy23 = _mm_sub_ps(iy2,jy3);
219 dz23 = _mm_sub_ps(iz2,jz3);
220 dx31 = _mm_sub_ps(ix3,jx1);
221 dy31 = _mm_sub_ps(iy3,jy1);
222 dz31 = _mm_sub_ps(iz3,jz1);
223 dx32 = _mm_sub_ps(ix3,jx2);
224 dy32 = _mm_sub_ps(iy3,jy2);
225 dz32 = _mm_sub_ps(iz3,jz2);
226 dx33 = _mm_sub_ps(ix3,jx3);
227 dy33 = _mm_sub_ps(iy3,jy3);
228 dz33 = _mm_sub_ps(iz3,jz3);
230 /* Calculate squared distance and things based on it */
231 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
232 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
233 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
234 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
235 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
236 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
237 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
238 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
239 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
241 rinv11 = gmx_mm_invsqrt_ps(rsq11);
242 rinv12 = gmx_mm_invsqrt_ps(rsq12);
243 rinv13 = gmx_mm_invsqrt_ps(rsq13);
244 rinv21 = gmx_mm_invsqrt_ps(rsq21);
245 rinv22 = gmx_mm_invsqrt_ps(rsq22);
246 rinv23 = gmx_mm_invsqrt_ps(rsq23);
247 rinv31 = gmx_mm_invsqrt_ps(rsq31);
248 rinv32 = gmx_mm_invsqrt_ps(rsq32);
249 rinv33 = gmx_mm_invsqrt_ps(rsq33);
251 fjx1 = _mm_setzero_ps();
252 fjy1 = _mm_setzero_ps();
253 fjz1 = _mm_setzero_ps();
254 fjx2 = _mm_setzero_ps();
255 fjy2 = _mm_setzero_ps();
256 fjz2 = _mm_setzero_ps();
257 fjx3 = _mm_setzero_ps();
258 fjy3 = _mm_setzero_ps();
259 fjz3 = _mm_setzero_ps();
261 /**************************
262 * CALCULATE INTERACTIONS *
263 **************************/
265 r11 = _mm_mul_ps(rsq11,rinv11);
267 /* Calculate table index by multiplying r with table scale and truncate to integer */
268 rt = _mm_mul_ps(r11,vftabscale);
269 vfitab = _mm_cvttps_epi32(rt);
271 vfeps = _mm_frcz_ps(rt);
273 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
275 twovfeps = _mm_add_ps(vfeps,vfeps);
276 vfitab = _mm_slli_epi32(vfitab,2);
278 /* CUBIC SPLINE TABLE ELECTROSTATICS */
279 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
280 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
281 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
282 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
283 _MM_TRANSPOSE4_PS(Y,F,G,H);
284 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
285 VV = _mm_macc_ps(vfeps,Fp,Y);
286 velec = _mm_mul_ps(qq11,VV);
287 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
288 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
290 /* Update potential sum for this i atom from the interaction with this j atom. */
291 velecsum = _mm_add_ps(velecsum,velec);
295 /* Update vectorial force */
296 fix1 = _mm_macc_ps(dx11,fscal,fix1);
297 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
298 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
300 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
301 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
302 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
304 /**************************
305 * CALCULATE INTERACTIONS *
306 **************************/
308 r12 = _mm_mul_ps(rsq12,rinv12);
310 /* Calculate table index by multiplying r with table scale and truncate to integer */
311 rt = _mm_mul_ps(r12,vftabscale);
312 vfitab = _mm_cvttps_epi32(rt);
314 vfeps = _mm_frcz_ps(rt);
316 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
318 twovfeps = _mm_add_ps(vfeps,vfeps);
319 vfitab = _mm_slli_epi32(vfitab,2);
321 /* CUBIC SPLINE TABLE ELECTROSTATICS */
322 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
323 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
324 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
325 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
326 _MM_TRANSPOSE4_PS(Y,F,G,H);
327 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
328 VV = _mm_macc_ps(vfeps,Fp,Y);
329 velec = _mm_mul_ps(qq12,VV);
330 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
331 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
333 /* Update potential sum for this i atom from the interaction with this j atom. */
334 velecsum = _mm_add_ps(velecsum,velec);
338 /* Update vectorial force */
339 fix1 = _mm_macc_ps(dx12,fscal,fix1);
340 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
341 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
343 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
344 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
345 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
347 /**************************
348 * CALCULATE INTERACTIONS *
349 **************************/
351 r13 = _mm_mul_ps(rsq13,rinv13);
353 /* Calculate table index by multiplying r with table scale and truncate to integer */
354 rt = _mm_mul_ps(r13,vftabscale);
355 vfitab = _mm_cvttps_epi32(rt);
357 vfeps = _mm_frcz_ps(rt);
359 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
361 twovfeps = _mm_add_ps(vfeps,vfeps);
362 vfitab = _mm_slli_epi32(vfitab,2);
364 /* CUBIC SPLINE TABLE ELECTROSTATICS */
365 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
366 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
367 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
368 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
369 _MM_TRANSPOSE4_PS(Y,F,G,H);
370 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
371 VV = _mm_macc_ps(vfeps,Fp,Y);
372 velec = _mm_mul_ps(qq13,VV);
373 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
374 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
376 /* Update potential sum for this i atom from the interaction with this j atom. */
377 velecsum = _mm_add_ps(velecsum,velec);
381 /* Update vectorial force */
382 fix1 = _mm_macc_ps(dx13,fscal,fix1);
383 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
384 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
386 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
387 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
388 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
390 /**************************
391 * CALCULATE INTERACTIONS *
392 **************************/
394 r21 = _mm_mul_ps(rsq21,rinv21);
396 /* Calculate table index by multiplying r with table scale and truncate to integer */
397 rt = _mm_mul_ps(r21,vftabscale);
398 vfitab = _mm_cvttps_epi32(rt);
400 vfeps = _mm_frcz_ps(rt);
402 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
404 twovfeps = _mm_add_ps(vfeps,vfeps);
405 vfitab = _mm_slli_epi32(vfitab,2);
407 /* CUBIC SPLINE TABLE ELECTROSTATICS */
408 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
409 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
410 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
411 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
412 _MM_TRANSPOSE4_PS(Y,F,G,H);
413 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
414 VV = _mm_macc_ps(vfeps,Fp,Y);
415 velec = _mm_mul_ps(qq21,VV);
416 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
417 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
419 /* Update potential sum for this i atom from the interaction with this j atom. */
420 velecsum = _mm_add_ps(velecsum,velec);
424 /* Update vectorial force */
425 fix2 = _mm_macc_ps(dx21,fscal,fix2);
426 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
427 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
429 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
430 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
431 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
433 /**************************
434 * CALCULATE INTERACTIONS *
435 **************************/
437 r22 = _mm_mul_ps(rsq22,rinv22);
439 /* Calculate table index by multiplying r with table scale and truncate to integer */
440 rt = _mm_mul_ps(r22,vftabscale);
441 vfitab = _mm_cvttps_epi32(rt);
443 vfeps = _mm_frcz_ps(rt);
445 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
447 twovfeps = _mm_add_ps(vfeps,vfeps);
448 vfitab = _mm_slli_epi32(vfitab,2);
450 /* CUBIC SPLINE TABLE ELECTROSTATICS */
451 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
452 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
453 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
454 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
455 _MM_TRANSPOSE4_PS(Y,F,G,H);
456 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
457 VV = _mm_macc_ps(vfeps,Fp,Y);
458 velec = _mm_mul_ps(qq22,VV);
459 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
460 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
462 /* Update potential sum for this i atom from the interaction with this j atom. */
463 velecsum = _mm_add_ps(velecsum,velec);
467 /* Update vectorial force */
468 fix2 = _mm_macc_ps(dx22,fscal,fix2);
469 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
470 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
472 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
473 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
474 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
476 /**************************
477 * CALCULATE INTERACTIONS *
478 **************************/
480 r23 = _mm_mul_ps(rsq23,rinv23);
482 /* Calculate table index by multiplying r with table scale and truncate to integer */
483 rt = _mm_mul_ps(r23,vftabscale);
484 vfitab = _mm_cvttps_epi32(rt);
486 vfeps = _mm_frcz_ps(rt);
488 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
490 twovfeps = _mm_add_ps(vfeps,vfeps);
491 vfitab = _mm_slli_epi32(vfitab,2);
493 /* CUBIC SPLINE TABLE ELECTROSTATICS */
494 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
495 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
496 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
497 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
498 _MM_TRANSPOSE4_PS(Y,F,G,H);
499 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
500 VV = _mm_macc_ps(vfeps,Fp,Y);
501 velec = _mm_mul_ps(qq23,VV);
502 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
503 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
505 /* Update potential sum for this i atom from the interaction with this j atom. */
506 velecsum = _mm_add_ps(velecsum,velec);
510 /* Update vectorial force */
511 fix2 = _mm_macc_ps(dx23,fscal,fix2);
512 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
513 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
515 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
516 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
517 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
519 /**************************
520 * CALCULATE INTERACTIONS *
521 **************************/
523 r31 = _mm_mul_ps(rsq31,rinv31);
525 /* Calculate table index by multiplying r with table scale and truncate to integer */
526 rt = _mm_mul_ps(r31,vftabscale);
527 vfitab = _mm_cvttps_epi32(rt);
529 vfeps = _mm_frcz_ps(rt);
531 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
533 twovfeps = _mm_add_ps(vfeps,vfeps);
534 vfitab = _mm_slli_epi32(vfitab,2);
536 /* CUBIC SPLINE TABLE ELECTROSTATICS */
537 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
538 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
539 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
540 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
541 _MM_TRANSPOSE4_PS(Y,F,G,H);
542 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
543 VV = _mm_macc_ps(vfeps,Fp,Y);
544 velec = _mm_mul_ps(qq31,VV);
545 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
546 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
548 /* Update potential sum for this i atom from the interaction with this j atom. */
549 velecsum = _mm_add_ps(velecsum,velec);
553 /* Update vectorial force */
554 fix3 = _mm_macc_ps(dx31,fscal,fix3);
555 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
556 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
558 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
559 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
560 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
562 /**************************
563 * CALCULATE INTERACTIONS *
564 **************************/
566 r32 = _mm_mul_ps(rsq32,rinv32);
568 /* Calculate table index by multiplying r with table scale and truncate to integer */
569 rt = _mm_mul_ps(r32,vftabscale);
570 vfitab = _mm_cvttps_epi32(rt);
572 vfeps = _mm_frcz_ps(rt);
574 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
576 twovfeps = _mm_add_ps(vfeps,vfeps);
577 vfitab = _mm_slli_epi32(vfitab,2);
579 /* CUBIC SPLINE TABLE ELECTROSTATICS */
580 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
581 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
582 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
583 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
584 _MM_TRANSPOSE4_PS(Y,F,G,H);
585 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
586 VV = _mm_macc_ps(vfeps,Fp,Y);
587 velec = _mm_mul_ps(qq32,VV);
588 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
589 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
591 /* Update potential sum for this i atom from the interaction with this j atom. */
592 velecsum = _mm_add_ps(velecsum,velec);
596 /* Update vectorial force */
597 fix3 = _mm_macc_ps(dx32,fscal,fix3);
598 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
599 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
601 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
602 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
603 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
605 /**************************
606 * CALCULATE INTERACTIONS *
607 **************************/
609 r33 = _mm_mul_ps(rsq33,rinv33);
611 /* Calculate table index by multiplying r with table scale and truncate to integer */
612 rt = _mm_mul_ps(r33,vftabscale);
613 vfitab = _mm_cvttps_epi32(rt);
615 vfeps = _mm_frcz_ps(rt);
617 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
619 twovfeps = _mm_add_ps(vfeps,vfeps);
620 vfitab = _mm_slli_epi32(vfitab,2);
622 /* CUBIC SPLINE TABLE ELECTROSTATICS */
623 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
624 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
625 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
626 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
627 _MM_TRANSPOSE4_PS(Y,F,G,H);
628 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
629 VV = _mm_macc_ps(vfeps,Fp,Y);
630 velec = _mm_mul_ps(qq33,VV);
631 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
632 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
634 /* Update potential sum for this i atom from the interaction with this j atom. */
635 velecsum = _mm_add_ps(velecsum,velec);
639 /* Update vectorial force */
640 fix3 = _mm_macc_ps(dx33,fscal,fix3);
641 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
642 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
644 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
645 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
646 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
648 fjptrA = f+j_coord_offsetA;
649 fjptrB = f+j_coord_offsetB;
650 fjptrC = f+j_coord_offsetC;
651 fjptrD = f+j_coord_offsetD;
653 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
654 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
656 /* Inner loop uses 414 flops */
662 /* Get j neighbor index, and coordinate index */
663 jnrlistA = jjnr[jidx];
664 jnrlistB = jjnr[jidx+1];
665 jnrlistC = jjnr[jidx+2];
666 jnrlistD = jjnr[jidx+3];
667 /* Sign of each element will be negative for non-real atoms.
668 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
669 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
671 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
672 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
673 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
674 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
675 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
676 j_coord_offsetA = DIM*jnrA;
677 j_coord_offsetB = DIM*jnrB;
678 j_coord_offsetC = DIM*jnrC;
679 j_coord_offsetD = DIM*jnrD;
681 /* load j atom coordinates */
682 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
683 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
684 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
686 /* Calculate displacement vector */
687 dx11 = _mm_sub_ps(ix1,jx1);
688 dy11 = _mm_sub_ps(iy1,jy1);
689 dz11 = _mm_sub_ps(iz1,jz1);
690 dx12 = _mm_sub_ps(ix1,jx2);
691 dy12 = _mm_sub_ps(iy1,jy2);
692 dz12 = _mm_sub_ps(iz1,jz2);
693 dx13 = _mm_sub_ps(ix1,jx3);
694 dy13 = _mm_sub_ps(iy1,jy3);
695 dz13 = _mm_sub_ps(iz1,jz3);
696 dx21 = _mm_sub_ps(ix2,jx1);
697 dy21 = _mm_sub_ps(iy2,jy1);
698 dz21 = _mm_sub_ps(iz2,jz1);
699 dx22 = _mm_sub_ps(ix2,jx2);
700 dy22 = _mm_sub_ps(iy2,jy2);
701 dz22 = _mm_sub_ps(iz2,jz2);
702 dx23 = _mm_sub_ps(ix2,jx3);
703 dy23 = _mm_sub_ps(iy2,jy3);
704 dz23 = _mm_sub_ps(iz2,jz3);
705 dx31 = _mm_sub_ps(ix3,jx1);
706 dy31 = _mm_sub_ps(iy3,jy1);
707 dz31 = _mm_sub_ps(iz3,jz1);
708 dx32 = _mm_sub_ps(ix3,jx2);
709 dy32 = _mm_sub_ps(iy3,jy2);
710 dz32 = _mm_sub_ps(iz3,jz2);
711 dx33 = _mm_sub_ps(ix3,jx3);
712 dy33 = _mm_sub_ps(iy3,jy3);
713 dz33 = _mm_sub_ps(iz3,jz3);
715 /* Calculate squared distance and things based on it */
716 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
717 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
718 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
719 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
720 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
721 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
722 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
723 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
724 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
726 rinv11 = gmx_mm_invsqrt_ps(rsq11);
727 rinv12 = gmx_mm_invsqrt_ps(rsq12);
728 rinv13 = gmx_mm_invsqrt_ps(rsq13);
729 rinv21 = gmx_mm_invsqrt_ps(rsq21);
730 rinv22 = gmx_mm_invsqrt_ps(rsq22);
731 rinv23 = gmx_mm_invsqrt_ps(rsq23);
732 rinv31 = gmx_mm_invsqrt_ps(rsq31);
733 rinv32 = gmx_mm_invsqrt_ps(rsq32);
734 rinv33 = gmx_mm_invsqrt_ps(rsq33);
736 fjx1 = _mm_setzero_ps();
737 fjy1 = _mm_setzero_ps();
738 fjz1 = _mm_setzero_ps();
739 fjx2 = _mm_setzero_ps();
740 fjy2 = _mm_setzero_ps();
741 fjz2 = _mm_setzero_ps();
742 fjx3 = _mm_setzero_ps();
743 fjy3 = _mm_setzero_ps();
744 fjz3 = _mm_setzero_ps();
746 /**************************
747 * CALCULATE INTERACTIONS *
748 **************************/
750 r11 = _mm_mul_ps(rsq11,rinv11);
751 r11 = _mm_andnot_ps(dummy_mask,r11);
753 /* Calculate table index by multiplying r with table scale and truncate to integer */
754 rt = _mm_mul_ps(r11,vftabscale);
755 vfitab = _mm_cvttps_epi32(rt);
757 vfeps = _mm_frcz_ps(rt);
759 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
761 twovfeps = _mm_add_ps(vfeps,vfeps);
762 vfitab = _mm_slli_epi32(vfitab,2);
764 /* CUBIC SPLINE TABLE ELECTROSTATICS */
765 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
766 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
767 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
768 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
769 _MM_TRANSPOSE4_PS(Y,F,G,H);
770 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
771 VV = _mm_macc_ps(vfeps,Fp,Y);
772 velec = _mm_mul_ps(qq11,VV);
773 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
774 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
776 /* Update potential sum for this i atom from the interaction with this j atom. */
777 velec = _mm_andnot_ps(dummy_mask,velec);
778 velecsum = _mm_add_ps(velecsum,velec);
782 fscal = _mm_andnot_ps(dummy_mask,fscal);
784 /* Update vectorial force */
785 fix1 = _mm_macc_ps(dx11,fscal,fix1);
786 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
787 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
789 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
790 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
791 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
793 /**************************
794 * CALCULATE INTERACTIONS *
795 **************************/
797 r12 = _mm_mul_ps(rsq12,rinv12);
798 r12 = _mm_andnot_ps(dummy_mask,r12);
800 /* Calculate table index by multiplying r with table scale and truncate to integer */
801 rt = _mm_mul_ps(r12,vftabscale);
802 vfitab = _mm_cvttps_epi32(rt);
804 vfeps = _mm_frcz_ps(rt);
806 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
808 twovfeps = _mm_add_ps(vfeps,vfeps);
809 vfitab = _mm_slli_epi32(vfitab,2);
811 /* CUBIC SPLINE TABLE ELECTROSTATICS */
812 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
813 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
814 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
815 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
816 _MM_TRANSPOSE4_PS(Y,F,G,H);
817 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
818 VV = _mm_macc_ps(vfeps,Fp,Y);
819 velec = _mm_mul_ps(qq12,VV);
820 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
821 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
823 /* Update potential sum for this i atom from the interaction with this j atom. */
824 velec = _mm_andnot_ps(dummy_mask,velec);
825 velecsum = _mm_add_ps(velecsum,velec);
829 fscal = _mm_andnot_ps(dummy_mask,fscal);
831 /* Update vectorial force */
832 fix1 = _mm_macc_ps(dx12,fscal,fix1);
833 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
834 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
836 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
837 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
838 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
840 /**************************
841 * CALCULATE INTERACTIONS *
842 **************************/
844 r13 = _mm_mul_ps(rsq13,rinv13);
845 r13 = _mm_andnot_ps(dummy_mask,r13);
847 /* Calculate table index by multiplying r with table scale and truncate to integer */
848 rt = _mm_mul_ps(r13,vftabscale);
849 vfitab = _mm_cvttps_epi32(rt);
851 vfeps = _mm_frcz_ps(rt);
853 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
855 twovfeps = _mm_add_ps(vfeps,vfeps);
856 vfitab = _mm_slli_epi32(vfitab,2);
858 /* CUBIC SPLINE TABLE ELECTROSTATICS */
859 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
860 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
861 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
862 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
863 _MM_TRANSPOSE4_PS(Y,F,G,H);
864 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
865 VV = _mm_macc_ps(vfeps,Fp,Y);
866 velec = _mm_mul_ps(qq13,VV);
867 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
868 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
870 /* Update potential sum for this i atom from the interaction with this j atom. */
871 velec = _mm_andnot_ps(dummy_mask,velec);
872 velecsum = _mm_add_ps(velecsum,velec);
876 fscal = _mm_andnot_ps(dummy_mask,fscal);
878 /* Update vectorial force */
879 fix1 = _mm_macc_ps(dx13,fscal,fix1);
880 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
881 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
883 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
884 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
885 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
887 /**************************
888 * CALCULATE INTERACTIONS *
889 **************************/
891 r21 = _mm_mul_ps(rsq21,rinv21);
892 r21 = _mm_andnot_ps(dummy_mask,r21);
894 /* Calculate table index by multiplying r with table scale and truncate to integer */
895 rt = _mm_mul_ps(r21,vftabscale);
896 vfitab = _mm_cvttps_epi32(rt);
898 vfeps = _mm_frcz_ps(rt);
900 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
902 twovfeps = _mm_add_ps(vfeps,vfeps);
903 vfitab = _mm_slli_epi32(vfitab,2);
905 /* CUBIC SPLINE TABLE ELECTROSTATICS */
906 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
907 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
908 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
909 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
910 _MM_TRANSPOSE4_PS(Y,F,G,H);
911 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
912 VV = _mm_macc_ps(vfeps,Fp,Y);
913 velec = _mm_mul_ps(qq21,VV);
914 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
915 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
917 /* Update potential sum for this i atom from the interaction with this j atom. */
918 velec = _mm_andnot_ps(dummy_mask,velec);
919 velecsum = _mm_add_ps(velecsum,velec);
923 fscal = _mm_andnot_ps(dummy_mask,fscal);
925 /* Update vectorial force */
926 fix2 = _mm_macc_ps(dx21,fscal,fix2);
927 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
928 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
930 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
931 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
932 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
934 /**************************
935 * CALCULATE INTERACTIONS *
936 **************************/
938 r22 = _mm_mul_ps(rsq22,rinv22);
939 r22 = _mm_andnot_ps(dummy_mask,r22);
941 /* Calculate table index by multiplying r with table scale and truncate to integer */
942 rt = _mm_mul_ps(r22,vftabscale);
943 vfitab = _mm_cvttps_epi32(rt);
945 vfeps = _mm_frcz_ps(rt);
947 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
949 twovfeps = _mm_add_ps(vfeps,vfeps);
950 vfitab = _mm_slli_epi32(vfitab,2);
952 /* CUBIC SPLINE TABLE ELECTROSTATICS */
953 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
954 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
955 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
956 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
957 _MM_TRANSPOSE4_PS(Y,F,G,H);
958 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
959 VV = _mm_macc_ps(vfeps,Fp,Y);
960 velec = _mm_mul_ps(qq22,VV);
961 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
962 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
964 /* Update potential sum for this i atom from the interaction with this j atom. */
965 velec = _mm_andnot_ps(dummy_mask,velec);
966 velecsum = _mm_add_ps(velecsum,velec);
970 fscal = _mm_andnot_ps(dummy_mask,fscal);
972 /* Update vectorial force */
973 fix2 = _mm_macc_ps(dx22,fscal,fix2);
974 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
975 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
977 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
978 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
979 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
981 /**************************
982 * CALCULATE INTERACTIONS *
983 **************************/
985 r23 = _mm_mul_ps(rsq23,rinv23);
986 r23 = _mm_andnot_ps(dummy_mask,r23);
988 /* Calculate table index by multiplying r with table scale and truncate to integer */
989 rt = _mm_mul_ps(r23,vftabscale);
990 vfitab = _mm_cvttps_epi32(rt);
992 vfeps = _mm_frcz_ps(rt);
994 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
996 twovfeps = _mm_add_ps(vfeps,vfeps);
997 vfitab = _mm_slli_epi32(vfitab,2);
999 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1000 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1001 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1002 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1003 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1004 _MM_TRANSPOSE4_PS(Y,F,G,H);
1005 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1006 VV = _mm_macc_ps(vfeps,Fp,Y);
1007 velec = _mm_mul_ps(qq23,VV);
1008 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1009 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1011 /* Update potential sum for this i atom from the interaction with this j atom. */
1012 velec = _mm_andnot_ps(dummy_mask,velec);
1013 velecsum = _mm_add_ps(velecsum,velec);
1017 fscal = _mm_andnot_ps(dummy_mask,fscal);
1019 /* Update vectorial force */
1020 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1021 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1022 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1024 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1025 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1026 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1028 /**************************
1029 * CALCULATE INTERACTIONS *
1030 **************************/
1032 r31 = _mm_mul_ps(rsq31,rinv31);
1033 r31 = _mm_andnot_ps(dummy_mask,r31);
1035 /* Calculate table index by multiplying r with table scale and truncate to integer */
1036 rt = _mm_mul_ps(r31,vftabscale);
1037 vfitab = _mm_cvttps_epi32(rt);
1039 vfeps = _mm_frcz_ps(rt);
1041 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1043 twovfeps = _mm_add_ps(vfeps,vfeps);
1044 vfitab = _mm_slli_epi32(vfitab,2);
1046 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1047 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1048 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1049 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1050 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1051 _MM_TRANSPOSE4_PS(Y,F,G,H);
1052 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1053 VV = _mm_macc_ps(vfeps,Fp,Y);
1054 velec = _mm_mul_ps(qq31,VV);
1055 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1056 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1058 /* Update potential sum for this i atom from the interaction with this j atom. */
1059 velec = _mm_andnot_ps(dummy_mask,velec);
1060 velecsum = _mm_add_ps(velecsum,velec);
1064 fscal = _mm_andnot_ps(dummy_mask,fscal);
1066 /* Update vectorial force */
1067 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1068 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1069 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1071 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1072 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1073 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1075 /**************************
1076 * CALCULATE INTERACTIONS *
1077 **************************/
1079 r32 = _mm_mul_ps(rsq32,rinv32);
1080 r32 = _mm_andnot_ps(dummy_mask,r32);
1082 /* Calculate table index by multiplying r with table scale and truncate to integer */
1083 rt = _mm_mul_ps(r32,vftabscale);
1084 vfitab = _mm_cvttps_epi32(rt);
1086 vfeps = _mm_frcz_ps(rt);
1088 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1090 twovfeps = _mm_add_ps(vfeps,vfeps);
1091 vfitab = _mm_slli_epi32(vfitab,2);
1093 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1094 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1095 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1096 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1097 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1098 _MM_TRANSPOSE4_PS(Y,F,G,H);
1099 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1100 VV = _mm_macc_ps(vfeps,Fp,Y);
1101 velec = _mm_mul_ps(qq32,VV);
1102 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1103 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1105 /* Update potential sum for this i atom from the interaction with this j atom. */
1106 velec = _mm_andnot_ps(dummy_mask,velec);
1107 velecsum = _mm_add_ps(velecsum,velec);
1111 fscal = _mm_andnot_ps(dummy_mask,fscal);
1113 /* Update vectorial force */
1114 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1115 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1116 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1118 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1119 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1120 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1122 /**************************
1123 * CALCULATE INTERACTIONS *
1124 **************************/
1126 r33 = _mm_mul_ps(rsq33,rinv33);
1127 r33 = _mm_andnot_ps(dummy_mask,r33);
1129 /* Calculate table index by multiplying r with table scale and truncate to integer */
1130 rt = _mm_mul_ps(r33,vftabscale);
1131 vfitab = _mm_cvttps_epi32(rt);
1133 vfeps = _mm_frcz_ps(rt);
1135 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1137 twovfeps = _mm_add_ps(vfeps,vfeps);
1138 vfitab = _mm_slli_epi32(vfitab,2);
1140 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1141 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1142 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1143 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1144 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1145 _MM_TRANSPOSE4_PS(Y,F,G,H);
1146 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1147 VV = _mm_macc_ps(vfeps,Fp,Y);
1148 velec = _mm_mul_ps(qq33,VV);
1149 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1150 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1152 /* Update potential sum for this i atom from the interaction with this j atom. */
1153 velec = _mm_andnot_ps(dummy_mask,velec);
1154 velecsum = _mm_add_ps(velecsum,velec);
1158 fscal = _mm_andnot_ps(dummy_mask,fscal);
1160 /* Update vectorial force */
1161 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1162 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1163 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1165 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1166 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1167 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1169 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1170 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1171 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1172 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1174 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1175 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1177 /* Inner loop uses 423 flops */
1180 /* End of innermost loop */
1182 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1183 f+i_coord_offset+DIM,fshift+i_shift_offset);
1186 /* Update potential energies */
1187 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1189 /* Increment number of inner iterations */
1190 inneriter += j_index_end - j_index_start;
1192 /* Outer loop uses 19 flops */
1195 /* Increment number of outer iterations */
1198 /* Update outer/inner flops */
1200 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*423);
1203 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_128_fma_single
1204 * Electrostatics interaction: CubicSplineTable
1205 * VdW interaction: None
1206 * Geometry: Water4-Water4
1207 * Calculate force/pot: Force
1210 nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_128_fma_single
1211 (t_nblist * gmx_restrict nlist,
1212 rvec * gmx_restrict xx,
1213 rvec * gmx_restrict ff,
1214 t_forcerec * gmx_restrict fr,
1215 t_mdatoms * gmx_restrict mdatoms,
1216 nb_kernel_data_t * gmx_restrict kernel_data,
1217 t_nrnb * gmx_restrict nrnb)
1219 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1220 * just 0 for non-waters.
1221 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1222 * jnr indices corresponding to data put in the four positions in the SIMD register.
1224 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1225 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1226 int jnrA,jnrB,jnrC,jnrD;
1227 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1228 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1229 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1230 real rcutoff_scalar;
1231 real *shiftvec,*fshift,*x,*f;
1232 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1233 real scratch[4*DIM];
1234 __m128 fscal,rcutoff,rcutoff2,jidxall;
1236 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1238 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1240 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1241 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1242 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1243 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1244 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1245 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1246 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1247 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1248 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1249 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1250 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1251 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1252 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1253 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1254 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1255 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1256 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1259 __m128i ifour = _mm_set1_epi32(4);
1260 __m128 rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
1262 __m128 dummy_mask,cutoff_mask;
1263 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1264 __m128 one = _mm_set1_ps(1.0);
1265 __m128 two = _mm_set1_ps(2.0);
1271 jindex = nlist->jindex;
1273 shiftidx = nlist->shift;
1275 shiftvec = fr->shift_vec[0];
1276 fshift = fr->fshift[0];
1277 facel = _mm_set1_ps(fr->epsfac);
1278 charge = mdatoms->chargeA;
1280 vftab = kernel_data->table_elec->data;
1281 vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
1283 /* Setup water-specific parameters */
1284 inr = nlist->iinr[0];
1285 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1286 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1287 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1289 jq1 = _mm_set1_ps(charge[inr+1]);
1290 jq2 = _mm_set1_ps(charge[inr+2]);
1291 jq3 = _mm_set1_ps(charge[inr+3]);
1292 qq11 = _mm_mul_ps(iq1,jq1);
1293 qq12 = _mm_mul_ps(iq1,jq2);
1294 qq13 = _mm_mul_ps(iq1,jq3);
1295 qq21 = _mm_mul_ps(iq2,jq1);
1296 qq22 = _mm_mul_ps(iq2,jq2);
1297 qq23 = _mm_mul_ps(iq2,jq3);
1298 qq31 = _mm_mul_ps(iq3,jq1);
1299 qq32 = _mm_mul_ps(iq3,jq2);
1300 qq33 = _mm_mul_ps(iq3,jq3);
1302 /* Avoid stupid compiler warnings */
1303 jnrA = jnrB = jnrC = jnrD = 0;
1304 j_coord_offsetA = 0;
1305 j_coord_offsetB = 0;
1306 j_coord_offsetC = 0;
1307 j_coord_offsetD = 0;
1312 for(iidx=0;iidx<4*DIM;iidx++)
1314 scratch[iidx] = 0.0;
1317 /* Start outer loop over neighborlists */
1318 for(iidx=0; iidx<nri; iidx++)
1320 /* Load shift vector for this list */
1321 i_shift_offset = DIM*shiftidx[iidx];
1323 /* Load limits for loop over neighbors */
1324 j_index_start = jindex[iidx];
1325 j_index_end = jindex[iidx+1];
1327 /* Get outer coordinate index */
1329 i_coord_offset = DIM*inr;
1331 /* Load i particle coords and add shift vector */
1332 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1333 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1335 fix1 = _mm_setzero_ps();
1336 fiy1 = _mm_setzero_ps();
1337 fiz1 = _mm_setzero_ps();
1338 fix2 = _mm_setzero_ps();
1339 fiy2 = _mm_setzero_ps();
1340 fiz2 = _mm_setzero_ps();
1341 fix3 = _mm_setzero_ps();
1342 fiy3 = _mm_setzero_ps();
1343 fiz3 = _mm_setzero_ps();
1345 /* Start inner kernel loop */
1346 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1349 /* Get j neighbor index, and coordinate index */
1351 jnrB = jjnr[jidx+1];
1352 jnrC = jjnr[jidx+2];
1353 jnrD = jjnr[jidx+3];
1354 j_coord_offsetA = DIM*jnrA;
1355 j_coord_offsetB = DIM*jnrB;
1356 j_coord_offsetC = DIM*jnrC;
1357 j_coord_offsetD = DIM*jnrD;
1359 /* load j atom coordinates */
1360 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1361 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1362 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1364 /* Calculate displacement vector */
1365 dx11 = _mm_sub_ps(ix1,jx1);
1366 dy11 = _mm_sub_ps(iy1,jy1);
1367 dz11 = _mm_sub_ps(iz1,jz1);
1368 dx12 = _mm_sub_ps(ix1,jx2);
1369 dy12 = _mm_sub_ps(iy1,jy2);
1370 dz12 = _mm_sub_ps(iz1,jz2);
1371 dx13 = _mm_sub_ps(ix1,jx3);
1372 dy13 = _mm_sub_ps(iy1,jy3);
1373 dz13 = _mm_sub_ps(iz1,jz3);
1374 dx21 = _mm_sub_ps(ix2,jx1);
1375 dy21 = _mm_sub_ps(iy2,jy1);
1376 dz21 = _mm_sub_ps(iz2,jz1);
1377 dx22 = _mm_sub_ps(ix2,jx2);
1378 dy22 = _mm_sub_ps(iy2,jy2);
1379 dz22 = _mm_sub_ps(iz2,jz2);
1380 dx23 = _mm_sub_ps(ix2,jx3);
1381 dy23 = _mm_sub_ps(iy2,jy3);
1382 dz23 = _mm_sub_ps(iz2,jz3);
1383 dx31 = _mm_sub_ps(ix3,jx1);
1384 dy31 = _mm_sub_ps(iy3,jy1);
1385 dz31 = _mm_sub_ps(iz3,jz1);
1386 dx32 = _mm_sub_ps(ix3,jx2);
1387 dy32 = _mm_sub_ps(iy3,jy2);
1388 dz32 = _mm_sub_ps(iz3,jz2);
1389 dx33 = _mm_sub_ps(ix3,jx3);
1390 dy33 = _mm_sub_ps(iy3,jy3);
1391 dz33 = _mm_sub_ps(iz3,jz3);
1393 /* Calculate squared distance and things based on it */
1394 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1395 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1396 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1397 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1398 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1399 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1400 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1401 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1402 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1404 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1405 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1406 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1407 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1408 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1409 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1410 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1411 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1412 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1414 fjx1 = _mm_setzero_ps();
1415 fjy1 = _mm_setzero_ps();
1416 fjz1 = _mm_setzero_ps();
1417 fjx2 = _mm_setzero_ps();
1418 fjy2 = _mm_setzero_ps();
1419 fjz2 = _mm_setzero_ps();
1420 fjx3 = _mm_setzero_ps();
1421 fjy3 = _mm_setzero_ps();
1422 fjz3 = _mm_setzero_ps();
1424 /**************************
1425 * CALCULATE INTERACTIONS *
1426 **************************/
1428 r11 = _mm_mul_ps(rsq11,rinv11);
1430 /* Calculate table index by multiplying r with table scale and truncate to integer */
1431 rt = _mm_mul_ps(r11,vftabscale);
1432 vfitab = _mm_cvttps_epi32(rt);
1434 vfeps = _mm_frcz_ps(rt);
1436 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1438 twovfeps = _mm_add_ps(vfeps,vfeps);
1439 vfitab = _mm_slli_epi32(vfitab,2);
1441 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1442 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1443 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1444 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1445 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1446 _MM_TRANSPOSE4_PS(Y,F,G,H);
1447 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1448 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1449 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1453 /* Update vectorial force */
1454 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1455 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1456 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1458 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1459 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1460 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1462 /**************************
1463 * CALCULATE INTERACTIONS *
1464 **************************/
1466 r12 = _mm_mul_ps(rsq12,rinv12);
1468 /* Calculate table index by multiplying r with table scale and truncate to integer */
1469 rt = _mm_mul_ps(r12,vftabscale);
1470 vfitab = _mm_cvttps_epi32(rt);
1472 vfeps = _mm_frcz_ps(rt);
1474 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1476 twovfeps = _mm_add_ps(vfeps,vfeps);
1477 vfitab = _mm_slli_epi32(vfitab,2);
1479 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1480 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1481 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1482 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1483 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1484 _MM_TRANSPOSE4_PS(Y,F,G,H);
1485 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1486 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1487 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1491 /* Update vectorial force */
1492 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1493 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1494 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1496 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1497 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1498 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1500 /**************************
1501 * CALCULATE INTERACTIONS *
1502 **************************/
1504 r13 = _mm_mul_ps(rsq13,rinv13);
1506 /* Calculate table index by multiplying r with table scale and truncate to integer */
1507 rt = _mm_mul_ps(r13,vftabscale);
1508 vfitab = _mm_cvttps_epi32(rt);
1510 vfeps = _mm_frcz_ps(rt);
1512 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1514 twovfeps = _mm_add_ps(vfeps,vfeps);
1515 vfitab = _mm_slli_epi32(vfitab,2);
1517 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1518 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1519 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1520 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1521 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1522 _MM_TRANSPOSE4_PS(Y,F,G,H);
1523 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1524 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1525 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1529 /* Update vectorial force */
1530 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1531 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1532 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1534 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1535 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1536 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1538 /**************************
1539 * CALCULATE INTERACTIONS *
1540 **************************/
1542 r21 = _mm_mul_ps(rsq21,rinv21);
1544 /* Calculate table index by multiplying r with table scale and truncate to integer */
1545 rt = _mm_mul_ps(r21,vftabscale);
1546 vfitab = _mm_cvttps_epi32(rt);
1548 vfeps = _mm_frcz_ps(rt);
1550 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1552 twovfeps = _mm_add_ps(vfeps,vfeps);
1553 vfitab = _mm_slli_epi32(vfitab,2);
1555 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1556 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1557 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1558 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1559 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1560 _MM_TRANSPOSE4_PS(Y,F,G,H);
1561 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1562 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1563 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1567 /* Update vectorial force */
1568 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1569 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1570 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1572 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1573 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1574 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1576 /**************************
1577 * CALCULATE INTERACTIONS *
1578 **************************/
1580 r22 = _mm_mul_ps(rsq22,rinv22);
1582 /* Calculate table index by multiplying r with table scale and truncate to integer */
1583 rt = _mm_mul_ps(r22,vftabscale);
1584 vfitab = _mm_cvttps_epi32(rt);
1586 vfeps = _mm_frcz_ps(rt);
1588 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1590 twovfeps = _mm_add_ps(vfeps,vfeps);
1591 vfitab = _mm_slli_epi32(vfitab,2);
1593 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1594 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1595 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1596 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1597 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1598 _MM_TRANSPOSE4_PS(Y,F,G,H);
1599 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1600 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1601 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1605 /* Update vectorial force */
1606 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1607 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1608 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1610 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1611 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1612 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1614 /**************************
1615 * CALCULATE INTERACTIONS *
1616 **************************/
1618 r23 = _mm_mul_ps(rsq23,rinv23);
1620 /* Calculate table index by multiplying r with table scale and truncate to integer */
1621 rt = _mm_mul_ps(r23,vftabscale);
1622 vfitab = _mm_cvttps_epi32(rt);
1624 vfeps = _mm_frcz_ps(rt);
1626 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1628 twovfeps = _mm_add_ps(vfeps,vfeps);
1629 vfitab = _mm_slli_epi32(vfitab,2);
1631 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1632 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1633 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1634 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1635 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1636 _MM_TRANSPOSE4_PS(Y,F,G,H);
1637 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1638 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1639 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1643 /* Update vectorial force */
1644 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1645 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1646 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1648 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1649 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1650 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1652 /**************************
1653 * CALCULATE INTERACTIONS *
1654 **************************/
1656 r31 = _mm_mul_ps(rsq31,rinv31);
1658 /* Calculate table index by multiplying r with table scale and truncate to integer */
1659 rt = _mm_mul_ps(r31,vftabscale);
1660 vfitab = _mm_cvttps_epi32(rt);
1662 vfeps = _mm_frcz_ps(rt);
1664 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1666 twovfeps = _mm_add_ps(vfeps,vfeps);
1667 vfitab = _mm_slli_epi32(vfitab,2);
1669 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1670 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1671 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1672 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1673 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1674 _MM_TRANSPOSE4_PS(Y,F,G,H);
1675 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1676 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1677 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1681 /* Update vectorial force */
1682 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1683 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1684 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1686 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1687 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1688 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1690 /**************************
1691 * CALCULATE INTERACTIONS *
1692 **************************/
1694 r32 = _mm_mul_ps(rsq32,rinv32);
1696 /* Calculate table index by multiplying r with table scale and truncate to integer */
1697 rt = _mm_mul_ps(r32,vftabscale);
1698 vfitab = _mm_cvttps_epi32(rt);
1700 vfeps = _mm_frcz_ps(rt);
1702 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1704 twovfeps = _mm_add_ps(vfeps,vfeps);
1705 vfitab = _mm_slli_epi32(vfitab,2);
1707 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1708 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1709 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1710 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1711 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1712 _MM_TRANSPOSE4_PS(Y,F,G,H);
1713 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1714 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1715 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1719 /* Update vectorial force */
1720 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1721 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1722 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1724 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1725 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1726 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1728 /**************************
1729 * CALCULATE INTERACTIONS *
1730 **************************/
1732 r33 = _mm_mul_ps(rsq33,rinv33);
1734 /* Calculate table index by multiplying r with table scale and truncate to integer */
1735 rt = _mm_mul_ps(r33,vftabscale);
1736 vfitab = _mm_cvttps_epi32(rt);
1738 vfeps = _mm_frcz_ps(rt);
1740 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1742 twovfeps = _mm_add_ps(vfeps,vfeps);
1743 vfitab = _mm_slli_epi32(vfitab,2);
1745 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1746 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1747 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1748 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1749 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1750 _MM_TRANSPOSE4_PS(Y,F,G,H);
1751 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1752 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1753 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1757 /* Update vectorial force */
1758 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1759 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1760 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1762 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1763 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1764 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1766 fjptrA = f+j_coord_offsetA;
1767 fjptrB = f+j_coord_offsetB;
1768 fjptrC = f+j_coord_offsetC;
1769 fjptrD = f+j_coord_offsetD;
1771 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1772 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1774 /* Inner loop uses 378 flops */
1777 if(jidx<j_index_end)
1780 /* Get j neighbor index, and coordinate index */
1781 jnrlistA = jjnr[jidx];
1782 jnrlistB = jjnr[jidx+1];
1783 jnrlistC = jjnr[jidx+2];
1784 jnrlistD = jjnr[jidx+3];
1785 /* Sign of each element will be negative for non-real atoms.
1786 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1787 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1789 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1790 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1791 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1792 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1793 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1794 j_coord_offsetA = DIM*jnrA;
1795 j_coord_offsetB = DIM*jnrB;
1796 j_coord_offsetC = DIM*jnrC;
1797 j_coord_offsetD = DIM*jnrD;
1799 /* load j atom coordinates */
1800 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1801 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1802 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1804 /* Calculate displacement vector */
1805 dx11 = _mm_sub_ps(ix1,jx1);
1806 dy11 = _mm_sub_ps(iy1,jy1);
1807 dz11 = _mm_sub_ps(iz1,jz1);
1808 dx12 = _mm_sub_ps(ix1,jx2);
1809 dy12 = _mm_sub_ps(iy1,jy2);
1810 dz12 = _mm_sub_ps(iz1,jz2);
1811 dx13 = _mm_sub_ps(ix1,jx3);
1812 dy13 = _mm_sub_ps(iy1,jy3);
1813 dz13 = _mm_sub_ps(iz1,jz3);
1814 dx21 = _mm_sub_ps(ix2,jx1);
1815 dy21 = _mm_sub_ps(iy2,jy1);
1816 dz21 = _mm_sub_ps(iz2,jz1);
1817 dx22 = _mm_sub_ps(ix2,jx2);
1818 dy22 = _mm_sub_ps(iy2,jy2);
1819 dz22 = _mm_sub_ps(iz2,jz2);
1820 dx23 = _mm_sub_ps(ix2,jx3);
1821 dy23 = _mm_sub_ps(iy2,jy3);
1822 dz23 = _mm_sub_ps(iz2,jz3);
1823 dx31 = _mm_sub_ps(ix3,jx1);
1824 dy31 = _mm_sub_ps(iy3,jy1);
1825 dz31 = _mm_sub_ps(iz3,jz1);
1826 dx32 = _mm_sub_ps(ix3,jx2);
1827 dy32 = _mm_sub_ps(iy3,jy2);
1828 dz32 = _mm_sub_ps(iz3,jz2);
1829 dx33 = _mm_sub_ps(ix3,jx3);
1830 dy33 = _mm_sub_ps(iy3,jy3);
1831 dz33 = _mm_sub_ps(iz3,jz3);
1833 /* Calculate squared distance and things based on it */
1834 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1835 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1836 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1837 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1838 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1839 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1840 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1841 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1842 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1844 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1845 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1846 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1847 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1848 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1849 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1850 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1851 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1852 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1854 fjx1 = _mm_setzero_ps();
1855 fjy1 = _mm_setzero_ps();
1856 fjz1 = _mm_setzero_ps();
1857 fjx2 = _mm_setzero_ps();
1858 fjy2 = _mm_setzero_ps();
1859 fjz2 = _mm_setzero_ps();
1860 fjx3 = _mm_setzero_ps();
1861 fjy3 = _mm_setzero_ps();
1862 fjz3 = _mm_setzero_ps();
1864 /**************************
1865 * CALCULATE INTERACTIONS *
1866 **************************/
1868 r11 = _mm_mul_ps(rsq11,rinv11);
1869 r11 = _mm_andnot_ps(dummy_mask,r11);
1871 /* Calculate table index by multiplying r with table scale and truncate to integer */
1872 rt = _mm_mul_ps(r11,vftabscale);
1873 vfitab = _mm_cvttps_epi32(rt);
1875 vfeps = _mm_frcz_ps(rt);
1877 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1879 twovfeps = _mm_add_ps(vfeps,vfeps);
1880 vfitab = _mm_slli_epi32(vfitab,2);
1882 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1883 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1884 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1885 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1886 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1887 _MM_TRANSPOSE4_PS(Y,F,G,H);
1888 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1889 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1890 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1894 fscal = _mm_andnot_ps(dummy_mask,fscal);
1896 /* Update vectorial force */
1897 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1898 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1899 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1901 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1902 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1903 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1905 /**************************
1906 * CALCULATE INTERACTIONS *
1907 **************************/
1909 r12 = _mm_mul_ps(rsq12,rinv12);
1910 r12 = _mm_andnot_ps(dummy_mask,r12);
1912 /* Calculate table index by multiplying r with table scale and truncate to integer */
1913 rt = _mm_mul_ps(r12,vftabscale);
1914 vfitab = _mm_cvttps_epi32(rt);
1916 vfeps = _mm_frcz_ps(rt);
1918 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1920 twovfeps = _mm_add_ps(vfeps,vfeps);
1921 vfitab = _mm_slli_epi32(vfitab,2);
1923 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1924 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1925 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1926 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1927 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1928 _MM_TRANSPOSE4_PS(Y,F,G,H);
1929 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1930 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1931 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1935 fscal = _mm_andnot_ps(dummy_mask,fscal);
1937 /* Update vectorial force */
1938 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1939 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1940 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1942 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1943 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1944 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1946 /**************************
1947 * CALCULATE INTERACTIONS *
1948 **************************/
1950 r13 = _mm_mul_ps(rsq13,rinv13);
1951 r13 = _mm_andnot_ps(dummy_mask,r13);
1953 /* Calculate table index by multiplying r with table scale and truncate to integer */
1954 rt = _mm_mul_ps(r13,vftabscale);
1955 vfitab = _mm_cvttps_epi32(rt);
1957 vfeps = _mm_frcz_ps(rt);
1959 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1961 twovfeps = _mm_add_ps(vfeps,vfeps);
1962 vfitab = _mm_slli_epi32(vfitab,2);
1964 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1965 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1966 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1967 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1968 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1969 _MM_TRANSPOSE4_PS(Y,F,G,H);
1970 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1971 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1972 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1976 fscal = _mm_andnot_ps(dummy_mask,fscal);
1978 /* Update vectorial force */
1979 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1980 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1981 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1983 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1984 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1985 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1987 /**************************
1988 * CALCULATE INTERACTIONS *
1989 **************************/
1991 r21 = _mm_mul_ps(rsq21,rinv21);
1992 r21 = _mm_andnot_ps(dummy_mask,r21);
1994 /* Calculate table index by multiplying r with table scale and truncate to integer */
1995 rt = _mm_mul_ps(r21,vftabscale);
1996 vfitab = _mm_cvttps_epi32(rt);
1998 vfeps = _mm_frcz_ps(rt);
2000 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2002 twovfeps = _mm_add_ps(vfeps,vfeps);
2003 vfitab = _mm_slli_epi32(vfitab,2);
2005 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2006 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2007 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2008 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2009 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2010 _MM_TRANSPOSE4_PS(Y,F,G,H);
2011 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2012 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2013 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2017 fscal = _mm_andnot_ps(dummy_mask,fscal);
2019 /* Update vectorial force */
2020 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2021 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2022 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2024 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2025 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2026 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2028 /**************************
2029 * CALCULATE INTERACTIONS *
2030 **************************/
2032 r22 = _mm_mul_ps(rsq22,rinv22);
2033 r22 = _mm_andnot_ps(dummy_mask,r22);
2035 /* Calculate table index by multiplying r with table scale and truncate to integer */
2036 rt = _mm_mul_ps(r22,vftabscale);
2037 vfitab = _mm_cvttps_epi32(rt);
2039 vfeps = _mm_frcz_ps(rt);
2041 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2043 twovfeps = _mm_add_ps(vfeps,vfeps);
2044 vfitab = _mm_slli_epi32(vfitab,2);
2046 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2047 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2048 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2049 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2050 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2051 _MM_TRANSPOSE4_PS(Y,F,G,H);
2052 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2053 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2054 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2058 fscal = _mm_andnot_ps(dummy_mask,fscal);
2060 /* Update vectorial force */
2061 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2062 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2063 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2065 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2066 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2067 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2069 /**************************
2070 * CALCULATE INTERACTIONS *
2071 **************************/
2073 r23 = _mm_mul_ps(rsq23,rinv23);
2074 r23 = _mm_andnot_ps(dummy_mask,r23);
2076 /* Calculate table index by multiplying r with table scale and truncate to integer */
2077 rt = _mm_mul_ps(r23,vftabscale);
2078 vfitab = _mm_cvttps_epi32(rt);
2080 vfeps = _mm_frcz_ps(rt);
2082 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2084 twovfeps = _mm_add_ps(vfeps,vfeps);
2085 vfitab = _mm_slli_epi32(vfitab,2);
2087 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2088 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2089 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2090 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2091 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2092 _MM_TRANSPOSE4_PS(Y,F,G,H);
2093 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2094 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2095 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
2099 fscal = _mm_andnot_ps(dummy_mask,fscal);
2101 /* Update vectorial force */
2102 fix2 = _mm_macc_ps(dx23,fscal,fix2);
2103 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
2104 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
2106 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
2107 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
2108 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
2110 /**************************
2111 * CALCULATE INTERACTIONS *
2112 **************************/
2114 r31 = _mm_mul_ps(rsq31,rinv31);
2115 r31 = _mm_andnot_ps(dummy_mask,r31);
2117 /* Calculate table index by multiplying r with table scale and truncate to integer */
2118 rt = _mm_mul_ps(r31,vftabscale);
2119 vfitab = _mm_cvttps_epi32(rt);
2121 vfeps = _mm_frcz_ps(rt);
2123 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2125 twovfeps = _mm_add_ps(vfeps,vfeps);
2126 vfitab = _mm_slli_epi32(vfitab,2);
2128 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2129 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2130 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2131 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2132 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2133 _MM_TRANSPOSE4_PS(Y,F,G,H);
2134 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2135 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2136 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
2140 fscal = _mm_andnot_ps(dummy_mask,fscal);
2142 /* Update vectorial force */
2143 fix3 = _mm_macc_ps(dx31,fscal,fix3);
2144 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
2145 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
2147 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
2148 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
2149 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
2151 /**************************
2152 * CALCULATE INTERACTIONS *
2153 **************************/
2155 r32 = _mm_mul_ps(rsq32,rinv32);
2156 r32 = _mm_andnot_ps(dummy_mask,r32);
2158 /* Calculate table index by multiplying r with table scale and truncate to integer */
2159 rt = _mm_mul_ps(r32,vftabscale);
2160 vfitab = _mm_cvttps_epi32(rt);
2162 vfeps = _mm_frcz_ps(rt);
2164 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2166 twovfeps = _mm_add_ps(vfeps,vfeps);
2167 vfitab = _mm_slli_epi32(vfitab,2);
2169 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2170 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2171 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2172 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2173 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2174 _MM_TRANSPOSE4_PS(Y,F,G,H);
2175 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2176 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2177 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
2181 fscal = _mm_andnot_ps(dummy_mask,fscal);
2183 /* Update vectorial force */
2184 fix3 = _mm_macc_ps(dx32,fscal,fix3);
2185 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
2186 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
2188 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
2189 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
2190 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
2192 /**************************
2193 * CALCULATE INTERACTIONS *
2194 **************************/
2196 r33 = _mm_mul_ps(rsq33,rinv33);
2197 r33 = _mm_andnot_ps(dummy_mask,r33);
2199 /* Calculate table index by multiplying r with table scale and truncate to integer */
2200 rt = _mm_mul_ps(r33,vftabscale);
2201 vfitab = _mm_cvttps_epi32(rt);
2203 vfeps = _mm_frcz_ps(rt);
2205 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2207 twovfeps = _mm_add_ps(vfeps,vfeps);
2208 vfitab = _mm_slli_epi32(vfitab,2);
2210 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2211 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2212 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2213 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2214 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2215 _MM_TRANSPOSE4_PS(Y,F,G,H);
2216 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2217 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2218 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2222 fscal = _mm_andnot_ps(dummy_mask,fscal);
2224 /* Update vectorial force */
2225 fix3 = _mm_macc_ps(dx33,fscal,fix3);
2226 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
2227 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
2229 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
2230 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
2231 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
2233 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2234 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2235 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2236 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2238 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2239 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2241 /* Inner loop uses 387 flops */
2244 /* End of innermost loop */
2246 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2247 f+i_coord_offset+DIM,fshift+i_shift_offset);
2249 /* Increment number of inner iterations */
2250 inneriter += j_index_end - j_index_start;
2252 /* Outer loop uses 18 flops */
2255 /* Increment number of outer iterations */
2258 /* Update outer/inner flops */
2260 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*387);