2 * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_single.h"
34 #include "kernelutil_x86_avx_128_fma_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_128_fma_single
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: None
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_128_fma_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
93 __m128i ifour = _mm_set1_epi32(4);
94 __m128 rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
96 __m128 dummy_mask,cutoff_mask;
97 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
98 __m128 one = _mm_set1_ps(1.0);
99 __m128 two = _mm_set1_ps(2.0);
105 jindex = nlist->jindex;
107 shiftidx = nlist->shift;
109 shiftvec = fr->shift_vec[0];
110 fshift = fr->fshift[0];
111 facel = _mm_set1_ps(fr->epsfac);
112 charge = mdatoms->chargeA;
114 vftab = kernel_data->table_elec->data;
115 vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
117 /* Setup water-specific parameters */
118 inr = nlist->iinr[0];
119 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
120 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
121 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
123 jq0 = _mm_set1_ps(charge[inr+0]);
124 jq1 = _mm_set1_ps(charge[inr+1]);
125 jq2 = _mm_set1_ps(charge[inr+2]);
126 qq00 = _mm_mul_ps(iq0,jq0);
127 qq01 = _mm_mul_ps(iq0,jq1);
128 qq02 = _mm_mul_ps(iq0,jq2);
129 qq10 = _mm_mul_ps(iq1,jq0);
130 qq11 = _mm_mul_ps(iq1,jq1);
131 qq12 = _mm_mul_ps(iq1,jq2);
132 qq20 = _mm_mul_ps(iq2,jq0);
133 qq21 = _mm_mul_ps(iq2,jq1);
134 qq22 = _mm_mul_ps(iq2,jq2);
136 /* Avoid stupid compiler warnings */
137 jnrA = jnrB = jnrC = jnrD = 0;
146 for(iidx=0;iidx<4*DIM;iidx++)
151 /* Start outer loop over neighborlists */
152 for(iidx=0; iidx<nri; iidx++)
154 /* Load shift vector for this list */
155 i_shift_offset = DIM*shiftidx[iidx];
157 /* Load limits for loop over neighbors */
158 j_index_start = jindex[iidx];
159 j_index_end = jindex[iidx+1];
161 /* Get outer coordinate index */
163 i_coord_offset = DIM*inr;
165 /* Load i particle coords and add shift vector */
166 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
167 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
169 fix0 = _mm_setzero_ps();
170 fiy0 = _mm_setzero_ps();
171 fiz0 = _mm_setzero_ps();
172 fix1 = _mm_setzero_ps();
173 fiy1 = _mm_setzero_ps();
174 fiz1 = _mm_setzero_ps();
175 fix2 = _mm_setzero_ps();
176 fiy2 = _mm_setzero_ps();
177 fiz2 = _mm_setzero_ps();
179 /* Reset potential sums */
180 velecsum = _mm_setzero_ps();
182 /* Start inner kernel loop */
183 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
186 /* Get j neighbor index, and coordinate index */
191 j_coord_offsetA = DIM*jnrA;
192 j_coord_offsetB = DIM*jnrB;
193 j_coord_offsetC = DIM*jnrC;
194 j_coord_offsetD = DIM*jnrD;
196 /* load j atom coordinates */
197 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
198 x+j_coord_offsetC,x+j_coord_offsetD,
199 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
201 /* Calculate displacement vector */
202 dx00 = _mm_sub_ps(ix0,jx0);
203 dy00 = _mm_sub_ps(iy0,jy0);
204 dz00 = _mm_sub_ps(iz0,jz0);
205 dx01 = _mm_sub_ps(ix0,jx1);
206 dy01 = _mm_sub_ps(iy0,jy1);
207 dz01 = _mm_sub_ps(iz0,jz1);
208 dx02 = _mm_sub_ps(ix0,jx2);
209 dy02 = _mm_sub_ps(iy0,jy2);
210 dz02 = _mm_sub_ps(iz0,jz2);
211 dx10 = _mm_sub_ps(ix1,jx0);
212 dy10 = _mm_sub_ps(iy1,jy0);
213 dz10 = _mm_sub_ps(iz1,jz0);
214 dx11 = _mm_sub_ps(ix1,jx1);
215 dy11 = _mm_sub_ps(iy1,jy1);
216 dz11 = _mm_sub_ps(iz1,jz1);
217 dx12 = _mm_sub_ps(ix1,jx2);
218 dy12 = _mm_sub_ps(iy1,jy2);
219 dz12 = _mm_sub_ps(iz1,jz2);
220 dx20 = _mm_sub_ps(ix2,jx0);
221 dy20 = _mm_sub_ps(iy2,jy0);
222 dz20 = _mm_sub_ps(iz2,jz0);
223 dx21 = _mm_sub_ps(ix2,jx1);
224 dy21 = _mm_sub_ps(iy2,jy1);
225 dz21 = _mm_sub_ps(iz2,jz1);
226 dx22 = _mm_sub_ps(ix2,jx2);
227 dy22 = _mm_sub_ps(iy2,jy2);
228 dz22 = _mm_sub_ps(iz2,jz2);
230 /* Calculate squared distance and things based on it */
231 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
232 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
233 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
234 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
235 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
236 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
237 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
238 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
239 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
241 rinv00 = gmx_mm_invsqrt_ps(rsq00);
242 rinv01 = gmx_mm_invsqrt_ps(rsq01);
243 rinv02 = gmx_mm_invsqrt_ps(rsq02);
244 rinv10 = gmx_mm_invsqrt_ps(rsq10);
245 rinv11 = gmx_mm_invsqrt_ps(rsq11);
246 rinv12 = gmx_mm_invsqrt_ps(rsq12);
247 rinv20 = gmx_mm_invsqrt_ps(rsq20);
248 rinv21 = gmx_mm_invsqrt_ps(rsq21);
249 rinv22 = gmx_mm_invsqrt_ps(rsq22);
251 fjx0 = _mm_setzero_ps();
252 fjy0 = _mm_setzero_ps();
253 fjz0 = _mm_setzero_ps();
254 fjx1 = _mm_setzero_ps();
255 fjy1 = _mm_setzero_ps();
256 fjz1 = _mm_setzero_ps();
257 fjx2 = _mm_setzero_ps();
258 fjy2 = _mm_setzero_ps();
259 fjz2 = _mm_setzero_ps();
261 /**************************
262 * CALCULATE INTERACTIONS *
263 **************************/
265 r00 = _mm_mul_ps(rsq00,rinv00);
267 /* Calculate table index by multiplying r with table scale and truncate to integer */
268 rt = _mm_mul_ps(r00,vftabscale);
269 vfitab = _mm_cvttps_epi32(rt);
271 vfeps = _mm_frcz_ps(rt);
273 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
275 twovfeps = _mm_add_ps(vfeps,vfeps);
276 vfitab = _mm_slli_epi32(vfitab,2);
278 /* CUBIC SPLINE TABLE ELECTROSTATICS */
279 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
280 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
281 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
282 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
283 _MM_TRANSPOSE4_PS(Y,F,G,H);
284 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
285 VV = _mm_macc_ps(vfeps,Fp,Y);
286 velec = _mm_mul_ps(qq00,VV);
287 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
288 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
290 /* Update potential sum for this i atom from the interaction with this j atom. */
291 velecsum = _mm_add_ps(velecsum,velec);
295 /* Update vectorial force */
296 fix0 = _mm_macc_ps(dx00,fscal,fix0);
297 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
298 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
300 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
301 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
302 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
304 /**************************
305 * CALCULATE INTERACTIONS *
306 **************************/
308 r01 = _mm_mul_ps(rsq01,rinv01);
310 /* Calculate table index by multiplying r with table scale and truncate to integer */
311 rt = _mm_mul_ps(r01,vftabscale);
312 vfitab = _mm_cvttps_epi32(rt);
314 vfeps = _mm_frcz_ps(rt);
316 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
318 twovfeps = _mm_add_ps(vfeps,vfeps);
319 vfitab = _mm_slli_epi32(vfitab,2);
321 /* CUBIC SPLINE TABLE ELECTROSTATICS */
322 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
323 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
324 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
325 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
326 _MM_TRANSPOSE4_PS(Y,F,G,H);
327 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
328 VV = _mm_macc_ps(vfeps,Fp,Y);
329 velec = _mm_mul_ps(qq01,VV);
330 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
331 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
333 /* Update potential sum for this i atom from the interaction with this j atom. */
334 velecsum = _mm_add_ps(velecsum,velec);
338 /* Update vectorial force */
339 fix0 = _mm_macc_ps(dx01,fscal,fix0);
340 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
341 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
343 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
344 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
345 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
347 /**************************
348 * CALCULATE INTERACTIONS *
349 **************************/
351 r02 = _mm_mul_ps(rsq02,rinv02);
353 /* Calculate table index by multiplying r with table scale and truncate to integer */
354 rt = _mm_mul_ps(r02,vftabscale);
355 vfitab = _mm_cvttps_epi32(rt);
357 vfeps = _mm_frcz_ps(rt);
359 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
361 twovfeps = _mm_add_ps(vfeps,vfeps);
362 vfitab = _mm_slli_epi32(vfitab,2);
364 /* CUBIC SPLINE TABLE ELECTROSTATICS */
365 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
366 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
367 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
368 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
369 _MM_TRANSPOSE4_PS(Y,F,G,H);
370 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
371 VV = _mm_macc_ps(vfeps,Fp,Y);
372 velec = _mm_mul_ps(qq02,VV);
373 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
374 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
376 /* Update potential sum for this i atom from the interaction with this j atom. */
377 velecsum = _mm_add_ps(velecsum,velec);
381 /* Update vectorial force */
382 fix0 = _mm_macc_ps(dx02,fscal,fix0);
383 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
384 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
386 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
387 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
388 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
390 /**************************
391 * CALCULATE INTERACTIONS *
392 **************************/
394 r10 = _mm_mul_ps(rsq10,rinv10);
396 /* Calculate table index by multiplying r with table scale and truncate to integer */
397 rt = _mm_mul_ps(r10,vftabscale);
398 vfitab = _mm_cvttps_epi32(rt);
400 vfeps = _mm_frcz_ps(rt);
402 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
404 twovfeps = _mm_add_ps(vfeps,vfeps);
405 vfitab = _mm_slli_epi32(vfitab,2);
407 /* CUBIC SPLINE TABLE ELECTROSTATICS */
408 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
409 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
410 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
411 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
412 _MM_TRANSPOSE4_PS(Y,F,G,H);
413 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
414 VV = _mm_macc_ps(vfeps,Fp,Y);
415 velec = _mm_mul_ps(qq10,VV);
416 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
417 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
419 /* Update potential sum for this i atom from the interaction with this j atom. */
420 velecsum = _mm_add_ps(velecsum,velec);
424 /* Update vectorial force */
425 fix1 = _mm_macc_ps(dx10,fscal,fix1);
426 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
427 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
429 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
430 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
431 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
433 /**************************
434 * CALCULATE INTERACTIONS *
435 **************************/
437 r11 = _mm_mul_ps(rsq11,rinv11);
439 /* Calculate table index by multiplying r with table scale and truncate to integer */
440 rt = _mm_mul_ps(r11,vftabscale);
441 vfitab = _mm_cvttps_epi32(rt);
443 vfeps = _mm_frcz_ps(rt);
445 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
447 twovfeps = _mm_add_ps(vfeps,vfeps);
448 vfitab = _mm_slli_epi32(vfitab,2);
450 /* CUBIC SPLINE TABLE ELECTROSTATICS */
451 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
452 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
453 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
454 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
455 _MM_TRANSPOSE4_PS(Y,F,G,H);
456 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
457 VV = _mm_macc_ps(vfeps,Fp,Y);
458 velec = _mm_mul_ps(qq11,VV);
459 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
460 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
462 /* Update potential sum for this i atom from the interaction with this j atom. */
463 velecsum = _mm_add_ps(velecsum,velec);
467 /* Update vectorial force */
468 fix1 = _mm_macc_ps(dx11,fscal,fix1);
469 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
470 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
472 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
473 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
474 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
476 /**************************
477 * CALCULATE INTERACTIONS *
478 **************************/
480 r12 = _mm_mul_ps(rsq12,rinv12);
482 /* Calculate table index by multiplying r with table scale and truncate to integer */
483 rt = _mm_mul_ps(r12,vftabscale);
484 vfitab = _mm_cvttps_epi32(rt);
486 vfeps = _mm_frcz_ps(rt);
488 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
490 twovfeps = _mm_add_ps(vfeps,vfeps);
491 vfitab = _mm_slli_epi32(vfitab,2);
493 /* CUBIC SPLINE TABLE ELECTROSTATICS */
494 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
495 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
496 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
497 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
498 _MM_TRANSPOSE4_PS(Y,F,G,H);
499 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
500 VV = _mm_macc_ps(vfeps,Fp,Y);
501 velec = _mm_mul_ps(qq12,VV);
502 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
503 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
505 /* Update potential sum for this i atom from the interaction with this j atom. */
506 velecsum = _mm_add_ps(velecsum,velec);
510 /* Update vectorial force */
511 fix1 = _mm_macc_ps(dx12,fscal,fix1);
512 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
513 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
515 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
516 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
517 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
519 /**************************
520 * CALCULATE INTERACTIONS *
521 **************************/
523 r20 = _mm_mul_ps(rsq20,rinv20);
525 /* Calculate table index by multiplying r with table scale and truncate to integer */
526 rt = _mm_mul_ps(r20,vftabscale);
527 vfitab = _mm_cvttps_epi32(rt);
529 vfeps = _mm_frcz_ps(rt);
531 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
533 twovfeps = _mm_add_ps(vfeps,vfeps);
534 vfitab = _mm_slli_epi32(vfitab,2);
536 /* CUBIC SPLINE TABLE ELECTROSTATICS */
537 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
538 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
539 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
540 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
541 _MM_TRANSPOSE4_PS(Y,F,G,H);
542 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
543 VV = _mm_macc_ps(vfeps,Fp,Y);
544 velec = _mm_mul_ps(qq20,VV);
545 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
546 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
548 /* Update potential sum for this i atom from the interaction with this j atom. */
549 velecsum = _mm_add_ps(velecsum,velec);
553 /* Update vectorial force */
554 fix2 = _mm_macc_ps(dx20,fscal,fix2);
555 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
556 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
558 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
559 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
560 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
562 /**************************
563 * CALCULATE INTERACTIONS *
564 **************************/
566 r21 = _mm_mul_ps(rsq21,rinv21);
568 /* Calculate table index by multiplying r with table scale and truncate to integer */
569 rt = _mm_mul_ps(r21,vftabscale);
570 vfitab = _mm_cvttps_epi32(rt);
572 vfeps = _mm_frcz_ps(rt);
574 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
576 twovfeps = _mm_add_ps(vfeps,vfeps);
577 vfitab = _mm_slli_epi32(vfitab,2);
579 /* CUBIC SPLINE TABLE ELECTROSTATICS */
580 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
581 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
582 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
583 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
584 _MM_TRANSPOSE4_PS(Y,F,G,H);
585 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
586 VV = _mm_macc_ps(vfeps,Fp,Y);
587 velec = _mm_mul_ps(qq21,VV);
588 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
589 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
591 /* Update potential sum for this i atom from the interaction with this j atom. */
592 velecsum = _mm_add_ps(velecsum,velec);
596 /* Update vectorial force */
597 fix2 = _mm_macc_ps(dx21,fscal,fix2);
598 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
599 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
601 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
602 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
603 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
605 /**************************
606 * CALCULATE INTERACTIONS *
607 **************************/
609 r22 = _mm_mul_ps(rsq22,rinv22);
611 /* Calculate table index by multiplying r with table scale and truncate to integer */
612 rt = _mm_mul_ps(r22,vftabscale);
613 vfitab = _mm_cvttps_epi32(rt);
615 vfeps = _mm_frcz_ps(rt);
617 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
619 twovfeps = _mm_add_ps(vfeps,vfeps);
620 vfitab = _mm_slli_epi32(vfitab,2);
622 /* CUBIC SPLINE TABLE ELECTROSTATICS */
623 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
624 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
625 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
626 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
627 _MM_TRANSPOSE4_PS(Y,F,G,H);
628 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
629 VV = _mm_macc_ps(vfeps,Fp,Y);
630 velec = _mm_mul_ps(qq22,VV);
631 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
632 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
634 /* Update potential sum for this i atom from the interaction with this j atom. */
635 velecsum = _mm_add_ps(velecsum,velec);
639 /* Update vectorial force */
640 fix2 = _mm_macc_ps(dx22,fscal,fix2);
641 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
642 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
644 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
645 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
646 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
648 fjptrA = f+j_coord_offsetA;
649 fjptrB = f+j_coord_offsetB;
650 fjptrC = f+j_coord_offsetC;
651 fjptrD = f+j_coord_offsetD;
653 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
654 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
656 /* Inner loop uses 414 flops */
662 /* Get j neighbor index, and coordinate index */
663 jnrlistA = jjnr[jidx];
664 jnrlistB = jjnr[jidx+1];
665 jnrlistC = jjnr[jidx+2];
666 jnrlistD = jjnr[jidx+3];
667 /* Sign of each element will be negative for non-real atoms.
668 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
669 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
671 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
672 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
673 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
674 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
675 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
676 j_coord_offsetA = DIM*jnrA;
677 j_coord_offsetB = DIM*jnrB;
678 j_coord_offsetC = DIM*jnrC;
679 j_coord_offsetD = DIM*jnrD;
681 /* load j atom coordinates */
682 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
683 x+j_coord_offsetC,x+j_coord_offsetD,
684 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
686 /* Calculate displacement vector */
687 dx00 = _mm_sub_ps(ix0,jx0);
688 dy00 = _mm_sub_ps(iy0,jy0);
689 dz00 = _mm_sub_ps(iz0,jz0);
690 dx01 = _mm_sub_ps(ix0,jx1);
691 dy01 = _mm_sub_ps(iy0,jy1);
692 dz01 = _mm_sub_ps(iz0,jz1);
693 dx02 = _mm_sub_ps(ix0,jx2);
694 dy02 = _mm_sub_ps(iy0,jy2);
695 dz02 = _mm_sub_ps(iz0,jz2);
696 dx10 = _mm_sub_ps(ix1,jx0);
697 dy10 = _mm_sub_ps(iy1,jy0);
698 dz10 = _mm_sub_ps(iz1,jz0);
699 dx11 = _mm_sub_ps(ix1,jx1);
700 dy11 = _mm_sub_ps(iy1,jy1);
701 dz11 = _mm_sub_ps(iz1,jz1);
702 dx12 = _mm_sub_ps(ix1,jx2);
703 dy12 = _mm_sub_ps(iy1,jy2);
704 dz12 = _mm_sub_ps(iz1,jz2);
705 dx20 = _mm_sub_ps(ix2,jx0);
706 dy20 = _mm_sub_ps(iy2,jy0);
707 dz20 = _mm_sub_ps(iz2,jz0);
708 dx21 = _mm_sub_ps(ix2,jx1);
709 dy21 = _mm_sub_ps(iy2,jy1);
710 dz21 = _mm_sub_ps(iz2,jz1);
711 dx22 = _mm_sub_ps(ix2,jx2);
712 dy22 = _mm_sub_ps(iy2,jy2);
713 dz22 = _mm_sub_ps(iz2,jz2);
715 /* Calculate squared distance and things based on it */
716 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
717 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
718 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
719 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
720 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
721 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
722 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
723 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
724 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
726 rinv00 = gmx_mm_invsqrt_ps(rsq00);
727 rinv01 = gmx_mm_invsqrt_ps(rsq01);
728 rinv02 = gmx_mm_invsqrt_ps(rsq02);
729 rinv10 = gmx_mm_invsqrt_ps(rsq10);
730 rinv11 = gmx_mm_invsqrt_ps(rsq11);
731 rinv12 = gmx_mm_invsqrt_ps(rsq12);
732 rinv20 = gmx_mm_invsqrt_ps(rsq20);
733 rinv21 = gmx_mm_invsqrt_ps(rsq21);
734 rinv22 = gmx_mm_invsqrt_ps(rsq22);
736 fjx0 = _mm_setzero_ps();
737 fjy0 = _mm_setzero_ps();
738 fjz0 = _mm_setzero_ps();
739 fjx1 = _mm_setzero_ps();
740 fjy1 = _mm_setzero_ps();
741 fjz1 = _mm_setzero_ps();
742 fjx2 = _mm_setzero_ps();
743 fjy2 = _mm_setzero_ps();
744 fjz2 = _mm_setzero_ps();
746 /**************************
747 * CALCULATE INTERACTIONS *
748 **************************/
750 r00 = _mm_mul_ps(rsq00,rinv00);
751 r00 = _mm_andnot_ps(dummy_mask,r00);
753 /* Calculate table index by multiplying r with table scale and truncate to integer */
754 rt = _mm_mul_ps(r00,vftabscale);
755 vfitab = _mm_cvttps_epi32(rt);
757 vfeps = _mm_frcz_ps(rt);
759 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
761 twovfeps = _mm_add_ps(vfeps,vfeps);
762 vfitab = _mm_slli_epi32(vfitab,2);
764 /* CUBIC SPLINE TABLE ELECTROSTATICS */
765 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
766 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
767 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
768 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
769 _MM_TRANSPOSE4_PS(Y,F,G,H);
770 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
771 VV = _mm_macc_ps(vfeps,Fp,Y);
772 velec = _mm_mul_ps(qq00,VV);
773 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
774 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
776 /* Update potential sum for this i atom from the interaction with this j atom. */
777 velec = _mm_andnot_ps(dummy_mask,velec);
778 velecsum = _mm_add_ps(velecsum,velec);
782 fscal = _mm_andnot_ps(dummy_mask,fscal);
784 /* Update vectorial force */
785 fix0 = _mm_macc_ps(dx00,fscal,fix0);
786 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
787 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
789 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
790 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
791 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
793 /**************************
794 * CALCULATE INTERACTIONS *
795 **************************/
797 r01 = _mm_mul_ps(rsq01,rinv01);
798 r01 = _mm_andnot_ps(dummy_mask,r01);
800 /* Calculate table index by multiplying r with table scale and truncate to integer */
801 rt = _mm_mul_ps(r01,vftabscale);
802 vfitab = _mm_cvttps_epi32(rt);
804 vfeps = _mm_frcz_ps(rt);
806 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
808 twovfeps = _mm_add_ps(vfeps,vfeps);
809 vfitab = _mm_slli_epi32(vfitab,2);
811 /* CUBIC SPLINE TABLE ELECTROSTATICS */
812 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
813 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
814 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
815 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
816 _MM_TRANSPOSE4_PS(Y,F,G,H);
817 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
818 VV = _mm_macc_ps(vfeps,Fp,Y);
819 velec = _mm_mul_ps(qq01,VV);
820 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
821 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
823 /* Update potential sum for this i atom from the interaction with this j atom. */
824 velec = _mm_andnot_ps(dummy_mask,velec);
825 velecsum = _mm_add_ps(velecsum,velec);
829 fscal = _mm_andnot_ps(dummy_mask,fscal);
831 /* Update vectorial force */
832 fix0 = _mm_macc_ps(dx01,fscal,fix0);
833 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
834 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
836 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
837 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
838 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
840 /**************************
841 * CALCULATE INTERACTIONS *
842 **************************/
844 r02 = _mm_mul_ps(rsq02,rinv02);
845 r02 = _mm_andnot_ps(dummy_mask,r02);
847 /* Calculate table index by multiplying r with table scale and truncate to integer */
848 rt = _mm_mul_ps(r02,vftabscale);
849 vfitab = _mm_cvttps_epi32(rt);
851 vfeps = _mm_frcz_ps(rt);
853 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
855 twovfeps = _mm_add_ps(vfeps,vfeps);
856 vfitab = _mm_slli_epi32(vfitab,2);
858 /* CUBIC SPLINE TABLE ELECTROSTATICS */
859 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
860 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
861 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
862 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
863 _MM_TRANSPOSE4_PS(Y,F,G,H);
864 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
865 VV = _mm_macc_ps(vfeps,Fp,Y);
866 velec = _mm_mul_ps(qq02,VV);
867 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
868 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
870 /* Update potential sum for this i atom from the interaction with this j atom. */
871 velec = _mm_andnot_ps(dummy_mask,velec);
872 velecsum = _mm_add_ps(velecsum,velec);
876 fscal = _mm_andnot_ps(dummy_mask,fscal);
878 /* Update vectorial force */
879 fix0 = _mm_macc_ps(dx02,fscal,fix0);
880 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
881 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
883 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
884 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
885 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
887 /**************************
888 * CALCULATE INTERACTIONS *
889 **************************/
891 r10 = _mm_mul_ps(rsq10,rinv10);
892 r10 = _mm_andnot_ps(dummy_mask,r10);
894 /* Calculate table index by multiplying r with table scale and truncate to integer */
895 rt = _mm_mul_ps(r10,vftabscale);
896 vfitab = _mm_cvttps_epi32(rt);
898 vfeps = _mm_frcz_ps(rt);
900 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
902 twovfeps = _mm_add_ps(vfeps,vfeps);
903 vfitab = _mm_slli_epi32(vfitab,2);
905 /* CUBIC SPLINE TABLE ELECTROSTATICS */
906 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
907 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
908 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
909 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
910 _MM_TRANSPOSE4_PS(Y,F,G,H);
911 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
912 VV = _mm_macc_ps(vfeps,Fp,Y);
913 velec = _mm_mul_ps(qq10,VV);
914 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
915 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
917 /* Update potential sum for this i atom from the interaction with this j atom. */
918 velec = _mm_andnot_ps(dummy_mask,velec);
919 velecsum = _mm_add_ps(velecsum,velec);
923 fscal = _mm_andnot_ps(dummy_mask,fscal);
925 /* Update vectorial force */
926 fix1 = _mm_macc_ps(dx10,fscal,fix1);
927 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
928 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
930 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
931 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
932 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
934 /**************************
935 * CALCULATE INTERACTIONS *
936 **************************/
938 r11 = _mm_mul_ps(rsq11,rinv11);
939 r11 = _mm_andnot_ps(dummy_mask,r11);
941 /* Calculate table index by multiplying r with table scale and truncate to integer */
942 rt = _mm_mul_ps(r11,vftabscale);
943 vfitab = _mm_cvttps_epi32(rt);
945 vfeps = _mm_frcz_ps(rt);
947 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
949 twovfeps = _mm_add_ps(vfeps,vfeps);
950 vfitab = _mm_slli_epi32(vfitab,2);
952 /* CUBIC SPLINE TABLE ELECTROSTATICS */
953 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
954 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
955 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
956 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
957 _MM_TRANSPOSE4_PS(Y,F,G,H);
958 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
959 VV = _mm_macc_ps(vfeps,Fp,Y);
960 velec = _mm_mul_ps(qq11,VV);
961 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
962 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
964 /* Update potential sum for this i atom from the interaction with this j atom. */
965 velec = _mm_andnot_ps(dummy_mask,velec);
966 velecsum = _mm_add_ps(velecsum,velec);
970 fscal = _mm_andnot_ps(dummy_mask,fscal);
972 /* Update vectorial force */
973 fix1 = _mm_macc_ps(dx11,fscal,fix1);
974 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
975 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
977 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
978 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
979 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
981 /**************************
982 * CALCULATE INTERACTIONS *
983 **************************/
985 r12 = _mm_mul_ps(rsq12,rinv12);
986 r12 = _mm_andnot_ps(dummy_mask,r12);
988 /* Calculate table index by multiplying r with table scale and truncate to integer */
989 rt = _mm_mul_ps(r12,vftabscale);
990 vfitab = _mm_cvttps_epi32(rt);
992 vfeps = _mm_frcz_ps(rt);
994 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
996 twovfeps = _mm_add_ps(vfeps,vfeps);
997 vfitab = _mm_slli_epi32(vfitab,2);
999 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1000 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1001 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1002 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1003 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1004 _MM_TRANSPOSE4_PS(Y,F,G,H);
1005 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1006 VV = _mm_macc_ps(vfeps,Fp,Y);
1007 velec = _mm_mul_ps(qq12,VV);
1008 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1009 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1011 /* Update potential sum for this i atom from the interaction with this j atom. */
1012 velec = _mm_andnot_ps(dummy_mask,velec);
1013 velecsum = _mm_add_ps(velecsum,velec);
1017 fscal = _mm_andnot_ps(dummy_mask,fscal);
1019 /* Update vectorial force */
1020 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1021 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1022 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1024 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1025 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1026 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1028 /**************************
1029 * CALCULATE INTERACTIONS *
1030 **************************/
1032 r20 = _mm_mul_ps(rsq20,rinv20);
1033 r20 = _mm_andnot_ps(dummy_mask,r20);
1035 /* Calculate table index by multiplying r with table scale and truncate to integer */
1036 rt = _mm_mul_ps(r20,vftabscale);
1037 vfitab = _mm_cvttps_epi32(rt);
1039 vfeps = _mm_frcz_ps(rt);
1041 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1043 twovfeps = _mm_add_ps(vfeps,vfeps);
1044 vfitab = _mm_slli_epi32(vfitab,2);
1046 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1047 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1048 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1049 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1050 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1051 _MM_TRANSPOSE4_PS(Y,F,G,H);
1052 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1053 VV = _mm_macc_ps(vfeps,Fp,Y);
1054 velec = _mm_mul_ps(qq20,VV);
1055 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1056 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1058 /* Update potential sum for this i atom from the interaction with this j atom. */
1059 velec = _mm_andnot_ps(dummy_mask,velec);
1060 velecsum = _mm_add_ps(velecsum,velec);
1064 fscal = _mm_andnot_ps(dummy_mask,fscal);
1066 /* Update vectorial force */
1067 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1068 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1069 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1071 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1072 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1073 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1075 /**************************
1076 * CALCULATE INTERACTIONS *
1077 **************************/
1079 r21 = _mm_mul_ps(rsq21,rinv21);
1080 r21 = _mm_andnot_ps(dummy_mask,r21);
1082 /* Calculate table index by multiplying r with table scale and truncate to integer */
1083 rt = _mm_mul_ps(r21,vftabscale);
1084 vfitab = _mm_cvttps_epi32(rt);
1086 vfeps = _mm_frcz_ps(rt);
1088 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1090 twovfeps = _mm_add_ps(vfeps,vfeps);
1091 vfitab = _mm_slli_epi32(vfitab,2);
1093 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1094 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1095 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1096 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1097 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1098 _MM_TRANSPOSE4_PS(Y,F,G,H);
1099 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1100 VV = _mm_macc_ps(vfeps,Fp,Y);
1101 velec = _mm_mul_ps(qq21,VV);
1102 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1103 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1105 /* Update potential sum for this i atom from the interaction with this j atom. */
1106 velec = _mm_andnot_ps(dummy_mask,velec);
1107 velecsum = _mm_add_ps(velecsum,velec);
1111 fscal = _mm_andnot_ps(dummy_mask,fscal);
1113 /* Update vectorial force */
1114 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1115 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1116 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1118 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1119 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1120 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1122 /**************************
1123 * CALCULATE INTERACTIONS *
1124 **************************/
1126 r22 = _mm_mul_ps(rsq22,rinv22);
1127 r22 = _mm_andnot_ps(dummy_mask,r22);
1129 /* Calculate table index by multiplying r with table scale and truncate to integer */
1130 rt = _mm_mul_ps(r22,vftabscale);
1131 vfitab = _mm_cvttps_epi32(rt);
1133 vfeps = _mm_frcz_ps(rt);
1135 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1137 twovfeps = _mm_add_ps(vfeps,vfeps);
1138 vfitab = _mm_slli_epi32(vfitab,2);
1140 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1141 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1142 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1143 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1144 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1145 _MM_TRANSPOSE4_PS(Y,F,G,H);
1146 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1147 VV = _mm_macc_ps(vfeps,Fp,Y);
1148 velec = _mm_mul_ps(qq22,VV);
1149 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1150 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1152 /* Update potential sum for this i atom from the interaction with this j atom. */
1153 velec = _mm_andnot_ps(dummy_mask,velec);
1154 velecsum = _mm_add_ps(velecsum,velec);
1158 fscal = _mm_andnot_ps(dummy_mask,fscal);
1160 /* Update vectorial force */
1161 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1162 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1163 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1165 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1166 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1167 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1169 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1170 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1171 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1172 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1174 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1175 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1177 /* Inner loop uses 423 flops */
1180 /* End of innermost loop */
1182 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1183 f+i_coord_offset,fshift+i_shift_offset);
1186 /* Update potential energies */
1187 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1189 /* Increment number of inner iterations */
1190 inneriter += j_index_end - j_index_start;
1192 /* Outer loop uses 19 flops */
1195 /* Increment number of outer iterations */
1198 /* Update outer/inner flops */
1200 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*423);
1203 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_128_fma_single
1204 * Electrostatics interaction: CubicSplineTable
1205 * VdW interaction: None
1206 * Geometry: Water3-Water3
1207 * Calculate force/pot: Force
1210 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_128_fma_single
1211 (t_nblist * gmx_restrict nlist,
1212 rvec * gmx_restrict xx,
1213 rvec * gmx_restrict ff,
1214 t_forcerec * gmx_restrict fr,
1215 t_mdatoms * gmx_restrict mdatoms,
1216 nb_kernel_data_t * gmx_restrict kernel_data,
1217 t_nrnb * gmx_restrict nrnb)
1219 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1220 * just 0 for non-waters.
1221 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1222 * jnr indices corresponding to data put in the four positions in the SIMD register.
1224 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1225 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1226 int jnrA,jnrB,jnrC,jnrD;
1227 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1228 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1229 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1230 real rcutoff_scalar;
1231 real *shiftvec,*fshift,*x,*f;
1232 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1233 real scratch[4*DIM];
1234 __m128 fscal,rcutoff,rcutoff2,jidxall;
1236 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1238 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1240 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1241 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1242 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1243 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1244 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1245 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1246 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1247 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1248 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1249 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1250 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1251 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1252 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1253 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1254 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1255 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1256 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1259 __m128i ifour = _mm_set1_epi32(4);
1260 __m128 rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
1262 __m128 dummy_mask,cutoff_mask;
1263 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1264 __m128 one = _mm_set1_ps(1.0);
1265 __m128 two = _mm_set1_ps(2.0);
1271 jindex = nlist->jindex;
1273 shiftidx = nlist->shift;
1275 shiftvec = fr->shift_vec[0];
1276 fshift = fr->fshift[0];
1277 facel = _mm_set1_ps(fr->epsfac);
1278 charge = mdatoms->chargeA;
1280 vftab = kernel_data->table_elec->data;
1281 vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
1283 /* Setup water-specific parameters */
1284 inr = nlist->iinr[0];
1285 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1286 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1287 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1289 jq0 = _mm_set1_ps(charge[inr+0]);
1290 jq1 = _mm_set1_ps(charge[inr+1]);
1291 jq2 = _mm_set1_ps(charge[inr+2]);
1292 qq00 = _mm_mul_ps(iq0,jq0);
1293 qq01 = _mm_mul_ps(iq0,jq1);
1294 qq02 = _mm_mul_ps(iq0,jq2);
1295 qq10 = _mm_mul_ps(iq1,jq0);
1296 qq11 = _mm_mul_ps(iq1,jq1);
1297 qq12 = _mm_mul_ps(iq1,jq2);
1298 qq20 = _mm_mul_ps(iq2,jq0);
1299 qq21 = _mm_mul_ps(iq2,jq1);
1300 qq22 = _mm_mul_ps(iq2,jq2);
1302 /* Avoid stupid compiler warnings */
1303 jnrA = jnrB = jnrC = jnrD = 0;
1304 j_coord_offsetA = 0;
1305 j_coord_offsetB = 0;
1306 j_coord_offsetC = 0;
1307 j_coord_offsetD = 0;
1312 for(iidx=0;iidx<4*DIM;iidx++)
1314 scratch[iidx] = 0.0;
1317 /* Start outer loop over neighborlists */
1318 for(iidx=0; iidx<nri; iidx++)
1320 /* Load shift vector for this list */
1321 i_shift_offset = DIM*shiftidx[iidx];
1323 /* Load limits for loop over neighbors */
1324 j_index_start = jindex[iidx];
1325 j_index_end = jindex[iidx+1];
1327 /* Get outer coordinate index */
1329 i_coord_offset = DIM*inr;
1331 /* Load i particle coords and add shift vector */
1332 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1333 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1335 fix0 = _mm_setzero_ps();
1336 fiy0 = _mm_setzero_ps();
1337 fiz0 = _mm_setzero_ps();
1338 fix1 = _mm_setzero_ps();
1339 fiy1 = _mm_setzero_ps();
1340 fiz1 = _mm_setzero_ps();
1341 fix2 = _mm_setzero_ps();
1342 fiy2 = _mm_setzero_ps();
1343 fiz2 = _mm_setzero_ps();
1345 /* Start inner kernel loop */
1346 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1349 /* Get j neighbor index, and coordinate index */
1351 jnrB = jjnr[jidx+1];
1352 jnrC = jjnr[jidx+2];
1353 jnrD = jjnr[jidx+3];
1354 j_coord_offsetA = DIM*jnrA;
1355 j_coord_offsetB = DIM*jnrB;
1356 j_coord_offsetC = DIM*jnrC;
1357 j_coord_offsetD = DIM*jnrD;
1359 /* load j atom coordinates */
1360 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1361 x+j_coord_offsetC,x+j_coord_offsetD,
1362 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1364 /* Calculate displacement vector */
1365 dx00 = _mm_sub_ps(ix0,jx0);
1366 dy00 = _mm_sub_ps(iy0,jy0);
1367 dz00 = _mm_sub_ps(iz0,jz0);
1368 dx01 = _mm_sub_ps(ix0,jx1);
1369 dy01 = _mm_sub_ps(iy0,jy1);
1370 dz01 = _mm_sub_ps(iz0,jz1);
1371 dx02 = _mm_sub_ps(ix0,jx2);
1372 dy02 = _mm_sub_ps(iy0,jy2);
1373 dz02 = _mm_sub_ps(iz0,jz2);
1374 dx10 = _mm_sub_ps(ix1,jx0);
1375 dy10 = _mm_sub_ps(iy1,jy0);
1376 dz10 = _mm_sub_ps(iz1,jz0);
1377 dx11 = _mm_sub_ps(ix1,jx1);
1378 dy11 = _mm_sub_ps(iy1,jy1);
1379 dz11 = _mm_sub_ps(iz1,jz1);
1380 dx12 = _mm_sub_ps(ix1,jx2);
1381 dy12 = _mm_sub_ps(iy1,jy2);
1382 dz12 = _mm_sub_ps(iz1,jz2);
1383 dx20 = _mm_sub_ps(ix2,jx0);
1384 dy20 = _mm_sub_ps(iy2,jy0);
1385 dz20 = _mm_sub_ps(iz2,jz0);
1386 dx21 = _mm_sub_ps(ix2,jx1);
1387 dy21 = _mm_sub_ps(iy2,jy1);
1388 dz21 = _mm_sub_ps(iz2,jz1);
1389 dx22 = _mm_sub_ps(ix2,jx2);
1390 dy22 = _mm_sub_ps(iy2,jy2);
1391 dz22 = _mm_sub_ps(iz2,jz2);
1393 /* Calculate squared distance and things based on it */
1394 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1395 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1396 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1397 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1398 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1399 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1400 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1401 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1402 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1404 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1405 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1406 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1407 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1408 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1409 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1410 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1411 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1412 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1414 fjx0 = _mm_setzero_ps();
1415 fjy0 = _mm_setzero_ps();
1416 fjz0 = _mm_setzero_ps();
1417 fjx1 = _mm_setzero_ps();
1418 fjy1 = _mm_setzero_ps();
1419 fjz1 = _mm_setzero_ps();
1420 fjx2 = _mm_setzero_ps();
1421 fjy2 = _mm_setzero_ps();
1422 fjz2 = _mm_setzero_ps();
1424 /**************************
1425 * CALCULATE INTERACTIONS *
1426 **************************/
1428 r00 = _mm_mul_ps(rsq00,rinv00);
1430 /* Calculate table index by multiplying r with table scale and truncate to integer */
1431 rt = _mm_mul_ps(r00,vftabscale);
1432 vfitab = _mm_cvttps_epi32(rt);
1434 vfeps = _mm_frcz_ps(rt);
1436 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1438 twovfeps = _mm_add_ps(vfeps,vfeps);
1439 vfitab = _mm_slli_epi32(vfitab,2);
1441 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1442 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1443 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1444 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1445 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1446 _MM_TRANSPOSE4_PS(Y,F,G,H);
1447 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1448 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1449 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1453 /* Update vectorial force */
1454 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1455 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1456 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1458 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1459 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1460 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1462 /**************************
1463 * CALCULATE INTERACTIONS *
1464 **************************/
1466 r01 = _mm_mul_ps(rsq01,rinv01);
1468 /* Calculate table index by multiplying r with table scale and truncate to integer */
1469 rt = _mm_mul_ps(r01,vftabscale);
1470 vfitab = _mm_cvttps_epi32(rt);
1472 vfeps = _mm_frcz_ps(rt);
1474 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1476 twovfeps = _mm_add_ps(vfeps,vfeps);
1477 vfitab = _mm_slli_epi32(vfitab,2);
1479 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1480 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1481 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1482 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1483 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1484 _MM_TRANSPOSE4_PS(Y,F,G,H);
1485 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1486 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1487 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
1491 /* Update vectorial force */
1492 fix0 = _mm_macc_ps(dx01,fscal,fix0);
1493 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1494 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1496 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1497 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1498 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1500 /**************************
1501 * CALCULATE INTERACTIONS *
1502 **************************/
1504 r02 = _mm_mul_ps(rsq02,rinv02);
1506 /* Calculate table index by multiplying r with table scale and truncate to integer */
1507 rt = _mm_mul_ps(r02,vftabscale);
1508 vfitab = _mm_cvttps_epi32(rt);
1510 vfeps = _mm_frcz_ps(rt);
1512 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1514 twovfeps = _mm_add_ps(vfeps,vfeps);
1515 vfitab = _mm_slli_epi32(vfitab,2);
1517 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1518 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1519 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1520 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1521 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1522 _MM_TRANSPOSE4_PS(Y,F,G,H);
1523 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1524 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1525 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
1529 /* Update vectorial force */
1530 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1531 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1532 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
1534 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
1535 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
1536 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
1538 /**************************
1539 * CALCULATE INTERACTIONS *
1540 **************************/
1542 r10 = _mm_mul_ps(rsq10,rinv10);
1544 /* Calculate table index by multiplying r with table scale and truncate to integer */
1545 rt = _mm_mul_ps(r10,vftabscale);
1546 vfitab = _mm_cvttps_epi32(rt);
1548 vfeps = _mm_frcz_ps(rt);
1550 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1552 twovfeps = _mm_add_ps(vfeps,vfeps);
1553 vfitab = _mm_slli_epi32(vfitab,2);
1555 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1556 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1557 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1558 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1559 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1560 _MM_TRANSPOSE4_PS(Y,F,G,H);
1561 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1562 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1563 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1567 /* Update vectorial force */
1568 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1569 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1570 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1572 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1573 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1574 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1576 /**************************
1577 * CALCULATE INTERACTIONS *
1578 **************************/
1580 r11 = _mm_mul_ps(rsq11,rinv11);
1582 /* Calculate table index by multiplying r with table scale and truncate to integer */
1583 rt = _mm_mul_ps(r11,vftabscale);
1584 vfitab = _mm_cvttps_epi32(rt);
1586 vfeps = _mm_frcz_ps(rt);
1588 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1590 twovfeps = _mm_add_ps(vfeps,vfeps);
1591 vfitab = _mm_slli_epi32(vfitab,2);
1593 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1594 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1595 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1596 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1597 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1598 _MM_TRANSPOSE4_PS(Y,F,G,H);
1599 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1600 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1601 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1605 /* Update vectorial force */
1606 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1607 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1608 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1610 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1611 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1612 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1614 /**************************
1615 * CALCULATE INTERACTIONS *
1616 **************************/
1618 r12 = _mm_mul_ps(rsq12,rinv12);
1620 /* Calculate table index by multiplying r with table scale and truncate to integer */
1621 rt = _mm_mul_ps(r12,vftabscale);
1622 vfitab = _mm_cvttps_epi32(rt);
1624 vfeps = _mm_frcz_ps(rt);
1626 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1628 twovfeps = _mm_add_ps(vfeps,vfeps);
1629 vfitab = _mm_slli_epi32(vfitab,2);
1631 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1632 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1633 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1634 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1635 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1636 _MM_TRANSPOSE4_PS(Y,F,G,H);
1637 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1638 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1639 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1643 /* Update vectorial force */
1644 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1645 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1646 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1648 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1649 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1650 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1652 /**************************
1653 * CALCULATE INTERACTIONS *
1654 **************************/
1656 r20 = _mm_mul_ps(rsq20,rinv20);
1658 /* Calculate table index by multiplying r with table scale and truncate to integer */
1659 rt = _mm_mul_ps(r20,vftabscale);
1660 vfitab = _mm_cvttps_epi32(rt);
1662 vfeps = _mm_frcz_ps(rt);
1664 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1666 twovfeps = _mm_add_ps(vfeps,vfeps);
1667 vfitab = _mm_slli_epi32(vfitab,2);
1669 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1670 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1671 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1672 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1673 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1674 _MM_TRANSPOSE4_PS(Y,F,G,H);
1675 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1676 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1677 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1681 /* Update vectorial force */
1682 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1683 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1684 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1686 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1687 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1688 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1690 /**************************
1691 * CALCULATE INTERACTIONS *
1692 **************************/
1694 r21 = _mm_mul_ps(rsq21,rinv21);
1696 /* Calculate table index by multiplying r with table scale and truncate to integer */
1697 rt = _mm_mul_ps(r21,vftabscale);
1698 vfitab = _mm_cvttps_epi32(rt);
1700 vfeps = _mm_frcz_ps(rt);
1702 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1704 twovfeps = _mm_add_ps(vfeps,vfeps);
1705 vfitab = _mm_slli_epi32(vfitab,2);
1707 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1708 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1709 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1710 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1711 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1712 _MM_TRANSPOSE4_PS(Y,F,G,H);
1713 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1714 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1715 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1719 /* Update vectorial force */
1720 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1721 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1722 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1724 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1725 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1726 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1728 /**************************
1729 * CALCULATE INTERACTIONS *
1730 **************************/
1732 r22 = _mm_mul_ps(rsq22,rinv22);
1734 /* Calculate table index by multiplying r with table scale and truncate to integer */
1735 rt = _mm_mul_ps(r22,vftabscale);
1736 vfitab = _mm_cvttps_epi32(rt);
1738 vfeps = _mm_frcz_ps(rt);
1740 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1742 twovfeps = _mm_add_ps(vfeps,vfeps);
1743 vfitab = _mm_slli_epi32(vfitab,2);
1745 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1746 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1747 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1748 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1749 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1750 _MM_TRANSPOSE4_PS(Y,F,G,H);
1751 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1752 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1753 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1757 /* Update vectorial force */
1758 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1759 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1760 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1762 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1763 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1764 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1766 fjptrA = f+j_coord_offsetA;
1767 fjptrB = f+j_coord_offsetB;
1768 fjptrC = f+j_coord_offsetC;
1769 fjptrD = f+j_coord_offsetD;
1771 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1772 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1774 /* Inner loop uses 378 flops */
1777 if(jidx<j_index_end)
1780 /* Get j neighbor index, and coordinate index */
1781 jnrlistA = jjnr[jidx];
1782 jnrlistB = jjnr[jidx+1];
1783 jnrlistC = jjnr[jidx+2];
1784 jnrlistD = jjnr[jidx+3];
1785 /* Sign of each element will be negative for non-real atoms.
1786 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1787 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1789 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1790 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1791 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1792 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1793 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1794 j_coord_offsetA = DIM*jnrA;
1795 j_coord_offsetB = DIM*jnrB;
1796 j_coord_offsetC = DIM*jnrC;
1797 j_coord_offsetD = DIM*jnrD;
1799 /* load j atom coordinates */
1800 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1801 x+j_coord_offsetC,x+j_coord_offsetD,
1802 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1804 /* Calculate displacement vector */
1805 dx00 = _mm_sub_ps(ix0,jx0);
1806 dy00 = _mm_sub_ps(iy0,jy0);
1807 dz00 = _mm_sub_ps(iz0,jz0);
1808 dx01 = _mm_sub_ps(ix0,jx1);
1809 dy01 = _mm_sub_ps(iy0,jy1);
1810 dz01 = _mm_sub_ps(iz0,jz1);
1811 dx02 = _mm_sub_ps(ix0,jx2);
1812 dy02 = _mm_sub_ps(iy0,jy2);
1813 dz02 = _mm_sub_ps(iz0,jz2);
1814 dx10 = _mm_sub_ps(ix1,jx0);
1815 dy10 = _mm_sub_ps(iy1,jy0);
1816 dz10 = _mm_sub_ps(iz1,jz0);
1817 dx11 = _mm_sub_ps(ix1,jx1);
1818 dy11 = _mm_sub_ps(iy1,jy1);
1819 dz11 = _mm_sub_ps(iz1,jz1);
1820 dx12 = _mm_sub_ps(ix1,jx2);
1821 dy12 = _mm_sub_ps(iy1,jy2);
1822 dz12 = _mm_sub_ps(iz1,jz2);
1823 dx20 = _mm_sub_ps(ix2,jx0);
1824 dy20 = _mm_sub_ps(iy2,jy0);
1825 dz20 = _mm_sub_ps(iz2,jz0);
1826 dx21 = _mm_sub_ps(ix2,jx1);
1827 dy21 = _mm_sub_ps(iy2,jy1);
1828 dz21 = _mm_sub_ps(iz2,jz1);
1829 dx22 = _mm_sub_ps(ix2,jx2);
1830 dy22 = _mm_sub_ps(iy2,jy2);
1831 dz22 = _mm_sub_ps(iz2,jz2);
1833 /* Calculate squared distance and things based on it */
1834 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1835 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1836 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1837 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1838 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1839 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1840 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1841 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1842 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1844 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1845 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1846 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1847 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1848 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1849 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1850 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1851 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1852 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1854 fjx0 = _mm_setzero_ps();
1855 fjy0 = _mm_setzero_ps();
1856 fjz0 = _mm_setzero_ps();
1857 fjx1 = _mm_setzero_ps();
1858 fjy1 = _mm_setzero_ps();
1859 fjz1 = _mm_setzero_ps();
1860 fjx2 = _mm_setzero_ps();
1861 fjy2 = _mm_setzero_ps();
1862 fjz2 = _mm_setzero_ps();
1864 /**************************
1865 * CALCULATE INTERACTIONS *
1866 **************************/
1868 r00 = _mm_mul_ps(rsq00,rinv00);
1869 r00 = _mm_andnot_ps(dummy_mask,r00);
1871 /* Calculate table index by multiplying r with table scale and truncate to integer */
1872 rt = _mm_mul_ps(r00,vftabscale);
1873 vfitab = _mm_cvttps_epi32(rt);
1875 vfeps = _mm_frcz_ps(rt);
1877 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1879 twovfeps = _mm_add_ps(vfeps,vfeps);
1880 vfitab = _mm_slli_epi32(vfitab,2);
1882 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1883 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1884 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1885 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1886 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1887 _MM_TRANSPOSE4_PS(Y,F,G,H);
1888 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1889 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1890 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1894 fscal = _mm_andnot_ps(dummy_mask,fscal);
1896 /* Update vectorial force */
1897 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1898 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1899 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1901 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1902 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1903 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1905 /**************************
1906 * CALCULATE INTERACTIONS *
1907 **************************/
1909 r01 = _mm_mul_ps(rsq01,rinv01);
1910 r01 = _mm_andnot_ps(dummy_mask,r01);
1912 /* Calculate table index by multiplying r with table scale and truncate to integer */
1913 rt = _mm_mul_ps(r01,vftabscale);
1914 vfitab = _mm_cvttps_epi32(rt);
1916 vfeps = _mm_frcz_ps(rt);
1918 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1920 twovfeps = _mm_add_ps(vfeps,vfeps);
1921 vfitab = _mm_slli_epi32(vfitab,2);
1923 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1924 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1925 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1926 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1927 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1928 _MM_TRANSPOSE4_PS(Y,F,G,H);
1929 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1930 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1931 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
1935 fscal = _mm_andnot_ps(dummy_mask,fscal);
1937 /* Update vectorial force */
1938 fix0 = _mm_macc_ps(dx01,fscal,fix0);
1939 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1940 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1942 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1943 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1944 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1946 /**************************
1947 * CALCULATE INTERACTIONS *
1948 **************************/
1950 r02 = _mm_mul_ps(rsq02,rinv02);
1951 r02 = _mm_andnot_ps(dummy_mask,r02);
1953 /* Calculate table index by multiplying r with table scale and truncate to integer */
1954 rt = _mm_mul_ps(r02,vftabscale);
1955 vfitab = _mm_cvttps_epi32(rt);
1957 vfeps = _mm_frcz_ps(rt);
1959 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1961 twovfeps = _mm_add_ps(vfeps,vfeps);
1962 vfitab = _mm_slli_epi32(vfitab,2);
1964 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1965 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1966 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1967 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1968 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1969 _MM_TRANSPOSE4_PS(Y,F,G,H);
1970 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1971 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1972 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
1976 fscal = _mm_andnot_ps(dummy_mask,fscal);
1978 /* Update vectorial force */
1979 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1980 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1981 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
1983 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
1984 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
1985 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
1987 /**************************
1988 * CALCULATE INTERACTIONS *
1989 **************************/
1991 r10 = _mm_mul_ps(rsq10,rinv10);
1992 r10 = _mm_andnot_ps(dummy_mask,r10);
1994 /* Calculate table index by multiplying r with table scale and truncate to integer */
1995 rt = _mm_mul_ps(r10,vftabscale);
1996 vfitab = _mm_cvttps_epi32(rt);
1998 vfeps = _mm_frcz_ps(rt);
2000 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2002 twovfeps = _mm_add_ps(vfeps,vfeps);
2003 vfitab = _mm_slli_epi32(vfitab,2);
2005 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2006 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2007 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2008 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2009 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2010 _MM_TRANSPOSE4_PS(Y,F,G,H);
2011 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2012 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2013 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
2017 fscal = _mm_andnot_ps(dummy_mask,fscal);
2019 /* Update vectorial force */
2020 fix1 = _mm_macc_ps(dx10,fscal,fix1);
2021 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
2022 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
2024 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
2025 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
2026 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
2028 /**************************
2029 * CALCULATE INTERACTIONS *
2030 **************************/
2032 r11 = _mm_mul_ps(rsq11,rinv11);
2033 r11 = _mm_andnot_ps(dummy_mask,r11);
2035 /* Calculate table index by multiplying r with table scale and truncate to integer */
2036 rt = _mm_mul_ps(r11,vftabscale);
2037 vfitab = _mm_cvttps_epi32(rt);
2039 vfeps = _mm_frcz_ps(rt);
2041 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2043 twovfeps = _mm_add_ps(vfeps,vfeps);
2044 vfitab = _mm_slli_epi32(vfitab,2);
2046 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2047 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2048 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2049 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2050 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2051 _MM_TRANSPOSE4_PS(Y,F,G,H);
2052 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2053 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2054 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2058 fscal = _mm_andnot_ps(dummy_mask,fscal);
2060 /* Update vectorial force */
2061 fix1 = _mm_macc_ps(dx11,fscal,fix1);
2062 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
2063 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
2065 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
2066 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
2067 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
2069 /**************************
2070 * CALCULATE INTERACTIONS *
2071 **************************/
2073 r12 = _mm_mul_ps(rsq12,rinv12);
2074 r12 = _mm_andnot_ps(dummy_mask,r12);
2076 /* Calculate table index by multiplying r with table scale and truncate to integer */
2077 rt = _mm_mul_ps(r12,vftabscale);
2078 vfitab = _mm_cvttps_epi32(rt);
2080 vfeps = _mm_frcz_ps(rt);
2082 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2084 twovfeps = _mm_add_ps(vfeps,vfeps);
2085 vfitab = _mm_slli_epi32(vfitab,2);
2087 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2088 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2089 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2090 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2091 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2092 _MM_TRANSPOSE4_PS(Y,F,G,H);
2093 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2094 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2095 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2099 fscal = _mm_andnot_ps(dummy_mask,fscal);
2101 /* Update vectorial force */
2102 fix1 = _mm_macc_ps(dx12,fscal,fix1);
2103 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
2104 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
2106 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
2107 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
2108 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
2110 /**************************
2111 * CALCULATE INTERACTIONS *
2112 **************************/
2114 r20 = _mm_mul_ps(rsq20,rinv20);
2115 r20 = _mm_andnot_ps(dummy_mask,r20);
2117 /* Calculate table index by multiplying r with table scale and truncate to integer */
2118 rt = _mm_mul_ps(r20,vftabscale);
2119 vfitab = _mm_cvttps_epi32(rt);
2121 vfeps = _mm_frcz_ps(rt);
2123 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2125 twovfeps = _mm_add_ps(vfeps,vfeps);
2126 vfitab = _mm_slli_epi32(vfitab,2);
2128 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2129 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2130 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2131 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2132 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2133 _MM_TRANSPOSE4_PS(Y,F,G,H);
2134 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2135 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2136 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
2140 fscal = _mm_andnot_ps(dummy_mask,fscal);
2142 /* Update vectorial force */
2143 fix2 = _mm_macc_ps(dx20,fscal,fix2);
2144 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
2145 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
2147 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
2148 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
2149 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
2151 /**************************
2152 * CALCULATE INTERACTIONS *
2153 **************************/
2155 r21 = _mm_mul_ps(rsq21,rinv21);
2156 r21 = _mm_andnot_ps(dummy_mask,r21);
2158 /* Calculate table index by multiplying r with table scale and truncate to integer */
2159 rt = _mm_mul_ps(r21,vftabscale);
2160 vfitab = _mm_cvttps_epi32(rt);
2162 vfeps = _mm_frcz_ps(rt);
2164 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2166 twovfeps = _mm_add_ps(vfeps,vfeps);
2167 vfitab = _mm_slli_epi32(vfitab,2);
2169 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2170 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2171 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2172 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2173 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2174 _MM_TRANSPOSE4_PS(Y,F,G,H);
2175 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2176 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2177 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2181 fscal = _mm_andnot_ps(dummy_mask,fscal);
2183 /* Update vectorial force */
2184 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2185 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2186 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2188 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2189 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2190 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2192 /**************************
2193 * CALCULATE INTERACTIONS *
2194 **************************/
2196 r22 = _mm_mul_ps(rsq22,rinv22);
2197 r22 = _mm_andnot_ps(dummy_mask,r22);
2199 /* Calculate table index by multiplying r with table scale and truncate to integer */
2200 rt = _mm_mul_ps(r22,vftabscale);
2201 vfitab = _mm_cvttps_epi32(rt);
2203 vfeps = _mm_frcz_ps(rt);
2205 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2207 twovfeps = _mm_add_ps(vfeps,vfeps);
2208 vfitab = _mm_slli_epi32(vfitab,2);
2210 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2211 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2212 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2213 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2214 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2215 _MM_TRANSPOSE4_PS(Y,F,G,H);
2216 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2217 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2218 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2222 fscal = _mm_andnot_ps(dummy_mask,fscal);
2224 /* Update vectorial force */
2225 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2226 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2227 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2229 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2230 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2231 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2233 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2234 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2235 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2236 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2238 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2239 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2241 /* Inner loop uses 387 flops */
2244 /* End of innermost loop */
2246 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2247 f+i_coord_offset,fshift+i_shift_offset);
2249 /* Increment number of inner iterations */
2250 inneriter += j_index_end - j_index_start;
2252 /* Outer loop uses 18 flops */
2255 /* Increment number of outer iterations */
2258 /* Update outer/inner flops */
2260 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*387);