2 * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_single.h"
34 #include "kernelutil_x86_avx_128_fma_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_128_fma_single
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: None
40 * Geometry: Water3-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwNone_GeomW3P1_VF_avx_128_fma_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
78 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
79 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
80 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
83 __m128i ifour = _mm_set1_epi32(4);
84 __m128 rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
86 __m128 dummy_mask,cutoff_mask;
87 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
88 __m128 one = _mm_set1_ps(1.0);
89 __m128 two = _mm_set1_ps(2.0);
95 jindex = nlist->jindex;
97 shiftidx = nlist->shift;
99 shiftvec = fr->shift_vec[0];
100 fshift = fr->fshift[0];
101 facel = _mm_set1_ps(fr->epsfac);
102 charge = mdatoms->chargeA;
104 vftab = kernel_data->table_elec->data;
105 vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
107 /* Setup water-specific parameters */
108 inr = nlist->iinr[0];
109 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
110 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
111 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
113 /* Avoid stupid compiler warnings */
114 jnrA = jnrB = jnrC = jnrD = 0;
123 for(iidx=0;iidx<4*DIM;iidx++)
128 /* Start outer loop over neighborlists */
129 for(iidx=0; iidx<nri; iidx++)
131 /* Load shift vector for this list */
132 i_shift_offset = DIM*shiftidx[iidx];
134 /* Load limits for loop over neighbors */
135 j_index_start = jindex[iidx];
136 j_index_end = jindex[iidx+1];
138 /* Get outer coordinate index */
140 i_coord_offset = DIM*inr;
142 /* Load i particle coords and add shift vector */
143 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
144 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
146 fix0 = _mm_setzero_ps();
147 fiy0 = _mm_setzero_ps();
148 fiz0 = _mm_setzero_ps();
149 fix1 = _mm_setzero_ps();
150 fiy1 = _mm_setzero_ps();
151 fiz1 = _mm_setzero_ps();
152 fix2 = _mm_setzero_ps();
153 fiy2 = _mm_setzero_ps();
154 fiz2 = _mm_setzero_ps();
156 /* Reset potential sums */
157 velecsum = _mm_setzero_ps();
159 /* Start inner kernel loop */
160 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
163 /* Get j neighbor index, and coordinate index */
168 j_coord_offsetA = DIM*jnrA;
169 j_coord_offsetB = DIM*jnrB;
170 j_coord_offsetC = DIM*jnrC;
171 j_coord_offsetD = DIM*jnrD;
173 /* load j atom coordinates */
174 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
175 x+j_coord_offsetC,x+j_coord_offsetD,
178 /* Calculate displacement vector */
179 dx00 = _mm_sub_ps(ix0,jx0);
180 dy00 = _mm_sub_ps(iy0,jy0);
181 dz00 = _mm_sub_ps(iz0,jz0);
182 dx10 = _mm_sub_ps(ix1,jx0);
183 dy10 = _mm_sub_ps(iy1,jy0);
184 dz10 = _mm_sub_ps(iz1,jz0);
185 dx20 = _mm_sub_ps(ix2,jx0);
186 dy20 = _mm_sub_ps(iy2,jy0);
187 dz20 = _mm_sub_ps(iz2,jz0);
189 /* Calculate squared distance and things based on it */
190 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
191 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
192 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
194 rinv00 = gmx_mm_invsqrt_ps(rsq00);
195 rinv10 = gmx_mm_invsqrt_ps(rsq10);
196 rinv20 = gmx_mm_invsqrt_ps(rsq20);
198 /* Load parameters for j particles */
199 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
200 charge+jnrC+0,charge+jnrD+0);
202 fjx0 = _mm_setzero_ps();
203 fjy0 = _mm_setzero_ps();
204 fjz0 = _mm_setzero_ps();
206 /**************************
207 * CALCULATE INTERACTIONS *
208 **************************/
210 r00 = _mm_mul_ps(rsq00,rinv00);
212 /* Compute parameters for interactions between i and j atoms */
213 qq00 = _mm_mul_ps(iq0,jq0);
215 /* Calculate table index by multiplying r with table scale and truncate to integer */
216 rt = _mm_mul_ps(r00,vftabscale);
217 vfitab = _mm_cvttps_epi32(rt);
219 vfeps = _mm_frcz_ps(rt);
221 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
223 twovfeps = _mm_add_ps(vfeps,vfeps);
224 vfitab = _mm_slli_epi32(vfitab,2);
226 /* CUBIC SPLINE TABLE ELECTROSTATICS */
227 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
228 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
229 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
230 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
231 _MM_TRANSPOSE4_PS(Y,F,G,H);
232 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
233 VV = _mm_macc_ps(vfeps,Fp,Y);
234 velec = _mm_mul_ps(qq00,VV);
235 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
236 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
238 /* Update potential sum for this i atom from the interaction with this j atom. */
239 velecsum = _mm_add_ps(velecsum,velec);
243 /* Update vectorial force */
244 fix0 = _mm_macc_ps(dx00,fscal,fix0);
245 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
246 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
248 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
249 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
250 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
252 /**************************
253 * CALCULATE INTERACTIONS *
254 **************************/
256 r10 = _mm_mul_ps(rsq10,rinv10);
258 /* Compute parameters for interactions between i and j atoms */
259 qq10 = _mm_mul_ps(iq1,jq0);
261 /* Calculate table index by multiplying r with table scale and truncate to integer */
262 rt = _mm_mul_ps(r10,vftabscale);
263 vfitab = _mm_cvttps_epi32(rt);
265 vfeps = _mm_frcz_ps(rt);
267 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
269 twovfeps = _mm_add_ps(vfeps,vfeps);
270 vfitab = _mm_slli_epi32(vfitab,2);
272 /* CUBIC SPLINE TABLE ELECTROSTATICS */
273 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
274 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
275 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
276 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
277 _MM_TRANSPOSE4_PS(Y,F,G,H);
278 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
279 VV = _mm_macc_ps(vfeps,Fp,Y);
280 velec = _mm_mul_ps(qq10,VV);
281 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
282 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
284 /* Update potential sum for this i atom from the interaction with this j atom. */
285 velecsum = _mm_add_ps(velecsum,velec);
289 /* Update vectorial force */
290 fix1 = _mm_macc_ps(dx10,fscal,fix1);
291 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
292 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
294 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
295 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
296 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
298 /**************************
299 * CALCULATE INTERACTIONS *
300 **************************/
302 r20 = _mm_mul_ps(rsq20,rinv20);
304 /* Compute parameters for interactions between i and j atoms */
305 qq20 = _mm_mul_ps(iq2,jq0);
307 /* Calculate table index by multiplying r with table scale and truncate to integer */
308 rt = _mm_mul_ps(r20,vftabscale);
309 vfitab = _mm_cvttps_epi32(rt);
311 vfeps = _mm_frcz_ps(rt);
313 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
315 twovfeps = _mm_add_ps(vfeps,vfeps);
316 vfitab = _mm_slli_epi32(vfitab,2);
318 /* CUBIC SPLINE TABLE ELECTROSTATICS */
319 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
320 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
321 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
322 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
323 _MM_TRANSPOSE4_PS(Y,F,G,H);
324 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
325 VV = _mm_macc_ps(vfeps,Fp,Y);
326 velec = _mm_mul_ps(qq20,VV);
327 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
328 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
330 /* Update potential sum for this i atom from the interaction with this j atom. */
331 velecsum = _mm_add_ps(velecsum,velec);
335 /* Update vectorial force */
336 fix2 = _mm_macc_ps(dx20,fscal,fix2);
337 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
338 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
340 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
341 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
342 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
344 fjptrA = f+j_coord_offsetA;
345 fjptrB = f+j_coord_offsetB;
346 fjptrC = f+j_coord_offsetC;
347 fjptrD = f+j_coord_offsetD;
349 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
351 /* Inner loop uses 138 flops */
357 /* Get j neighbor index, and coordinate index */
358 jnrlistA = jjnr[jidx];
359 jnrlistB = jjnr[jidx+1];
360 jnrlistC = jjnr[jidx+2];
361 jnrlistD = jjnr[jidx+3];
362 /* Sign of each element will be negative for non-real atoms.
363 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
364 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
366 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
367 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
368 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
369 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
370 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
371 j_coord_offsetA = DIM*jnrA;
372 j_coord_offsetB = DIM*jnrB;
373 j_coord_offsetC = DIM*jnrC;
374 j_coord_offsetD = DIM*jnrD;
376 /* load j atom coordinates */
377 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
378 x+j_coord_offsetC,x+j_coord_offsetD,
381 /* Calculate displacement vector */
382 dx00 = _mm_sub_ps(ix0,jx0);
383 dy00 = _mm_sub_ps(iy0,jy0);
384 dz00 = _mm_sub_ps(iz0,jz0);
385 dx10 = _mm_sub_ps(ix1,jx0);
386 dy10 = _mm_sub_ps(iy1,jy0);
387 dz10 = _mm_sub_ps(iz1,jz0);
388 dx20 = _mm_sub_ps(ix2,jx0);
389 dy20 = _mm_sub_ps(iy2,jy0);
390 dz20 = _mm_sub_ps(iz2,jz0);
392 /* Calculate squared distance and things based on it */
393 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
394 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
395 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
397 rinv00 = gmx_mm_invsqrt_ps(rsq00);
398 rinv10 = gmx_mm_invsqrt_ps(rsq10);
399 rinv20 = gmx_mm_invsqrt_ps(rsq20);
401 /* Load parameters for j particles */
402 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
403 charge+jnrC+0,charge+jnrD+0);
405 fjx0 = _mm_setzero_ps();
406 fjy0 = _mm_setzero_ps();
407 fjz0 = _mm_setzero_ps();
409 /**************************
410 * CALCULATE INTERACTIONS *
411 **************************/
413 r00 = _mm_mul_ps(rsq00,rinv00);
414 r00 = _mm_andnot_ps(dummy_mask,r00);
416 /* Compute parameters for interactions between i and j atoms */
417 qq00 = _mm_mul_ps(iq0,jq0);
419 /* Calculate table index by multiplying r with table scale and truncate to integer */
420 rt = _mm_mul_ps(r00,vftabscale);
421 vfitab = _mm_cvttps_epi32(rt);
423 vfeps = _mm_frcz_ps(rt);
425 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
427 twovfeps = _mm_add_ps(vfeps,vfeps);
428 vfitab = _mm_slli_epi32(vfitab,2);
430 /* CUBIC SPLINE TABLE ELECTROSTATICS */
431 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
432 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
433 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
434 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
435 _MM_TRANSPOSE4_PS(Y,F,G,H);
436 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
437 VV = _mm_macc_ps(vfeps,Fp,Y);
438 velec = _mm_mul_ps(qq00,VV);
439 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
440 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
442 /* Update potential sum for this i atom from the interaction with this j atom. */
443 velec = _mm_andnot_ps(dummy_mask,velec);
444 velecsum = _mm_add_ps(velecsum,velec);
448 fscal = _mm_andnot_ps(dummy_mask,fscal);
450 /* Update vectorial force */
451 fix0 = _mm_macc_ps(dx00,fscal,fix0);
452 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
453 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
455 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
456 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
457 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
459 /**************************
460 * CALCULATE INTERACTIONS *
461 **************************/
463 r10 = _mm_mul_ps(rsq10,rinv10);
464 r10 = _mm_andnot_ps(dummy_mask,r10);
466 /* Compute parameters for interactions between i and j atoms */
467 qq10 = _mm_mul_ps(iq1,jq0);
469 /* Calculate table index by multiplying r with table scale and truncate to integer */
470 rt = _mm_mul_ps(r10,vftabscale);
471 vfitab = _mm_cvttps_epi32(rt);
473 vfeps = _mm_frcz_ps(rt);
475 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
477 twovfeps = _mm_add_ps(vfeps,vfeps);
478 vfitab = _mm_slli_epi32(vfitab,2);
480 /* CUBIC SPLINE TABLE ELECTROSTATICS */
481 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
482 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
483 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
484 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
485 _MM_TRANSPOSE4_PS(Y,F,G,H);
486 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
487 VV = _mm_macc_ps(vfeps,Fp,Y);
488 velec = _mm_mul_ps(qq10,VV);
489 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
490 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
492 /* Update potential sum for this i atom from the interaction with this j atom. */
493 velec = _mm_andnot_ps(dummy_mask,velec);
494 velecsum = _mm_add_ps(velecsum,velec);
498 fscal = _mm_andnot_ps(dummy_mask,fscal);
500 /* Update vectorial force */
501 fix1 = _mm_macc_ps(dx10,fscal,fix1);
502 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
503 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
505 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
506 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
507 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
509 /**************************
510 * CALCULATE INTERACTIONS *
511 **************************/
513 r20 = _mm_mul_ps(rsq20,rinv20);
514 r20 = _mm_andnot_ps(dummy_mask,r20);
516 /* Compute parameters for interactions between i and j atoms */
517 qq20 = _mm_mul_ps(iq2,jq0);
519 /* Calculate table index by multiplying r with table scale and truncate to integer */
520 rt = _mm_mul_ps(r20,vftabscale);
521 vfitab = _mm_cvttps_epi32(rt);
523 vfeps = _mm_frcz_ps(rt);
525 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
527 twovfeps = _mm_add_ps(vfeps,vfeps);
528 vfitab = _mm_slli_epi32(vfitab,2);
530 /* CUBIC SPLINE TABLE ELECTROSTATICS */
531 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
532 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
533 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
534 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
535 _MM_TRANSPOSE4_PS(Y,F,G,H);
536 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
537 VV = _mm_macc_ps(vfeps,Fp,Y);
538 velec = _mm_mul_ps(qq20,VV);
539 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
540 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
542 /* Update potential sum for this i atom from the interaction with this j atom. */
543 velec = _mm_andnot_ps(dummy_mask,velec);
544 velecsum = _mm_add_ps(velecsum,velec);
548 fscal = _mm_andnot_ps(dummy_mask,fscal);
550 /* Update vectorial force */
551 fix2 = _mm_macc_ps(dx20,fscal,fix2);
552 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
553 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
555 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
556 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
557 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
559 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
560 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
561 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
562 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
564 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
566 /* Inner loop uses 141 flops */
569 /* End of innermost loop */
571 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
572 f+i_coord_offset,fshift+i_shift_offset);
575 /* Update potential energies */
576 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
578 /* Increment number of inner iterations */
579 inneriter += j_index_end - j_index_start;
581 /* Outer loop uses 19 flops */
584 /* Increment number of outer iterations */
587 /* Update outer/inner flops */
589 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*141);
592 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_128_fma_single
593 * Electrostatics interaction: CubicSplineTable
594 * VdW interaction: None
595 * Geometry: Water3-Particle
596 * Calculate force/pot: Force
599 nb_kernel_ElecCSTab_VdwNone_GeomW3P1_F_avx_128_fma_single
600 (t_nblist * gmx_restrict nlist,
601 rvec * gmx_restrict xx,
602 rvec * gmx_restrict ff,
603 t_forcerec * gmx_restrict fr,
604 t_mdatoms * gmx_restrict mdatoms,
605 nb_kernel_data_t * gmx_restrict kernel_data,
606 t_nrnb * gmx_restrict nrnb)
608 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
609 * just 0 for non-waters.
610 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
611 * jnr indices corresponding to data put in the four positions in the SIMD register.
613 int i_shift_offset,i_coord_offset,outeriter,inneriter;
614 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
615 int jnrA,jnrB,jnrC,jnrD;
616 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
617 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
618 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
620 real *shiftvec,*fshift,*x,*f;
621 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
623 __m128 fscal,rcutoff,rcutoff2,jidxall;
625 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
627 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
629 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
630 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
631 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
632 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
633 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
634 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
635 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
638 __m128i ifour = _mm_set1_epi32(4);
639 __m128 rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
641 __m128 dummy_mask,cutoff_mask;
642 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
643 __m128 one = _mm_set1_ps(1.0);
644 __m128 two = _mm_set1_ps(2.0);
650 jindex = nlist->jindex;
652 shiftidx = nlist->shift;
654 shiftvec = fr->shift_vec[0];
655 fshift = fr->fshift[0];
656 facel = _mm_set1_ps(fr->epsfac);
657 charge = mdatoms->chargeA;
659 vftab = kernel_data->table_elec->data;
660 vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
662 /* Setup water-specific parameters */
663 inr = nlist->iinr[0];
664 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
665 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
666 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
668 /* Avoid stupid compiler warnings */
669 jnrA = jnrB = jnrC = jnrD = 0;
678 for(iidx=0;iidx<4*DIM;iidx++)
683 /* Start outer loop over neighborlists */
684 for(iidx=0; iidx<nri; iidx++)
686 /* Load shift vector for this list */
687 i_shift_offset = DIM*shiftidx[iidx];
689 /* Load limits for loop over neighbors */
690 j_index_start = jindex[iidx];
691 j_index_end = jindex[iidx+1];
693 /* Get outer coordinate index */
695 i_coord_offset = DIM*inr;
697 /* Load i particle coords and add shift vector */
698 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
699 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
701 fix0 = _mm_setzero_ps();
702 fiy0 = _mm_setzero_ps();
703 fiz0 = _mm_setzero_ps();
704 fix1 = _mm_setzero_ps();
705 fiy1 = _mm_setzero_ps();
706 fiz1 = _mm_setzero_ps();
707 fix2 = _mm_setzero_ps();
708 fiy2 = _mm_setzero_ps();
709 fiz2 = _mm_setzero_ps();
711 /* Start inner kernel loop */
712 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
715 /* Get j neighbor index, and coordinate index */
720 j_coord_offsetA = DIM*jnrA;
721 j_coord_offsetB = DIM*jnrB;
722 j_coord_offsetC = DIM*jnrC;
723 j_coord_offsetD = DIM*jnrD;
725 /* load j atom coordinates */
726 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
727 x+j_coord_offsetC,x+j_coord_offsetD,
730 /* Calculate displacement vector */
731 dx00 = _mm_sub_ps(ix0,jx0);
732 dy00 = _mm_sub_ps(iy0,jy0);
733 dz00 = _mm_sub_ps(iz0,jz0);
734 dx10 = _mm_sub_ps(ix1,jx0);
735 dy10 = _mm_sub_ps(iy1,jy0);
736 dz10 = _mm_sub_ps(iz1,jz0);
737 dx20 = _mm_sub_ps(ix2,jx0);
738 dy20 = _mm_sub_ps(iy2,jy0);
739 dz20 = _mm_sub_ps(iz2,jz0);
741 /* Calculate squared distance and things based on it */
742 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
743 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
744 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
746 rinv00 = gmx_mm_invsqrt_ps(rsq00);
747 rinv10 = gmx_mm_invsqrt_ps(rsq10);
748 rinv20 = gmx_mm_invsqrt_ps(rsq20);
750 /* Load parameters for j particles */
751 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
752 charge+jnrC+0,charge+jnrD+0);
754 fjx0 = _mm_setzero_ps();
755 fjy0 = _mm_setzero_ps();
756 fjz0 = _mm_setzero_ps();
758 /**************************
759 * CALCULATE INTERACTIONS *
760 **************************/
762 r00 = _mm_mul_ps(rsq00,rinv00);
764 /* Compute parameters for interactions between i and j atoms */
765 qq00 = _mm_mul_ps(iq0,jq0);
767 /* Calculate table index by multiplying r with table scale and truncate to integer */
768 rt = _mm_mul_ps(r00,vftabscale);
769 vfitab = _mm_cvttps_epi32(rt);
771 vfeps = _mm_frcz_ps(rt);
773 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
775 twovfeps = _mm_add_ps(vfeps,vfeps);
776 vfitab = _mm_slli_epi32(vfitab,2);
778 /* CUBIC SPLINE TABLE ELECTROSTATICS */
779 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
780 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
781 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
782 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
783 _MM_TRANSPOSE4_PS(Y,F,G,H);
784 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
785 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
786 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
790 /* Update vectorial force */
791 fix0 = _mm_macc_ps(dx00,fscal,fix0);
792 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
793 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
795 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
796 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
797 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
799 /**************************
800 * CALCULATE INTERACTIONS *
801 **************************/
803 r10 = _mm_mul_ps(rsq10,rinv10);
805 /* Compute parameters for interactions between i and j atoms */
806 qq10 = _mm_mul_ps(iq1,jq0);
808 /* Calculate table index by multiplying r with table scale and truncate to integer */
809 rt = _mm_mul_ps(r10,vftabscale);
810 vfitab = _mm_cvttps_epi32(rt);
812 vfeps = _mm_frcz_ps(rt);
814 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
816 twovfeps = _mm_add_ps(vfeps,vfeps);
817 vfitab = _mm_slli_epi32(vfitab,2);
819 /* CUBIC SPLINE TABLE ELECTROSTATICS */
820 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
821 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
822 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
823 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
824 _MM_TRANSPOSE4_PS(Y,F,G,H);
825 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
826 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
827 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
831 /* Update vectorial force */
832 fix1 = _mm_macc_ps(dx10,fscal,fix1);
833 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
834 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
836 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
837 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
838 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
840 /**************************
841 * CALCULATE INTERACTIONS *
842 **************************/
844 r20 = _mm_mul_ps(rsq20,rinv20);
846 /* Compute parameters for interactions between i and j atoms */
847 qq20 = _mm_mul_ps(iq2,jq0);
849 /* Calculate table index by multiplying r with table scale and truncate to integer */
850 rt = _mm_mul_ps(r20,vftabscale);
851 vfitab = _mm_cvttps_epi32(rt);
853 vfeps = _mm_frcz_ps(rt);
855 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
857 twovfeps = _mm_add_ps(vfeps,vfeps);
858 vfitab = _mm_slli_epi32(vfitab,2);
860 /* CUBIC SPLINE TABLE ELECTROSTATICS */
861 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
862 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
863 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
864 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
865 _MM_TRANSPOSE4_PS(Y,F,G,H);
866 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
867 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
868 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
872 /* Update vectorial force */
873 fix2 = _mm_macc_ps(dx20,fscal,fix2);
874 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
875 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
877 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
878 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
879 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
881 fjptrA = f+j_coord_offsetA;
882 fjptrB = f+j_coord_offsetB;
883 fjptrC = f+j_coord_offsetC;
884 fjptrD = f+j_coord_offsetD;
886 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
888 /* Inner loop uses 126 flops */
894 /* Get j neighbor index, and coordinate index */
895 jnrlistA = jjnr[jidx];
896 jnrlistB = jjnr[jidx+1];
897 jnrlistC = jjnr[jidx+2];
898 jnrlistD = jjnr[jidx+3];
899 /* Sign of each element will be negative for non-real atoms.
900 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
901 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
903 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
904 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
905 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
906 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
907 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
908 j_coord_offsetA = DIM*jnrA;
909 j_coord_offsetB = DIM*jnrB;
910 j_coord_offsetC = DIM*jnrC;
911 j_coord_offsetD = DIM*jnrD;
913 /* load j atom coordinates */
914 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
915 x+j_coord_offsetC,x+j_coord_offsetD,
918 /* Calculate displacement vector */
919 dx00 = _mm_sub_ps(ix0,jx0);
920 dy00 = _mm_sub_ps(iy0,jy0);
921 dz00 = _mm_sub_ps(iz0,jz0);
922 dx10 = _mm_sub_ps(ix1,jx0);
923 dy10 = _mm_sub_ps(iy1,jy0);
924 dz10 = _mm_sub_ps(iz1,jz0);
925 dx20 = _mm_sub_ps(ix2,jx0);
926 dy20 = _mm_sub_ps(iy2,jy0);
927 dz20 = _mm_sub_ps(iz2,jz0);
929 /* Calculate squared distance and things based on it */
930 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
931 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
932 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
934 rinv00 = gmx_mm_invsqrt_ps(rsq00);
935 rinv10 = gmx_mm_invsqrt_ps(rsq10);
936 rinv20 = gmx_mm_invsqrt_ps(rsq20);
938 /* Load parameters for j particles */
939 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
940 charge+jnrC+0,charge+jnrD+0);
942 fjx0 = _mm_setzero_ps();
943 fjy0 = _mm_setzero_ps();
944 fjz0 = _mm_setzero_ps();
946 /**************************
947 * CALCULATE INTERACTIONS *
948 **************************/
950 r00 = _mm_mul_ps(rsq00,rinv00);
951 r00 = _mm_andnot_ps(dummy_mask,r00);
953 /* Compute parameters for interactions between i and j atoms */
954 qq00 = _mm_mul_ps(iq0,jq0);
956 /* Calculate table index by multiplying r with table scale and truncate to integer */
957 rt = _mm_mul_ps(r00,vftabscale);
958 vfitab = _mm_cvttps_epi32(rt);
960 vfeps = _mm_frcz_ps(rt);
962 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
964 twovfeps = _mm_add_ps(vfeps,vfeps);
965 vfitab = _mm_slli_epi32(vfitab,2);
967 /* CUBIC SPLINE TABLE ELECTROSTATICS */
968 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
969 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
970 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
971 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
972 _MM_TRANSPOSE4_PS(Y,F,G,H);
973 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
974 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
975 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
979 fscal = _mm_andnot_ps(dummy_mask,fscal);
981 /* Update vectorial force */
982 fix0 = _mm_macc_ps(dx00,fscal,fix0);
983 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
984 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
986 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
987 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
988 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
990 /**************************
991 * CALCULATE INTERACTIONS *
992 **************************/
994 r10 = _mm_mul_ps(rsq10,rinv10);
995 r10 = _mm_andnot_ps(dummy_mask,r10);
997 /* Compute parameters for interactions between i and j atoms */
998 qq10 = _mm_mul_ps(iq1,jq0);
1000 /* Calculate table index by multiplying r with table scale and truncate to integer */
1001 rt = _mm_mul_ps(r10,vftabscale);
1002 vfitab = _mm_cvttps_epi32(rt);
1004 vfeps = _mm_frcz_ps(rt);
1006 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1008 twovfeps = _mm_add_ps(vfeps,vfeps);
1009 vfitab = _mm_slli_epi32(vfitab,2);
1011 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1012 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1013 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1014 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1015 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1016 _MM_TRANSPOSE4_PS(Y,F,G,H);
1017 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1018 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1019 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1023 fscal = _mm_andnot_ps(dummy_mask,fscal);
1025 /* Update vectorial force */
1026 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1027 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1028 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1030 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1031 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1032 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1034 /**************************
1035 * CALCULATE INTERACTIONS *
1036 **************************/
1038 r20 = _mm_mul_ps(rsq20,rinv20);
1039 r20 = _mm_andnot_ps(dummy_mask,r20);
1041 /* Compute parameters for interactions between i and j atoms */
1042 qq20 = _mm_mul_ps(iq2,jq0);
1044 /* Calculate table index by multiplying r with table scale and truncate to integer */
1045 rt = _mm_mul_ps(r20,vftabscale);
1046 vfitab = _mm_cvttps_epi32(rt);
1048 vfeps = _mm_frcz_ps(rt);
1050 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1052 twovfeps = _mm_add_ps(vfeps,vfeps);
1053 vfitab = _mm_slli_epi32(vfitab,2);
1055 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1056 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1057 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1058 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1059 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1060 _MM_TRANSPOSE4_PS(Y,F,G,H);
1061 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1062 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1063 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1067 fscal = _mm_andnot_ps(dummy_mask,fscal);
1069 /* Update vectorial force */
1070 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1071 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1072 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1074 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1075 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1076 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1078 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1079 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1080 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1081 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1083 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1085 /* Inner loop uses 129 flops */
1088 /* End of innermost loop */
1090 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1091 f+i_coord_offset,fshift+i_shift_offset);
1093 /* Increment number of inner iterations */
1094 inneriter += j_index_end - j_index_start;
1096 /* Outer loop uses 18 flops */
1099 /* Increment number of outer iterations */
1102 /* Update outer/inner flops */
1104 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*129);