2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
48 #include "kernelutil_x86_avx_128_fma_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_128_fma_single
52 * Electrostatics interaction: CubicSplineTable
53 * VdW interaction: LennardJones
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_128_fma_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
112 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
116 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
118 __m128i ifour = _mm_set1_epi32(4);
119 __m128 rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
121 __m128 dummy_mask,cutoff_mask;
122 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
123 __m128 one = _mm_set1_ps(1.0);
124 __m128 two = _mm_set1_ps(2.0);
130 jindex = nlist->jindex;
132 shiftidx = nlist->shift;
134 shiftvec = fr->shift_vec[0];
135 fshift = fr->fshift[0];
136 facel = _mm_set1_ps(fr->epsfac);
137 charge = mdatoms->chargeA;
138 nvdwtype = fr->ntype;
140 vdwtype = mdatoms->typeA;
142 vftab = kernel_data->table_elec->data;
143 vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
145 /* Setup water-specific parameters */
146 inr = nlist->iinr[0];
147 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
148 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
149 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
150 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
152 jq1 = _mm_set1_ps(charge[inr+1]);
153 jq2 = _mm_set1_ps(charge[inr+2]);
154 jq3 = _mm_set1_ps(charge[inr+3]);
155 vdwjidx0A = 2*vdwtype[inr+0];
156 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
157 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
158 qq11 = _mm_mul_ps(iq1,jq1);
159 qq12 = _mm_mul_ps(iq1,jq2);
160 qq13 = _mm_mul_ps(iq1,jq3);
161 qq21 = _mm_mul_ps(iq2,jq1);
162 qq22 = _mm_mul_ps(iq2,jq2);
163 qq23 = _mm_mul_ps(iq2,jq3);
164 qq31 = _mm_mul_ps(iq3,jq1);
165 qq32 = _mm_mul_ps(iq3,jq2);
166 qq33 = _mm_mul_ps(iq3,jq3);
168 /* Avoid stupid compiler warnings */
169 jnrA = jnrB = jnrC = jnrD = 0;
178 for(iidx=0;iidx<4*DIM;iidx++)
183 /* Start outer loop over neighborlists */
184 for(iidx=0; iidx<nri; iidx++)
186 /* Load shift vector for this list */
187 i_shift_offset = DIM*shiftidx[iidx];
189 /* Load limits for loop over neighbors */
190 j_index_start = jindex[iidx];
191 j_index_end = jindex[iidx+1];
193 /* Get outer coordinate index */
195 i_coord_offset = DIM*inr;
197 /* Load i particle coords and add shift vector */
198 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
199 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
201 fix0 = _mm_setzero_ps();
202 fiy0 = _mm_setzero_ps();
203 fiz0 = _mm_setzero_ps();
204 fix1 = _mm_setzero_ps();
205 fiy1 = _mm_setzero_ps();
206 fiz1 = _mm_setzero_ps();
207 fix2 = _mm_setzero_ps();
208 fiy2 = _mm_setzero_ps();
209 fiz2 = _mm_setzero_ps();
210 fix3 = _mm_setzero_ps();
211 fiy3 = _mm_setzero_ps();
212 fiz3 = _mm_setzero_ps();
214 /* Reset potential sums */
215 velecsum = _mm_setzero_ps();
216 vvdwsum = _mm_setzero_ps();
218 /* Start inner kernel loop */
219 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
222 /* Get j neighbor index, and coordinate index */
227 j_coord_offsetA = DIM*jnrA;
228 j_coord_offsetB = DIM*jnrB;
229 j_coord_offsetC = DIM*jnrC;
230 j_coord_offsetD = DIM*jnrD;
232 /* load j atom coordinates */
233 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
234 x+j_coord_offsetC,x+j_coord_offsetD,
235 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
236 &jy2,&jz2,&jx3,&jy3,&jz3);
238 /* Calculate displacement vector */
239 dx00 = _mm_sub_ps(ix0,jx0);
240 dy00 = _mm_sub_ps(iy0,jy0);
241 dz00 = _mm_sub_ps(iz0,jz0);
242 dx11 = _mm_sub_ps(ix1,jx1);
243 dy11 = _mm_sub_ps(iy1,jy1);
244 dz11 = _mm_sub_ps(iz1,jz1);
245 dx12 = _mm_sub_ps(ix1,jx2);
246 dy12 = _mm_sub_ps(iy1,jy2);
247 dz12 = _mm_sub_ps(iz1,jz2);
248 dx13 = _mm_sub_ps(ix1,jx3);
249 dy13 = _mm_sub_ps(iy1,jy3);
250 dz13 = _mm_sub_ps(iz1,jz3);
251 dx21 = _mm_sub_ps(ix2,jx1);
252 dy21 = _mm_sub_ps(iy2,jy1);
253 dz21 = _mm_sub_ps(iz2,jz1);
254 dx22 = _mm_sub_ps(ix2,jx2);
255 dy22 = _mm_sub_ps(iy2,jy2);
256 dz22 = _mm_sub_ps(iz2,jz2);
257 dx23 = _mm_sub_ps(ix2,jx3);
258 dy23 = _mm_sub_ps(iy2,jy3);
259 dz23 = _mm_sub_ps(iz2,jz3);
260 dx31 = _mm_sub_ps(ix3,jx1);
261 dy31 = _mm_sub_ps(iy3,jy1);
262 dz31 = _mm_sub_ps(iz3,jz1);
263 dx32 = _mm_sub_ps(ix3,jx2);
264 dy32 = _mm_sub_ps(iy3,jy2);
265 dz32 = _mm_sub_ps(iz3,jz2);
266 dx33 = _mm_sub_ps(ix3,jx3);
267 dy33 = _mm_sub_ps(iy3,jy3);
268 dz33 = _mm_sub_ps(iz3,jz3);
270 /* Calculate squared distance and things based on it */
271 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
272 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
273 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
274 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
275 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
276 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
277 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
278 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
279 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
280 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
282 rinv11 = gmx_mm_invsqrt_ps(rsq11);
283 rinv12 = gmx_mm_invsqrt_ps(rsq12);
284 rinv13 = gmx_mm_invsqrt_ps(rsq13);
285 rinv21 = gmx_mm_invsqrt_ps(rsq21);
286 rinv22 = gmx_mm_invsqrt_ps(rsq22);
287 rinv23 = gmx_mm_invsqrt_ps(rsq23);
288 rinv31 = gmx_mm_invsqrt_ps(rsq31);
289 rinv32 = gmx_mm_invsqrt_ps(rsq32);
290 rinv33 = gmx_mm_invsqrt_ps(rsq33);
292 rinvsq00 = gmx_mm_inv_ps(rsq00);
294 fjx0 = _mm_setzero_ps();
295 fjy0 = _mm_setzero_ps();
296 fjz0 = _mm_setzero_ps();
297 fjx1 = _mm_setzero_ps();
298 fjy1 = _mm_setzero_ps();
299 fjz1 = _mm_setzero_ps();
300 fjx2 = _mm_setzero_ps();
301 fjy2 = _mm_setzero_ps();
302 fjz2 = _mm_setzero_ps();
303 fjx3 = _mm_setzero_ps();
304 fjy3 = _mm_setzero_ps();
305 fjz3 = _mm_setzero_ps();
307 /**************************
308 * CALCULATE INTERACTIONS *
309 **************************/
311 /* LENNARD-JONES DISPERSION/REPULSION */
313 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
314 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
315 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
316 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
317 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
319 /* Update potential sum for this i atom from the interaction with this j atom. */
320 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
324 /* Update vectorial force */
325 fix0 = _mm_macc_ps(dx00,fscal,fix0);
326 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
327 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
329 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
330 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
331 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
333 /**************************
334 * CALCULATE INTERACTIONS *
335 **************************/
337 r11 = _mm_mul_ps(rsq11,rinv11);
339 /* Calculate table index by multiplying r with table scale and truncate to integer */
340 rt = _mm_mul_ps(r11,vftabscale);
341 vfitab = _mm_cvttps_epi32(rt);
343 vfeps = _mm_frcz_ps(rt);
345 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
347 twovfeps = _mm_add_ps(vfeps,vfeps);
348 vfitab = _mm_slli_epi32(vfitab,2);
350 /* CUBIC SPLINE TABLE ELECTROSTATICS */
351 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
352 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
353 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
354 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
355 _MM_TRANSPOSE4_PS(Y,F,G,H);
356 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
357 VV = _mm_macc_ps(vfeps,Fp,Y);
358 velec = _mm_mul_ps(qq11,VV);
359 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
360 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
362 /* Update potential sum for this i atom from the interaction with this j atom. */
363 velecsum = _mm_add_ps(velecsum,velec);
367 /* Update vectorial force */
368 fix1 = _mm_macc_ps(dx11,fscal,fix1);
369 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
370 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
372 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
373 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
374 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
380 r12 = _mm_mul_ps(rsq12,rinv12);
382 /* Calculate table index by multiplying r with table scale and truncate to integer */
383 rt = _mm_mul_ps(r12,vftabscale);
384 vfitab = _mm_cvttps_epi32(rt);
386 vfeps = _mm_frcz_ps(rt);
388 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
390 twovfeps = _mm_add_ps(vfeps,vfeps);
391 vfitab = _mm_slli_epi32(vfitab,2);
393 /* CUBIC SPLINE TABLE ELECTROSTATICS */
394 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
395 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
396 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
397 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
398 _MM_TRANSPOSE4_PS(Y,F,G,H);
399 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
400 VV = _mm_macc_ps(vfeps,Fp,Y);
401 velec = _mm_mul_ps(qq12,VV);
402 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
403 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
405 /* Update potential sum for this i atom from the interaction with this j atom. */
406 velecsum = _mm_add_ps(velecsum,velec);
410 /* Update vectorial force */
411 fix1 = _mm_macc_ps(dx12,fscal,fix1);
412 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
413 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
415 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
416 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
417 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
419 /**************************
420 * CALCULATE INTERACTIONS *
421 **************************/
423 r13 = _mm_mul_ps(rsq13,rinv13);
425 /* Calculate table index by multiplying r with table scale and truncate to integer */
426 rt = _mm_mul_ps(r13,vftabscale);
427 vfitab = _mm_cvttps_epi32(rt);
429 vfeps = _mm_frcz_ps(rt);
431 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
433 twovfeps = _mm_add_ps(vfeps,vfeps);
434 vfitab = _mm_slli_epi32(vfitab,2);
436 /* CUBIC SPLINE TABLE ELECTROSTATICS */
437 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
438 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
439 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
440 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
441 _MM_TRANSPOSE4_PS(Y,F,G,H);
442 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
443 VV = _mm_macc_ps(vfeps,Fp,Y);
444 velec = _mm_mul_ps(qq13,VV);
445 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
446 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
448 /* Update potential sum for this i atom from the interaction with this j atom. */
449 velecsum = _mm_add_ps(velecsum,velec);
453 /* Update vectorial force */
454 fix1 = _mm_macc_ps(dx13,fscal,fix1);
455 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
456 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
458 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
459 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
460 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
462 /**************************
463 * CALCULATE INTERACTIONS *
464 **************************/
466 r21 = _mm_mul_ps(rsq21,rinv21);
468 /* Calculate table index by multiplying r with table scale and truncate to integer */
469 rt = _mm_mul_ps(r21,vftabscale);
470 vfitab = _mm_cvttps_epi32(rt);
472 vfeps = _mm_frcz_ps(rt);
474 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
476 twovfeps = _mm_add_ps(vfeps,vfeps);
477 vfitab = _mm_slli_epi32(vfitab,2);
479 /* CUBIC SPLINE TABLE ELECTROSTATICS */
480 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
481 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
482 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
483 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
484 _MM_TRANSPOSE4_PS(Y,F,G,H);
485 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
486 VV = _mm_macc_ps(vfeps,Fp,Y);
487 velec = _mm_mul_ps(qq21,VV);
488 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
489 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
491 /* Update potential sum for this i atom from the interaction with this j atom. */
492 velecsum = _mm_add_ps(velecsum,velec);
496 /* Update vectorial force */
497 fix2 = _mm_macc_ps(dx21,fscal,fix2);
498 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
499 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
501 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
502 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
503 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
505 /**************************
506 * CALCULATE INTERACTIONS *
507 **************************/
509 r22 = _mm_mul_ps(rsq22,rinv22);
511 /* Calculate table index by multiplying r with table scale and truncate to integer */
512 rt = _mm_mul_ps(r22,vftabscale);
513 vfitab = _mm_cvttps_epi32(rt);
515 vfeps = _mm_frcz_ps(rt);
517 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
519 twovfeps = _mm_add_ps(vfeps,vfeps);
520 vfitab = _mm_slli_epi32(vfitab,2);
522 /* CUBIC SPLINE TABLE ELECTROSTATICS */
523 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
524 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
525 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
526 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
527 _MM_TRANSPOSE4_PS(Y,F,G,H);
528 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
529 VV = _mm_macc_ps(vfeps,Fp,Y);
530 velec = _mm_mul_ps(qq22,VV);
531 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
532 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
534 /* Update potential sum for this i atom from the interaction with this j atom. */
535 velecsum = _mm_add_ps(velecsum,velec);
539 /* Update vectorial force */
540 fix2 = _mm_macc_ps(dx22,fscal,fix2);
541 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
542 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
544 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
545 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
546 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
548 /**************************
549 * CALCULATE INTERACTIONS *
550 **************************/
552 r23 = _mm_mul_ps(rsq23,rinv23);
554 /* Calculate table index by multiplying r with table scale and truncate to integer */
555 rt = _mm_mul_ps(r23,vftabscale);
556 vfitab = _mm_cvttps_epi32(rt);
558 vfeps = _mm_frcz_ps(rt);
560 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
562 twovfeps = _mm_add_ps(vfeps,vfeps);
563 vfitab = _mm_slli_epi32(vfitab,2);
565 /* CUBIC SPLINE TABLE ELECTROSTATICS */
566 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
567 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
568 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
569 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
570 _MM_TRANSPOSE4_PS(Y,F,G,H);
571 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
572 VV = _mm_macc_ps(vfeps,Fp,Y);
573 velec = _mm_mul_ps(qq23,VV);
574 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
575 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
577 /* Update potential sum for this i atom from the interaction with this j atom. */
578 velecsum = _mm_add_ps(velecsum,velec);
582 /* Update vectorial force */
583 fix2 = _mm_macc_ps(dx23,fscal,fix2);
584 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
585 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
587 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
588 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
589 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
591 /**************************
592 * CALCULATE INTERACTIONS *
593 **************************/
595 r31 = _mm_mul_ps(rsq31,rinv31);
597 /* Calculate table index by multiplying r with table scale and truncate to integer */
598 rt = _mm_mul_ps(r31,vftabscale);
599 vfitab = _mm_cvttps_epi32(rt);
601 vfeps = _mm_frcz_ps(rt);
603 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
605 twovfeps = _mm_add_ps(vfeps,vfeps);
606 vfitab = _mm_slli_epi32(vfitab,2);
608 /* CUBIC SPLINE TABLE ELECTROSTATICS */
609 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
610 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
611 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
612 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
613 _MM_TRANSPOSE4_PS(Y,F,G,H);
614 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
615 VV = _mm_macc_ps(vfeps,Fp,Y);
616 velec = _mm_mul_ps(qq31,VV);
617 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
618 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
620 /* Update potential sum for this i atom from the interaction with this j atom. */
621 velecsum = _mm_add_ps(velecsum,velec);
625 /* Update vectorial force */
626 fix3 = _mm_macc_ps(dx31,fscal,fix3);
627 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
628 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
630 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
631 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
632 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
634 /**************************
635 * CALCULATE INTERACTIONS *
636 **************************/
638 r32 = _mm_mul_ps(rsq32,rinv32);
640 /* Calculate table index by multiplying r with table scale and truncate to integer */
641 rt = _mm_mul_ps(r32,vftabscale);
642 vfitab = _mm_cvttps_epi32(rt);
644 vfeps = _mm_frcz_ps(rt);
646 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
648 twovfeps = _mm_add_ps(vfeps,vfeps);
649 vfitab = _mm_slli_epi32(vfitab,2);
651 /* CUBIC SPLINE TABLE ELECTROSTATICS */
652 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
653 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
654 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
655 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
656 _MM_TRANSPOSE4_PS(Y,F,G,H);
657 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
658 VV = _mm_macc_ps(vfeps,Fp,Y);
659 velec = _mm_mul_ps(qq32,VV);
660 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
661 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
663 /* Update potential sum for this i atom from the interaction with this j atom. */
664 velecsum = _mm_add_ps(velecsum,velec);
668 /* Update vectorial force */
669 fix3 = _mm_macc_ps(dx32,fscal,fix3);
670 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
671 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
673 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
674 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
675 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
677 /**************************
678 * CALCULATE INTERACTIONS *
679 **************************/
681 r33 = _mm_mul_ps(rsq33,rinv33);
683 /* Calculate table index by multiplying r with table scale and truncate to integer */
684 rt = _mm_mul_ps(r33,vftabscale);
685 vfitab = _mm_cvttps_epi32(rt);
687 vfeps = _mm_frcz_ps(rt);
689 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
691 twovfeps = _mm_add_ps(vfeps,vfeps);
692 vfitab = _mm_slli_epi32(vfitab,2);
694 /* CUBIC SPLINE TABLE ELECTROSTATICS */
695 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
696 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
697 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
698 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
699 _MM_TRANSPOSE4_PS(Y,F,G,H);
700 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
701 VV = _mm_macc_ps(vfeps,Fp,Y);
702 velec = _mm_mul_ps(qq33,VV);
703 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
704 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
706 /* Update potential sum for this i atom from the interaction with this j atom. */
707 velecsum = _mm_add_ps(velecsum,velec);
711 /* Update vectorial force */
712 fix3 = _mm_macc_ps(dx33,fscal,fix3);
713 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
714 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
716 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
717 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
718 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
720 fjptrA = f+j_coord_offsetA;
721 fjptrB = f+j_coord_offsetB;
722 fjptrC = f+j_coord_offsetC;
723 fjptrD = f+j_coord_offsetD;
725 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
726 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
727 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
729 /* Inner loop uses 452 flops */
735 /* Get j neighbor index, and coordinate index */
736 jnrlistA = jjnr[jidx];
737 jnrlistB = jjnr[jidx+1];
738 jnrlistC = jjnr[jidx+2];
739 jnrlistD = jjnr[jidx+3];
740 /* Sign of each element will be negative for non-real atoms.
741 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
742 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
744 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
745 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
746 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
747 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
748 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
749 j_coord_offsetA = DIM*jnrA;
750 j_coord_offsetB = DIM*jnrB;
751 j_coord_offsetC = DIM*jnrC;
752 j_coord_offsetD = DIM*jnrD;
754 /* load j atom coordinates */
755 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
756 x+j_coord_offsetC,x+j_coord_offsetD,
757 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
758 &jy2,&jz2,&jx3,&jy3,&jz3);
760 /* Calculate displacement vector */
761 dx00 = _mm_sub_ps(ix0,jx0);
762 dy00 = _mm_sub_ps(iy0,jy0);
763 dz00 = _mm_sub_ps(iz0,jz0);
764 dx11 = _mm_sub_ps(ix1,jx1);
765 dy11 = _mm_sub_ps(iy1,jy1);
766 dz11 = _mm_sub_ps(iz1,jz1);
767 dx12 = _mm_sub_ps(ix1,jx2);
768 dy12 = _mm_sub_ps(iy1,jy2);
769 dz12 = _mm_sub_ps(iz1,jz2);
770 dx13 = _mm_sub_ps(ix1,jx3);
771 dy13 = _mm_sub_ps(iy1,jy3);
772 dz13 = _mm_sub_ps(iz1,jz3);
773 dx21 = _mm_sub_ps(ix2,jx1);
774 dy21 = _mm_sub_ps(iy2,jy1);
775 dz21 = _mm_sub_ps(iz2,jz1);
776 dx22 = _mm_sub_ps(ix2,jx2);
777 dy22 = _mm_sub_ps(iy2,jy2);
778 dz22 = _mm_sub_ps(iz2,jz2);
779 dx23 = _mm_sub_ps(ix2,jx3);
780 dy23 = _mm_sub_ps(iy2,jy3);
781 dz23 = _mm_sub_ps(iz2,jz3);
782 dx31 = _mm_sub_ps(ix3,jx1);
783 dy31 = _mm_sub_ps(iy3,jy1);
784 dz31 = _mm_sub_ps(iz3,jz1);
785 dx32 = _mm_sub_ps(ix3,jx2);
786 dy32 = _mm_sub_ps(iy3,jy2);
787 dz32 = _mm_sub_ps(iz3,jz2);
788 dx33 = _mm_sub_ps(ix3,jx3);
789 dy33 = _mm_sub_ps(iy3,jy3);
790 dz33 = _mm_sub_ps(iz3,jz3);
792 /* Calculate squared distance and things based on it */
793 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
794 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
795 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
796 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
797 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
798 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
799 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
800 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
801 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
802 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
804 rinv11 = gmx_mm_invsqrt_ps(rsq11);
805 rinv12 = gmx_mm_invsqrt_ps(rsq12);
806 rinv13 = gmx_mm_invsqrt_ps(rsq13);
807 rinv21 = gmx_mm_invsqrt_ps(rsq21);
808 rinv22 = gmx_mm_invsqrt_ps(rsq22);
809 rinv23 = gmx_mm_invsqrt_ps(rsq23);
810 rinv31 = gmx_mm_invsqrt_ps(rsq31);
811 rinv32 = gmx_mm_invsqrt_ps(rsq32);
812 rinv33 = gmx_mm_invsqrt_ps(rsq33);
814 rinvsq00 = gmx_mm_inv_ps(rsq00);
816 fjx0 = _mm_setzero_ps();
817 fjy0 = _mm_setzero_ps();
818 fjz0 = _mm_setzero_ps();
819 fjx1 = _mm_setzero_ps();
820 fjy1 = _mm_setzero_ps();
821 fjz1 = _mm_setzero_ps();
822 fjx2 = _mm_setzero_ps();
823 fjy2 = _mm_setzero_ps();
824 fjz2 = _mm_setzero_ps();
825 fjx3 = _mm_setzero_ps();
826 fjy3 = _mm_setzero_ps();
827 fjz3 = _mm_setzero_ps();
829 /**************************
830 * CALCULATE INTERACTIONS *
831 **************************/
833 /* LENNARD-JONES DISPERSION/REPULSION */
835 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
836 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
837 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
838 vvdw = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
839 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
841 /* Update potential sum for this i atom from the interaction with this j atom. */
842 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
843 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
847 fscal = _mm_andnot_ps(dummy_mask,fscal);
849 /* Update vectorial force */
850 fix0 = _mm_macc_ps(dx00,fscal,fix0);
851 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
852 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
854 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
855 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
856 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
858 /**************************
859 * CALCULATE INTERACTIONS *
860 **************************/
862 r11 = _mm_mul_ps(rsq11,rinv11);
863 r11 = _mm_andnot_ps(dummy_mask,r11);
865 /* Calculate table index by multiplying r with table scale and truncate to integer */
866 rt = _mm_mul_ps(r11,vftabscale);
867 vfitab = _mm_cvttps_epi32(rt);
869 vfeps = _mm_frcz_ps(rt);
871 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
873 twovfeps = _mm_add_ps(vfeps,vfeps);
874 vfitab = _mm_slli_epi32(vfitab,2);
876 /* CUBIC SPLINE TABLE ELECTROSTATICS */
877 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
878 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
879 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
880 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
881 _MM_TRANSPOSE4_PS(Y,F,G,H);
882 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
883 VV = _mm_macc_ps(vfeps,Fp,Y);
884 velec = _mm_mul_ps(qq11,VV);
885 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
886 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
888 /* Update potential sum for this i atom from the interaction with this j atom. */
889 velec = _mm_andnot_ps(dummy_mask,velec);
890 velecsum = _mm_add_ps(velecsum,velec);
894 fscal = _mm_andnot_ps(dummy_mask,fscal);
896 /* Update vectorial force */
897 fix1 = _mm_macc_ps(dx11,fscal,fix1);
898 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
899 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
901 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
902 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
903 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
905 /**************************
906 * CALCULATE INTERACTIONS *
907 **************************/
909 r12 = _mm_mul_ps(rsq12,rinv12);
910 r12 = _mm_andnot_ps(dummy_mask,r12);
912 /* Calculate table index by multiplying r with table scale and truncate to integer */
913 rt = _mm_mul_ps(r12,vftabscale);
914 vfitab = _mm_cvttps_epi32(rt);
916 vfeps = _mm_frcz_ps(rt);
918 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
920 twovfeps = _mm_add_ps(vfeps,vfeps);
921 vfitab = _mm_slli_epi32(vfitab,2);
923 /* CUBIC SPLINE TABLE ELECTROSTATICS */
924 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
925 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
926 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
927 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
928 _MM_TRANSPOSE4_PS(Y,F,G,H);
929 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
930 VV = _mm_macc_ps(vfeps,Fp,Y);
931 velec = _mm_mul_ps(qq12,VV);
932 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
933 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
935 /* Update potential sum for this i atom from the interaction with this j atom. */
936 velec = _mm_andnot_ps(dummy_mask,velec);
937 velecsum = _mm_add_ps(velecsum,velec);
941 fscal = _mm_andnot_ps(dummy_mask,fscal);
943 /* Update vectorial force */
944 fix1 = _mm_macc_ps(dx12,fscal,fix1);
945 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
946 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
948 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
949 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
950 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
952 /**************************
953 * CALCULATE INTERACTIONS *
954 **************************/
956 r13 = _mm_mul_ps(rsq13,rinv13);
957 r13 = _mm_andnot_ps(dummy_mask,r13);
959 /* Calculate table index by multiplying r with table scale and truncate to integer */
960 rt = _mm_mul_ps(r13,vftabscale);
961 vfitab = _mm_cvttps_epi32(rt);
963 vfeps = _mm_frcz_ps(rt);
965 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
967 twovfeps = _mm_add_ps(vfeps,vfeps);
968 vfitab = _mm_slli_epi32(vfitab,2);
970 /* CUBIC SPLINE TABLE ELECTROSTATICS */
971 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
972 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
973 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
974 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
975 _MM_TRANSPOSE4_PS(Y,F,G,H);
976 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
977 VV = _mm_macc_ps(vfeps,Fp,Y);
978 velec = _mm_mul_ps(qq13,VV);
979 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
980 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
982 /* Update potential sum for this i atom from the interaction with this j atom. */
983 velec = _mm_andnot_ps(dummy_mask,velec);
984 velecsum = _mm_add_ps(velecsum,velec);
988 fscal = _mm_andnot_ps(dummy_mask,fscal);
990 /* Update vectorial force */
991 fix1 = _mm_macc_ps(dx13,fscal,fix1);
992 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
993 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
995 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
996 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
997 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
999 /**************************
1000 * CALCULATE INTERACTIONS *
1001 **************************/
1003 r21 = _mm_mul_ps(rsq21,rinv21);
1004 r21 = _mm_andnot_ps(dummy_mask,r21);
1006 /* Calculate table index by multiplying r with table scale and truncate to integer */
1007 rt = _mm_mul_ps(r21,vftabscale);
1008 vfitab = _mm_cvttps_epi32(rt);
1010 vfeps = _mm_frcz_ps(rt);
1012 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1014 twovfeps = _mm_add_ps(vfeps,vfeps);
1015 vfitab = _mm_slli_epi32(vfitab,2);
1017 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1018 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1019 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1020 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1021 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1022 _MM_TRANSPOSE4_PS(Y,F,G,H);
1023 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1024 VV = _mm_macc_ps(vfeps,Fp,Y);
1025 velec = _mm_mul_ps(qq21,VV);
1026 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1027 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1029 /* Update potential sum for this i atom from the interaction with this j atom. */
1030 velec = _mm_andnot_ps(dummy_mask,velec);
1031 velecsum = _mm_add_ps(velecsum,velec);
1035 fscal = _mm_andnot_ps(dummy_mask,fscal);
1037 /* Update vectorial force */
1038 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1039 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1040 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1042 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1043 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1044 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1046 /**************************
1047 * CALCULATE INTERACTIONS *
1048 **************************/
1050 r22 = _mm_mul_ps(rsq22,rinv22);
1051 r22 = _mm_andnot_ps(dummy_mask,r22);
1053 /* Calculate table index by multiplying r with table scale and truncate to integer */
1054 rt = _mm_mul_ps(r22,vftabscale);
1055 vfitab = _mm_cvttps_epi32(rt);
1057 vfeps = _mm_frcz_ps(rt);
1059 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1061 twovfeps = _mm_add_ps(vfeps,vfeps);
1062 vfitab = _mm_slli_epi32(vfitab,2);
1064 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1065 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1066 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1067 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1068 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1069 _MM_TRANSPOSE4_PS(Y,F,G,H);
1070 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1071 VV = _mm_macc_ps(vfeps,Fp,Y);
1072 velec = _mm_mul_ps(qq22,VV);
1073 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1074 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1076 /* Update potential sum for this i atom from the interaction with this j atom. */
1077 velec = _mm_andnot_ps(dummy_mask,velec);
1078 velecsum = _mm_add_ps(velecsum,velec);
1082 fscal = _mm_andnot_ps(dummy_mask,fscal);
1084 /* Update vectorial force */
1085 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1086 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1087 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1089 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1090 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1091 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1093 /**************************
1094 * CALCULATE INTERACTIONS *
1095 **************************/
1097 r23 = _mm_mul_ps(rsq23,rinv23);
1098 r23 = _mm_andnot_ps(dummy_mask,r23);
1100 /* Calculate table index by multiplying r with table scale and truncate to integer */
1101 rt = _mm_mul_ps(r23,vftabscale);
1102 vfitab = _mm_cvttps_epi32(rt);
1104 vfeps = _mm_frcz_ps(rt);
1106 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1108 twovfeps = _mm_add_ps(vfeps,vfeps);
1109 vfitab = _mm_slli_epi32(vfitab,2);
1111 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1112 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1113 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1114 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1115 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1116 _MM_TRANSPOSE4_PS(Y,F,G,H);
1117 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1118 VV = _mm_macc_ps(vfeps,Fp,Y);
1119 velec = _mm_mul_ps(qq23,VV);
1120 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1121 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1123 /* Update potential sum for this i atom from the interaction with this j atom. */
1124 velec = _mm_andnot_ps(dummy_mask,velec);
1125 velecsum = _mm_add_ps(velecsum,velec);
1129 fscal = _mm_andnot_ps(dummy_mask,fscal);
1131 /* Update vectorial force */
1132 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1133 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1134 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1136 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1137 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1138 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1140 /**************************
1141 * CALCULATE INTERACTIONS *
1142 **************************/
1144 r31 = _mm_mul_ps(rsq31,rinv31);
1145 r31 = _mm_andnot_ps(dummy_mask,r31);
1147 /* Calculate table index by multiplying r with table scale and truncate to integer */
1148 rt = _mm_mul_ps(r31,vftabscale);
1149 vfitab = _mm_cvttps_epi32(rt);
1151 vfeps = _mm_frcz_ps(rt);
1153 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1155 twovfeps = _mm_add_ps(vfeps,vfeps);
1156 vfitab = _mm_slli_epi32(vfitab,2);
1158 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1159 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1160 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1161 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1162 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1163 _MM_TRANSPOSE4_PS(Y,F,G,H);
1164 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1165 VV = _mm_macc_ps(vfeps,Fp,Y);
1166 velec = _mm_mul_ps(qq31,VV);
1167 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1168 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1170 /* Update potential sum for this i atom from the interaction with this j atom. */
1171 velec = _mm_andnot_ps(dummy_mask,velec);
1172 velecsum = _mm_add_ps(velecsum,velec);
1176 fscal = _mm_andnot_ps(dummy_mask,fscal);
1178 /* Update vectorial force */
1179 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1180 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1181 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1183 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1184 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1185 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1187 /**************************
1188 * CALCULATE INTERACTIONS *
1189 **************************/
1191 r32 = _mm_mul_ps(rsq32,rinv32);
1192 r32 = _mm_andnot_ps(dummy_mask,r32);
1194 /* Calculate table index by multiplying r with table scale and truncate to integer */
1195 rt = _mm_mul_ps(r32,vftabscale);
1196 vfitab = _mm_cvttps_epi32(rt);
1198 vfeps = _mm_frcz_ps(rt);
1200 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1202 twovfeps = _mm_add_ps(vfeps,vfeps);
1203 vfitab = _mm_slli_epi32(vfitab,2);
1205 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1206 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1207 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1208 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1209 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1210 _MM_TRANSPOSE4_PS(Y,F,G,H);
1211 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1212 VV = _mm_macc_ps(vfeps,Fp,Y);
1213 velec = _mm_mul_ps(qq32,VV);
1214 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1215 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1217 /* Update potential sum for this i atom from the interaction with this j atom. */
1218 velec = _mm_andnot_ps(dummy_mask,velec);
1219 velecsum = _mm_add_ps(velecsum,velec);
1223 fscal = _mm_andnot_ps(dummy_mask,fscal);
1225 /* Update vectorial force */
1226 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1227 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1228 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1230 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1231 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1232 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1234 /**************************
1235 * CALCULATE INTERACTIONS *
1236 **************************/
1238 r33 = _mm_mul_ps(rsq33,rinv33);
1239 r33 = _mm_andnot_ps(dummy_mask,r33);
1241 /* Calculate table index by multiplying r with table scale and truncate to integer */
1242 rt = _mm_mul_ps(r33,vftabscale);
1243 vfitab = _mm_cvttps_epi32(rt);
1245 vfeps = _mm_frcz_ps(rt);
1247 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1249 twovfeps = _mm_add_ps(vfeps,vfeps);
1250 vfitab = _mm_slli_epi32(vfitab,2);
1252 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1253 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1254 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1255 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1256 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1257 _MM_TRANSPOSE4_PS(Y,F,G,H);
1258 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1259 VV = _mm_macc_ps(vfeps,Fp,Y);
1260 velec = _mm_mul_ps(qq33,VV);
1261 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1262 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1264 /* Update potential sum for this i atom from the interaction with this j atom. */
1265 velec = _mm_andnot_ps(dummy_mask,velec);
1266 velecsum = _mm_add_ps(velecsum,velec);
1270 fscal = _mm_andnot_ps(dummy_mask,fscal);
1272 /* Update vectorial force */
1273 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1274 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1275 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1277 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1278 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1279 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1281 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1282 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1283 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1284 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1286 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1287 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1288 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1290 /* Inner loop uses 461 flops */
1293 /* End of innermost loop */
1295 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1296 f+i_coord_offset,fshift+i_shift_offset);
1299 /* Update potential energies */
1300 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1301 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1303 /* Increment number of inner iterations */
1304 inneriter += j_index_end - j_index_start;
1306 /* Outer loop uses 26 flops */
1309 /* Increment number of outer iterations */
1312 /* Update outer/inner flops */
1314 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*461);
1317 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_128_fma_single
1318 * Electrostatics interaction: CubicSplineTable
1319 * VdW interaction: LennardJones
1320 * Geometry: Water4-Water4
1321 * Calculate force/pot: Force
1324 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_128_fma_single
1325 (t_nblist * gmx_restrict nlist,
1326 rvec * gmx_restrict xx,
1327 rvec * gmx_restrict ff,
1328 t_forcerec * gmx_restrict fr,
1329 t_mdatoms * gmx_restrict mdatoms,
1330 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1331 t_nrnb * gmx_restrict nrnb)
1333 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1334 * just 0 for non-waters.
1335 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1336 * jnr indices corresponding to data put in the four positions in the SIMD register.
1338 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1339 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1340 int jnrA,jnrB,jnrC,jnrD;
1341 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1342 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1343 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1344 real rcutoff_scalar;
1345 real *shiftvec,*fshift,*x,*f;
1346 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1347 real scratch[4*DIM];
1348 __m128 fscal,rcutoff,rcutoff2,jidxall;
1350 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1352 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1354 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1356 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1357 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1358 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1359 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1360 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1361 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1362 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1363 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1364 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1365 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1366 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1367 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1368 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1369 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1370 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1371 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1372 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1373 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1374 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1375 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1378 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1381 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1382 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1384 __m128i ifour = _mm_set1_epi32(4);
1385 __m128 rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
1387 __m128 dummy_mask,cutoff_mask;
1388 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1389 __m128 one = _mm_set1_ps(1.0);
1390 __m128 two = _mm_set1_ps(2.0);
1396 jindex = nlist->jindex;
1398 shiftidx = nlist->shift;
1400 shiftvec = fr->shift_vec[0];
1401 fshift = fr->fshift[0];
1402 facel = _mm_set1_ps(fr->epsfac);
1403 charge = mdatoms->chargeA;
1404 nvdwtype = fr->ntype;
1405 vdwparam = fr->nbfp;
1406 vdwtype = mdatoms->typeA;
1408 vftab = kernel_data->table_elec->data;
1409 vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
1411 /* Setup water-specific parameters */
1412 inr = nlist->iinr[0];
1413 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1414 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1415 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1416 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1418 jq1 = _mm_set1_ps(charge[inr+1]);
1419 jq2 = _mm_set1_ps(charge[inr+2]);
1420 jq3 = _mm_set1_ps(charge[inr+3]);
1421 vdwjidx0A = 2*vdwtype[inr+0];
1422 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1423 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1424 qq11 = _mm_mul_ps(iq1,jq1);
1425 qq12 = _mm_mul_ps(iq1,jq2);
1426 qq13 = _mm_mul_ps(iq1,jq3);
1427 qq21 = _mm_mul_ps(iq2,jq1);
1428 qq22 = _mm_mul_ps(iq2,jq2);
1429 qq23 = _mm_mul_ps(iq2,jq3);
1430 qq31 = _mm_mul_ps(iq3,jq1);
1431 qq32 = _mm_mul_ps(iq3,jq2);
1432 qq33 = _mm_mul_ps(iq3,jq3);
1434 /* Avoid stupid compiler warnings */
1435 jnrA = jnrB = jnrC = jnrD = 0;
1436 j_coord_offsetA = 0;
1437 j_coord_offsetB = 0;
1438 j_coord_offsetC = 0;
1439 j_coord_offsetD = 0;
1444 for(iidx=0;iidx<4*DIM;iidx++)
1446 scratch[iidx] = 0.0;
1449 /* Start outer loop over neighborlists */
1450 for(iidx=0; iidx<nri; iidx++)
1452 /* Load shift vector for this list */
1453 i_shift_offset = DIM*shiftidx[iidx];
1455 /* Load limits for loop over neighbors */
1456 j_index_start = jindex[iidx];
1457 j_index_end = jindex[iidx+1];
1459 /* Get outer coordinate index */
1461 i_coord_offset = DIM*inr;
1463 /* Load i particle coords and add shift vector */
1464 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1465 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1467 fix0 = _mm_setzero_ps();
1468 fiy0 = _mm_setzero_ps();
1469 fiz0 = _mm_setzero_ps();
1470 fix1 = _mm_setzero_ps();
1471 fiy1 = _mm_setzero_ps();
1472 fiz1 = _mm_setzero_ps();
1473 fix2 = _mm_setzero_ps();
1474 fiy2 = _mm_setzero_ps();
1475 fiz2 = _mm_setzero_ps();
1476 fix3 = _mm_setzero_ps();
1477 fiy3 = _mm_setzero_ps();
1478 fiz3 = _mm_setzero_ps();
1480 /* Start inner kernel loop */
1481 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1484 /* Get j neighbor index, and coordinate index */
1486 jnrB = jjnr[jidx+1];
1487 jnrC = jjnr[jidx+2];
1488 jnrD = jjnr[jidx+3];
1489 j_coord_offsetA = DIM*jnrA;
1490 j_coord_offsetB = DIM*jnrB;
1491 j_coord_offsetC = DIM*jnrC;
1492 j_coord_offsetD = DIM*jnrD;
1494 /* load j atom coordinates */
1495 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1496 x+j_coord_offsetC,x+j_coord_offsetD,
1497 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1498 &jy2,&jz2,&jx3,&jy3,&jz3);
1500 /* Calculate displacement vector */
1501 dx00 = _mm_sub_ps(ix0,jx0);
1502 dy00 = _mm_sub_ps(iy0,jy0);
1503 dz00 = _mm_sub_ps(iz0,jz0);
1504 dx11 = _mm_sub_ps(ix1,jx1);
1505 dy11 = _mm_sub_ps(iy1,jy1);
1506 dz11 = _mm_sub_ps(iz1,jz1);
1507 dx12 = _mm_sub_ps(ix1,jx2);
1508 dy12 = _mm_sub_ps(iy1,jy2);
1509 dz12 = _mm_sub_ps(iz1,jz2);
1510 dx13 = _mm_sub_ps(ix1,jx3);
1511 dy13 = _mm_sub_ps(iy1,jy3);
1512 dz13 = _mm_sub_ps(iz1,jz3);
1513 dx21 = _mm_sub_ps(ix2,jx1);
1514 dy21 = _mm_sub_ps(iy2,jy1);
1515 dz21 = _mm_sub_ps(iz2,jz1);
1516 dx22 = _mm_sub_ps(ix2,jx2);
1517 dy22 = _mm_sub_ps(iy2,jy2);
1518 dz22 = _mm_sub_ps(iz2,jz2);
1519 dx23 = _mm_sub_ps(ix2,jx3);
1520 dy23 = _mm_sub_ps(iy2,jy3);
1521 dz23 = _mm_sub_ps(iz2,jz3);
1522 dx31 = _mm_sub_ps(ix3,jx1);
1523 dy31 = _mm_sub_ps(iy3,jy1);
1524 dz31 = _mm_sub_ps(iz3,jz1);
1525 dx32 = _mm_sub_ps(ix3,jx2);
1526 dy32 = _mm_sub_ps(iy3,jy2);
1527 dz32 = _mm_sub_ps(iz3,jz2);
1528 dx33 = _mm_sub_ps(ix3,jx3);
1529 dy33 = _mm_sub_ps(iy3,jy3);
1530 dz33 = _mm_sub_ps(iz3,jz3);
1532 /* Calculate squared distance and things based on it */
1533 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1534 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1535 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1536 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1537 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1538 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1539 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1540 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1541 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1542 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1544 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1545 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1546 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1547 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1548 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1549 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1550 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1551 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1552 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1554 rinvsq00 = gmx_mm_inv_ps(rsq00);
1556 fjx0 = _mm_setzero_ps();
1557 fjy0 = _mm_setzero_ps();
1558 fjz0 = _mm_setzero_ps();
1559 fjx1 = _mm_setzero_ps();
1560 fjy1 = _mm_setzero_ps();
1561 fjz1 = _mm_setzero_ps();
1562 fjx2 = _mm_setzero_ps();
1563 fjy2 = _mm_setzero_ps();
1564 fjz2 = _mm_setzero_ps();
1565 fjx3 = _mm_setzero_ps();
1566 fjy3 = _mm_setzero_ps();
1567 fjz3 = _mm_setzero_ps();
1569 /**************************
1570 * CALCULATE INTERACTIONS *
1571 **************************/
1573 /* LENNARD-JONES DISPERSION/REPULSION */
1575 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1576 fvdw = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1580 /* Update vectorial force */
1581 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1582 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1583 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1585 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1586 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1587 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1589 /**************************
1590 * CALCULATE INTERACTIONS *
1591 **************************/
1593 r11 = _mm_mul_ps(rsq11,rinv11);
1595 /* Calculate table index by multiplying r with table scale and truncate to integer */
1596 rt = _mm_mul_ps(r11,vftabscale);
1597 vfitab = _mm_cvttps_epi32(rt);
1599 vfeps = _mm_frcz_ps(rt);
1601 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1603 twovfeps = _mm_add_ps(vfeps,vfeps);
1604 vfitab = _mm_slli_epi32(vfitab,2);
1606 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1607 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1608 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1609 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1610 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1611 _MM_TRANSPOSE4_PS(Y,F,G,H);
1612 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1613 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1614 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1618 /* Update vectorial force */
1619 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1620 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1621 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1623 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1624 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1625 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1627 /**************************
1628 * CALCULATE INTERACTIONS *
1629 **************************/
1631 r12 = _mm_mul_ps(rsq12,rinv12);
1633 /* Calculate table index by multiplying r with table scale and truncate to integer */
1634 rt = _mm_mul_ps(r12,vftabscale);
1635 vfitab = _mm_cvttps_epi32(rt);
1637 vfeps = _mm_frcz_ps(rt);
1639 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1641 twovfeps = _mm_add_ps(vfeps,vfeps);
1642 vfitab = _mm_slli_epi32(vfitab,2);
1644 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1645 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1646 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1647 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1648 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1649 _MM_TRANSPOSE4_PS(Y,F,G,H);
1650 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1651 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1652 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1656 /* Update vectorial force */
1657 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1658 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1659 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1661 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1662 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1663 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1665 /**************************
1666 * CALCULATE INTERACTIONS *
1667 **************************/
1669 r13 = _mm_mul_ps(rsq13,rinv13);
1671 /* Calculate table index by multiplying r with table scale and truncate to integer */
1672 rt = _mm_mul_ps(r13,vftabscale);
1673 vfitab = _mm_cvttps_epi32(rt);
1675 vfeps = _mm_frcz_ps(rt);
1677 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1679 twovfeps = _mm_add_ps(vfeps,vfeps);
1680 vfitab = _mm_slli_epi32(vfitab,2);
1682 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1683 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1684 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1685 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1686 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1687 _MM_TRANSPOSE4_PS(Y,F,G,H);
1688 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1689 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1690 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1694 /* Update vectorial force */
1695 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1696 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1697 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1699 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1700 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1701 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1703 /**************************
1704 * CALCULATE INTERACTIONS *
1705 **************************/
1707 r21 = _mm_mul_ps(rsq21,rinv21);
1709 /* Calculate table index by multiplying r with table scale and truncate to integer */
1710 rt = _mm_mul_ps(r21,vftabscale);
1711 vfitab = _mm_cvttps_epi32(rt);
1713 vfeps = _mm_frcz_ps(rt);
1715 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1717 twovfeps = _mm_add_ps(vfeps,vfeps);
1718 vfitab = _mm_slli_epi32(vfitab,2);
1720 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1721 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1722 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1723 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1724 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1725 _MM_TRANSPOSE4_PS(Y,F,G,H);
1726 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1727 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1728 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1732 /* Update vectorial force */
1733 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1734 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1735 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1737 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1738 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1739 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1741 /**************************
1742 * CALCULATE INTERACTIONS *
1743 **************************/
1745 r22 = _mm_mul_ps(rsq22,rinv22);
1747 /* Calculate table index by multiplying r with table scale and truncate to integer */
1748 rt = _mm_mul_ps(r22,vftabscale);
1749 vfitab = _mm_cvttps_epi32(rt);
1751 vfeps = _mm_frcz_ps(rt);
1753 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1755 twovfeps = _mm_add_ps(vfeps,vfeps);
1756 vfitab = _mm_slli_epi32(vfitab,2);
1758 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1759 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1760 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1761 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1762 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1763 _MM_TRANSPOSE4_PS(Y,F,G,H);
1764 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1765 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1766 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1770 /* Update vectorial force */
1771 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1772 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1773 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1775 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1776 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1777 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1779 /**************************
1780 * CALCULATE INTERACTIONS *
1781 **************************/
1783 r23 = _mm_mul_ps(rsq23,rinv23);
1785 /* Calculate table index by multiplying r with table scale and truncate to integer */
1786 rt = _mm_mul_ps(r23,vftabscale);
1787 vfitab = _mm_cvttps_epi32(rt);
1789 vfeps = _mm_frcz_ps(rt);
1791 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1793 twovfeps = _mm_add_ps(vfeps,vfeps);
1794 vfitab = _mm_slli_epi32(vfitab,2);
1796 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1797 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1798 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1799 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1800 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1801 _MM_TRANSPOSE4_PS(Y,F,G,H);
1802 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1803 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1804 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1808 /* Update vectorial force */
1809 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1810 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1811 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1813 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1814 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1815 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1817 /**************************
1818 * CALCULATE INTERACTIONS *
1819 **************************/
1821 r31 = _mm_mul_ps(rsq31,rinv31);
1823 /* Calculate table index by multiplying r with table scale and truncate to integer */
1824 rt = _mm_mul_ps(r31,vftabscale);
1825 vfitab = _mm_cvttps_epi32(rt);
1827 vfeps = _mm_frcz_ps(rt);
1829 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1831 twovfeps = _mm_add_ps(vfeps,vfeps);
1832 vfitab = _mm_slli_epi32(vfitab,2);
1834 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1835 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1836 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1837 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1838 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1839 _MM_TRANSPOSE4_PS(Y,F,G,H);
1840 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1841 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1842 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1846 /* Update vectorial force */
1847 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1848 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1849 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1851 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1852 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1853 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1855 /**************************
1856 * CALCULATE INTERACTIONS *
1857 **************************/
1859 r32 = _mm_mul_ps(rsq32,rinv32);
1861 /* Calculate table index by multiplying r with table scale and truncate to integer */
1862 rt = _mm_mul_ps(r32,vftabscale);
1863 vfitab = _mm_cvttps_epi32(rt);
1865 vfeps = _mm_frcz_ps(rt);
1867 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1869 twovfeps = _mm_add_ps(vfeps,vfeps);
1870 vfitab = _mm_slli_epi32(vfitab,2);
1872 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1873 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1874 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1875 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1876 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1877 _MM_TRANSPOSE4_PS(Y,F,G,H);
1878 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1879 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1880 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1884 /* Update vectorial force */
1885 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1886 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1887 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1889 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1890 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1891 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1893 /**************************
1894 * CALCULATE INTERACTIONS *
1895 **************************/
1897 r33 = _mm_mul_ps(rsq33,rinv33);
1899 /* Calculate table index by multiplying r with table scale and truncate to integer */
1900 rt = _mm_mul_ps(r33,vftabscale);
1901 vfitab = _mm_cvttps_epi32(rt);
1903 vfeps = _mm_frcz_ps(rt);
1905 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1907 twovfeps = _mm_add_ps(vfeps,vfeps);
1908 vfitab = _mm_slli_epi32(vfitab,2);
1910 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1911 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1912 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1913 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1914 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1915 _MM_TRANSPOSE4_PS(Y,F,G,H);
1916 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1917 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1918 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1922 /* Update vectorial force */
1923 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1924 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1925 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1927 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1928 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1929 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1931 fjptrA = f+j_coord_offsetA;
1932 fjptrB = f+j_coord_offsetB;
1933 fjptrC = f+j_coord_offsetC;
1934 fjptrD = f+j_coord_offsetD;
1936 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1937 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1938 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1940 /* Inner loop uses 411 flops */
1943 if(jidx<j_index_end)
1946 /* Get j neighbor index, and coordinate index */
1947 jnrlistA = jjnr[jidx];
1948 jnrlistB = jjnr[jidx+1];
1949 jnrlistC = jjnr[jidx+2];
1950 jnrlistD = jjnr[jidx+3];
1951 /* Sign of each element will be negative for non-real atoms.
1952 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1953 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1955 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1956 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1957 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1958 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1959 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1960 j_coord_offsetA = DIM*jnrA;
1961 j_coord_offsetB = DIM*jnrB;
1962 j_coord_offsetC = DIM*jnrC;
1963 j_coord_offsetD = DIM*jnrD;
1965 /* load j atom coordinates */
1966 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1967 x+j_coord_offsetC,x+j_coord_offsetD,
1968 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1969 &jy2,&jz2,&jx3,&jy3,&jz3);
1971 /* Calculate displacement vector */
1972 dx00 = _mm_sub_ps(ix0,jx0);
1973 dy00 = _mm_sub_ps(iy0,jy0);
1974 dz00 = _mm_sub_ps(iz0,jz0);
1975 dx11 = _mm_sub_ps(ix1,jx1);
1976 dy11 = _mm_sub_ps(iy1,jy1);
1977 dz11 = _mm_sub_ps(iz1,jz1);
1978 dx12 = _mm_sub_ps(ix1,jx2);
1979 dy12 = _mm_sub_ps(iy1,jy2);
1980 dz12 = _mm_sub_ps(iz1,jz2);
1981 dx13 = _mm_sub_ps(ix1,jx3);
1982 dy13 = _mm_sub_ps(iy1,jy3);
1983 dz13 = _mm_sub_ps(iz1,jz3);
1984 dx21 = _mm_sub_ps(ix2,jx1);
1985 dy21 = _mm_sub_ps(iy2,jy1);
1986 dz21 = _mm_sub_ps(iz2,jz1);
1987 dx22 = _mm_sub_ps(ix2,jx2);
1988 dy22 = _mm_sub_ps(iy2,jy2);
1989 dz22 = _mm_sub_ps(iz2,jz2);
1990 dx23 = _mm_sub_ps(ix2,jx3);
1991 dy23 = _mm_sub_ps(iy2,jy3);
1992 dz23 = _mm_sub_ps(iz2,jz3);
1993 dx31 = _mm_sub_ps(ix3,jx1);
1994 dy31 = _mm_sub_ps(iy3,jy1);
1995 dz31 = _mm_sub_ps(iz3,jz1);
1996 dx32 = _mm_sub_ps(ix3,jx2);
1997 dy32 = _mm_sub_ps(iy3,jy2);
1998 dz32 = _mm_sub_ps(iz3,jz2);
1999 dx33 = _mm_sub_ps(ix3,jx3);
2000 dy33 = _mm_sub_ps(iy3,jy3);
2001 dz33 = _mm_sub_ps(iz3,jz3);
2003 /* Calculate squared distance and things based on it */
2004 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2005 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2006 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2007 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2008 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2009 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2010 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2011 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2012 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2013 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2015 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2016 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2017 rinv13 = gmx_mm_invsqrt_ps(rsq13);
2018 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2019 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2020 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2021 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2022 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2023 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2025 rinvsq00 = gmx_mm_inv_ps(rsq00);
2027 fjx0 = _mm_setzero_ps();
2028 fjy0 = _mm_setzero_ps();
2029 fjz0 = _mm_setzero_ps();
2030 fjx1 = _mm_setzero_ps();
2031 fjy1 = _mm_setzero_ps();
2032 fjz1 = _mm_setzero_ps();
2033 fjx2 = _mm_setzero_ps();
2034 fjy2 = _mm_setzero_ps();
2035 fjz2 = _mm_setzero_ps();
2036 fjx3 = _mm_setzero_ps();
2037 fjy3 = _mm_setzero_ps();
2038 fjz3 = _mm_setzero_ps();
2040 /**************************
2041 * CALCULATE INTERACTIONS *
2042 **************************/
2044 /* LENNARD-JONES DISPERSION/REPULSION */
2046 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2047 fvdw = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
2051 fscal = _mm_andnot_ps(dummy_mask,fscal);
2053 /* Update vectorial force */
2054 fix0 = _mm_macc_ps(dx00,fscal,fix0);
2055 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
2056 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
2058 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
2059 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
2060 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
2062 /**************************
2063 * CALCULATE INTERACTIONS *
2064 **************************/
2066 r11 = _mm_mul_ps(rsq11,rinv11);
2067 r11 = _mm_andnot_ps(dummy_mask,r11);
2069 /* Calculate table index by multiplying r with table scale and truncate to integer */
2070 rt = _mm_mul_ps(r11,vftabscale);
2071 vfitab = _mm_cvttps_epi32(rt);
2073 vfeps = _mm_frcz_ps(rt);
2075 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2077 twovfeps = _mm_add_ps(vfeps,vfeps);
2078 vfitab = _mm_slli_epi32(vfitab,2);
2080 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2081 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2082 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2083 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2084 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2085 _MM_TRANSPOSE4_PS(Y,F,G,H);
2086 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2087 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2088 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2092 fscal = _mm_andnot_ps(dummy_mask,fscal);
2094 /* Update vectorial force */
2095 fix1 = _mm_macc_ps(dx11,fscal,fix1);
2096 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
2097 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
2099 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
2100 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
2101 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
2103 /**************************
2104 * CALCULATE INTERACTIONS *
2105 **************************/
2107 r12 = _mm_mul_ps(rsq12,rinv12);
2108 r12 = _mm_andnot_ps(dummy_mask,r12);
2110 /* Calculate table index by multiplying r with table scale and truncate to integer */
2111 rt = _mm_mul_ps(r12,vftabscale);
2112 vfitab = _mm_cvttps_epi32(rt);
2114 vfeps = _mm_frcz_ps(rt);
2116 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2118 twovfeps = _mm_add_ps(vfeps,vfeps);
2119 vfitab = _mm_slli_epi32(vfitab,2);
2121 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2122 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2123 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2124 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2125 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2126 _MM_TRANSPOSE4_PS(Y,F,G,H);
2127 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2128 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2129 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2133 fscal = _mm_andnot_ps(dummy_mask,fscal);
2135 /* Update vectorial force */
2136 fix1 = _mm_macc_ps(dx12,fscal,fix1);
2137 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
2138 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
2140 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
2141 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
2142 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
2144 /**************************
2145 * CALCULATE INTERACTIONS *
2146 **************************/
2148 r13 = _mm_mul_ps(rsq13,rinv13);
2149 r13 = _mm_andnot_ps(dummy_mask,r13);
2151 /* Calculate table index by multiplying r with table scale and truncate to integer */
2152 rt = _mm_mul_ps(r13,vftabscale);
2153 vfitab = _mm_cvttps_epi32(rt);
2155 vfeps = _mm_frcz_ps(rt);
2157 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2159 twovfeps = _mm_add_ps(vfeps,vfeps);
2160 vfitab = _mm_slli_epi32(vfitab,2);
2162 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2163 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2164 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2165 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2166 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2167 _MM_TRANSPOSE4_PS(Y,F,G,H);
2168 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2169 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2170 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
2174 fscal = _mm_andnot_ps(dummy_mask,fscal);
2176 /* Update vectorial force */
2177 fix1 = _mm_macc_ps(dx13,fscal,fix1);
2178 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
2179 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
2181 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
2182 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
2183 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
2185 /**************************
2186 * CALCULATE INTERACTIONS *
2187 **************************/
2189 r21 = _mm_mul_ps(rsq21,rinv21);
2190 r21 = _mm_andnot_ps(dummy_mask,r21);
2192 /* Calculate table index by multiplying r with table scale and truncate to integer */
2193 rt = _mm_mul_ps(r21,vftabscale);
2194 vfitab = _mm_cvttps_epi32(rt);
2196 vfeps = _mm_frcz_ps(rt);
2198 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2200 twovfeps = _mm_add_ps(vfeps,vfeps);
2201 vfitab = _mm_slli_epi32(vfitab,2);
2203 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2204 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2205 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2206 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2207 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2208 _MM_TRANSPOSE4_PS(Y,F,G,H);
2209 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2210 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2211 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2215 fscal = _mm_andnot_ps(dummy_mask,fscal);
2217 /* Update vectorial force */
2218 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2219 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2220 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2222 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2223 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2224 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2226 /**************************
2227 * CALCULATE INTERACTIONS *
2228 **************************/
2230 r22 = _mm_mul_ps(rsq22,rinv22);
2231 r22 = _mm_andnot_ps(dummy_mask,r22);
2233 /* Calculate table index by multiplying r with table scale and truncate to integer */
2234 rt = _mm_mul_ps(r22,vftabscale);
2235 vfitab = _mm_cvttps_epi32(rt);
2237 vfeps = _mm_frcz_ps(rt);
2239 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2241 twovfeps = _mm_add_ps(vfeps,vfeps);
2242 vfitab = _mm_slli_epi32(vfitab,2);
2244 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2245 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2246 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2247 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2248 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2249 _MM_TRANSPOSE4_PS(Y,F,G,H);
2250 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2251 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2252 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2256 fscal = _mm_andnot_ps(dummy_mask,fscal);
2258 /* Update vectorial force */
2259 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2260 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2261 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2263 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2264 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2265 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2267 /**************************
2268 * CALCULATE INTERACTIONS *
2269 **************************/
2271 r23 = _mm_mul_ps(rsq23,rinv23);
2272 r23 = _mm_andnot_ps(dummy_mask,r23);
2274 /* Calculate table index by multiplying r with table scale and truncate to integer */
2275 rt = _mm_mul_ps(r23,vftabscale);
2276 vfitab = _mm_cvttps_epi32(rt);
2278 vfeps = _mm_frcz_ps(rt);
2280 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2282 twovfeps = _mm_add_ps(vfeps,vfeps);
2283 vfitab = _mm_slli_epi32(vfitab,2);
2285 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2286 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2287 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2288 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2289 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2290 _MM_TRANSPOSE4_PS(Y,F,G,H);
2291 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2292 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2293 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
2297 fscal = _mm_andnot_ps(dummy_mask,fscal);
2299 /* Update vectorial force */
2300 fix2 = _mm_macc_ps(dx23,fscal,fix2);
2301 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
2302 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
2304 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
2305 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
2306 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
2308 /**************************
2309 * CALCULATE INTERACTIONS *
2310 **************************/
2312 r31 = _mm_mul_ps(rsq31,rinv31);
2313 r31 = _mm_andnot_ps(dummy_mask,r31);
2315 /* Calculate table index by multiplying r with table scale and truncate to integer */
2316 rt = _mm_mul_ps(r31,vftabscale);
2317 vfitab = _mm_cvttps_epi32(rt);
2319 vfeps = _mm_frcz_ps(rt);
2321 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2323 twovfeps = _mm_add_ps(vfeps,vfeps);
2324 vfitab = _mm_slli_epi32(vfitab,2);
2326 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2327 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2328 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2329 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2330 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2331 _MM_TRANSPOSE4_PS(Y,F,G,H);
2332 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2333 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2334 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
2338 fscal = _mm_andnot_ps(dummy_mask,fscal);
2340 /* Update vectorial force */
2341 fix3 = _mm_macc_ps(dx31,fscal,fix3);
2342 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
2343 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
2345 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
2346 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
2347 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
2349 /**************************
2350 * CALCULATE INTERACTIONS *
2351 **************************/
2353 r32 = _mm_mul_ps(rsq32,rinv32);
2354 r32 = _mm_andnot_ps(dummy_mask,r32);
2356 /* Calculate table index by multiplying r with table scale and truncate to integer */
2357 rt = _mm_mul_ps(r32,vftabscale);
2358 vfitab = _mm_cvttps_epi32(rt);
2360 vfeps = _mm_frcz_ps(rt);
2362 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2364 twovfeps = _mm_add_ps(vfeps,vfeps);
2365 vfitab = _mm_slli_epi32(vfitab,2);
2367 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2368 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2369 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2370 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2371 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2372 _MM_TRANSPOSE4_PS(Y,F,G,H);
2373 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2374 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2375 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
2379 fscal = _mm_andnot_ps(dummy_mask,fscal);
2381 /* Update vectorial force */
2382 fix3 = _mm_macc_ps(dx32,fscal,fix3);
2383 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
2384 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
2386 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
2387 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
2388 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
2390 /**************************
2391 * CALCULATE INTERACTIONS *
2392 **************************/
2394 r33 = _mm_mul_ps(rsq33,rinv33);
2395 r33 = _mm_andnot_ps(dummy_mask,r33);
2397 /* Calculate table index by multiplying r with table scale and truncate to integer */
2398 rt = _mm_mul_ps(r33,vftabscale);
2399 vfitab = _mm_cvttps_epi32(rt);
2401 vfeps = _mm_frcz_ps(rt);
2403 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2405 twovfeps = _mm_add_ps(vfeps,vfeps);
2406 vfitab = _mm_slli_epi32(vfitab,2);
2408 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2409 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2410 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2411 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2412 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2413 _MM_TRANSPOSE4_PS(Y,F,G,H);
2414 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2415 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2416 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2420 fscal = _mm_andnot_ps(dummy_mask,fscal);
2422 /* Update vectorial force */
2423 fix3 = _mm_macc_ps(dx33,fscal,fix3);
2424 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
2425 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
2427 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
2428 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
2429 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
2431 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2432 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2433 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2434 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2436 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2437 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2438 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2440 /* Inner loop uses 420 flops */
2443 /* End of innermost loop */
2445 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2446 f+i_coord_offset,fshift+i_shift_offset);
2448 /* Increment number of inner iterations */
2449 inneriter += j_index_end - j_index_start;
2451 /* Outer loop uses 24 flops */
2454 /* Increment number of outer iterations */
2457 /* Update outer/inner flops */
2459 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*420);