2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
48 #include "kernelutil_x86_avx_128_fma_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_128_fma_single
52 * Electrostatics interaction: CubicSplineTable
53 * VdW interaction: CubicSplineTable
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_128_fma_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
107 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
111 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
113 __m128i ifour = _mm_set1_epi32(4);
114 __m128 rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
116 __m128 dummy_mask,cutoff_mask;
117 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
118 __m128 one = _mm_set1_ps(1.0);
119 __m128 two = _mm_set1_ps(2.0);
125 jindex = nlist->jindex;
127 shiftidx = nlist->shift;
129 shiftvec = fr->shift_vec[0];
130 fshift = fr->fshift[0];
131 facel = _mm_set1_ps(fr->epsfac);
132 charge = mdatoms->chargeA;
133 nvdwtype = fr->ntype;
135 vdwtype = mdatoms->typeA;
137 vftab = kernel_data->table_elec_vdw->data;
138 vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
140 /* Setup water-specific parameters */
141 inr = nlist->iinr[0];
142 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
143 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
144 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
145 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
147 jq0 = _mm_set1_ps(charge[inr+0]);
148 jq1 = _mm_set1_ps(charge[inr+1]);
149 jq2 = _mm_set1_ps(charge[inr+2]);
150 vdwjidx0A = 2*vdwtype[inr+0];
151 qq00 = _mm_mul_ps(iq0,jq0);
152 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
153 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
154 qq01 = _mm_mul_ps(iq0,jq1);
155 qq02 = _mm_mul_ps(iq0,jq2);
156 qq10 = _mm_mul_ps(iq1,jq0);
157 qq11 = _mm_mul_ps(iq1,jq1);
158 qq12 = _mm_mul_ps(iq1,jq2);
159 qq20 = _mm_mul_ps(iq2,jq0);
160 qq21 = _mm_mul_ps(iq2,jq1);
161 qq22 = _mm_mul_ps(iq2,jq2);
163 /* Avoid stupid compiler warnings */
164 jnrA = jnrB = jnrC = jnrD = 0;
173 for(iidx=0;iidx<4*DIM;iidx++)
178 /* Start outer loop over neighborlists */
179 for(iidx=0; iidx<nri; iidx++)
181 /* Load shift vector for this list */
182 i_shift_offset = DIM*shiftidx[iidx];
184 /* Load limits for loop over neighbors */
185 j_index_start = jindex[iidx];
186 j_index_end = jindex[iidx+1];
188 /* Get outer coordinate index */
190 i_coord_offset = DIM*inr;
192 /* Load i particle coords and add shift vector */
193 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
194 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
196 fix0 = _mm_setzero_ps();
197 fiy0 = _mm_setzero_ps();
198 fiz0 = _mm_setzero_ps();
199 fix1 = _mm_setzero_ps();
200 fiy1 = _mm_setzero_ps();
201 fiz1 = _mm_setzero_ps();
202 fix2 = _mm_setzero_ps();
203 fiy2 = _mm_setzero_ps();
204 fiz2 = _mm_setzero_ps();
206 /* Reset potential sums */
207 velecsum = _mm_setzero_ps();
208 vvdwsum = _mm_setzero_ps();
210 /* Start inner kernel loop */
211 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
214 /* Get j neighbor index, and coordinate index */
219 j_coord_offsetA = DIM*jnrA;
220 j_coord_offsetB = DIM*jnrB;
221 j_coord_offsetC = DIM*jnrC;
222 j_coord_offsetD = DIM*jnrD;
224 /* load j atom coordinates */
225 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
226 x+j_coord_offsetC,x+j_coord_offsetD,
227 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
229 /* Calculate displacement vector */
230 dx00 = _mm_sub_ps(ix0,jx0);
231 dy00 = _mm_sub_ps(iy0,jy0);
232 dz00 = _mm_sub_ps(iz0,jz0);
233 dx01 = _mm_sub_ps(ix0,jx1);
234 dy01 = _mm_sub_ps(iy0,jy1);
235 dz01 = _mm_sub_ps(iz0,jz1);
236 dx02 = _mm_sub_ps(ix0,jx2);
237 dy02 = _mm_sub_ps(iy0,jy2);
238 dz02 = _mm_sub_ps(iz0,jz2);
239 dx10 = _mm_sub_ps(ix1,jx0);
240 dy10 = _mm_sub_ps(iy1,jy0);
241 dz10 = _mm_sub_ps(iz1,jz0);
242 dx11 = _mm_sub_ps(ix1,jx1);
243 dy11 = _mm_sub_ps(iy1,jy1);
244 dz11 = _mm_sub_ps(iz1,jz1);
245 dx12 = _mm_sub_ps(ix1,jx2);
246 dy12 = _mm_sub_ps(iy1,jy2);
247 dz12 = _mm_sub_ps(iz1,jz2);
248 dx20 = _mm_sub_ps(ix2,jx0);
249 dy20 = _mm_sub_ps(iy2,jy0);
250 dz20 = _mm_sub_ps(iz2,jz0);
251 dx21 = _mm_sub_ps(ix2,jx1);
252 dy21 = _mm_sub_ps(iy2,jy1);
253 dz21 = _mm_sub_ps(iz2,jz1);
254 dx22 = _mm_sub_ps(ix2,jx2);
255 dy22 = _mm_sub_ps(iy2,jy2);
256 dz22 = _mm_sub_ps(iz2,jz2);
258 /* Calculate squared distance and things based on it */
259 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
260 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
261 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
262 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
263 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
264 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
265 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
266 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
267 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
269 rinv00 = gmx_mm_invsqrt_ps(rsq00);
270 rinv01 = gmx_mm_invsqrt_ps(rsq01);
271 rinv02 = gmx_mm_invsqrt_ps(rsq02);
272 rinv10 = gmx_mm_invsqrt_ps(rsq10);
273 rinv11 = gmx_mm_invsqrt_ps(rsq11);
274 rinv12 = gmx_mm_invsqrt_ps(rsq12);
275 rinv20 = gmx_mm_invsqrt_ps(rsq20);
276 rinv21 = gmx_mm_invsqrt_ps(rsq21);
277 rinv22 = gmx_mm_invsqrt_ps(rsq22);
279 fjx0 = _mm_setzero_ps();
280 fjy0 = _mm_setzero_ps();
281 fjz0 = _mm_setzero_ps();
282 fjx1 = _mm_setzero_ps();
283 fjy1 = _mm_setzero_ps();
284 fjz1 = _mm_setzero_ps();
285 fjx2 = _mm_setzero_ps();
286 fjy2 = _mm_setzero_ps();
287 fjz2 = _mm_setzero_ps();
289 /**************************
290 * CALCULATE INTERACTIONS *
291 **************************/
293 r00 = _mm_mul_ps(rsq00,rinv00);
295 /* Calculate table index by multiplying r with table scale and truncate to integer */
296 rt = _mm_mul_ps(r00,vftabscale);
297 vfitab = _mm_cvttps_epi32(rt);
299 vfeps = _mm_frcz_ps(rt);
301 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
303 twovfeps = _mm_add_ps(vfeps,vfeps);
304 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
306 /* CUBIC SPLINE TABLE ELECTROSTATICS */
307 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
308 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
309 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
310 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
311 _MM_TRANSPOSE4_PS(Y,F,G,H);
312 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
313 VV = _mm_macc_ps(vfeps,Fp,Y);
314 velec = _mm_mul_ps(qq00,VV);
315 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
316 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
318 /* CUBIC SPLINE TABLE DISPERSION */
319 vfitab = _mm_add_epi32(vfitab,ifour);
320 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
321 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
322 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
323 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
324 _MM_TRANSPOSE4_PS(Y,F,G,H);
325 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
326 VV = _mm_macc_ps(vfeps,Fp,Y);
327 vvdw6 = _mm_mul_ps(c6_00,VV);
328 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
329 fvdw6 = _mm_mul_ps(c6_00,FF);
331 /* CUBIC SPLINE TABLE REPULSION */
332 vfitab = _mm_add_epi32(vfitab,ifour);
333 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
334 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
335 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
336 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
337 _MM_TRANSPOSE4_PS(Y,F,G,H);
338 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
339 VV = _mm_macc_ps(vfeps,Fp,Y);
340 vvdw12 = _mm_mul_ps(c12_00,VV);
341 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
342 fvdw12 = _mm_mul_ps(c12_00,FF);
343 vvdw = _mm_add_ps(vvdw12,vvdw6);
344 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
346 /* Update potential sum for this i atom from the interaction with this j atom. */
347 velecsum = _mm_add_ps(velecsum,velec);
348 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
350 fscal = _mm_add_ps(felec,fvdw);
352 /* Update vectorial force */
353 fix0 = _mm_macc_ps(dx00,fscal,fix0);
354 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
355 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
357 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
358 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
359 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
361 /**************************
362 * CALCULATE INTERACTIONS *
363 **************************/
365 r01 = _mm_mul_ps(rsq01,rinv01);
367 /* Calculate table index by multiplying r with table scale and truncate to integer */
368 rt = _mm_mul_ps(r01,vftabscale);
369 vfitab = _mm_cvttps_epi32(rt);
371 vfeps = _mm_frcz_ps(rt);
373 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
375 twovfeps = _mm_add_ps(vfeps,vfeps);
376 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
378 /* CUBIC SPLINE TABLE ELECTROSTATICS */
379 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
380 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
381 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
382 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
383 _MM_TRANSPOSE4_PS(Y,F,G,H);
384 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
385 VV = _mm_macc_ps(vfeps,Fp,Y);
386 velec = _mm_mul_ps(qq01,VV);
387 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
388 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
390 /* Update potential sum for this i atom from the interaction with this j atom. */
391 velecsum = _mm_add_ps(velecsum,velec);
395 /* Update vectorial force */
396 fix0 = _mm_macc_ps(dx01,fscal,fix0);
397 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
398 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
400 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
401 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
402 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
404 /**************************
405 * CALCULATE INTERACTIONS *
406 **************************/
408 r02 = _mm_mul_ps(rsq02,rinv02);
410 /* Calculate table index by multiplying r with table scale and truncate to integer */
411 rt = _mm_mul_ps(r02,vftabscale);
412 vfitab = _mm_cvttps_epi32(rt);
414 vfeps = _mm_frcz_ps(rt);
416 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
418 twovfeps = _mm_add_ps(vfeps,vfeps);
419 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
421 /* CUBIC SPLINE TABLE ELECTROSTATICS */
422 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
423 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
424 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
425 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
426 _MM_TRANSPOSE4_PS(Y,F,G,H);
427 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
428 VV = _mm_macc_ps(vfeps,Fp,Y);
429 velec = _mm_mul_ps(qq02,VV);
430 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
431 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
433 /* Update potential sum for this i atom from the interaction with this j atom. */
434 velecsum = _mm_add_ps(velecsum,velec);
438 /* Update vectorial force */
439 fix0 = _mm_macc_ps(dx02,fscal,fix0);
440 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
441 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
443 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
444 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
445 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
447 /**************************
448 * CALCULATE INTERACTIONS *
449 **************************/
451 r10 = _mm_mul_ps(rsq10,rinv10);
453 /* Calculate table index by multiplying r with table scale and truncate to integer */
454 rt = _mm_mul_ps(r10,vftabscale);
455 vfitab = _mm_cvttps_epi32(rt);
457 vfeps = _mm_frcz_ps(rt);
459 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
461 twovfeps = _mm_add_ps(vfeps,vfeps);
462 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
464 /* CUBIC SPLINE TABLE ELECTROSTATICS */
465 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
466 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
467 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
468 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
469 _MM_TRANSPOSE4_PS(Y,F,G,H);
470 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
471 VV = _mm_macc_ps(vfeps,Fp,Y);
472 velec = _mm_mul_ps(qq10,VV);
473 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
474 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
476 /* Update potential sum for this i atom from the interaction with this j atom. */
477 velecsum = _mm_add_ps(velecsum,velec);
481 /* Update vectorial force */
482 fix1 = _mm_macc_ps(dx10,fscal,fix1);
483 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
484 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
486 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
487 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
488 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
490 /**************************
491 * CALCULATE INTERACTIONS *
492 **************************/
494 r11 = _mm_mul_ps(rsq11,rinv11);
496 /* Calculate table index by multiplying r with table scale and truncate to integer */
497 rt = _mm_mul_ps(r11,vftabscale);
498 vfitab = _mm_cvttps_epi32(rt);
500 vfeps = _mm_frcz_ps(rt);
502 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
504 twovfeps = _mm_add_ps(vfeps,vfeps);
505 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
507 /* CUBIC SPLINE TABLE ELECTROSTATICS */
508 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
509 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
510 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
511 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
512 _MM_TRANSPOSE4_PS(Y,F,G,H);
513 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
514 VV = _mm_macc_ps(vfeps,Fp,Y);
515 velec = _mm_mul_ps(qq11,VV);
516 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
517 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
519 /* Update potential sum for this i atom from the interaction with this j atom. */
520 velecsum = _mm_add_ps(velecsum,velec);
524 /* Update vectorial force */
525 fix1 = _mm_macc_ps(dx11,fscal,fix1);
526 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
527 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
529 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
530 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
531 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
533 /**************************
534 * CALCULATE INTERACTIONS *
535 **************************/
537 r12 = _mm_mul_ps(rsq12,rinv12);
539 /* Calculate table index by multiplying r with table scale and truncate to integer */
540 rt = _mm_mul_ps(r12,vftabscale);
541 vfitab = _mm_cvttps_epi32(rt);
543 vfeps = _mm_frcz_ps(rt);
545 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
547 twovfeps = _mm_add_ps(vfeps,vfeps);
548 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
550 /* CUBIC SPLINE TABLE ELECTROSTATICS */
551 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
552 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
553 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
554 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
555 _MM_TRANSPOSE4_PS(Y,F,G,H);
556 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
557 VV = _mm_macc_ps(vfeps,Fp,Y);
558 velec = _mm_mul_ps(qq12,VV);
559 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
560 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
562 /* Update potential sum for this i atom from the interaction with this j atom. */
563 velecsum = _mm_add_ps(velecsum,velec);
567 /* Update vectorial force */
568 fix1 = _mm_macc_ps(dx12,fscal,fix1);
569 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
570 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
572 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
573 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
574 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
576 /**************************
577 * CALCULATE INTERACTIONS *
578 **************************/
580 r20 = _mm_mul_ps(rsq20,rinv20);
582 /* Calculate table index by multiplying r with table scale and truncate to integer */
583 rt = _mm_mul_ps(r20,vftabscale);
584 vfitab = _mm_cvttps_epi32(rt);
586 vfeps = _mm_frcz_ps(rt);
588 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
590 twovfeps = _mm_add_ps(vfeps,vfeps);
591 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
593 /* CUBIC SPLINE TABLE ELECTROSTATICS */
594 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
595 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
596 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
597 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
598 _MM_TRANSPOSE4_PS(Y,F,G,H);
599 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
600 VV = _mm_macc_ps(vfeps,Fp,Y);
601 velec = _mm_mul_ps(qq20,VV);
602 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
603 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
605 /* Update potential sum for this i atom from the interaction with this j atom. */
606 velecsum = _mm_add_ps(velecsum,velec);
610 /* Update vectorial force */
611 fix2 = _mm_macc_ps(dx20,fscal,fix2);
612 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
613 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
615 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
616 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
617 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
619 /**************************
620 * CALCULATE INTERACTIONS *
621 **************************/
623 r21 = _mm_mul_ps(rsq21,rinv21);
625 /* Calculate table index by multiplying r with table scale and truncate to integer */
626 rt = _mm_mul_ps(r21,vftabscale);
627 vfitab = _mm_cvttps_epi32(rt);
629 vfeps = _mm_frcz_ps(rt);
631 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
633 twovfeps = _mm_add_ps(vfeps,vfeps);
634 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
636 /* CUBIC SPLINE TABLE ELECTROSTATICS */
637 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
638 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
639 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
640 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
641 _MM_TRANSPOSE4_PS(Y,F,G,H);
642 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
643 VV = _mm_macc_ps(vfeps,Fp,Y);
644 velec = _mm_mul_ps(qq21,VV);
645 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
646 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
648 /* Update potential sum for this i atom from the interaction with this j atom. */
649 velecsum = _mm_add_ps(velecsum,velec);
653 /* Update vectorial force */
654 fix2 = _mm_macc_ps(dx21,fscal,fix2);
655 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
656 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
658 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
659 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
660 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
662 /**************************
663 * CALCULATE INTERACTIONS *
664 **************************/
666 r22 = _mm_mul_ps(rsq22,rinv22);
668 /* Calculate table index by multiplying r with table scale and truncate to integer */
669 rt = _mm_mul_ps(r22,vftabscale);
670 vfitab = _mm_cvttps_epi32(rt);
672 vfeps = _mm_frcz_ps(rt);
674 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
676 twovfeps = _mm_add_ps(vfeps,vfeps);
677 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
679 /* CUBIC SPLINE TABLE ELECTROSTATICS */
680 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
681 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
682 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
683 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
684 _MM_TRANSPOSE4_PS(Y,F,G,H);
685 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
686 VV = _mm_macc_ps(vfeps,Fp,Y);
687 velec = _mm_mul_ps(qq22,VV);
688 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
689 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
691 /* Update potential sum for this i atom from the interaction with this j atom. */
692 velecsum = _mm_add_ps(velecsum,velec);
696 /* Update vectorial force */
697 fix2 = _mm_macc_ps(dx22,fscal,fix2);
698 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
699 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
701 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
702 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
703 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
705 fjptrA = f+j_coord_offsetA;
706 fjptrB = f+j_coord_offsetB;
707 fjptrC = f+j_coord_offsetC;
708 fjptrD = f+j_coord_offsetD;
710 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
711 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
713 /* Inner loop uses 444 flops */
719 /* Get j neighbor index, and coordinate index */
720 jnrlistA = jjnr[jidx];
721 jnrlistB = jjnr[jidx+1];
722 jnrlistC = jjnr[jidx+2];
723 jnrlistD = jjnr[jidx+3];
724 /* Sign of each element will be negative for non-real atoms.
725 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
726 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
728 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
729 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
730 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
731 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
732 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
733 j_coord_offsetA = DIM*jnrA;
734 j_coord_offsetB = DIM*jnrB;
735 j_coord_offsetC = DIM*jnrC;
736 j_coord_offsetD = DIM*jnrD;
738 /* load j atom coordinates */
739 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
740 x+j_coord_offsetC,x+j_coord_offsetD,
741 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
743 /* Calculate displacement vector */
744 dx00 = _mm_sub_ps(ix0,jx0);
745 dy00 = _mm_sub_ps(iy0,jy0);
746 dz00 = _mm_sub_ps(iz0,jz0);
747 dx01 = _mm_sub_ps(ix0,jx1);
748 dy01 = _mm_sub_ps(iy0,jy1);
749 dz01 = _mm_sub_ps(iz0,jz1);
750 dx02 = _mm_sub_ps(ix0,jx2);
751 dy02 = _mm_sub_ps(iy0,jy2);
752 dz02 = _mm_sub_ps(iz0,jz2);
753 dx10 = _mm_sub_ps(ix1,jx0);
754 dy10 = _mm_sub_ps(iy1,jy0);
755 dz10 = _mm_sub_ps(iz1,jz0);
756 dx11 = _mm_sub_ps(ix1,jx1);
757 dy11 = _mm_sub_ps(iy1,jy1);
758 dz11 = _mm_sub_ps(iz1,jz1);
759 dx12 = _mm_sub_ps(ix1,jx2);
760 dy12 = _mm_sub_ps(iy1,jy2);
761 dz12 = _mm_sub_ps(iz1,jz2);
762 dx20 = _mm_sub_ps(ix2,jx0);
763 dy20 = _mm_sub_ps(iy2,jy0);
764 dz20 = _mm_sub_ps(iz2,jz0);
765 dx21 = _mm_sub_ps(ix2,jx1);
766 dy21 = _mm_sub_ps(iy2,jy1);
767 dz21 = _mm_sub_ps(iz2,jz1);
768 dx22 = _mm_sub_ps(ix2,jx2);
769 dy22 = _mm_sub_ps(iy2,jy2);
770 dz22 = _mm_sub_ps(iz2,jz2);
772 /* Calculate squared distance and things based on it */
773 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
774 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
775 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
776 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
777 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
778 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
779 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
780 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
781 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
783 rinv00 = gmx_mm_invsqrt_ps(rsq00);
784 rinv01 = gmx_mm_invsqrt_ps(rsq01);
785 rinv02 = gmx_mm_invsqrt_ps(rsq02);
786 rinv10 = gmx_mm_invsqrt_ps(rsq10);
787 rinv11 = gmx_mm_invsqrt_ps(rsq11);
788 rinv12 = gmx_mm_invsqrt_ps(rsq12);
789 rinv20 = gmx_mm_invsqrt_ps(rsq20);
790 rinv21 = gmx_mm_invsqrt_ps(rsq21);
791 rinv22 = gmx_mm_invsqrt_ps(rsq22);
793 fjx0 = _mm_setzero_ps();
794 fjy0 = _mm_setzero_ps();
795 fjz0 = _mm_setzero_ps();
796 fjx1 = _mm_setzero_ps();
797 fjy1 = _mm_setzero_ps();
798 fjz1 = _mm_setzero_ps();
799 fjx2 = _mm_setzero_ps();
800 fjy2 = _mm_setzero_ps();
801 fjz2 = _mm_setzero_ps();
803 /**************************
804 * CALCULATE INTERACTIONS *
805 **************************/
807 r00 = _mm_mul_ps(rsq00,rinv00);
808 r00 = _mm_andnot_ps(dummy_mask,r00);
810 /* Calculate table index by multiplying r with table scale and truncate to integer */
811 rt = _mm_mul_ps(r00,vftabscale);
812 vfitab = _mm_cvttps_epi32(rt);
814 vfeps = _mm_frcz_ps(rt);
816 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
818 twovfeps = _mm_add_ps(vfeps,vfeps);
819 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
821 /* CUBIC SPLINE TABLE ELECTROSTATICS */
822 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
823 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
824 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
825 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
826 _MM_TRANSPOSE4_PS(Y,F,G,H);
827 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
828 VV = _mm_macc_ps(vfeps,Fp,Y);
829 velec = _mm_mul_ps(qq00,VV);
830 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
831 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
833 /* CUBIC SPLINE TABLE DISPERSION */
834 vfitab = _mm_add_epi32(vfitab,ifour);
835 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
836 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
837 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
838 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
839 _MM_TRANSPOSE4_PS(Y,F,G,H);
840 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
841 VV = _mm_macc_ps(vfeps,Fp,Y);
842 vvdw6 = _mm_mul_ps(c6_00,VV);
843 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
844 fvdw6 = _mm_mul_ps(c6_00,FF);
846 /* CUBIC SPLINE TABLE REPULSION */
847 vfitab = _mm_add_epi32(vfitab,ifour);
848 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
849 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
850 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
851 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
852 _MM_TRANSPOSE4_PS(Y,F,G,H);
853 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
854 VV = _mm_macc_ps(vfeps,Fp,Y);
855 vvdw12 = _mm_mul_ps(c12_00,VV);
856 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
857 fvdw12 = _mm_mul_ps(c12_00,FF);
858 vvdw = _mm_add_ps(vvdw12,vvdw6);
859 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
861 /* Update potential sum for this i atom from the interaction with this j atom. */
862 velec = _mm_andnot_ps(dummy_mask,velec);
863 velecsum = _mm_add_ps(velecsum,velec);
864 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
865 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
867 fscal = _mm_add_ps(felec,fvdw);
869 fscal = _mm_andnot_ps(dummy_mask,fscal);
871 /* Update vectorial force */
872 fix0 = _mm_macc_ps(dx00,fscal,fix0);
873 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
874 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
876 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
877 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
878 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
880 /**************************
881 * CALCULATE INTERACTIONS *
882 **************************/
884 r01 = _mm_mul_ps(rsq01,rinv01);
885 r01 = _mm_andnot_ps(dummy_mask,r01);
887 /* Calculate table index by multiplying r with table scale and truncate to integer */
888 rt = _mm_mul_ps(r01,vftabscale);
889 vfitab = _mm_cvttps_epi32(rt);
891 vfeps = _mm_frcz_ps(rt);
893 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
895 twovfeps = _mm_add_ps(vfeps,vfeps);
896 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
898 /* CUBIC SPLINE TABLE ELECTROSTATICS */
899 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
900 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
901 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
902 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
903 _MM_TRANSPOSE4_PS(Y,F,G,H);
904 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
905 VV = _mm_macc_ps(vfeps,Fp,Y);
906 velec = _mm_mul_ps(qq01,VV);
907 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
908 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
910 /* Update potential sum for this i atom from the interaction with this j atom. */
911 velec = _mm_andnot_ps(dummy_mask,velec);
912 velecsum = _mm_add_ps(velecsum,velec);
916 fscal = _mm_andnot_ps(dummy_mask,fscal);
918 /* Update vectorial force */
919 fix0 = _mm_macc_ps(dx01,fscal,fix0);
920 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
921 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
923 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
924 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
925 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
927 /**************************
928 * CALCULATE INTERACTIONS *
929 **************************/
931 r02 = _mm_mul_ps(rsq02,rinv02);
932 r02 = _mm_andnot_ps(dummy_mask,r02);
934 /* Calculate table index by multiplying r with table scale and truncate to integer */
935 rt = _mm_mul_ps(r02,vftabscale);
936 vfitab = _mm_cvttps_epi32(rt);
938 vfeps = _mm_frcz_ps(rt);
940 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
942 twovfeps = _mm_add_ps(vfeps,vfeps);
943 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
945 /* CUBIC SPLINE TABLE ELECTROSTATICS */
946 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
947 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
948 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
949 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
950 _MM_TRANSPOSE4_PS(Y,F,G,H);
951 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
952 VV = _mm_macc_ps(vfeps,Fp,Y);
953 velec = _mm_mul_ps(qq02,VV);
954 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
955 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
957 /* Update potential sum for this i atom from the interaction with this j atom. */
958 velec = _mm_andnot_ps(dummy_mask,velec);
959 velecsum = _mm_add_ps(velecsum,velec);
963 fscal = _mm_andnot_ps(dummy_mask,fscal);
965 /* Update vectorial force */
966 fix0 = _mm_macc_ps(dx02,fscal,fix0);
967 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
968 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
970 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
971 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
972 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
974 /**************************
975 * CALCULATE INTERACTIONS *
976 **************************/
978 r10 = _mm_mul_ps(rsq10,rinv10);
979 r10 = _mm_andnot_ps(dummy_mask,r10);
981 /* Calculate table index by multiplying r with table scale and truncate to integer */
982 rt = _mm_mul_ps(r10,vftabscale);
983 vfitab = _mm_cvttps_epi32(rt);
985 vfeps = _mm_frcz_ps(rt);
987 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
989 twovfeps = _mm_add_ps(vfeps,vfeps);
990 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
992 /* CUBIC SPLINE TABLE ELECTROSTATICS */
993 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
994 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
995 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
996 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
997 _MM_TRANSPOSE4_PS(Y,F,G,H);
998 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
999 VV = _mm_macc_ps(vfeps,Fp,Y);
1000 velec = _mm_mul_ps(qq10,VV);
1001 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1002 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1004 /* Update potential sum for this i atom from the interaction with this j atom. */
1005 velec = _mm_andnot_ps(dummy_mask,velec);
1006 velecsum = _mm_add_ps(velecsum,velec);
1010 fscal = _mm_andnot_ps(dummy_mask,fscal);
1012 /* Update vectorial force */
1013 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1014 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1015 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1017 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1018 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1019 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1021 /**************************
1022 * CALCULATE INTERACTIONS *
1023 **************************/
1025 r11 = _mm_mul_ps(rsq11,rinv11);
1026 r11 = _mm_andnot_ps(dummy_mask,r11);
1028 /* Calculate table index by multiplying r with table scale and truncate to integer */
1029 rt = _mm_mul_ps(r11,vftabscale);
1030 vfitab = _mm_cvttps_epi32(rt);
1032 vfeps = _mm_frcz_ps(rt);
1034 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1036 twovfeps = _mm_add_ps(vfeps,vfeps);
1037 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1039 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1040 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1041 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1042 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1043 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1044 _MM_TRANSPOSE4_PS(Y,F,G,H);
1045 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1046 VV = _mm_macc_ps(vfeps,Fp,Y);
1047 velec = _mm_mul_ps(qq11,VV);
1048 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1049 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1051 /* Update potential sum for this i atom from the interaction with this j atom. */
1052 velec = _mm_andnot_ps(dummy_mask,velec);
1053 velecsum = _mm_add_ps(velecsum,velec);
1057 fscal = _mm_andnot_ps(dummy_mask,fscal);
1059 /* Update vectorial force */
1060 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1061 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1062 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1064 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1065 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1066 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1068 /**************************
1069 * CALCULATE INTERACTIONS *
1070 **************************/
1072 r12 = _mm_mul_ps(rsq12,rinv12);
1073 r12 = _mm_andnot_ps(dummy_mask,r12);
1075 /* Calculate table index by multiplying r with table scale and truncate to integer */
1076 rt = _mm_mul_ps(r12,vftabscale);
1077 vfitab = _mm_cvttps_epi32(rt);
1079 vfeps = _mm_frcz_ps(rt);
1081 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1083 twovfeps = _mm_add_ps(vfeps,vfeps);
1084 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1086 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1087 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1088 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1089 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1090 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1091 _MM_TRANSPOSE4_PS(Y,F,G,H);
1092 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1093 VV = _mm_macc_ps(vfeps,Fp,Y);
1094 velec = _mm_mul_ps(qq12,VV);
1095 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1096 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1098 /* Update potential sum for this i atom from the interaction with this j atom. */
1099 velec = _mm_andnot_ps(dummy_mask,velec);
1100 velecsum = _mm_add_ps(velecsum,velec);
1104 fscal = _mm_andnot_ps(dummy_mask,fscal);
1106 /* Update vectorial force */
1107 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1108 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1109 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1111 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1112 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1113 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1115 /**************************
1116 * CALCULATE INTERACTIONS *
1117 **************************/
1119 r20 = _mm_mul_ps(rsq20,rinv20);
1120 r20 = _mm_andnot_ps(dummy_mask,r20);
1122 /* Calculate table index by multiplying r with table scale and truncate to integer */
1123 rt = _mm_mul_ps(r20,vftabscale);
1124 vfitab = _mm_cvttps_epi32(rt);
1126 vfeps = _mm_frcz_ps(rt);
1128 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1130 twovfeps = _mm_add_ps(vfeps,vfeps);
1131 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1133 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1134 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1135 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1136 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1137 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1138 _MM_TRANSPOSE4_PS(Y,F,G,H);
1139 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1140 VV = _mm_macc_ps(vfeps,Fp,Y);
1141 velec = _mm_mul_ps(qq20,VV);
1142 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1143 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1145 /* Update potential sum for this i atom from the interaction with this j atom. */
1146 velec = _mm_andnot_ps(dummy_mask,velec);
1147 velecsum = _mm_add_ps(velecsum,velec);
1151 fscal = _mm_andnot_ps(dummy_mask,fscal);
1153 /* Update vectorial force */
1154 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1155 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1156 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1158 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1159 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1160 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1162 /**************************
1163 * CALCULATE INTERACTIONS *
1164 **************************/
1166 r21 = _mm_mul_ps(rsq21,rinv21);
1167 r21 = _mm_andnot_ps(dummy_mask,r21);
1169 /* Calculate table index by multiplying r with table scale and truncate to integer */
1170 rt = _mm_mul_ps(r21,vftabscale);
1171 vfitab = _mm_cvttps_epi32(rt);
1173 vfeps = _mm_frcz_ps(rt);
1175 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1177 twovfeps = _mm_add_ps(vfeps,vfeps);
1178 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1180 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1181 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1182 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1183 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1184 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1185 _MM_TRANSPOSE4_PS(Y,F,G,H);
1186 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1187 VV = _mm_macc_ps(vfeps,Fp,Y);
1188 velec = _mm_mul_ps(qq21,VV);
1189 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1190 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1192 /* Update potential sum for this i atom from the interaction with this j atom. */
1193 velec = _mm_andnot_ps(dummy_mask,velec);
1194 velecsum = _mm_add_ps(velecsum,velec);
1198 fscal = _mm_andnot_ps(dummy_mask,fscal);
1200 /* Update vectorial force */
1201 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1202 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1203 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1205 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1206 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1207 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1209 /**************************
1210 * CALCULATE INTERACTIONS *
1211 **************************/
1213 r22 = _mm_mul_ps(rsq22,rinv22);
1214 r22 = _mm_andnot_ps(dummy_mask,r22);
1216 /* Calculate table index by multiplying r with table scale and truncate to integer */
1217 rt = _mm_mul_ps(r22,vftabscale);
1218 vfitab = _mm_cvttps_epi32(rt);
1220 vfeps = _mm_frcz_ps(rt);
1222 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1224 twovfeps = _mm_add_ps(vfeps,vfeps);
1225 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1227 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1228 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1229 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1230 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1231 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1232 _MM_TRANSPOSE4_PS(Y,F,G,H);
1233 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1234 VV = _mm_macc_ps(vfeps,Fp,Y);
1235 velec = _mm_mul_ps(qq22,VV);
1236 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1237 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1239 /* Update potential sum for this i atom from the interaction with this j atom. */
1240 velec = _mm_andnot_ps(dummy_mask,velec);
1241 velecsum = _mm_add_ps(velecsum,velec);
1245 fscal = _mm_andnot_ps(dummy_mask,fscal);
1247 /* Update vectorial force */
1248 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1249 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1250 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1252 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1253 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1254 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1256 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1257 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1258 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1259 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1261 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1262 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1264 /* Inner loop uses 453 flops */
1267 /* End of innermost loop */
1269 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1270 f+i_coord_offset,fshift+i_shift_offset);
1273 /* Update potential energies */
1274 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1275 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1277 /* Increment number of inner iterations */
1278 inneriter += j_index_end - j_index_start;
1280 /* Outer loop uses 20 flops */
1283 /* Increment number of outer iterations */
1286 /* Update outer/inner flops */
1288 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*453);
1291 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_128_fma_single
1292 * Electrostatics interaction: CubicSplineTable
1293 * VdW interaction: CubicSplineTable
1294 * Geometry: Water3-Water3
1295 * Calculate force/pot: Force
1298 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_128_fma_single
1299 (t_nblist * gmx_restrict nlist,
1300 rvec * gmx_restrict xx,
1301 rvec * gmx_restrict ff,
1302 t_forcerec * gmx_restrict fr,
1303 t_mdatoms * gmx_restrict mdatoms,
1304 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1305 t_nrnb * gmx_restrict nrnb)
1307 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1308 * just 0 for non-waters.
1309 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1310 * jnr indices corresponding to data put in the four positions in the SIMD register.
1312 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1313 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1314 int jnrA,jnrB,jnrC,jnrD;
1315 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1316 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1317 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1318 real rcutoff_scalar;
1319 real *shiftvec,*fshift,*x,*f;
1320 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1321 real scratch[4*DIM];
1322 __m128 fscal,rcutoff,rcutoff2,jidxall;
1324 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1326 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1328 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1329 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1330 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1331 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1332 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1333 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1334 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1335 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1336 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1337 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1338 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1339 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1340 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1341 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1342 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1343 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1344 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1347 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1350 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1351 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1353 __m128i ifour = _mm_set1_epi32(4);
1354 __m128 rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
1356 __m128 dummy_mask,cutoff_mask;
1357 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1358 __m128 one = _mm_set1_ps(1.0);
1359 __m128 two = _mm_set1_ps(2.0);
1365 jindex = nlist->jindex;
1367 shiftidx = nlist->shift;
1369 shiftvec = fr->shift_vec[0];
1370 fshift = fr->fshift[0];
1371 facel = _mm_set1_ps(fr->epsfac);
1372 charge = mdatoms->chargeA;
1373 nvdwtype = fr->ntype;
1374 vdwparam = fr->nbfp;
1375 vdwtype = mdatoms->typeA;
1377 vftab = kernel_data->table_elec_vdw->data;
1378 vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
1380 /* Setup water-specific parameters */
1381 inr = nlist->iinr[0];
1382 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1383 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1384 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1385 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1387 jq0 = _mm_set1_ps(charge[inr+0]);
1388 jq1 = _mm_set1_ps(charge[inr+1]);
1389 jq2 = _mm_set1_ps(charge[inr+2]);
1390 vdwjidx0A = 2*vdwtype[inr+0];
1391 qq00 = _mm_mul_ps(iq0,jq0);
1392 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1393 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1394 qq01 = _mm_mul_ps(iq0,jq1);
1395 qq02 = _mm_mul_ps(iq0,jq2);
1396 qq10 = _mm_mul_ps(iq1,jq0);
1397 qq11 = _mm_mul_ps(iq1,jq1);
1398 qq12 = _mm_mul_ps(iq1,jq2);
1399 qq20 = _mm_mul_ps(iq2,jq0);
1400 qq21 = _mm_mul_ps(iq2,jq1);
1401 qq22 = _mm_mul_ps(iq2,jq2);
1403 /* Avoid stupid compiler warnings */
1404 jnrA = jnrB = jnrC = jnrD = 0;
1405 j_coord_offsetA = 0;
1406 j_coord_offsetB = 0;
1407 j_coord_offsetC = 0;
1408 j_coord_offsetD = 0;
1413 for(iidx=0;iidx<4*DIM;iidx++)
1415 scratch[iidx] = 0.0;
1418 /* Start outer loop over neighborlists */
1419 for(iidx=0; iidx<nri; iidx++)
1421 /* Load shift vector for this list */
1422 i_shift_offset = DIM*shiftidx[iidx];
1424 /* Load limits for loop over neighbors */
1425 j_index_start = jindex[iidx];
1426 j_index_end = jindex[iidx+1];
1428 /* Get outer coordinate index */
1430 i_coord_offset = DIM*inr;
1432 /* Load i particle coords and add shift vector */
1433 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1434 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1436 fix0 = _mm_setzero_ps();
1437 fiy0 = _mm_setzero_ps();
1438 fiz0 = _mm_setzero_ps();
1439 fix1 = _mm_setzero_ps();
1440 fiy1 = _mm_setzero_ps();
1441 fiz1 = _mm_setzero_ps();
1442 fix2 = _mm_setzero_ps();
1443 fiy2 = _mm_setzero_ps();
1444 fiz2 = _mm_setzero_ps();
1446 /* Start inner kernel loop */
1447 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1450 /* Get j neighbor index, and coordinate index */
1452 jnrB = jjnr[jidx+1];
1453 jnrC = jjnr[jidx+2];
1454 jnrD = jjnr[jidx+3];
1455 j_coord_offsetA = DIM*jnrA;
1456 j_coord_offsetB = DIM*jnrB;
1457 j_coord_offsetC = DIM*jnrC;
1458 j_coord_offsetD = DIM*jnrD;
1460 /* load j atom coordinates */
1461 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1462 x+j_coord_offsetC,x+j_coord_offsetD,
1463 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1465 /* Calculate displacement vector */
1466 dx00 = _mm_sub_ps(ix0,jx0);
1467 dy00 = _mm_sub_ps(iy0,jy0);
1468 dz00 = _mm_sub_ps(iz0,jz0);
1469 dx01 = _mm_sub_ps(ix0,jx1);
1470 dy01 = _mm_sub_ps(iy0,jy1);
1471 dz01 = _mm_sub_ps(iz0,jz1);
1472 dx02 = _mm_sub_ps(ix0,jx2);
1473 dy02 = _mm_sub_ps(iy0,jy2);
1474 dz02 = _mm_sub_ps(iz0,jz2);
1475 dx10 = _mm_sub_ps(ix1,jx0);
1476 dy10 = _mm_sub_ps(iy1,jy0);
1477 dz10 = _mm_sub_ps(iz1,jz0);
1478 dx11 = _mm_sub_ps(ix1,jx1);
1479 dy11 = _mm_sub_ps(iy1,jy1);
1480 dz11 = _mm_sub_ps(iz1,jz1);
1481 dx12 = _mm_sub_ps(ix1,jx2);
1482 dy12 = _mm_sub_ps(iy1,jy2);
1483 dz12 = _mm_sub_ps(iz1,jz2);
1484 dx20 = _mm_sub_ps(ix2,jx0);
1485 dy20 = _mm_sub_ps(iy2,jy0);
1486 dz20 = _mm_sub_ps(iz2,jz0);
1487 dx21 = _mm_sub_ps(ix2,jx1);
1488 dy21 = _mm_sub_ps(iy2,jy1);
1489 dz21 = _mm_sub_ps(iz2,jz1);
1490 dx22 = _mm_sub_ps(ix2,jx2);
1491 dy22 = _mm_sub_ps(iy2,jy2);
1492 dz22 = _mm_sub_ps(iz2,jz2);
1494 /* Calculate squared distance and things based on it */
1495 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1496 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1497 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1498 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1499 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1500 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1501 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1502 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1503 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1505 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1506 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1507 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1508 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1509 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1510 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1511 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1512 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1513 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1515 fjx0 = _mm_setzero_ps();
1516 fjy0 = _mm_setzero_ps();
1517 fjz0 = _mm_setzero_ps();
1518 fjx1 = _mm_setzero_ps();
1519 fjy1 = _mm_setzero_ps();
1520 fjz1 = _mm_setzero_ps();
1521 fjx2 = _mm_setzero_ps();
1522 fjy2 = _mm_setzero_ps();
1523 fjz2 = _mm_setzero_ps();
1525 /**************************
1526 * CALCULATE INTERACTIONS *
1527 **************************/
1529 r00 = _mm_mul_ps(rsq00,rinv00);
1531 /* Calculate table index by multiplying r with table scale and truncate to integer */
1532 rt = _mm_mul_ps(r00,vftabscale);
1533 vfitab = _mm_cvttps_epi32(rt);
1535 vfeps = _mm_frcz_ps(rt);
1537 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1539 twovfeps = _mm_add_ps(vfeps,vfeps);
1540 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1542 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1543 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1544 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1545 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1546 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1547 _MM_TRANSPOSE4_PS(Y,F,G,H);
1548 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1549 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1550 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1552 /* CUBIC SPLINE TABLE DISPERSION */
1553 vfitab = _mm_add_epi32(vfitab,ifour);
1554 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1555 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1556 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1557 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1558 _MM_TRANSPOSE4_PS(Y,F,G,H);
1559 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1560 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1561 fvdw6 = _mm_mul_ps(c6_00,FF);
1563 /* CUBIC SPLINE TABLE REPULSION */
1564 vfitab = _mm_add_epi32(vfitab,ifour);
1565 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1566 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1567 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1568 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1569 _MM_TRANSPOSE4_PS(Y,F,G,H);
1570 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1571 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1572 fvdw12 = _mm_mul_ps(c12_00,FF);
1573 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1575 fscal = _mm_add_ps(felec,fvdw);
1577 /* Update vectorial force */
1578 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1579 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1580 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1582 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1583 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1584 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1586 /**************************
1587 * CALCULATE INTERACTIONS *
1588 **************************/
1590 r01 = _mm_mul_ps(rsq01,rinv01);
1592 /* Calculate table index by multiplying r with table scale and truncate to integer */
1593 rt = _mm_mul_ps(r01,vftabscale);
1594 vfitab = _mm_cvttps_epi32(rt);
1596 vfeps = _mm_frcz_ps(rt);
1598 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1600 twovfeps = _mm_add_ps(vfeps,vfeps);
1601 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1603 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1604 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1605 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1606 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1607 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1608 _MM_TRANSPOSE4_PS(Y,F,G,H);
1609 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1610 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1611 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
1615 /* Update vectorial force */
1616 fix0 = _mm_macc_ps(dx01,fscal,fix0);
1617 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
1618 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
1620 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
1621 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
1622 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
1624 /**************************
1625 * CALCULATE INTERACTIONS *
1626 **************************/
1628 r02 = _mm_mul_ps(rsq02,rinv02);
1630 /* Calculate table index by multiplying r with table scale and truncate to integer */
1631 rt = _mm_mul_ps(r02,vftabscale);
1632 vfitab = _mm_cvttps_epi32(rt);
1634 vfeps = _mm_frcz_ps(rt);
1636 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1638 twovfeps = _mm_add_ps(vfeps,vfeps);
1639 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1641 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1642 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1643 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1644 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1645 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1646 _MM_TRANSPOSE4_PS(Y,F,G,H);
1647 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1648 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1649 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
1653 /* Update vectorial force */
1654 fix0 = _mm_macc_ps(dx02,fscal,fix0);
1655 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
1656 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
1658 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
1659 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
1660 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
1662 /**************************
1663 * CALCULATE INTERACTIONS *
1664 **************************/
1666 r10 = _mm_mul_ps(rsq10,rinv10);
1668 /* Calculate table index by multiplying r with table scale and truncate to integer */
1669 rt = _mm_mul_ps(r10,vftabscale);
1670 vfitab = _mm_cvttps_epi32(rt);
1672 vfeps = _mm_frcz_ps(rt);
1674 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1676 twovfeps = _mm_add_ps(vfeps,vfeps);
1677 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1679 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1680 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1681 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1682 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1683 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1684 _MM_TRANSPOSE4_PS(Y,F,G,H);
1685 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1686 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1687 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1691 /* Update vectorial force */
1692 fix1 = _mm_macc_ps(dx10,fscal,fix1);
1693 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
1694 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
1696 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
1697 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
1698 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
1700 /**************************
1701 * CALCULATE INTERACTIONS *
1702 **************************/
1704 r11 = _mm_mul_ps(rsq11,rinv11);
1706 /* Calculate table index by multiplying r with table scale and truncate to integer */
1707 rt = _mm_mul_ps(r11,vftabscale);
1708 vfitab = _mm_cvttps_epi32(rt);
1710 vfeps = _mm_frcz_ps(rt);
1712 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1714 twovfeps = _mm_add_ps(vfeps,vfeps);
1715 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1717 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1718 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1719 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1720 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1721 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1722 _MM_TRANSPOSE4_PS(Y,F,G,H);
1723 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1724 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1725 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1729 /* Update vectorial force */
1730 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1731 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1732 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1734 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1735 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1736 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1738 /**************************
1739 * CALCULATE INTERACTIONS *
1740 **************************/
1742 r12 = _mm_mul_ps(rsq12,rinv12);
1744 /* Calculate table index by multiplying r with table scale and truncate to integer */
1745 rt = _mm_mul_ps(r12,vftabscale);
1746 vfitab = _mm_cvttps_epi32(rt);
1748 vfeps = _mm_frcz_ps(rt);
1750 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1752 twovfeps = _mm_add_ps(vfeps,vfeps);
1753 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1755 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1756 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1757 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1758 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1759 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1760 _MM_TRANSPOSE4_PS(Y,F,G,H);
1761 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1762 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1763 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1767 /* Update vectorial force */
1768 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1769 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1770 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1772 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1773 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1774 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1776 /**************************
1777 * CALCULATE INTERACTIONS *
1778 **************************/
1780 r20 = _mm_mul_ps(rsq20,rinv20);
1782 /* Calculate table index by multiplying r with table scale and truncate to integer */
1783 rt = _mm_mul_ps(r20,vftabscale);
1784 vfitab = _mm_cvttps_epi32(rt);
1786 vfeps = _mm_frcz_ps(rt);
1788 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1790 twovfeps = _mm_add_ps(vfeps,vfeps);
1791 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1793 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1794 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1795 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1796 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1797 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1798 _MM_TRANSPOSE4_PS(Y,F,G,H);
1799 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1800 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1801 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1805 /* Update vectorial force */
1806 fix2 = _mm_macc_ps(dx20,fscal,fix2);
1807 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
1808 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
1810 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
1811 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
1812 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
1814 /**************************
1815 * CALCULATE INTERACTIONS *
1816 **************************/
1818 r21 = _mm_mul_ps(rsq21,rinv21);
1820 /* Calculate table index by multiplying r with table scale and truncate to integer */
1821 rt = _mm_mul_ps(r21,vftabscale);
1822 vfitab = _mm_cvttps_epi32(rt);
1824 vfeps = _mm_frcz_ps(rt);
1826 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1828 twovfeps = _mm_add_ps(vfeps,vfeps);
1829 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1831 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1832 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1833 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1834 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1835 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1836 _MM_TRANSPOSE4_PS(Y,F,G,H);
1837 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1838 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1839 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1843 /* Update vectorial force */
1844 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1845 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1846 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1848 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1849 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1850 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1852 /**************************
1853 * CALCULATE INTERACTIONS *
1854 **************************/
1856 r22 = _mm_mul_ps(rsq22,rinv22);
1858 /* Calculate table index by multiplying r with table scale and truncate to integer */
1859 rt = _mm_mul_ps(r22,vftabscale);
1860 vfitab = _mm_cvttps_epi32(rt);
1862 vfeps = _mm_frcz_ps(rt);
1864 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1866 twovfeps = _mm_add_ps(vfeps,vfeps);
1867 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1869 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1870 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1871 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1872 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1873 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1874 _MM_TRANSPOSE4_PS(Y,F,G,H);
1875 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1876 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1877 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1881 /* Update vectorial force */
1882 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1883 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1884 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1886 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1887 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1888 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1890 fjptrA = f+j_coord_offsetA;
1891 fjptrB = f+j_coord_offsetB;
1892 fjptrC = f+j_coord_offsetC;
1893 fjptrD = f+j_coord_offsetD;
1895 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1896 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1898 /* Inner loop uses 400 flops */
1901 if(jidx<j_index_end)
1904 /* Get j neighbor index, and coordinate index */
1905 jnrlistA = jjnr[jidx];
1906 jnrlistB = jjnr[jidx+1];
1907 jnrlistC = jjnr[jidx+2];
1908 jnrlistD = jjnr[jidx+3];
1909 /* Sign of each element will be negative for non-real atoms.
1910 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1911 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1913 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1914 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1915 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1916 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1917 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1918 j_coord_offsetA = DIM*jnrA;
1919 j_coord_offsetB = DIM*jnrB;
1920 j_coord_offsetC = DIM*jnrC;
1921 j_coord_offsetD = DIM*jnrD;
1923 /* load j atom coordinates */
1924 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1925 x+j_coord_offsetC,x+j_coord_offsetD,
1926 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1928 /* Calculate displacement vector */
1929 dx00 = _mm_sub_ps(ix0,jx0);
1930 dy00 = _mm_sub_ps(iy0,jy0);
1931 dz00 = _mm_sub_ps(iz0,jz0);
1932 dx01 = _mm_sub_ps(ix0,jx1);
1933 dy01 = _mm_sub_ps(iy0,jy1);
1934 dz01 = _mm_sub_ps(iz0,jz1);
1935 dx02 = _mm_sub_ps(ix0,jx2);
1936 dy02 = _mm_sub_ps(iy0,jy2);
1937 dz02 = _mm_sub_ps(iz0,jz2);
1938 dx10 = _mm_sub_ps(ix1,jx0);
1939 dy10 = _mm_sub_ps(iy1,jy0);
1940 dz10 = _mm_sub_ps(iz1,jz0);
1941 dx11 = _mm_sub_ps(ix1,jx1);
1942 dy11 = _mm_sub_ps(iy1,jy1);
1943 dz11 = _mm_sub_ps(iz1,jz1);
1944 dx12 = _mm_sub_ps(ix1,jx2);
1945 dy12 = _mm_sub_ps(iy1,jy2);
1946 dz12 = _mm_sub_ps(iz1,jz2);
1947 dx20 = _mm_sub_ps(ix2,jx0);
1948 dy20 = _mm_sub_ps(iy2,jy0);
1949 dz20 = _mm_sub_ps(iz2,jz0);
1950 dx21 = _mm_sub_ps(ix2,jx1);
1951 dy21 = _mm_sub_ps(iy2,jy1);
1952 dz21 = _mm_sub_ps(iz2,jz1);
1953 dx22 = _mm_sub_ps(ix2,jx2);
1954 dy22 = _mm_sub_ps(iy2,jy2);
1955 dz22 = _mm_sub_ps(iz2,jz2);
1957 /* Calculate squared distance and things based on it */
1958 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1959 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1960 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1961 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1962 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1963 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1964 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1965 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1966 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1968 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1969 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1970 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1971 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1972 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1973 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1974 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1975 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1976 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1978 fjx0 = _mm_setzero_ps();
1979 fjy0 = _mm_setzero_ps();
1980 fjz0 = _mm_setzero_ps();
1981 fjx1 = _mm_setzero_ps();
1982 fjy1 = _mm_setzero_ps();
1983 fjz1 = _mm_setzero_ps();
1984 fjx2 = _mm_setzero_ps();
1985 fjy2 = _mm_setzero_ps();
1986 fjz2 = _mm_setzero_ps();
1988 /**************************
1989 * CALCULATE INTERACTIONS *
1990 **************************/
1992 r00 = _mm_mul_ps(rsq00,rinv00);
1993 r00 = _mm_andnot_ps(dummy_mask,r00);
1995 /* Calculate table index by multiplying r with table scale and truncate to integer */
1996 rt = _mm_mul_ps(r00,vftabscale);
1997 vfitab = _mm_cvttps_epi32(rt);
1999 vfeps = _mm_frcz_ps(rt);
2001 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2003 twovfeps = _mm_add_ps(vfeps,vfeps);
2004 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2006 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2007 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2008 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2009 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2010 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2011 _MM_TRANSPOSE4_PS(Y,F,G,H);
2012 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2013 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2014 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
2016 /* CUBIC SPLINE TABLE DISPERSION */
2017 vfitab = _mm_add_epi32(vfitab,ifour);
2018 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2019 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2020 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2021 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2022 _MM_TRANSPOSE4_PS(Y,F,G,H);
2023 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2024 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2025 fvdw6 = _mm_mul_ps(c6_00,FF);
2027 /* CUBIC SPLINE TABLE REPULSION */
2028 vfitab = _mm_add_epi32(vfitab,ifour);
2029 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2030 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2031 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2032 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2033 _MM_TRANSPOSE4_PS(Y,F,G,H);
2034 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2035 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2036 fvdw12 = _mm_mul_ps(c12_00,FF);
2037 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
2039 fscal = _mm_add_ps(felec,fvdw);
2041 fscal = _mm_andnot_ps(dummy_mask,fscal);
2043 /* Update vectorial force */
2044 fix0 = _mm_macc_ps(dx00,fscal,fix0);
2045 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
2046 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
2048 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
2049 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
2050 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
2052 /**************************
2053 * CALCULATE INTERACTIONS *
2054 **************************/
2056 r01 = _mm_mul_ps(rsq01,rinv01);
2057 r01 = _mm_andnot_ps(dummy_mask,r01);
2059 /* Calculate table index by multiplying r with table scale and truncate to integer */
2060 rt = _mm_mul_ps(r01,vftabscale);
2061 vfitab = _mm_cvttps_epi32(rt);
2063 vfeps = _mm_frcz_ps(rt);
2065 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2067 twovfeps = _mm_add_ps(vfeps,vfeps);
2068 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2070 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2071 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2072 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2073 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2074 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2075 _MM_TRANSPOSE4_PS(Y,F,G,H);
2076 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2077 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2078 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
2082 fscal = _mm_andnot_ps(dummy_mask,fscal);
2084 /* Update vectorial force */
2085 fix0 = _mm_macc_ps(dx01,fscal,fix0);
2086 fiy0 = _mm_macc_ps(dy01,fscal,fiy0);
2087 fiz0 = _mm_macc_ps(dz01,fscal,fiz0);
2089 fjx1 = _mm_macc_ps(dx01,fscal,fjx1);
2090 fjy1 = _mm_macc_ps(dy01,fscal,fjy1);
2091 fjz1 = _mm_macc_ps(dz01,fscal,fjz1);
2093 /**************************
2094 * CALCULATE INTERACTIONS *
2095 **************************/
2097 r02 = _mm_mul_ps(rsq02,rinv02);
2098 r02 = _mm_andnot_ps(dummy_mask,r02);
2100 /* Calculate table index by multiplying r with table scale and truncate to integer */
2101 rt = _mm_mul_ps(r02,vftabscale);
2102 vfitab = _mm_cvttps_epi32(rt);
2104 vfeps = _mm_frcz_ps(rt);
2106 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2108 twovfeps = _mm_add_ps(vfeps,vfeps);
2109 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2111 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2112 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2113 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2114 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2115 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2116 _MM_TRANSPOSE4_PS(Y,F,G,H);
2117 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2118 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2119 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
2123 fscal = _mm_andnot_ps(dummy_mask,fscal);
2125 /* Update vectorial force */
2126 fix0 = _mm_macc_ps(dx02,fscal,fix0);
2127 fiy0 = _mm_macc_ps(dy02,fscal,fiy0);
2128 fiz0 = _mm_macc_ps(dz02,fscal,fiz0);
2130 fjx2 = _mm_macc_ps(dx02,fscal,fjx2);
2131 fjy2 = _mm_macc_ps(dy02,fscal,fjy2);
2132 fjz2 = _mm_macc_ps(dz02,fscal,fjz2);
2134 /**************************
2135 * CALCULATE INTERACTIONS *
2136 **************************/
2138 r10 = _mm_mul_ps(rsq10,rinv10);
2139 r10 = _mm_andnot_ps(dummy_mask,r10);
2141 /* Calculate table index by multiplying r with table scale and truncate to integer */
2142 rt = _mm_mul_ps(r10,vftabscale);
2143 vfitab = _mm_cvttps_epi32(rt);
2145 vfeps = _mm_frcz_ps(rt);
2147 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2149 twovfeps = _mm_add_ps(vfeps,vfeps);
2150 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2152 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2153 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2154 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2155 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2156 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2157 _MM_TRANSPOSE4_PS(Y,F,G,H);
2158 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2159 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2160 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
2164 fscal = _mm_andnot_ps(dummy_mask,fscal);
2166 /* Update vectorial force */
2167 fix1 = _mm_macc_ps(dx10,fscal,fix1);
2168 fiy1 = _mm_macc_ps(dy10,fscal,fiy1);
2169 fiz1 = _mm_macc_ps(dz10,fscal,fiz1);
2171 fjx0 = _mm_macc_ps(dx10,fscal,fjx0);
2172 fjy0 = _mm_macc_ps(dy10,fscal,fjy0);
2173 fjz0 = _mm_macc_ps(dz10,fscal,fjz0);
2175 /**************************
2176 * CALCULATE INTERACTIONS *
2177 **************************/
2179 r11 = _mm_mul_ps(rsq11,rinv11);
2180 r11 = _mm_andnot_ps(dummy_mask,r11);
2182 /* Calculate table index by multiplying r with table scale and truncate to integer */
2183 rt = _mm_mul_ps(r11,vftabscale);
2184 vfitab = _mm_cvttps_epi32(rt);
2186 vfeps = _mm_frcz_ps(rt);
2188 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2190 twovfeps = _mm_add_ps(vfeps,vfeps);
2191 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2193 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2194 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2195 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2196 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2197 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2198 _MM_TRANSPOSE4_PS(Y,F,G,H);
2199 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2200 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2201 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2205 fscal = _mm_andnot_ps(dummy_mask,fscal);
2207 /* Update vectorial force */
2208 fix1 = _mm_macc_ps(dx11,fscal,fix1);
2209 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
2210 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
2212 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
2213 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
2214 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
2216 /**************************
2217 * CALCULATE INTERACTIONS *
2218 **************************/
2220 r12 = _mm_mul_ps(rsq12,rinv12);
2221 r12 = _mm_andnot_ps(dummy_mask,r12);
2223 /* Calculate table index by multiplying r with table scale and truncate to integer */
2224 rt = _mm_mul_ps(r12,vftabscale);
2225 vfitab = _mm_cvttps_epi32(rt);
2227 vfeps = _mm_frcz_ps(rt);
2229 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2231 twovfeps = _mm_add_ps(vfeps,vfeps);
2232 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2234 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2235 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2236 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2237 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2238 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2239 _MM_TRANSPOSE4_PS(Y,F,G,H);
2240 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2241 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2242 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2246 fscal = _mm_andnot_ps(dummy_mask,fscal);
2248 /* Update vectorial force */
2249 fix1 = _mm_macc_ps(dx12,fscal,fix1);
2250 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
2251 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
2253 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
2254 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
2255 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
2257 /**************************
2258 * CALCULATE INTERACTIONS *
2259 **************************/
2261 r20 = _mm_mul_ps(rsq20,rinv20);
2262 r20 = _mm_andnot_ps(dummy_mask,r20);
2264 /* Calculate table index by multiplying r with table scale and truncate to integer */
2265 rt = _mm_mul_ps(r20,vftabscale);
2266 vfitab = _mm_cvttps_epi32(rt);
2268 vfeps = _mm_frcz_ps(rt);
2270 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2272 twovfeps = _mm_add_ps(vfeps,vfeps);
2273 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2275 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2276 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2277 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2278 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2279 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2280 _MM_TRANSPOSE4_PS(Y,F,G,H);
2281 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2282 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2283 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
2287 fscal = _mm_andnot_ps(dummy_mask,fscal);
2289 /* Update vectorial force */
2290 fix2 = _mm_macc_ps(dx20,fscal,fix2);
2291 fiy2 = _mm_macc_ps(dy20,fscal,fiy2);
2292 fiz2 = _mm_macc_ps(dz20,fscal,fiz2);
2294 fjx0 = _mm_macc_ps(dx20,fscal,fjx0);
2295 fjy0 = _mm_macc_ps(dy20,fscal,fjy0);
2296 fjz0 = _mm_macc_ps(dz20,fscal,fjz0);
2298 /**************************
2299 * CALCULATE INTERACTIONS *
2300 **************************/
2302 r21 = _mm_mul_ps(rsq21,rinv21);
2303 r21 = _mm_andnot_ps(dummy_mask,r21);
2305 /* Calculate table index by multiplying r with table scale and truncate to integer */
2306 rt = _mm_mul_ps(r21,vftabscale);
2307 vfitab = _mm_cvttps_epi32(rt);
2309 vfeps = _mm_frcz_ps(rt);
2311 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2313 twovfeps = _mm_add_ps(vfeps,vfeps);
2314 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2316 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2317 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2318 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2319 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2320 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2321 _MM_TRANSPOSE4_PS(Y,F,G,H);
2322 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2323 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2324 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2328 fscal = _mm_andnot_ps(dummy_mask,fscal);
2330 /* Update vectorial force */
2331 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2332 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2333 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2335 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2336 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2337 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2339 /**************************
2340 * CALCULATE INTERACTIONS *
2341 **************************/
2343 r22 = _mm_mul_ps(rsq22,rinv22);
2344 r22 = _mm_andnot_ps(dummy_mask,r22);
2346 /* Calculate table index by multiplying r with table scale and truncate to integer */
2347 rt = _mm_mul_ps(r22,vftabscale);
2348 vfitab = _mm_cvttps_epi32(rt);
2350 vfeps = _mm_frcz_ps(rt);
2352 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2354 twovfeps = _mm_add_ps(vfeps,vfeps);
2355 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2357 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2358 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2359 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2360 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2361 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2362 _MM_TRANSPOSE4_PS(Y,F,G,H);
2363 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2364 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2365 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2369 fscal = _mm_andnot_ps(dummy_mask,fscal);
2371 /* Update vectorial force */
2372 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2373 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2374 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2376 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2377 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2378 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2380 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2381 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2382 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2383 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2385 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2386 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2388 /* Inner loop uses 409 flops */
2391 /* End of innermost loop */
2393 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2394 f+i_coord_offset,fshift+i_shift_offset);
2396 /* Increment number of inner iterations */
2397 inneriter += j_index_end - j_index_start;
2399 /* Outer loop uses 18 flops */
2402 /* Increment number of outer iterations */
2405 /* Update outer/inner flops */
2407 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*409);