2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse4_1_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse4_1_single
51 * Electrostatics interaction: CubicSplineTable
52 * VdW interaction: CubicSplineTable
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse4_1_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
90 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
97 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
98 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
102 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
105 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
106 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
107 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
108 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
111 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
114 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
115 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
117 __m128i ifour = _mm_set1_epi32(4);
118 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
120 __m128 dummy_mask,cutoff_mask;
121 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
122 __m128 one = _mm_set1_ps(1.0);
123 __m128 two = _mm_set1_ps(2.0);
129 jindex = nlist->jindex;
131 shiftidx = nlist->shift;
133 shiftvec = fr->shift_vec[0];
134 fshift = fr->fshift[0];
135 facel = _mm_set1_ps(fr->ic->epsfac);
136 charge = mdatoms->chargeA;
137 nvdwtype = fr->ntype;
139 vdwtype = mdatoms->typeA;
141 vftab = kernel_data->table_elec_vdw->data;
142 vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
144 /* Setup water-specific parameters */
145 inr = nlist->iinr[0];
146 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
147 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
148 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
149 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
151 jq1 = _mm_set1_ps(charge[inr+1]);
152 jq2 = _mm_set1_ps(charge[inr+2]);
153 jq3 = _mm_set1_ps(charge[inr+3]);
154 vdwjidx0A = 2*vdwtype[inr+0];
155 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
156 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
157 qq11 = _mm_mul_ps(iq1,jq1);
158 qq12 = _mm_mul_ps(iq1,jq2);
159 qq13 = _mm_mul_ps(iq1,jq3);
160 qq21 = _mm_mul_ps(iq2,jq1);
161 qq22 = _mm_mul_ps(iq2,jq2);
162 qq23 = _mm_mul_ps(iq2,jq3);
163 qq31 = _mm_mul_ps(iq3,jq1);
164 qq32 = _mm_mul_ps(iq3,jq2);
165 qq33 = _mm_mul_ps(iq3,jq3);
167 /* Avoid stupid compiler warnings */
168 jnrA = jnrB = jnrC = jnrD = 0;
177 for(iidx=0;iidx<4*DIM;iidx++)
182 /* Start outer loop over neighborlists */
183 for(iidx=0; iidx<nri; iidx++)
185 /* Load shift vector for this list */
186 i_shift_offset = DIM*shiftidx[iidx];
188 /* Load limits for loop over neighbors */
189 j_index_start = jindex[iidx];
190 j_index_end = jindex[iidx+1];
192 /* Get outer coordinate index */
194 i_coord_offset = DIM*inr;
196 /* Load i particle coords and add shift vector */
197 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
198 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
200 fix0 = _mm_setzero_ps();
201 fiy0 = _mm_setzero_ps();
202 fiz0 = _mm_setzero_ps();
203 fix1 = _mm_setzero_ps();
204 fiy1 = _mm_setzero_ps();
205 fiz1 = _mm_setzero_ps();
206 fix2 = _mm_setzero_ps();
207 fiy2 = _mm_setzero_ps();
208 fiz2 = _mm_setzero_ps();
209 fix3 = _mm_setzero_ps();
210 fiy3 = _mm_setzero_ps();
211 fiz3 = _mm_setzero_ps();
213 /* Reset potential sums */
214 velecsum = _mm_setzero_ps();
215 vvdwsum = _mm_setzero_ps();
217 /* Start inner kernel loop */
218 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
221 /* Get j neighbor index, and coordinate index */
226 j_coord_offsetA = DIM*jnrA;
227 j_coord_offsetB = DIM*jnrB;
228 j_coord_offsetC = DIM*jnrC;
229 j_coord_offsetD = DIM*jnrD;
231 /* load j atom coordinates */
232 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
233 x+j_coord_offsetC,x+j_coord_offsetD,
234 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
235 &jy2,&jz2,&jx3,&jy3,&jz3);
237 /* Calculate displacement vector */
238 dx00 = _mm_sub_ps(ix0,jx0);
239 dy00 = _mm_sub_ps(iy0,jy0);
240 dz00 = _mm_sub_ps(iz0,jz0);
241 dx11 = _mm_sub_ps(ix1,jx1);
242 dy11 = _mm_sub_ps(iy1,jy1);
243 dz11 = _mm_sub_ps(iz1,jz1);
244 dx12 = _mm_sub_ps(ix1,jx2);
245 dy12 = _mm_sub_ps(iy1,jy2);
246 dz12 = _mm_sub_ps(iz1,jz2);
247 dx13 = _mm_sub_ps(ix1,jx3);
248 dy13 = _mm_sub_ps(iy1,jy3);
249 dz13 = _mm_sub_ps(iz1,jz3);
250 dx21 = _mm_sub_ps(ix2,jx1);
251 dy21 = _mm_sub_ps(iy2,jy1);
252 dz21 = _mm_sub_ps(iz2,jz1);
253 dx22 = _mm_sub_ps(ix2,jx2);
254 dy22 = _mm_sub_ps(iy2,jy2);
255 dz22 = _mm_sub_ps(iz2,jz2);
256 dx23 = _mm_sub_ps(ix2,jx3);
257 dy23 = _mm_sub_ps(iy2,jy3);
258 dz23 = _mm_sub_ps(iz2,jz3);
259 dx31 = _mm_sub_ps(ix3,jx1);
260 dy31 = _mm_sub_ps(iy3,jy1);
261 dz31 = _mm_sub_ps(iz3,jz1);
262 dx32 = _mm_sub_ps(ix3,jx2);
263 dy32 = _mm_sub_ps(iy3,jy2);
264 dz32 = _mm_sub_ps(iz3,jz2);
265 dx33 = _mm_sub_ps(ix3,jx3);
266 dy33 = _mm_sub_ps(iy3,jy3);
267 dz33 = _mm_sub_ps(iz3,jz3);
269 /* Calculate squared distance and things based on it */
270 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
271 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
272 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
273 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
274 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
275 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
276 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
277 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
278 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
279 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
281 rinv00 = sse41_invsqrt_f(rsq00);
282 rinv11 = sse41_invsqrt_f(rsq11);
283 rinv12 = sse41_invsqrt_f(rsq12);
284 rinv13 = sse41_invsqrt_f(rsq13);
285 rinv21 = sse41_invsqrt_f(rsq21);
286 rinv22 = sse41_invsqrt_f(rsq22);
287 rinv23 = sse41_invsqrt_f(rsq23);
288 rinv31 = sse41_invsqrt_f(rsq31);
289 rinv32 = sse41_invsqrt_f(rsq32);
290 rinv33 = sse41_invsqrt_f(rsq33);
292 fjx0 = _mm_setzero_ps();
293 fjy0 = _mm_setzero_ps();
294 fjz0 = _mm_setzero_ps();
295 fjx1 = _mm_setzero_ps();
296 fjy1 = _mm_setzero_ps();
297 fjz1 = _mm_setzero_ps();
298 fjx2 = _mm_setzero_ps();
299 fjy2 = _mm_setzero_ps();
300 fjz2 = _mm_setzero_ps();
301 fjx3 = _mm_setzero_ps();
302 fjy3 = _mm_setzero_ps();
303 fjz3 = _mm_setzero_ps();
305 /**************************
306 * CALCULATE INTERACTIONS *
307 **************************/
309 r00 = _mm_mul_ps(rsq00,rinv00);
311 /* Calculate table index by multiplying r with table scale and truncate to integer */
312 rt = _mm_mul_ps(r00,vftabscale);
313 vfitab = _mm_cvttps_epi32(rt);
314 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
315 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
317 /* CUBIC SPLINE TABLE DISPERSION */
318 vfitab = _mm_add_epi32(vfitab,ifour);
319 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
320 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
321 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
322 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
323 _MM_TRANSPOSE4_PS(Y,F,G,H);
324 Heps = _mm_mul_ps(vfeps,H);
325 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
326 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
327 vvdw6 = _mm_mul_ps(c6_00,VV);
328 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
329 fvdw6 = _mm_mul_ps(c6_00,FF);
331 /* CUBIC SPLINE TABLE REPULSION */
332 vfitab = _mm_add_epi32(vfitab,ifour);
333 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
334 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
335 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
336 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
337 _MM_TRANSPOSE4_PS(Y,F,G,H);
338 Heps = _mm_mul_ps(vfeps,H);
339 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
340 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
341 vvdw12 = _mm_mul_ps(c12_00,VV);
342 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
343 fvdw12 = _mm_mul_ps(c12_00,FF);
344 vvdw = _mm_add_ps(vvdw12,vvdw6);
345 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
347 /* Update potential sum for this i atom from the interaction with this j atom. */
348 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
352 /* Calculate temporary vectorial force */
353 tx = _mm_mul_ps(fscal,dx00);
354 ty = _mm_mul_ps(fscal,dy00);
355 tz = _mm_mul_ps(fscal,dz00);
357 /* Update vectorial force */
358 fix0 = _mm_add_ps(fix0,tx);
359 fiy0 = _mm_add_ps(fiy0,ty);
360 fiz0 = _mm_add_ps(fiz0,tz);
362 fjx0 = _mm_add_ps(fjx0,tx);
363 fjy0 = _mm_add_ps(fjy0,ty);
364 fjz0 = _mm_add_ps(fjz0,tz);
366 /**************************
367 * CALCULATE INTERACTIONS *
368 **************************/
370 r11 = _mm_mul_ps(rsq11,rinv11);
372 /* Calculate table index by multiplying r with table scale and truncate to integer */
373 rt = _mm_mul_ps(r11,vftabscale);
374 vfitab = _mm_cvttps_epi32(rt);
375 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
376 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
378 /* CUBIC SPLINE TABLE ELECTROSTATICS */
379 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
380 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
381 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
382 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
383 _MM_TRANSPOSE4_PS(Y,F,G,H);
384 Heps = _mm_mul_ps(vfeps,H);
385 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
386 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
387 velec = _mm_mul_ps(qq11,VV);
388 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
389 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
391 /* Update potential sum for this i atom from the interaction with this j atom. */
392 velecsum = _mm_add_ps(velecsum,velec);
396 /* Calculate temporary vectorial force */
397 tx = _mm_mul_ps(fscal,dx11);
398 ty = _mm_mul_ps(fscal,dy11);
399 tz = _mm_mul_ps(fscal,dz11);
401 /* Update vectorial force */
402 fix1 = _mm_add_ps(fix1,tx);
403 fiy1 = _mm_add_ps(fiy1,ty);
404 fiz1 = _mm_add_ps(fiz1,tz);
406 fjx1 = _mm_add_ps(fjx1,tx);
407 fjy1 = _mm_add_ps(fjy1,ty);
408 fjz1 = _mm_add_ps(fjz1,tz);
410 /**************************
411 * CALCULATE INTERACTIONS *
412 **************************/
414 r12 = _mm_mul_ps(rsq12,rinv12);
416 /* Calculate table index by multiplying r with table scale and truncate to integer */
417 rt = _mm_mul_ps(r12,vftabscale);
418 vfitab = _mm_cvttps_epi32(rt);
419 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
420 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
422 /* CUBIC SPLINE TABLE ELECTROSTATICS */
423 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
424 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
425 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
426 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
427 _MM_TRANSPOSE4_PS(Y,F,G,H);
428 Heps = _mm_mul_ps(vfeps,H);
429 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
430 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
431 velec = _mm_mul_ps(qq12,VV);
432 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
433 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
435 /* Update potential sum for this i atom from the interaction with this j atom. */
436 velecsum = _mm_add_ps(velecsum,velec);
440 /* Calculate temporary vectorial force */
441 tx = _mm_mul_ps(fscal,dx12);
442 ty = _mm_mul_ps(fscal,dy12);
443 tz = _mm_mul_ps(fscal,dz12);
445 /* Update vectorial force */
446 fix1 = _mm_add_ps(fix1,tx);
447 fiy1 = _mm_add_ps(fiy1,ty);
448 fiz1 = _mm_add_ps(fiz1,tz);
450 fjx2 = _mm_add_ps(fjx2,tx);
451 fjy2 = _mm_add_ps(fjy2,ty);
452 fjz2 = _mm_add_ps(fjz2,tz);
454 /**************************
455 * CALCULATE INTERACTIONS *
456 **************************/
458 r13 = _mm_mul_ps(rsq13,rinv13);
460 /* Calculate table index by multiplying r with table scale and truncate to integer */
461 rt = _mm_mul_ps(r13,vftabscale);
462 vfitab = _mm_cvttps_epi32(rt);
463 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
464 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
466 /* CUBIC SPLINE TABLE ELECTROSTATICS */
467 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
468 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
469 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
470 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
471 _MM_TRANSPOSE4_PS(Y,F,G,H);
472 Heps = _mm_mul_ps(vfeps,H);
473 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
474 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
475 velec = _mm_mul_ps(qq13,VV);
476 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
477 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
479 /* Update potential sum for this i atom from the interaction with this j atom. */
480 velecsum = _mm_add_ps(velecsum,velec);
484 /* Calculate temporary vectorial force */
485 tx = _mm_mul_ps(fscal,dx13);
486 ty = _mm_mul_ps(fscal,dy13);
487 tz = _mm_mul_ps(fscal,dz13);
489 /* Update vectorial force */
490 fix1 = _mm_add_ps(fix1,tx);
491 fiy1 = _mm_add_ps(fiy1,ty);
492 fiz1 = _mm_add_ps(fiz1,tz);
494 fjx3 = _mm_add_ps(fjx3,tx);
495 fjy3 = _mm_add_ps(fjy3,ty);
496 fjz3 = _mm_add_ps(fjz3,tz);
498 /**************************
499 * CALCULATE INTERACTIONS *
500 **************************/
502 r21 = _mm_mul_ps(rsq21,rinv21);
504 /* Calculate table index by multiplying r with table scale and truncate to integer */
505 rt = _mm_mul_ps(r21,vftabscale);
506 vfitab = _mm_cvttps_epi32(rt);
507 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
508 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
510 /* CUBIC SPLINE TABLE ELECTROSTATICS */
511 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
512 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
513 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
514 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
515 _MM_TRANSPOSE4_PS(Y,F,G,H);
516 Heps = _mm_mul_ps(vfeps,H);
517 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
518 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
519 velec = _mm_mul_ps(qq21,VV);
520 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
521 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
523 /* Update potential sum for this i atom from the interaction with this j atom. */
524 velecsum = _mm_add_ps(velecsum,velec);
528 /* Calculate temporary vectorial force */
529 tx = _mm_mul_ps(fscal,dx21);
530 ty = _mm_mul_ps(fscal,dy21);
531 tz = _mm_mul_ps(fscal,dz21);
533 /* Update vectorial force */
534 fix2 = _mm_add_ps(fix2,tx);
535 fiy2 = _mm_add_ps(fiy2,ty);
536 fiz2 = _mm_add_ps(fiz2,tz);
538 fjx1 = _mm_add_ps(fjx1,tx);
539 fjy1 = _mm_add_ps(fjy1,ty);
540 fjz1 = _mm_add_ps(fjz1,tz);
542 /**************************
543 * CALCULATE INTERACTIONS *
544 **************************/
546 r22 = _mm_mul_ps(rsq22,rinv22);
548 /* Calculate table index by multiplying r with table scale and truncate to integer */
549 rt = _mm_mul_ps(r22,vftabscale);
550 vfitab = _mm_cvttps_epi32(rt);
551 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
552 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
554 /* CUBIC SPLINE TABLE ELECTROSTATICS */
555 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
556 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
557 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
558 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
559 _MM_TRANSPOSE4_PS(Y,F,G,H);
560 Heps = _mm_mul_ps(vfeps,H);
561 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
562 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
563 velec = _mm_mul_ps(qq22,VV);
564 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
565 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
567 /* Update potential sum for this i atom from the interaction with this j atom. */
568 velecsum = _mm_add_ps(velecsum,velec);
572 /* Calculate temporary vectorial force */
573 tx = _mm_mul_ps(fscal,dx22);
574 ty = _mm_mul_ps(fscal,dy22);
575 tz = _mm_mul_ps(fscal,dz22);
577 /* Update vectorial force */
578 fix2 = _mm_add_ps(fix2,tx);
579 fiy2 = _mm_add_ps(fiy2,ty);
580 fiz2 = _mm_add_ps(fiz2,tz);
582 fjx2 = _mm_add_ps(fjx2,tx);
583 fjy2 = _mm_add_ps(fjy2,ty);
584 fjz2 = _mm_add_ps(fjz2,tz);
586 /**************************
587 * CALCULATE INTERACTIONS *
588 **************************/
590 r23 = _mm_mul_ps(rsq23,rinv23);
592 /* Calculate table index by multiplying r with table scale and truncate to integer */
593 rt = _mm_mul_ps(r23,vftabscale);
594 vfitab = _mm_cvttps_epi32(rt);
595 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
596 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
598 /* CUBIC SPLINE TABLE ELECTROSTATICS */
599 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
600 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
601 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
602 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
603 _MM_TRANSPOSE4_PS(Y,F,G,H);
604 Heps = _mm_mul_ps(vfeps,H);
605 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
606 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
607 velec = _mm_mul_ps(qq23,VV);
608 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
609 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
611 /* Update potential sum for this i atom from the interaction with this j atom. */
612 velecsum = _mm_add_ps(velecsum,velec);
616 /* Calculate temporary vectorial force */
617 tx = _mm_mul_ps(fscal,dx23);
618 ty = _mm_mul_ps(fscal,dy23);
619 tz = _mm_mul_ps(fscal,dz23);
621 /* Update vectorial force */
622 fix2 = _mm_add_ps(fix2,tx);
623 fiy2 = _mm_add_ps(fiy2,ty);
624 fiz2 = _mm_add_ps(fiz2,tz);
626 fjx3 = _mm_add_ps(fjx3,tx);
627 fjy3 = _mm_add_ps(fjy3,ty);
628 fjz3 = _mm_add_ps(fjz3,tz);
630 /**************************
631 * CALCULATE INTERACTIONS *
632 **************************/
634 r31 = _mm_mul_ps(rsq31,rinv31);
636 /* Calculate table index by multiplying r with table scale and truncate to integer */
637 rt = _mm_mul_ps(r31,vftabscale);
638 vfitab = _mm_cvttps_epi32(rt);
639 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
640 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
642 /* CUBIC SPLINE TABLE ELECTROSTATICS */
643 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
644 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
645 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
646 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
647 _MM_TRANSPOSE4_PS(Y,F,G,H);
648 Heps = _mm_mul_ps(vfeps,H);
649 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
650 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
651 velec = _mm_mul_ps(qq31,VV);
652 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
653 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
655 /* Update potential sum for this i atom from the interaction with this j atom. */
656 velecsum = _mm_add_ps(velecsum,velec);
660 /* Calculate temporary vectorial force */
661 tx = _mm_mul_ps(fscal,dx31);
662 ty = _mm_mul_ps(fscal,dy31);
663 tz = _mm_mul_ps(fscal,dz31);
665 /* Update vectorial force */
666 fix3 = _mm_add_ps(fix3,tx);
667 fiy3 = _mm_add_ps(fiy3,ty);
668 fiz3 = _mm_add_ps(fiz3,tz);
670 fjx1 = _mm_add_ps(fjx1,tx);
671 fjy1 = _mm_add_ps(fjy1,ty);
672 fjz1 = _mm_add_ps(fjz1,tz);
674 /**************************
675 * CALCULATE INTERACTIONS *
676 **************************/
678 r32 = _mm_mul_ps(rsq32,rinv32);
680 /* Calculate table index by multiplying r with table scale and truncate to integer */
681 rt = _mm_mul_ps(r32,vftabscale);
682 vfitab = _mm_cvttps_epi32(rt);
683 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
684 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
686 /* CUBIC SPLINE TABLE ELECTROSTATICS */
687 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
688 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
689 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
690 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
691 _MM_TRANSPOSE4_PS(Y,F,G,H);
692 Heps = _mm_mul_ps(vfeps,H);
693 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
694 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
695 velec = _mm_mul_ps(qq32,VV);
696 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
697 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
699 /* Update potential sum for this i atom from the interaction with this j atom. */
700 velecsum = _mm_add_ps(velecsum,velec);
704 /* Calculate temporary vectorial force */
705 tx = _mm_mul_ps(fscal,dx32);
706 ty = _mm_mul_ps(fscal,dy32);
707 tz = _mm_mul_ps(fscal,dz32);
709 /* Update vectorial force */
710 fix3 = _mm_add_ps(fix3,tx);
711 fiy3 = _mm_add_ps(fiy3,ty);
712 fiz3 = _mm_add_ps(fiz3,tz);
714 fjx2 = _mm_add_ps(fjx2,tx);
715 fjy2 = _mm_add_ps(fjy2,ty);
716 fjz2 = _mm_add_ps(fjz2,tz);
718 /**************************
719 * CALCULATE INTERACTIONS *
720 **************************/
722 r33 = _mm_mul_ps(rsq33,rinv33);
724 /* Calculate table index by multiplying r with table scale and truncate to integer */
725 rt = _mm_mul_ps(r33,vftabscale);
726 vfitab = _mm_cvttps_epi32(rt);
727 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
728 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
730 /* CUBIC SPLINE TABLE ELECTROSTATICS */
731 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
732 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
733 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
734 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
735 _MM_TRANSPOSE4_PS(Y,F,G,H);
736 Heps = _mm_mul_ps(vfeps,H);
737 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
738 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
739 velec = _mm_mul_ps(qq33,VV);
740 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
741 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
743 /* Update potential sum for this i atom from the interaction with this j atom. */
744 velecsum = _mm_add_ps(velecsum,velec);
748 /* Calculate temporary vectorial force */
749 tx = _mm_mul_ps(fscal,dx33);
750 ty = _mm_mul_ps(fscal,dy33);
751 tz = _mm_mul_ps(fscal,dz33);
753 /* Update vectorial force */
754 fix3 = _mm_add_ps(fix3,tx);
755 fiy3 = _mm_add_ps(fiy3,ty);
756 fiz3 = _mm_add_ps(fiz3,tz);
758 fjx3 = _mm_add_ps(fjx3,tx);
759 fjy3 = _mm_add_ps(fjy3,ty);
760 fjz3 = _mm_add_ps(fjz3,tz);
762 fjptrA = f+j_coord_offsetA;
763 fjptrB = f+j_coord_offsetB;
764 fjptrC = f+j_coord_offsetC;
765 fjptrD = f+j_coord_offsetD;
767 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
768 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
769 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
771 /* Inner loop uses 446 flops */
777 /* Get j neighbor index, and coordinate index */
778 jnrlistA = jjnr[jidx];
779 jnrlistB = jjnr[jidx+1];
780 jnrlistC = jjnr[jidx+2];
781 jnrlistD = jjnr[jidx+3];
782 /* Sign of each element will be negative for non-real atoms.
783 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
784 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
786 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
787 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
788 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
789 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
790 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
791 j_coord_offsetA = DIM*jnrA;
792 j_coord_offsetB = DIM*jnrB;
793 j_coord_offsetC = DIM*jnrC;
794 j_coord_offsetD = DIM*jnrD;
796 /* load j atom coordinates */
797 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
798 x+j_coord_offsetC,x+j_coord_offsetD,
799 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
800 &jy2,&jz2,&jx3,&jy3,&jz3);
802 /* Calculate displacement vector */
803 dx00 = _mm_sub_ps(ix0,jx0);
804 dy00 = _mm_sub_ps(iy0,jy0);
805 dz00 = _mm_sub_ps(iz0,jz0);
806 dx11 = _mm_sub_ps(ix1,jx1);
807 dy11 = _mm_sub_ps(iy1,jy1);
808 dz11 = _mm_sub_ps(iz1,jz1);
809 dx12 = _mm_sub_ps(ix1,jx2);
810 dy12 = _mm_sub_ps(iy1,jy2);
811 dz12 = _mm_sub_ps(iz1,jz2);
812 dx13 = _mm_sub_ps(ix1,jx3);
813 dy13 = _mm_sub_ps(iy1,jy3);
814 dz13 = _mm_sub_ps(iz1,jz3);
815 dx21 = _mm_sub_ps(ix2,jx1);
816 dy21 = _mm_sub_ps(iy2,jy1);
817 dz21 = _mm_sub_ps(iz2,jz1);
818 dx22 = _mm_sub_ps(ix2,jx2);
819 dy22 = _mm_sub_ps(iy2,jy2);
820 dz22 = _mm_sub_ps(iz2,jz2);
821 dx23 = _mm_sub_ps(ix2,jx3);
822 dy23 = _mm_sub_ps(iy2,jy3);
823 dz23 = _mm_sub_ps(iz2,jz3);
824 dx31 = _mm_sub_ps(ix3,jx1);
825 dy31 = _mm_sub_ps(iy3,jy1);
826 dz31 = _mm_sub_ps(iz3,jz1);
827 dx32 = _mm_sub_ps(ix3,jx2);
828 dy32 = _mm_sub_ps(iy3,jy2);
829 dz32 = _mm_sub_ps(iz3,jz2);
830 dx33 = _mm_sub_ps(ix3,jx3);
831 dy33 = _mm_sub_ps(iy3,jy3);
832 dz33 = _mm_sub_ps(iz3,jz3);
834 /* Calculate squared distance and things based on it */
835 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
836 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
837 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
838 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
839 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
840 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
841 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
842 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
843 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
844 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
846 rinv00 = sse41_invsqrt_f(rsq00);
847 rinv11 = sse41_invsqrt_f(rsq11);
848 rinv12 = sse41_invsqrt_f(rsq12);
849 rinv13 = sse41_invsqrt_f(rsq13);
850 rinv21 = sse41_invsqrt_f(rsq21);
851 rinv22 = sse41_invsqrt_f(rsq22);
852 rinv23 = sse41_invsqrt_f(rsq23);
853 rinv31 = sse41_invsqrt_f(rsq31);
854 rinv32 = sse41_invsqrt_f(rsq32);
855 rinv33 = sse41_invsqrt_f(rsq33);
857 fjx0 = _mm_setzero_ps();
858 fjy0 = _mm_setzero_ps();
859 fjz0 = _mm_setzero_ps();
860 fjx1 = _mm_setzero_ps();
861 fjy1 = _mm_setzero_ps();
862 fjz1 = _mm_setzero_ps();
863 fjx2 = _mm_setzero_ps();
864 fjy2 = _mm_setzero_ps();
865 fjz2 = _mm_setzero_ps();
866 fjx3 = _mm_setzero_ps();
867 fjy3 = _mm_setzero_ps();
868 fjz3 = _mm_setzero_ps();
870 /**************************
871 * CALCULATE INTERACTIONS *
872 **************************/
874 r00 = _mm_mul_ps(rsq00,rinv00);
875 r00 = _mm_andnot_ps(dummy_mask,r00);
877 /* Calculate table index by multiplying r with table scale and truncate to integer */
878 rt = _mm_mul_ps(r00,vftabscale);
879 vfitab = _mm_cvttps_epi32(rt);
880 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
881 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
883 /* CUBIC SPLINE TABLE DISPERSION */
884 vfitab = _mm_add_epi32(vfitab,ifour);
885 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
886 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
887 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
888 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
889 _MM_TRANSPOSE4_PS(Y,F,G,H);
890 Heps = _mm_mul_ps(vfeps,H);
891 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
892 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
893 vvdw6 = _mm_mul_ps(c6_00,VV);
894 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
895 fvdw6 = _mm_mul_ps(c6_00,FF);
897 /* CUBIC SPLINE TABLE REPULSION */
898 vfitab = _mm_add_epi32(vfitab,ifour);
899 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
900 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
901 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
902 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
903 _MM_TRANSPOSE4_PS(Y,F,G,H);
904 Heps = _mm_mul_ps(vfeps,H);
905 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
906 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
907 vvdw12 = _mm_mul_ps(c12_00,VV);
908 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
909 fvdw12 = _mm_mul_ps(c12_00,FF);
910 vvdw = _mm_add_ps(vvdw12,vvdw6);
911 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
913 /* Update potential sum for this i atom from the interaction with this j atom. */
914 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
915 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
919 fscal = _mm_andnot_ps(dummy_mask,fscal);
921 /* Calculate temporary vectorial force */
922 tx = _mm_mul_ps(fscal,dx00);
923 ty = _mm_mul_ps(fscal,dy00);
924 tz = _mm_mul_ps(fscal,dz00);
926 /* Update vectorial force */
927 fix0 = _mm_add_ps(fix0,tx);
928 fiy0 = _mm_add_ps(fiy0,ty);
929 fiz0 = _mm_add_ps(fiz0,tz);
931 fjx0 = _mm_add_ps(fjx0,tx);
932 fjy0 = _mm_add_ps(fjy0,ty);
933 fjz0 = _mm_add_ps(fjz0,tz);
935 /**************************
936 * CALCULATE INTERACTIONS *
937 **************************/
939 r11 = _mm_mul_ps(rsq11,rinv11);
940 r11 = _mm_andnot_ps(dummy_mask,r11);
942 /* Calculate table index by multiplying r with table scale and truncate to integer */
943 rt = _mm_mul_ps(r11,vftabscale);
944 vfitab = _mm_cvttps_epi32(rt);
945 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
946 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
948 /* CUBIC SPLINE TABLE ELECTROSTATICS */
949 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
950 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
951 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
952 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
953 _MM_TRANSPOSE4_PS(Y,F,G,H);
954 Heps = _mm_mul_ps(vfeps,H);
955 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
956 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
957 velec = _mm_mul_ps(qq11,VV);
958 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
959 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
961 /* Update potential sum for this i atom from the interaction with this j atom. */
962 velec = _mm_andnot_ps(dummy_mask,velec);
963 velecsum = _mm_add_ps(velecsum,velec);
967 fscal = _mm_andnot_ps(dummy_mask,fscal);
969 /* Calculate temporary vectorial force */
970 tx = _mm_mul_ps(fscal,dx11);
971 ty = _mm_mul_ps(fscal,dy11);
972 tz = _mm_mul_ps(fscal,dz11);
974 /* Update vectorial force */
975 fix1 = _mm_add_ps(fix1,tx);
976 fiy1 = _mm_add_ps(fiy1,ty);
977 fiz1 = _mm_add_ps(fiz1,tz);
979 fjx1 = _mm_add_ps(fjx1,tx);
980 fjy1 = _mm_add_ps(fjy1,ty);
981 fjz1 = _mm_add_ps(fjz1,tz);
983 /**************************
984 * CALCULATE INTERACTIONS *
985 **************************/
987 r12 = _mm_mul_ps(rsq12,rinv12);
988 r12 = _mm_andnot_ps(dummy_mask,r12);
990 /* Calculate table index by multiplying r with table scale and truncate to integer */
991 rt = _mm_mul_ps(r12,vftabscale);
992 vfitab = _mm_cvttps_epi32(rt);
993 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
994 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
996 /* CUBIC SPLINE TABLE ELECTROSTATICS */
997 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
998 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
999 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1000 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1001 _MM_TRANSPOSE4_PS(Y,F,G,H);
1002 Heps = _mm_mul_ps(vfeps,H);
1003 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1004 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1005 velec = _mm_mul_ps(qq12,VV);
1006 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1007 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1009 /* Update potential sum for this i atom from the interaction with this j atom. */
1010 velec = _mm_andnot_ps(dummy_mask,velec);
1011 velecsum = _mm_add_ps(velecsum,velec);
1015 fscal = _mm_andnot_ps(dummy_mask,fscal);
1017 /* Calculate temporary vectorial force */
1018 tx = _mm_mul_ps(fscal,dx12);
1019 ty = _mm_mul_ps(fscal,dy12);
1020 tz = _mm_mul_ps(fscal,dz12);
1022 /* Update vectorial force */
1023 fix1 = _mm_add_ps(fix1,tx);
1024 fiy1 = _mm_add_ps(fiy1,ty);
1025 fiz1 = _mm_add_ps(fiz1,tz);
1027 fjx2 = _mm_add_ps(fjx2,tx);
1028 fjy2 = _mm_add_ps(fjy2,ty);
1029 fjz2 = _mm_add_ps(fjz2,tz);
1031 /**************************
1032 * CALCULATE INTERACTIONS *
1033 **************************/
1035 r13 = _mm_mul_ps(rsq13,rinv13);
1036 r13 = _mm_andnot_ps(dummy_mask,r13);
1038 /* Calculate table index by multiplying r with table scale and truncate to integer */
1039 rt = _mm_mul_ps(r13,vftabscale);
1040 vfitab = _mm_cvttps_epi32(rt);
1041 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1042 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1044 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1045 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1046 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1047 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1048 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1049 _MM_TRANSPOSE4_PS(Y,F,G,H);
1050 Heps = _mm_mul_ps(vfeps,H);
1051 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1052 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1053 velec = _mm_mul_ps(qq13,VV);
1054 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1055 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1057 /* Update potential sum for this i atom from the interaction with this j atom. */
1058 velec = _mm_andnot_ps(dummy_mask,velec);
1059 velecsum = _mm_add_ps(velecsum,velec);
1063 fscal = _mm_andnot_ps(dummy_mask,fscal);
1065 /* Calculate temporary vectorial force */
1066 tx = _mm_mul_ps(fscal,dx13);
1067 ty = _mm_mul_ps(fscal,dy13);
1068 tz = _mm_mul_ps(fscal,dz13);
1070 /* Update vectorial force */
1071 fix1 = _mm_add_ps(fix1,tx);
1072 fiy1 = _mm_add_ps(fiy1,ty);
1073 fiz1 = _mm_add_ps(fiz1,tz);
1075 fjx3 = _mm_add_ps(fjx3,tx);
1076 fjy3 = _mm_add_ps(fjy3,ty);
1077 fjz3 = _mm_add_ps(fjz3,tz);
1079 /**************************
1080 * CALCULATE INTERACTIONS *
1081 **************************/
1083 r21 = _mm_mul_ps(rsq21,rinv21);
1084 r21 = _mm_andnot_ps(dummy_mask,r21);
1086 /* Calculate table index by multiplying r with table scale and truncate to integer */
1087 rt = _mm_mul_ps(r21,vftabscale);
1088 vfitab = _mm_cvttps_epi32(rt);
1089 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1090 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1092 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1093 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1094 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1095 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1096 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1097 _MM_TRANSPOSE4_PS(Y,F,G,H);
1098 Heps = _mm_mul_ps(vfeps,H);
1099 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1100 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1101 velec = _mm_mul_ps(qq21,VV);
1102 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1103 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1105 /* Update potential sum for this i atom from the interaction with this j atom. */
1106 velec = _mm_andnot_ps(dummy_mask,velec);
1107 velecsum = _mm_add_ps(velecsum,velec);
1111 fscal = _mm_andnot_ps(dummy_mask,fscal);
1113 /* Calculate temporary vectorial force */
1114 tx = _mm_mul_ps(fscal,dx21);
1115 ty = _mm_mul_ps(fscal,dy21);
1116 tz = _mm_mul_ps(fscal,dz21);
1118 /* Update vectorial force */
1119 fix2 = _mm_add_ps(fix2,tx);
1120 fiy2 = _mm_add_ps(fiy2,ty);
1121 fiz2 = _mm_add_ps(fiz2,tz);
1123 fjx1 = _mm_add_ps(fjx1,tx);
1124 fjy1 = _mm_add_ps(fjy1,ty);
1125 fjz1 = _mm_add_ps(fjz1,tz);
1127 /**************************
1128 * CALCULATE INTERACTIONS *
1129 **************************/
1131 r22 = _mm_mul_ps(rsq22,rinv22);
1132 r22 = _mm_andnot_ps(dummy_mask,r22);
1134 /* Calculate table index by multiplying r with table scale and truncate to integer */
1135 rt = _mm_mul_ps(r22,vftabscale);
1136 vfitab = _mm_cvttps_epi32(rt);
1137 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1138 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1140 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1141 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1142 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1143 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1144 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1145 _MM_TRANSPOSE4_PS(Y,F,G,H);
1146 Heps = _mm_mul_ps(vfeps,H);
1147 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1148 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1149 velec = _mm_mul_ps(qq22,VV);
1150 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1151 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1153 /* Update potential sum for this i atom from the interaction with this j atom. */
1154 velec = _mm_andnot_ps(dummy_mask,velec);
1155 velecsum = _mm_add_ps(velecsum,velec);
1159 fscal = _mm_andnot_ps(dummy_mask,fscal);
1161 /* Calculate temporary vectorial force */
1162 tx = _mm_mul_ps(fscal,dx22);
1163 ty = _mm_mul_ps(fscal,dy22);
1164 tz = _mm_mul_ps(fscal,dz22);
1166 /* Update vectorial force */
1167 fix2 = _mm_add_ps(fix2,tx);
1168 fiy2 = _mm_add_ps(fiy2,ty);
1169 fiz2 = _mm_add_ps(fiz2,tz);
1171 fjx2 = _mm_add_ps(fjx2,tx);
1172 fjy2 = _mm_add_ps(fjy2,ty);
1173 fjz2 = _mm_add_ps(fjz2,tz);
1175 /**************************
1176 * CALCULATE INTERACTIONS *
1177 **************************/
1179 r23 = _mm_mul_ps(rsq23,rinv23);
1180 r23 = _mm_andnot_ps(dummy_mask,r23);
1182 /* Calculate table index by multiplying r with table scale and truncate to integer */
1183 rt = _mm_mul_ps(r23,vftabscale);
1184 vfitab = _mm_cvttps_epi32(rt);
1185 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1186 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1188 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1189 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1190 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1191 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1192 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1193 _MM_TRANSPOSE4_PS(Y,F,G,H);
1194 Heps = _mm_mul_ps(vfeps,H);
1195 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1196 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1197 velec = _mm_mul_ps(qq23,VV);
1198 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1199 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1201 /* Update potential sum for this i atom from the interaction with this j atom. */
1202 velec = _mm_andnot_ps(dummy_mask,velec);
1203 velecsum = _mm_add_ps(velecsum,velec);
1207 fscal = _mm_andnot_ps(dummy_mask,fscal);
1209 /* Calculate temporary vectorial force */
1210 tx = _mm_mul_ps(fscal,dx23);
1211 ty = _mm_mul_ps(fscal,dy23);
1212 tz = _mm_mul_ps(fscal,dz23);
1214 /* Update vectorial force */
1215 fix2 = _mm_add_ps(fix2,tx);
1216 fiy2 = _mm_add_ps(fiy2,ty);
1217 fiz2 = _mm_add_ps(fiz2,tz);
1219 fjx3 = _mm_add_ps(fjx3,tx);
1220 fjy3 = _mm_add_ps(fjy3,ty);
1221 fjz3 = _mm_add_ps(fjz3,tz);
1223 /**************************
1224 * CALCULATE INTERACTIONS *
1225 **************************/
1227 r31 = _mm_mul_ps(rsq31,rinv31);
1228 r31 = _mm_andnot_ps(dummy_mask,r31);
1230 /* Calculate table index by multiplying r with table scale and truncate to integer */
1231 rt = _mm_mul_ps(r31,vftabscale);
1232 vfitab = _mm_cvttps_epi32(rt);
1233 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1234 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1236 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1237 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1238 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1239 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1240 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1241 _MM_TRANSPOSE4_PS(Y,F,G,H);
1242 Heps = _mm_mul_ps(vfeps,H);
1243 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1244 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1245 velec = _mm_mul_ps(qq31,VV);
1246 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1247 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1249 /* Update potential sum for this i atom from the interaction with this j atom. */
1250 velec = _mm_andnot_ps(dummy_mask,velec);
1251 velecsum = _mm_add_ps(velecsum,velec);
1255 fscal = _mm_andnot_ps(dummy_mask,fscal);
1257 /* Calculate temporary vectorial force */
1258 tx = _mm_mul_ps(fscal,dx31);
1259 ty = _mm_mul_ps(fscal,dy31);
1260 tz = _mm_mul_ps(fscal,dz31);
1262 /* Update vectorial force */
1263 fix3 = _mm_add_ps(fix3,tx);
1264 fiy3 = _mm_add_ps(fiy3,ty);
1265 fiz3 = _mm_add_ps(fiz3,tz);
1267 fjx1 = _mm_add_ps(fjx1,tx);
1268 fjy1 = _mm_add_ps(fjy1,ty);
1269 fjz1 = _mm_add_ps(fjz1,tz);
1271 /**************************
1272 * CALCULATE INTERACTIONS *
1273 **************************/
1275 r32 = _mm_mul_ps(rsq32,rinv32);
1276 r32 = _mm_andnot_ps(dummy_mask,r32);
1278 /* Calculate table index by multiplying r with table scale and truncate to integer */
1279 rt = _mm_mul_ps(r32,vftabscale);
1280 vfitab = _mm_cvttps_epi32(rt);
1281 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1282 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1284 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1285 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1286 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1287 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1288 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1289 _MM_TRANSPOSE4_PS(Y,F,G,H);
1290 Heps = _mm_mul_ps(vfeps,H);
1291 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1292 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1293 velec = _mm_mul_ps(qq32,VV);
1294 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1295 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1297 /* Update potential sum for this i atom from the interaction with this j atom. */
1298 velec = _mm_andnot_ps(dummy_mask,velec);
1299 velecsum = _mm_add_ps(velecsum,velec);
1303 fscal = _mm_andnot_ps(dummy_mask,fscal);
1305 /* Calculate temporary vectorial force */
1306 tx = _mm_mul_ps(fscal,dx32);
1307 ty = _mm_mul_ps(fscal,dy32);
1308 tz = _mm_mul_ps(fscal,dz32);
1310 /* Update vectorial force */
1311 fix3 = _mm_add_ps(fix3,tx);
1312 fiy3 = _mm_add_ps(fiy3,ty);
1313 fiz3 = _mm_add_ps(fiz3,tz);
1315 fjx2 = _mm_add_ps(fjx2,tx);
1316 fjy2 = _mm_add_ps(fjy2,ty);
1317 fjz2 = _mm_add_ps(fjz2,tz);
1319 /**************************
1320 * CALCULATE INTERACTIONS *
1321 **************************/
1323 r33 = _mm_mul_ps(rsq33,rinv33);
1324 r33 = _mm_andnot_ps(dummy_mask,r33);
1326 /* Calculate table index by multiplying r with table scale and truncate to integer */
1327 rt = _mm_mul_ps(r33,vftabscale);
1328 vfitab = _mm_cvttps_epi32(rt);
1329 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1330 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1332 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1333 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1334 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1335 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1336 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1337 _MM_TRANSPOSE4_PS(Y,F,G,H);
1338 Heps = _mm_mul_ps(vfeps,H);
1339 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1340 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1341 velec = _mm_mul_ps(qq33,VV);
1342 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1343 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1345 /* Update potential sum for this i atom from the interaction with this j atom. */
1346 velec = _mm_andnot_ps(dummy_mask,velec);
1347 velecsum = _mm_add_ps(velecsum,velec);
1351 fscal = _mm_andnot_ps(dummy_mask,fscal);
1353 /* Calculate temporary vectorial force */
1354 tx = _mm_mul_ps(fscal,dx33);
1355 ty = _mm_mul_ps(fscal,dy33);
1356 tz = _mm_mul_ps(fscal,dz33);
1358 /* Update vectorial force */
1359 fix3 = _mm_add_ps(fix3,tx);
1360 fiy3 = _mm_add_ps(fiy3,ty);
1361 fiz3 = _mm_add_ps(fiz3,tz);
1363 fjx3 = _mm_add_ps(fjx3,tx);
1364 fjy3 = _mm_add_ps(fjy3,ty);
1365 fjz3 = _mm_add_ps(fjz3,tz);
1367 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1368 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1369 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1370 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1372 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1373 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1374 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1376 /* Inner loop uses 456 flops */
1379 /* End of innermost loop */
1381 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1382 f+i_coord_offset,fshift+i_shift_offset);
1385 /* Update potential energies */
1386 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1387 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1389 /* Increment number of inner iterations */
1390 inneriter += j_index_end - j_index_start;
1392 /* Outer loop uses 26 flops */
1395 /* Increment number of outer iterations */
1398 /* Update outer/inner flops */
1400 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*456);
1403 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse4_1_single
1404 * Electrostatics interaction: CubicSplineTable
1405 * VdW interaction: CubicSplineTable
1406 * Geometry: Water4-Water4
1407 * Calculate force/pot: Force
1410 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse4_1_single
1411 (t_nblist * gmx_restrict nlist,
1412 rvec * gmx_restrict xx,
1413 rvec * gmx_restrict ff,
1414 struct t_forcerec * gmx_restrict fr,
1415 t_mdatoms * gmx_restrict mdatoms,
1416 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1417 t_nrnb * gmx_restrict nrnb)
1419 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1420 * just 0 for non-waters.
1421 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1422 * jnr indices corresponding to data put in the four positions in the SIMD register.
1424 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1425 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1426 int jnrA,jnrB,jnrC,jnrD;
1427 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1428 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1429 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1430 real rcutoff_scalar;
1431 real *shiftvec,*fshift,*x,*f;
1432 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1433 real scratch[4*DIM];
1434 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1436 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1438 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1440 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1442 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1443 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1444 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1445 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1446 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1447 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1448 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1449 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1450 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1451 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1452 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1453 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1454 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1455 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1456 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1457 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1458 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1459 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1460 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1461 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1464 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1467 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1468 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1470 __m128i ifour = _mm_set1_epi32(4);
1471 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1473 __m128 dummy_mask,cutoff_mask;
1474 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1475 __m128 one = _mm_set1_ps(1.0);
1476 __m128 two = _mm_set1_ps(2.0);
1482 jindex = nlist->jindex;
1484 shiftidx = nlist->shift;
1486 shiftvec = fr->shift_vec[0];
1487 fshift = fr->fshift[0];
1488 facel = _mm_set1_ps(fr->ic->epsfac);
1489 charge = mdatoms->chargeA;
1490 nvdwtype = fr->ntype;
1491 vdwparam = fr->nbfp;
1492 vdwtype = mdatoms->typeA;
1494 vftab = kernel_data->table_elec_vdw->data;
1495 vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
1497 /* Setup water-specific parameters */
1498 inr = nlist->iinr[0];
1499 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1500 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1501 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1502 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1504 jq1 = _mm_set1_ps(charge[inr+1]);
1505 jq2 = _mm_set1_ps(charge[inr+2]);
1506 jq3 = _mm_set1_ps(charge[inr+3]);
1507 vdwjidx0A = 2*vdwtype[inr+0];
1508 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1509 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1510 qq11 = _mm_mul_ps(iq1,jq1);
1511 qq12 = _mm_mul_ps(iq1,jq2);
1512 qq13 = _mm_mul_ps(iq1,jq3);
1513 qq21 = _mm_mul_ps(iq2,jq1);
1514 qq22 = _mm_mul_ps(iq2,jq2);
1515 qq23 = _mm_mul_ps(iq2,jq3);
1516 qq31 = _mm_mul_ps(iq3,jq1);
1517 qq32 = _mm_mul_ps(iq3,jq2);
1518 qq33 = _mm_mul_ps(iq3,jq3);
1520 /* Avoid stupid compiler warnings */
1521 jnrA = jnrB = jnrC = jnrD = 0;
1522 j_coord_offsetA = 0;
1523 j_coord_offsetB = 0;
1524 j_coord_offsetC = 0;
1525 j_coord_offsetD = 0;
1530 for(iidx=0;iidx<4*DIM;iidx++)
1532 scratch[iidx] = 0.0;
1535 /* Start outer loop over neighborlists */
1536 for(iidx=0; iidx<nri; iidx++)
1538 /* Load shift vector for this list */
1539 i_shift_offset = DIM*shiftidx[iidx];
1541 /* Load limits for loop over neighbors */
1542 j_index_start = jindex[iidx];
1543 j_index_end = jindex[iidx+1];
1545 /* Get outer coordinate index */
1547 i_coord_offset = DIM*inr;
1549 /* Load i particle coords and add shift vector */
1550 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1551 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1553 fix0 = _mm_setzero_ps();
1554 fiy0 = _mm_setzero_ps();
1555 fiz0 = _mm_setzero_ps();
1556 fix1 = _mm_setzero_ps();
1557 fiy1 = _mm_setzero_ps();
1558 fiz1 = _mm_setzero_ps();
1559 fix2 = _mm_setzero_ps();
1560 fiy2 = _mm_setzero_ps();
1561 fiz2 = _mm_setzero_ps();
1562 fix3 = _mm_setzero_ps();
1563 fiy3 = _mm_setzero_ps();
1564 fiz3 = _mm_setzero_ps();
1566 /* Start inner kernel loop */
1567 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1570 /* Get j neighbor index, and coordinate index */
1572 jnrB = jjnr[jidx+1];
1573 jnrC = jjnr[jidx+2];
1574 jnrD = jjnr[jidx+3];
1575 j_coord_offsetA = DIM*jnrA;
1576 j_coord_offsetB = DIM*jnrB;
1577 j_coord_offsetC = DIM*jnrC;
1578 j_coord_offsetD = DIM*jnrD;
1580 /* load j atom coordinates */
1581 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1582 x+j_coord_offsetC,x+j_coord_offsetD,
1583 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1584 &jy2,&jz2,&jx3,&jy3,&jz3);
1586 /* Calculate displacement vector */
1587 dx00 = _mm_sub_ps(ix0,jx0);
1588 dy00 = _mm_sub_ps(iy0,jy0);
1589 dz00 = _mm_sub_ps(iz0,jz0);
1590 dx11 = _mm_sub_ps(ix1,jx1);
1591 dy11 = _mm_sub_ps(iy1,jy1);
1592 dz11 = _mm_sub_ps(iz1,jz1);
1593 dx12 = _mm_sub_ps(ix1,jx2);
1594 dy12 = _mm_sub_ps(iy1,jy2);
1595 dz12 = _mm_sub_ps(iz1,jz2);
1596 dx13 = _mm_sub_ps(ix1,jx3);
1597 dy13 = _mm_sub_ps(iy1,jy3);
1598 dz13 = _mm_sub_ps(iz1,jz3);
1599 dx21 = _mm_sub_ps(ix2,jx1);
1600 dy21 = _mm_sub_ps(iy2,jy1);
1601 dz21 = _mm_sub_ps(iz2,jz1);
1602 dx22 = _mm_sub_ps(ix2,jx2);
1603 dy22 = _mm_sub_ps(iy2,jy2);
1604 dz22 = _mm_sub_ps(iz2,jz2);
1605 dx23 = _mm_sub_ps(ix2,jx3);
1606 dy23 = _mm_sub_ps(iy2,jy3);
1607 dz23 = _mm_sub_ps(iz2,jz3);
1608 dx31 = _mm_sub_ps(ix3,jx1);
1609 dy31 = _mm_sub_ps(iy3,jy1);
1610 dz31 = _mm_sub_ps(iz3,jz1);
1611 dx32 = _mm_sub_ps(ix3,jx2);
1612 dy32 = _mm_sub_ps(iy3,jy2);
1613 dz32 = _mm_sub_ps(iz3,jz2);
1614 dx33 = _mm_sub_ps(ix3,jx3);
1615 dy33 = _mm_sub_ps(iy3,jy3);
1616 dz33 = _mm_sub_ps(iz3,jz3);
1618 /* Calculate squared distance and things based on it */
1619 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1620 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1621 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1622 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1623 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1624 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1625 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1626 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1627 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1628 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1630 rinv00 = sse41_invsqrt_f(rsq00);
1631 rinv11 = sse41_invsqrt_f(rsq11);
1632 rinv12 = sse41_invsqrt_f(rsq12);
1633 rinv13 = sse41_invsqrt_f(rsq13);
1634 rinv21 = sse41_invsqrt_f(rsq21);
1635 rinv22 = sse41_invsqrt_f(rsq22);
1636 rinv23 = sse41_invsqrt_f(rsq23);
1637 rinv31 = sse41_invsqrt_f(rsq31);
1638 rinv32 = sse41_invsqrt_f(rsq32);
1639 rinv33 = sse41_invsqrt_f(rsq33);
1641 fjx0 = _mm_setzero_ps();
1642 fjy0 = _mm_setzero_ps();
1643 fjz0 = _mm_setzero_ps();
1644 fjx1 = _mm_setzero_ps();
1645 fjy1 = _mm_setzero_ps();
1646 fjz1 = _mm_setzero_ps();
1647 fjx2 = _mm_setzero_ps();
1648 fjy2 = _mm_setzero_ps();
1649 fjz2 = _mm_setzero_ps();
1650 fjx3 = _mm_setzero_ps();
1651 fjy3 = _mm_setzero_ps();
1652 fjz3 = _mm_setzero_ps();
1654 /**************************
1655 * CALCULATE INTERACTIONS *
1656 **************************/
1658 r00 = _mm_mul_ps(rsq00,rinv00);
1660 /* Calculate table index by multiplying r with table scale and truncate to integer */
1661 rt = _mm_mul_ps(r00,vftabscale);
1662 vfitab = _mm_cvttps_epi32(rt);
1663 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1664 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1666 /* CUBIC SPLINE TABLE DISPERSION */
1667 vfitab = _mm_add_epi32(vfitab,ifour);
1668 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1669 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1670 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1671 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1672 _MM_TRANSPOSE4_PS(Y,F,G,H);
1673 Heps = _mm_mul_ps(vfeps,H);
1674 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1675 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1676 fvdw6 = _mm_mul_ps(c6_00,FF);
1678 /* CUBIC SPLINE TABLE REPULSION */
1679 vfitab = _mm_add_epi32(vfitab,ifour);
1680 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1681 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1682 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1683 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1684 _MM_TRANSPOSE4_PS(Y,F,G,H);
1685 Heps = _mm_mul_ps(vfeps,H);
1686 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1687 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1688 fvdw12 = _mm_mul_ps(c12_00,FF);
1689 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1693 /* Calculate temporary vectorial force */
1694 tx = _mm_mul_ps(fscal,dx00);
1695 ty = _mm_mul_ps(fscal,dy00);
1696 tz = _mm_mul_ps(fscal,dz00);
1698 /* Update vectorial force */
1699 fix0 = _mm_add_ps(fix0,tx);
1700 fiy0 = _mm_add_ps(fiy0,ty);
1701 fiz0 = _mm_add_ps(fiz0,tz);
1703 fjx0 = _mm_add_ps(fjx0,tx);
1704 fjy0 = _mm_add_ps(fjy0,ty);
1705 fjz0 = _mm_add_ps(fjz0,tz);
1707 /**************************
1708 * CALCULATE INTERACTIONS *
1709 **************************/
1711 r11 = _mm_mul_ps(rsq11,rinv11);
1713 /* Calculate table index by multiplying r with table scale and truncate to integer */
1714 rt = _mm_mul_ps(r11,vftabscale);
1715 vfitab = _mm_cvttps_epi32(rt);
1716 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1717 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1719 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1720 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1721 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1722 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1723 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1724 _MM_TRANSPOSE4_PS(Y,F,G,H);
1725 Heps = _mm_mul_ps(vfeps,H);
1726 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1727 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1728 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1732 /* Calculate temporary vectorial force */
1733 tx = _mm_mul_ps(fscal,dx11);
1734 ty = _mm_mul_ps(fscal,dy11);
1735 tz = _mm_mul_ps(fscal,dz11);
1737 /* Update vectorial force */
1738 fix1 = _mm_add_ps(fix1,tx);
1739 fiy1 = _mm_add_ps(fiy1,ty);
1740 fiz1 = _mm_add_ps(fiz1,tz);
1742 fjx1 = _mm_add_ps(fjx1,tx);
1743 fjy1 = _mm_add_ps(fjy1,ty);
1744 fjz1 = _mm_add_ps(fjz1,tz);
1746 /**************************
1747 * CALCULATE INTERACTIONS *
1748 **************************/
1750 r12 = _mm_mul_ps(rsq12,rinv12);
1752 /* Calculate table index by multiplying r with table scale and truncate to integer */
1753 rt = _mm_mul_ps(r12,vftabscale);
1754 vfitab = _mm_cvttps_epi32(rt);
1755 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1756 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1758 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1759 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1760 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1761 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1762 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1763 _MM_TRANSPOSE4_PS(Y,F,G,H);
1764 Heps = _mm_mul_ps(vfeps,H);
1765 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1766 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1767 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1771 /* Calculate temporary vectorial force */
1772 tx = _mm_mul_ps(fscal,dx12);
1773 ty = _mm_mul_ps(fscal,dy12);
1774 tz = _mm_mul_ps(fscal,dz12);
1776 /* Update vectorial force */
1777 fix1 = _mm_add_ps(fix1,tx);
1778 fiy1 = _mm_add_ps(fiy1,ty);
1779 fiz1 = _mm_add_ps(fiz1,tz);
1781 fjx2 = _mm_add_ps(fjx2,tx);
1782 fjy2 = _mm_add_ps(fjy2,ty);
1783 fjz2 = _mm_add_ps(fjz2,tz);
1785 /**************************
1786 * CALCULATE INTERACTIONS *
1787 **************************/
1789 r13 = _mm_mul_ps(rsq13,rinv13);
1791 /* Calculate table index by multiplying r with table scale and truncate to integer */
1792 rt = _mm_mul_ps(r13,vftabscale);
1793 vfitab = _mm_cvttps_epi32(rt);
1794 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1795 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1797 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1798 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1799 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1800 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1801 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1802 _MM_TRANSPOSE4_PS(Y,F,G,H);
1803 Heps = _mm_mul_ps(vfeps,H);
1804 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1805 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1806 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1810 /* Calculate temporary vectorial force */
1811 tx = _mm_mul_ps(fscal,dx13);
1812 ty = _mm_mul_ps(fscal,dy13);
1813 tz = _mm_mul_ps(fscal,dz13);
1815 /* Update vectorial force */
1816 fix1 = _mm_add_ps(fix1,tx);
1817 fiy1 = _mm_add_ps(fiy1,ty);
1818 fiz1 = _mm_add_ps(fiz1,tz);
1820 fjx3 = _mm_add_ps(fjx3,tx);
1821 fjy3 = _mm_add_ps(fjy3,ty);
1822 fjz3 = _mm_add_ps(fjz3,tz);
1824 /**************************
1825 * CALCULATE INTERACTIONS *
1826 **************************/
1828 r21 = _mm_mul_ps(rsq21,rinv21);
1830 /* Calculate table index by multiplying r with table scale and truncate to integer */
1831 rt = _mm_mul_ps(r21,vftabscale);
1832 vfitab = _mm_cvttps_epi32(rt);
1833 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1834 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1836 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1837 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1838 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1839 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1840 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1841 _MM_TRANSPOSE4_PS(Y,F,G,H);
1842 Heps = _mm_mul_ps(vfeps,H);
1843 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1844 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1845 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1849 /* Calculate temporary vectorial force */
1850 tx = _mm_mul_ps(fscal,dx21);
1851 ty = _mm_mul_ps(fscal,dy21);
1852 tz = _mm_mul_ps(fscal,dz21);
1854 /* Update vectorial force */
1855 fix2 = _mm_add_ps(fix2,tx);
1856 fiy2 = _mm_add_ps(fiy2,ty);
1857 fiz2 = _mm_add_ps(fiz2,tz);
1859 fjx1 = _mm_add_ps(fjx1,tx);
1860 fjy1 = _mm_add_ps(fjy1,ty);
1861 fjz1 = _mm_add_ps(fjz1,tz);
1863 /**************************
1864 * CALCULATE INTERACTIONS *
1865 **************************/
1867 r22 = _mm_mul_ps(rsq22,rinv22);
1869 /* Calculate table index by multiplying r with table scale and truncate to integer */
1870 rt = _mm_mul_ps(r22,vftabscale);
1871 vfitab = _mm_cvttps_epi32(rt);
1872 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1873 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1875 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1876 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1877 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1878 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1879 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1880 _MM_TRANSPOSE4_PS(Y,F,G,H);
1881 Heps = _mm_mul_ps(vfeps,H);
1882 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1883 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1884 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1888 /* Calculate temporary vectorial force */
1889 tx = _mm_mul_ps(fscal,dx22);
1890 ty = _mm_mul_ps(fscal,dy22);
1891 tz = _mm_mul_ps(fscal,dz22);
1893 /* Update vectorial force */
1894 fix2 = _mm_add_ps(fix2,tx);
1895 fiy2 = _mm_add_ps(fiy2,ty);
1896 fiz2 = _mm_add_ps(fiz2,tz);
1898 fjx2 = _mm_add_ps(fjx2,tx);
1899 fjy2 = _mm_add_ps(fjy2,ty);
1900 fjz2 = _mm_add_ps(fjz2,tz);
1902 /**************************
1903 * CALCULATE INTERACTIONS *
1904 **************************/
1906 r23 = _mm_mul_ps(rsq23,rinv23);
1908 /* Calculate table index by multiplying r with table scale and truncate to integer */
1909 rt = _mm_mul_ps(r23,vftabscale);
1910 vfitab = _mm_cvttps_epi32(rt);
1911 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1912 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1914 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1915 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1916 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1917 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1918 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1919 _MM_TRANSPOSE4_PS(Y,F,G,H);
1920 Heps = _mm_mul_ps(vfeps,H);
1921 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1922 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1923 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1927 /* Calculate temporary vectorial force */
1928 tx = _mm_mul_ps(fscal,dx23);
1929 ty = _mm_mul_ps(fscal,dy23);
1930 tz = _mm_mul_ps(fscal,dz23);
1932 /* Update vectorial force */
1933 fix2 = _mm_add_ps(fix2,tx);
1934 fiy2 = _mm_add_ps(fiy2,ty);
1935 fiz2 = _mm_add_ps(fiz2,tz);
1937 fjx3 = _mm_add_ps(fjx3,tx);
1938 fjy3 = _mm_add_ps(fjy3,ty);
1939 fjz3 = _mm_add_ps(fjz3,tz);
1941 /**************************
1942 * CALCULATE INTERACTIONS *
1943 **************************/
1945 r31 = _mm_mul_ps(rsq31,rinv31);
1947 /* Calculate table index by multiplying r with table scale and truncate to integer */
1948 rt = _mm_mul_ps(r31,vftabscale);
1949 vfitab = _mm_cvttps_epi32(rt);
1950 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1951 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1953 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1954 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1955 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1956 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1957 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1958 _MM_TRANSPOSE4_PS(Y,F,G,H);
1959 Heps = _mm_mul_ps(vfeps,H);
1960 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1961 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1962 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1966 /* Calculate temporary vectorial force */
1967 tx = _mm_mul_ps(fscal,dx31);
1968 ty = _mm_mul_ps(fscal,dy31);
1969 tz = _mm_mul_ps(fscal,dz31);
1971 /* Update vectorial force */
1972 fix3 = _mm_add_ps(fix3,tx);
1973 fiy3 = _mm_add_ps(fiy3,ty);
1974 fiz3 = _mm_add_ps(fiz3,tz);
1976 fjx1 = _mm_add_ps(fjx1,tx);
1977 fjy1 = _mm_add_ps(fjy1,ty);
1978 fjz1 = _mm_add_ps(fjz1,tz);
1980 /**************************
1981 * CALCULATE INTERACTIONS *
1982 **************************/
1984 r32 = _mm_mul_ps(rsq32,rinv32);
1986 /* Calculate table index by multiplying r with table scale and truncate to integer */
1987 rt = _mm_mul_ps(r32,vftabscale);
1988 vfitab = _mm_cvttps_epi32(rt);
1989 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1990 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1992 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1993 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1994 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1995 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1996 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1997 _MM_TRANSPOSE4_PS(Y,F,G,H);
1998 Heps = _mm_mul_ps(vfeps,H);
1999 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2000 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2001 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
2005 /* Calculate temporary vectorial force */
2006 tx = _mm_mul_ps(fscal,dx32);
2007 ty = _mm_mul_ps(fscal,dy32);
2008 tz = _mm_mul_ps(fscal,dz32);
2010 /* Update vectorial force */
2011 fix3 = _mm_add_ps(fix3,tx);
2012 fiy3 = _mm_add_ps(fiy3,ty);
2013 fiz3 = _mm_add_ps(fiz3,tz);
2015 fjx2 = _mm_add_ps(fjx2,tx);
2016 fjy2 = _mm_add_ps(fjy2,ty);
2017 fjz2 = _mm_add_ps(fjz2,tz);
2019 /**************************
2020 * CALCULATE INTERACTIONS *
2021 **************************/
2023 r33 = _mm_mul_ps(rsq33,rinv33);
2025 /* Calculate table index by multiplying r with table scale and truncate to integer */
2026 rt = _mm_mul_ps(r33,vftabscale);
2027 vfitab = _mm_cvttps_epi32(rt);
2028 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2029 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2031 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2032 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2033 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2034 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2035 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2036 _MM_TRANSPOSE4_PS(Y,F,G,H);
2037 Heps = _mm_mul_ps(vfeps,H);
2038 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2039 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2040 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2044 /* Calculate temporary vectorial force */
2045 tx = _mm_mul_ps(fscal,dx33);
2046 ty = _mm_mul_ps(fscal,dy33);
2047 tz = _mm_mul_ps(fscal,dz33);
2049 /* Update vectorial force */
2050 fix3 = _mm_add_ps(fix3,tx);
2051 fiy3 = _mm_add_ps(fiy3,ty);
2052 fiz3 = _mm_add_ps(fiz3,tz);
2054 fjx3 = _mm_add_ps(fjx3,tx);
2055 fjy3 = _mm_add_ps(fjy3,ty);
2056 fjz3 = _mm_add_ps(fjz3,tz);
2058 fjptrA = f+j_coord_offsetA;
2059 fjptrB = f+j_coord_offsetB;
2060 fjptrC = f+j_coord_offsetC;
2061 fjptrD = f+j_coord_offsetD;
2063 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2064 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2065 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2067 /* Inner loop uses 402 flops */
2070 if(jidx<j_index_end)
2073 /* Get j neighbor index, and coordinate index */
2074 jnrlistA = jjnr[jidx];
2075 jnrlistB = jjnr[jidx+1];
2076 jnrlistC = jjnr[jidx+2];
2077 jnrlistD = jjnr[jidx+3];
2078 /* Sign of each element will be negative for non-real atoms.
2079 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2080 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2082 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2083 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2084 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2085 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2086 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2087 j_coord_offsetA = DIM*jnrA;
2088 j_coord_offsetB = DIM*jnrB;
2089 j_coord_offsetC = DIM*jnrC;
2090 j_coord_offsetD = DIM*jnrD;
2092 /* load j atom coordinates */
2093 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2094 x+j_coord_offsetC,x+j_coord_offsetD,
2095 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2096 &jy2,&jz2,&jx3,&jy3,&jz3);
2098 /* Calculate displacement vector */
2099 dx00 = _mm_sub_ps(ix0,jx0);
2100 dy00 = _mm_sub_ps(iy0,jy0);
2101 dz00 = _mm_sub_ps(iz0,jz0);
2102 dx11 = _mm_sub_ps(ix1,jx1);
2103 dy11 = _mm_sub_ps(iy1,jy1);
2104 dz11 = _mm_sub_ps(iz1,jz1);
2105 dx12 = _mm_sub_ps(ix1,jx2);
2106 dy12 = _mm_sub_ps(iy1,jy2);
2107 dz12 = _mm_sub_ps(iz1,jz2);
2108 dx13 = _mm_sub_ps(ix1,jx3);
2109 dy13 = _mm_sub_ps(iy1,jy3);
2110 dz13 = _mm_sub_ps(iz1,jz3);
2111 dx21 = _mm_sub_ps(ix2,jx1);
2112 dy21 = _mm_sub_ps(iy2,jy1);
2113 dz21 = _mm_sub_ps(iz2,jz1);
2114 dx22 = _mm_sub_ps(ix2,jx2);
2115 dy22 = _mm_sub_ps(iy2,jy2);
2116 dz22 = _mm_sub_ps(iz2,jz2);
2117 dx23 = _mm_sub_ps(ix2,jx3);
2118 dy23 = _mm_sub_ps(iy2,jy3);
2119 dz23 = _mm_sub_ps(iz2,jz3);
2120 dx31 = _mm_sub_ps(ix3,jx1);
2121 dy31 = _mm_sub_ps(iy3,jy1);
2122 dz31 = _mm_sub_ps(iz3,jz1);
2123 dx32 = _mm_sub_ps(ix3,jx2);
2124 dy32 = _mm_sub_ps(iy3,jy2);
2125 dz32 = _mm_sub_ps(iz3,jz2);
2126 dx33 = _mm_sub_ps(ix3,jx3);
2127 dy33 = _mm_sub_ps(iy3,jy3);
2128 dz33 = _mm_sub_ps(iz3,jz3);
2130 /* Calculate squared distance and things based on it */
2131 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2132 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2133 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2134 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2135 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2136 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2137 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2138 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2139 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2140 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2142 rinv00 = sse41_invsqrt_f(rsq00);
2143 rinv11 = sse41_invsqrt_f(rsq11);
2144 rinv12 = sse41_invsqrt_f(rsq12);
2145 rinv13 = sse41_invsqrt_f(rsq13);
2146 rinv21 = sse41_invsqrt_f(rsq21);
2147 rinv22 = sse41_invsqrt_f(rsq22);
2148 rinv23 = sse41_invsqrt_f(rsq23);
2149 rinv31 = sse41_invsqrt_f(rsq31);
2150 rinv32 = sse41_invsqrt_f(rsq32);
2151 rinv33 = sse41_invsqrt_f(rsq33);
2153 fjx0 = _mm_setzero_ps();
2154 fjy0 = _mm_setzero_ps();
2155 fjz0 = _mm_setzero_ps();
2156 fjx1 = _mm_setzero_ps();
2157 fjy1 = _mm_setzero_ps();
2158 fjz1 = _mm_setzero_ps();
2159 fjx2 = _mm_setzero_ps();
2160 fjy2 = _mm_setzero_ps();
2161 fjz2 = _mm_setzero_ps();
2162 fjx3 = _mm_setzero_ps();
2163 fjy3 = _mm_setzero_ps();
2164 fjz3 = _mm_setzero_ps();
2166 /**************************
2167 * CALCULATE INTERACTIONS *
2168 **************************/
2170 r00 = _mm_mul_ps(rsq00,rinv00);
2171 r00 = _mm_andnot_ps(dummy_mask,r00);
2173 /* Calculate table index by multiplying r with table scale and truncate to integer */
2174 rt = _mm_mul_ps(r00,vftabscale);
2175 vfitab = _mm_cvttps_epi32(rt);
2176 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2177 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2179 /* CUBIC SPLINE TABLE DISPERSION */
2180 vfitab = _mm_add_epi32(vfitab,ifour);
2181 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2182 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2183 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2184 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2185 _MM_TRANSPOSE4_PS(Y,F,G,H);
2186 Heps = _mm_mul_ps(vfeps,H);
2187 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2188 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2189 fvdw6 = _mm_mul_ps(c6_00,FF);
2191 /* CUBIC SPLINE TABLE REPULSION */
2192 vfitab = _mm_add_epi32(vfitab,ifour);
2193 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2194 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2195 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2196 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2197 _MM_TRANSPOSE4_PS(Y,F,G,H);
2198 Heps = _mm_mul_ps(vfeps,H);
2199 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2200 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2201 fvdw12 = _mm_mul_ps(c12_00,FF);
2202 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
2206 fscal = _mm_andnot_ps(dummy_mask,fscal);
2208 /* Calculate temporary vectorial force */
2209 tx = _mm_mul_ps(fscal,dx00);
2210 ty = _mm_mul_ps(fscal,dy00);
2211 tz = _mm_mul_ps(fscal,dz00);
2213 /* Update vectorial force */
2214 fix0 = _mm_add_ps(fix0,tx);
2215 fiy0 = _mm_add_ps(fiy0,ty);
2216 fiz0 = _mm_add_ps(fiz0,tz);
2218 fjx0 = _mm_add_ps(fjx0,tx);
2219 fjy0 = _mm_add_ps(fjy0,ty);
2220 fjz0 = _mm_add_ps(fjz0,tz);
2222 /**************************
2223 * CALCULATE INTERACTIONS *
2224 **************************/
2226 r11 = _mm_mul_ps(rsq11,rinv11);
2227 r11 = _mm_andnot_ps(dummy_mask,r11);
2229 /* Calculate table index by multiplying r with table scale and truncate to integer */
2230 rt = _mm_mul_ps(r11,vftabscale);
2231 vfitab = _mm_cvttps_epi32(rt);
2232 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2233 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2235 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2236 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2237 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2238 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2239 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2240 _MM_TRANSPOSE4_PS(Y,F,G,H);
2241 Heps = _mm_mul_ps(vfeps,H);
2242 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2243 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2244 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2248 fscal = _mm_andnot_ps(dummy_mask,fscal);
2250 /* Calculate temporary vectorial force */
2251 tx = _mm_mul_ps(fscal,dx11);
2252 ty = _mm_mul_ps(fscal,dy11);
2253 tz = _mm_mul_ps(fscal,dz11);
2255 /* Update vectorial force */
2256 fix1 = _mm_add_ps(fix1,tx);
2257 fiy1 = _mm_add_ps(fiy1,ty);
2258 fiz1 = _mm_add_ps(fiz1,tz);
2260 fjx1 = _mm_add_ps(fjx1,tx);
2261 fjy1 = _mm_add_ps(fjy1,ty);
2262 fjz1 = _mm_add_ps(fjz1,tz);
2264 /**************************
2265 * CALCULATE INTERACTIONS *
2266 **************************/
2268 r12 = _mm_mul_ps(rsq12,rinv12);
2269 r12 = _mm_andnot_ps(dummy_mask,r12);
2271 /* Calculate table index by multiplying r with table scale and truncate to integer */
2272 rt = _mm_mul_ps(r12,vftabscale);
2273 vfitab = _mm_cvttps_epi32(rt);
2274 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2275 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2277 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2278 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2279 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2280 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2281 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2282 _MM_TRANSPOSE4_PS(Y,F,G,H);
2283 Heps = _mm_mul_ps(vfeps,H);
2284 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2285 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2286 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2290 fscal = _mm_andnot_ps(dummy_mask,fscal);
2292 /* Calculate temporary vectorial force */
2293 tx = _mm_mul_ps(fscal,dx12);
2294 ty = _mm_mul_ps(fscal,dy12);
2295 tz = _mm_mul_ps(fscal,dz12);
2297 /* Update vectorial force */
2298 fix1 = _mm_add_ps(fix1,tx);
2299 fiy1 = _mm_add_ps(fiy1,ty);
2300 fiz1 = _mm_add_ps(fiz1,tz);
2302 fjx2 = _mm_add_ps(fjx2,tx);
2303 fjy2 = _mm_add_ps(fjy2,ty);
2304 fjz2 = _mm_add_ps(fjz2,tz);
2306 /**************************
2307 * CALCULATE INTERACTIONS *
2308 **************************/
2310 r13 = _mm_mul_ps(rsq13,rinv13);
2311 r13 = _mm_andnot_ps(dummy_mask,r13);
2313 /* Calculate table index by multiplying r with table scale and truncate to integer */
2314 rt = _mm_mul_ps(r13,vftabscale);
2315 vfitab = _mm_cvttps_epi32(rt);
2316 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2317 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2319 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2320 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2321 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2322 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2323 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2324 _MM_TRANSPOSE4_PS(Y,F,G,H);
2325 Heps = _mm_mul_ps(vfeps,H);
2326 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2327 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2328 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
2332 fscal = _mm_andnot_ps(dummy_mask,fscal);
2334 /* Calculate temporary vectorial force */
2335 tx = _mm_mul_ps(fscal,dx13);
2336 ty = _mm_mul_ps(fscal,dy13);
2337 tz = _mm_mul_ps(fscal,dz13);
2339 /* Update vectorial force */
2340 fix1 = _mm_add_ps(fix1,tx);
2341 fiy1 = _mm_add_ps(fiy1,ty);
2342 fiz1 = _mm_add_ps(fiz1,tz);
2344 fjx3 = _mm_add_ps(fjx3,tx);
2345 fjy3 = _mm_add_ps(fjy3,ty);
2346 fjz3 = _mm_add_ps(fjz3,tz);
2348 /**************************
2349 * CALCULATE INTERACTIONS *
2350 **************************/
2352 r21 = _mm_mul_ps(rsq21,rinv21);
2353 r21 = _mm_andnot_ps(dummy_mask,r21);
2355 /* Calculate table index by multiplying r with table scale and truncate to integer */
2356 rt = _mm_mul_ps(r21,vftabscale);
2357 vfitab = _mm_cvttps_epi32(rt);
2358 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2359 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2361 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2362 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2363 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2364 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2365 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2366 _MM_TRANSPOSE4_PS(Y,F,G,H);
2367 Heps = _mm_mul_ps(vfeps,H);
2368 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2369 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2370 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2374 fscal = _mm_andnot_ps(dummy_mask,fscal);
2376 /* Calculate temporary vectorial force */
2377 tx = _mm_mul_ps(fscal,dx21);
2378 ty = _mm_mul_ps(fscal,dy21);
2379 tz = _mm_mul_ps(fscal,dz21);
2381 /* Update vectorial force */
2382 fix2 = _mm_add_ps(fix2,tx);
2383 fiy2 = _mm_add_ps(fiy2,ty);
2384 fiz2 = _mm_add_ps(fiz2,tz);
2386 fjx1 = _mm_add_ps(fjx1,tx);
2387 fjy1 = _mm_add_ps(fjy1,ty);
2388 fjz1 = _mm_add_ps(fjz1,tz);
2390 /**************************
2391 * CALCULATE INTERACTIONS *
2392 **************************/
2394 r22 = _mm_mul_ps(rsq22,rinv22);
2395 r22 = _mm_andnot_ps(dummy_mask,r22);
2397 /* Calculate table index by multiplying r with table scale and truncate to integer */
2398 rt = _mm_mul_ps(r22,vftabscale);
2399 vfitab = _mm_cvttps_epi32(rt);
2400 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2401 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2403 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2404 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2405 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2406 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2407 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2408 _MM_TRANSPOSE4_PS(Y,F,G,H);
2409 Heps = _mm_mul_ps(vfeps,H);
2410 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2411 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2412 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2416 fscal = _mm_andnot_ps(dummy_mask,fscal);
2418 /* Calculate temporary vectorial force */
2419 tx = _mm_mul_ps(fscal,dx22);
2420 ty = _mm_mul_ps(fscal,dy22);
2421 tz = _mm_mul_ps(fscal,dz22);
2423 /* Update vectorial force */
2424 fix2 = _mm_add_ps(fix2,tx);
2425 fiy2 = _mm_add_ps(fiy2,ty);
2426 fiz2 = _mm_add_ps(fiz2,tz);
2428 fjx2 = _mm_add_ps(fjx2,tx);
2429 fjy2 = _mm_add_ps(fjy2,ty);
2430 fjz2 = _mm_add_ps(fjz2,tz);
2432 /**************************
2433 * CALCULATE INTERACTIONS *
2434 **************************/
2436 r23 = _mm_mul_ps(rsq23,rinv23);
2437 r23 = _mm_andnot_ps(dummy_mask,r23);
2439 /* Calculate table index by multiplying r with table scale and truncate to integer */
2440 rt = _mm_mul_ps(r23,vftabscale);
2441 vfitab = _mm_cvttps_epi32(rt);
2442 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2443 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2445 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2446 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2447 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2448 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2449 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2450 _MM_TRANSPOSE4_PS(Y,F,G,H);
2451 Heps = _mm_mul_ps(vfeps,H);
2452 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2453 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2454 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
2458 fscal = _mm_andnot_ps(dummy_mask,fscal);
2460 /* Calculate temporary vectorial force */
2461 tx = _mm_mul_ps(fscal,dx23);
2462 ty = _mm_mul_ps(fscal,dy23);
2463 tz = _mm_mul_ps(fscal,dz23);
2465 /* Update vectorial force */
2466 fix2 = _mm_add_ps(fix2,tx);
2467 fiy2 = _mm_add_ps(fiy2,ty);
2468 fiz2 = _mm_add_ps(fiz2,tz);
2470 fjx3 = _mm_add_ps(fjx3,tx);
2471 fjy3 = _mm_add_ps(fjy3,ty);
2472 fjz3 = _mm_add_ps(fjz3,tz);
2474 /**************************
2475 * CALCULATE INTERACTIONS *
2476 **************************/
2478 r31 = _mm_mul_ps(rsq31,rinv31);
2479 r31 = _mm_andnot_ps(dummy_mask,r31);
2481 /* Calculate table index by multiplying r with table scale and truncate to integer */
2482 rt = _mm_mul_ps(r31,vftabscale);
2483 vfitab = _mm_cvttps_epi32(rt);
2484 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2485 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2487 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2488 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2489 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2490 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2491 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2492 _MM_TRANSPOSE4_PS(Y,F,G,H);
2493 Heps = _mm_mul_ps(vfeps,H);
2494 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2495 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2496 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
2500 fscal = _mm_andnot_ps(dummy_mask,fscal);
2502 /* Calculate temporary vectorial force */
2503 tx = _mm_mul_ps(fscal,dx31);
2504 ty = _mm_mul_ps(fscal,dy31);
2505 tz = _mm_mul_ps(fscal,dz31);
2507 /* Update vectorial force */
2508 fix3 = _mm_add_ps(fix3,tx);
2509 fiy3 = _mm_add_ps(fiy3,ty);
2510 fiz3 = _mm_add_ps(fiz3,tz);
2512 fjx1 = _mm_add_ps(fjx1,tx);
2513 fjy1 = _mm_add_ps(fjy1,ty);
2514 fjz1 = _mm_add_ps(fjz1,tz);
2516 /**************************
2517 * CALCULATE INTERACTIONS *
2518 **************************/
2520 r32 = _mm_mul_ps(rsq32,rinv32);
2521 r32 = _mm_andnot_ps(dummy_mask,r32);
2523 /* Calculate table index by multiplying r with table scale and truncate to integer */
2524 rt = _mm_mul_ps(r32,vftabscale);
2525 vfitab = _mm_cvttps_epi32(rt);
2526 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2527 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2529 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2530 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2531 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2532 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2533 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2534 _MM_TRANSPOSE4_PS(Y,F,G,H);
2535 Heps = _mm_mul_ps(vfeps,H);
2536 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2537 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2538 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
2542 fscal = _mm_andnot_ps(dummy_mask,fscal);
2544 /* Calculate temporary vectorial force */
2545 tx = _mm_mul_ps(fscal,dx32);
2546 ty = _mm_mul_ps(fscal,dy32);
2547 tz = _mm_mul_ps(fscal,dz32);
2549 /* Update vectorial force */
2550 fix3 = _mm_add_ps(fix3,tx);
2551 fiy3 = _mm_add_ps(fiy3,ty);
2552 fiz3 = _mm_add_ps(fiz3,tz);
2554 fjx2 = _mm_add_ps(fjx2,tx);
2555 fjy2 = _mm_add_ps(fjy2,ty);
2556 fjz2 = _mm_add_ps(fjz2,tz);
2558 /**************************
2559 * CALCULATE INTERACTIONS *
2560 **************************/
2562 r33 = _mm_mul_ps(rsq33,rinv33);
2563 r33 = _mm_andnot_ps(dummy_mask,r33);
2565 /* Calculate table index by multiplying r with table scale and truncate to integer */
2566 rt = _mm_mul_ps(r33,vftabscale);
2567 vfitab = _mm_cvttps_epi32(rt);
2568 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2569 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2571 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2572 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2573 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2574 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2575 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2576 _MM_TRANSPOSE4_PS(Y,F,G,H);
2577 Heps = _mm_mul_ps(vfeps,H);
2578 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2579 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2580 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2584 fscal = _mm_andnot_ps(dummy_mask,fscal);
2586 /* Calculate temporary vectorial force */
2587 tx = _mm_mul_ps(fscal,dx33);
2588 ty = _mm_mul_ps(fscal,dy33);
2589 tz = _mm_mul_ps(fscal,dz33);
2591 /* Update vectorial force */
2592 fix3 = _mm_add_ps(fix3,tx);
2593 fiy3 = _mm_add_ps(fiy3,ty);
2594 fiz3 = _mm_add_ps(fiz3,tz);
2596 fjx3 = _mm_add_ps(fjx3,tx);
2597 fjy3 = _mm_add_ps(fjy3,ty);
2598 fjz3 = _mm_add_ps(fjz3,tz);
2600 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2601 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2602 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2603 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2605 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2606 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2607 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2609 /* Inner loop uses 412 flops */
2612 /* End of innermost loop */
2614 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2615 f+i_coord_offset,fshift+i_shift_offset);
2617 /* Increment number of inner iterations */
2618 inneriter += j_index_end - j_index_start;
2620 /* Outer loop uses 24 flops */
2623 /* Increment number of outer iterations */
2626 /* Update outer/inner flops */
2628 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*412);