2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_sse2_single.h"
48 #include "kernelutil_x86_sse2_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single
52 * Electrostatics interaction: CubicSplineTable
53 * VdW interaction: CubicSplineTable
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
107 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
111 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
113 __m128i ifour = _mm_set1_epi32(4);
114 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
116 __m128 dummy_mask,cutoff_mask;
117 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
118 __m128 one = _mm_set1_ps(1.0);
119 __m128 two = _mm_set1_ps(2.0);
125 jindex = nlist->jindex;
127 shiftidx = nlist->shift;
129 shiftvec = fr->shift_vec[0];
130 fshift = fr->fshift[0];
131 facel = _mm_set1_ps(fr->epsfac);
132 charge = mdatoms->chargeA;
133 nvdwtype = fr->ntype;
135 vdwtype = mdatoms->typeA;
137 vftab = kernel_data->table_elec_vdw->data;
138 vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
140 /* Setup water-specific parameters */
141 inr = nlist->iinr[0];
142 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
143 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
144 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
145 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
147 jq0 = _mm_set1_ps(charge[inr+0]);
148 jq1 = _mm_set1_ps(charge[inr+1]);
149 jq2 = _mm_set1_ps(charge[inr+2]);
150 vdwjidx0A = 2*vdwtype[inr+0];
151 qq00 = _mm_mul_ps(iq0,jq0);
152 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
153 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
154 qq01 = _mm_mul_ps(iq0,jq1);
155 qq02 = _mm_mul_ps(iq0,jq2);
156 qq10 = _mm_mul_ps(iq1,jq0);
157 qq11 = _mm_mul_ps(iq1,jq1);
158 qq12 = _mm_mul_ps(iq1,jq2);
159 qq20 = _mm_mul_ps(iq2,jq0);
160 qq21 = _mm_mul_ps(iq2,jq1);
161 qq22 = _mm_mul_ps(iq2,jq2);
163 /* Avoid stupid compiler warnings */
164 jnrA = jnrB = jnrC = jnrD = 0;
173 for(iidx=0;iidx<4*DIM;iidx++)
178 /* Start outer loop over neighborlists */
179 for(iidx=0; iidx<nri; iidx++)
181 /* Load shift vector for this list */
182 i_shift_offset = DIM*shiftidx[iidx];
184 /* Load limits for loop over neighbors */
185 j_index_start = jindex[iidx];
186 j_index_end = jindex[iidx+1];
188 /* Get outer coordinate index */
190 i_coord_offset = DIM*inr;
192 /* Load i particle coords and add shift vector */
193 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
194 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
196 fix0 = _mm_setzero_ps();
197 fiy0 = _mm_setzero_ps();
198 fiz0 = _mm_setzero_ps();
199 fix1 = _mm_setzero_ps();
200 fiy1 = _mm_setzero_ps();
201 fiz1 = _mm_setzero_ps();
202 fix2 = _mm_setzero_ps();
203 fiy2 = _mm_setzero_ps();
204 fiz2 = _mm_setzero_ps();
206 /* Reset potential sums */
207 velecsum = _mm_setzero_ps();
208 vvdwsum = _mm_setzero_ps();
210 /* Start inner kernel loop */
211 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
214 /* Get j neighbor index, and coordinate index */
219 j_coord_offsetA = DIM*jnrA;
220 j_coord_offsetB = DIM*jnrB;
221 j_coord_offsetC = DIM*jnrC;
222 j_coord_offsetD = DIM*jnrD;
224 /* load j atom coordinates */
225 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
226 x+j_coord_offsetC,x+j_coord_offsetD,
227 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
229 /* Calculate displacement vector */
230 dx00 = _mm_sub_ps(ix0,jx0);
231 dy00 = _mm_sub_ps(iy0,jy0);
232 dz00 = _mm_sub_ps(iz0,jz0);
233 dx01 = _mm_sub_ps(ix0,jx1);
234 dy01 = _mm_sub_ps(iy0,jy1);
235 dz01 = _mm_sub_ps(iz0,jz1);
236 dx02 = _mm_sub_ps(ix0,jx2);
237 dy02 = _mm_sub_ps(iy0,jy2);
238 dz02 = _mm_sub_ps(iz0,jz2);
239 dx10 = _mm_sub_ps(ix1,jx0);
240 dy10 = _mm_sub_ps(iy1,jy0);
241 dz10 = _mm_sub_ps(iz1,jz0);
242 dx11 = _mm_sub_ps(ix1,jx1);
243 dy11 = _mm_sub_ps(iy1,jy1);
244 dz11 = _mm_sub_ps(iz1,jz1);
245 dx12 = _mm_sub_ps(ix1,jx2);
246 dy12 = _mm_sub_ps(iy1,jy2);
247 dz12 = _mm_sub_ps(iz1,jz2);
248 dx20 = _mm_sub_ps(ix2,jx0);
249 dy20 = _mm_sub_ps(iy2,jy0);
250 dz20 = _mm_sub_ps(iz2,jz0);
251 dx21 = _mm_sub_ps(ix2,jx1);
252 dy21 = _mm_sub_ps(iy2,jy1);
253 dz21 = _mm_sub_ps(iz2,jz1);
254 dx22 = _mm_sub_ps(ix2,jx2);
255 dy22 = _mm_sub_ps(iy2,jy2);
256 dz22 = _mm_sub_ps(iz2,jz2);
258 /* Calculate squared distance and things based on it */
259 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
260 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
261 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
262 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
263 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
264 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
265 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
266 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
267 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
269 rinv00 = gmx_mm_invsqrt_ps(rsq00);
270 rinv01 = gmx_mm_invsqrt_ps(rsq01);
271 rinv02 = gmx_mm_invsqrt_ps(rsq02);
272 rinv10 = gmx_mm_invsqrt_ps(rsq10);
273 rinv11 = gmx_mm_invsqrt_ps(rsq11);
274 rinv12 = gmx_mm_invsqrt_ps(rsq12);
275 rinv20 = gmx_mm_invsqrt_ps(rsq20);
276 rinv21 = gmx_mm_invsqrt_ps(rsq21);
277 rinv22 = gmx_mm_invsqrt_ps(rsq22);
279 fjx0 = _mm_setzero_ps();
280 fjy0 = _mm_setzero_ps();
281 fjz0 = _mm_setzero_ps();
282 fjx1 = _mm_setzero_ps();
283 fjy1 = _mm_setzero_ps();
284 fjz1 = _mm_setzero_ps();
285 fjx2 = _mm_setzero_ps();
286 fjy2 = _mm_setzero_ps();
287 fjz2 = _mm_setzero_ps();
289 /**************************
290 * CALCULATE INTERACTIONS *
291 **************************/
293 r00 = _mm_mul_ps(rsq00,rinv00);
295 /* Calculate table index by multiplying r with table scale and truncate to integer */
296 rt = _mm_mul_ps(r00,vftabscale);
297 vfitab = _mm_cvttps_epi32(rt);
298 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
299 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
301 /* CUBIC SPLINE TABLE ELECTROSTATICS */
302 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
303 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
304 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
305 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
306 _MM_TRANSPOSE4_PS(Y,F,G,H);
307 Heps = _mm_mul_ps(vfeps,H);
308 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
309 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
310 velec = _mm_mul_ps(qq00,VV);
311 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
312 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
314 /* CUBIC SPLINE TABLE DISPERSION */
315 vfitab = _mm_add_epi32(vfitab,ifour);
316 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
317 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
318 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
319 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
320 _MM_TRANSPOSE4_PS(Y,F,G,H);
321 Heps = _mm_mul_ps(vfeps,H);
322 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
323 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
324 vvdw6 = _mm_mul_ps(c6_00,VV);
325 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
326 fvdw6 = _mm_mul_ps(c6_00,FF);
328 /* CUBIC SPLINE TABLE REPULSION */
329 vfitab = _mm_add_epi32(vfitab,ifour);
330 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
331 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
332 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
333 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
334 _MM_TRANSPOSE4_PS(Y,F,G,H);
335 Heps = _mm_mul_ps(vfeps,H);
336 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
337 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
338 vvdw12 = _mm_mul_ps(c12_00,VV);
339 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
340 fvdw12 = _mm_mul_ps(c12_00,FF);
341 vvdw = _mm_add_ps(vvdw12,vvdw6);
342 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
344 /* Update potential sum for this i atom from the interaction with this j atom. */
345 velecsum = _mm_add_ps(velecsum,velec);
346 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
348 fscal = _mm_add_ps(felec,fvdw);
350 /* Calculate temporary vectorial force */
351 tx = _mm_mul_ps(fscal,dx00);
352 ty = _mm_mul_ps(fscal,dy00);
353 tz = _mm_mul_ps(fscal,dz00);
355 /* Update vectorial force */
356 fix0 = _mm_add_ps(fix0,tx);
357 fiy0 = _mm_add_ps(fiy0,ty);
358 fiz0 = _mm_add_ps(fiz0,tz);
360 fjx0 = _mm_add_ps(fjx0,tx);
361 fjy0 = _mm_add_ps(fjy0,ty);
362 fjz0 = _mm_add_ps(fjz0,tz);
364 /**************************
365 * CALCULATE INTERACTIONS *
366 **************************/
368 r01 = _mm_mul_ps(rsq01,rinv01);
370 /* Calculate table index by multiplying r with table scale and truncate to integer */
371 rt = _mm_mul_ps(r01,vftabscale);
372 vfitab = _mm_cvttps_epi32(rt);
373 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
374 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
376 /* CUBIC SPLINE TABLE ELECTROSTATICS */
377 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
378 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
379 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
380 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
381 _MM_TRANSPOSE4_PS(Y,F,G,H);
382 Heps = _mm_mul_ps(vfeps,H);
383 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
384 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
385 velec = _mm_mul_ps(qq01,VV);
386 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
387 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
389 /* Update potential sum for this i atom from the interaction with this j atom. */
390 velecsum = _mm_add_ps(velecsum,velec);
394 /* Calculate temporary vectorial force */
395 tx = _mm_mul_ps(fscal,dx01);
396 ty = _mm_mul_ps(fscal,dy01);
397 tz = _mm_mul_ps(fscal,dz01);
399 /* Update vectorial force */
400 fix0 = _mm_add_ps(fix0,tx);
401 fiy0 = _mm_add_ps(fiy0,ty);
402 fiz0 = _mm_add_ps(fiz0,tz);
404 fjx1 = _mm_add_ps(fjx1,tx);
405 fjy1 = _mm_add_ps(fjy1,ty);
406 fjz1 = _mm_add_ps(fjz1,tz);
408 /**************************
409 * CALCULATE INTERACTIONS *
410 **************************/
412 r02 = _mm_mul_ps(rsq02,rinv02);
414 /* Calculate table index by multiplying r with table scale and truncate to integer */
415 rt = _mm_mul_ps(r02,vftabscale);
416 vfitab = _mm_cvttps_epi32(rt);
417 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
418 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
420 /* CUBIC SPLINE TABLE ELECTROSTATICS */
421 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
422 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
423 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
424 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
425 _MM_TRANSPOSE4_PS(Y,F,G,H);
426 Heps = _mm_mul_ps(vfeps,H);
427 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
428 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
429 velec = _mm_mul_ps(qq02,VV);
430 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
431 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
433 /* Update potential sum for this i atom from the interaction with this j atom. */
434 velecsum = _mm_add_ps(velecsum,velec);
438 /* Calculate temporary vectorial force */
439 tx = _mm_mul_ps(fscal,dx02);
440 ty = _mm_mul_ps(fscal,dy02);
441 tz = _mm_mul_ps(fscal,dz02);
443 /* Update vectorial force */
444 fix0 = _mm_add_ps(fix0,tx);
445 fiy0 = _mm_add_ps(fiy0,ty);
446 fiz0 = _mm_add_ps(fiz0,tz);
448 fjx2 = _mm_add_ps(fjx2,tx);
449 fjy2 = _mm_add_ps(fjy2,ty);
450 fjz2 = _mm_add_ps(fjz2,tz);
452 /**************************
453 * CALCULATE INTERACTIONS *
454 **************************/
456 r10 = _mm_mul_ps(rsq10,rinv10);
458 /* Calculate table index by multiplying r with table scale and truncate to integer */
459 rt = _mm_mul_ps(r10,vftabscale);
460 vfitab = _mm_cvttps_epi32(rt);
461 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
462 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
464 /* CUBIC SPLINE TABLE ELECTROSTATICS */
465 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
466 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
467 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
468 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
469 _MM_TRANSPOSE4_PS(Y,F,G,H);
470 Heps = _mm_mul_ps(vfeps,H);
471 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
472 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
473 velec = _mm_mul_ps(qq10,VV);
474 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
475 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
477 /* Update potential sum for this i atom from the interaction with this j atom. */
478 velecsum = _mm_add_ps(velecsum,velec);
482 /* Calculate temporary vectorial force */
483 tx = _mm_mul_ps(fscal,dx10);
484 ty = _mm_mul_ps(fscal,dy10);
485 tz = _mm_mul_ps(fscal,dz10);
487 /* Update vectorial force */
488 fix1 = _mm_add_ps(fix1,tx);
489 fiy1 = _mm_add_ps(fiy1,ty);
490 fiz1 = _mm_add_ps(fiz1,tz);
492 fjx0 = _mm_add_ps(fjx0,tx);
493 fjy0 = _mm_add_ps(fjy0,ty);
494 fjz0 = _mm_add_ps(fjz0,tz);
496 /**************************
497 * CALCULATE INTERACTIONS *
498 **************************/
500 r11 = _mm_mul_ps(rsq11,rinv11);
502 /* Calculate table index by multiplying r with table scale and truncate to integer */
503 rt = _mm_mul_ps(r11,vftabscale);
504 vfitab = _mm_cvttps_epi32(rt);
505 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
506 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
508 /* CUBIC SPLINE TABLE ELECTROSTATICS */
509 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
510 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
511 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
512 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
513 _MM_TRANSPOSE4_PS(Y,F,G,H);
514 Heps = _mm_mul_ps(vfeps,H);
515 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
516 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
517 velec = _mm_mul_ps(qq11,VV);
518 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
519 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
521 /* Update potential sum for this i atom from the interaction with this j atom. */
522 velecsum = _mm_add_ps(velecsum,velec);
526 /* Calculate temporary vectorial force */
527 tx = _mm_mul_ps(fscal,dx11);
528 ty = _mm_mul_ps(fscal,dy11);
529 tz = _mm_mul_ps(fscal,dz11);
531 /* Update vectorial force */
532 fix1 = _mm_add_ps(fix1,tx);
533 fiy1 = _mm_add_ps(fiy1,ty);
534 fiz1 = _mm_add_ps(fiz1,tz);
536 fjx1 = _mm_add_ps(fjx1,tx);
537 fjy1 = _mm_add_ps(fjy1,ty);
538 fjz1 = _mm_add_ps(fjz1,tz);
540 /**************************
541 * CALCULATE INTERACTIONS *
542 **************************/
544 r12 = _mm_mul_ps(rsq12,rinv12);
546 /* Calculate table index by multiplying r with table scale and truncate to integer */
547 rt = _mm_mul_ps(r12,vftabscale);
548 vfitab = _mm_cvttps_epi32(rt);
549 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
550 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
552 /* CUBIC SPLINE TABLE ELECTROSTATICS */
553 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
554 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
555 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
556 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
557 _MM_TRANSPOSE4_PS(Y,F,G,H);
558 Heps = _mm_mul_ps(vfeps,H);
559 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
560 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
561 velec = _mm_mul_ps(qq12,VV);
562 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
563 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
565 /* Update potential sum for this i atom from the interaction with this j atom. */
566 velecsum = _mm_add_ps(velecsum,velec);
570 /* Calculate temporary vectorial force */
571 tx = _mm_mul_ps(fscal,dx12);
572 ty = _mm_mul_ps(fscal,dy12);
573 tz = _mm_mul_ps(fscal,dz12);
575 /* Update vectorial force */
576 fix1 = _mm_add_ps(fix1,tx);
577 fiy1 = _mm_add_ps(fiy1,ty);
578 fiz1 = _mm_add_ps(fiz1,tz);
580 fjx2 = _mm_add_ps(fjx2,tx);
581 fjy2 = _mm_add_ps(fjy2,ty);
582 fjz2 = _mm_add_ps(fjz2,tz);
584 /**************************
585 * CALCULATE INTERACTIONS *
586 **************************/
588 r20 = _mm_mul_ps(rsq20,rinv20);
590 /* Calculate table index by multiplying r with table scale and truncate to integer */
591 rt = _mm_mul_ps(r20,vftabscale);
592 vfitab = _mm_cvttps_epi32(rt);
593 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
594 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
596 /* CUBIC SPLINE TABLE ELECTROSTATICS */
597 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
598 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
599 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
600 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
601 _MM_TRANSPOSE4_PS(Y,F,G,H);
602 Heps = _mm_mul_ps(vfeps,H);
603 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
604 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
605 velec = _mm_mul_ps(qq20,VV);
606 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
607 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
609 /* Update potential sum for this i atom from the interaction with this j atom. */
610 velecsum = _mm_add_ps(velecsum,velec);
614 /* Calculate temporary vectorial force */
615 tx = _mm_mul_ps(fscal,dx20);
616 ty = _mm_mul_ps(fscal,dy20);
617 tz = _mm_mul_ps(fscal,dz20);
619 /* Update vectorial force */
620 fix2 = _mm_add_ps(fix2,tx);
621 fiy2 = _mm_add_ps(fiy2,ty);
622 fiz2 = _mm_add_ps(fiz2,tz);
624 fjx0 = _mm_add_ps(fjx0,tx);
625 fjy0 = _mm_add_ps(fjy0,ty);
626 fjz0 = _mm_add_ps(fjz0,tz);
628 /**************************
629 * CALCULATE INTERACTIONS *
630 **************************/
632 r21 = _mm_mul_ps(rsq21,rinv21);
634 /* Calculate table index by multiplying r with table scale and truncate to integer */
635 rt = _mm_mul_ps(r21,vftabscale);
636 vfitab = _mm_cvttps_epi32(rt);
637 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
638 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
640 /* CUBIC SPLINE TABLE ELECTROSTATICS */
641 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
642 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
643 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
644 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
645 _MM_TRANSPOSE4_PS(Y,F,G,H);
646 Heps = _mm_mul_ps(vfeps,H);
647 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
648 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
649 velec = _mm_mul_ps(qq21,VV);
650 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
651 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
653 /* Update potential sum for this i atom from the interaction with this j atom. */
654 velecsum = _mm_add_ps(velecsum,velec);
658 /* Calculate temporary vectorial force */
659 tx = _mm_mul_ps(fscal,dx21);
660 ty = _mm_mul_ps(fscal,dy21);
661 tz = _mm_mul_ps(fscal,dz21);
663 /* Update vectorial force */
664 fix2 = _mm_add_ps(fix2,tx);
665 fiy2 = _mm_add_ps(fiy2,ty);
666 fiz2 = _mm_add_ps(fiz2,tz);
668 fjx1 = _mm_add_ps(fjx1,tx);
669 fjy1 = _mm_add_ps(fjy1,ty);
670 fjz1 = _mm_add_ps(fjz1,tz);
672 /**************************
673 * CALCULATE INTERACTIONS *
674 **************************/
676 r22 = _mm_mul_ps(rsq22,rinv22);
678 /* Calculate table index by multiplying r with table scale and truncate to integer */
679 rt = _mm_mul_ps(r22,vftabscale);
680 vfitab = _mm_cvttps_epi32(rt);
681 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
682 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
684 /* CUBIC SPLINE TABLE ELECTROSTATICS */
685 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
686 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
687 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
688 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
689 _MM_TRANSPOSE4_PS(Y,F,G,H);
690 Heps = _mm_mul_ps(vfeps,H);
691 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
692 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
693 velec = _mm_mul_ps(qq22,VV);
694 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
695 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
697 /* Update potential sum for this i atom from the interaction with this j atom. */
698 velecsum = _mm_add_ps(velecsum,velec);
702 /* Calculate temporary vectorial force */
703 tx = _mm_mul_ps(fscal,dx22);
704 ty = _mm_mul_ps(fscal,dy22);
705 tz = _mm_mul_ps(fscal,dz22);
707 /* Update vectorial force */
708 fix2 = _mm_add_ps(fix2,tx);
709 fiy2 = _mm_add_ps(fiy2,ty);
710 fiz2 = _mm_add_ps(fiz2,tz);
712 fjx2 = _mm_add_ps(fjx2,tx);
713 fjy2 = _mm_add_ps(fjy2,ty);
714 fjz2 = _mm_add_ps(fjz2,tz);
716 fjptrA = f+j_coord_offsetA;
717 fjptrB = f+j_coord_offsetB;
718 fjptrC = f+j_coord_offsetC;
719 fjptrD = f+j_coord_offsetD;
721 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
722 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
724 /* Inner loop uses 417 flops */
730 /* Get j neighbor index, and coordinate index */
731 jnrlistA = jjnr[jidx];
732 jnrlistB = jjnr[jidx+1];
733 jnrlistC = jjnr[jidx+2];
734 jnrlistD = jjnr[jidx+3];
735 /* Sign of each element will be negative for non-real atoms.
736 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
737 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
739 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
740 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
741 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
742 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
743 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
744 j_coord_offsetA = DIM*jnrA;
745 j_coord_offsetB = DIM*jnrB;
746 j_coord_offsetC = DIM*jnrC;
747 j_coord_offsetD = DIM*jnrD;
749 /* load j atom coordinates */
750 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
751 x+j_coord_offsetC,x+j_coord_offsetD,
752 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
754 /* Calculate displacement vector */
755 dx00 = _mm_sub_ps(ix0,jx0);
756 dy00 = _mm_sub_ps(iy0,jy0);
757 dz00 = _mm_sub_ps(iz0,jz0);
758 dx01 = _mm_sub_ps(ix0,jx1);
759 dy01 = _mm_sub_ps(iy0,jy1);
760 dz01 = _mm_sub_ps(iz0,jz1);
761 dx02 = _mm_sub_ps(ix0,jx2);
762 dy02 = _mm_sub_ps(iy0,jy2);
763 dz02 = _mm_sub_ps(iz0,jz2);
764 dx10 = _mm_sub_ps(ix1,jx0);
765 dy10 = _mm_sub_ps(iy1,jy0);
766 dz10 = _mm_sub_ps(iz1,jz0);
767 dx11 = _mm_sub_ps(ix1,jx1);
768 dy11 = _mm_sub_ps(iy1,jy1);
769 dz11 = _mm_sub_ps(iz1,jz1);
770 dx12 = _mm_sub_ps(ix1,jx2);
771 dy12 = _mm_sub_ps(iy1,jy2);
772 dz12 = _mm_sub_ps(iz1,jz2);
773 dx20 = _mm_sub_ps(ix2,jx0);
774 dy20 = _mm_sub_ps(iy2,jy0);
775 dz20 = _mm_sub_ps(iz2,jz0);
776 dx21 = _mm_sub_ps(ix2,jx1);
777 dy21 = _mm_sub_ps(iy2,jy1);
778 dz21 = _mm_sub_ps(iz2,jz1);
779 dx22 = _mm_sub_ps(ix2,jx2);
780 dy22 = _mm_sub_ps(iy2,jy2);
781 dz22 = _mm_sub_ps(iz2,jz2);
783 /* Calculate squared distance and things based on it */
784 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
785 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
786 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
787 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
788 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
789 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
790 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
791 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
792 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
794 rinv00 = gmx_mm_invsqrt_ps(rsq00);
795 rinv01 = gmx_mm_invsqrt_ps(rsq01);
796 rinv02 = gmx_mm_invsqrt_ps(rsq02);
797 rinv10 = gmx_mm_invsqrt_ps(rsq10);
798 rinv11 = gmx_mm_invsqrt_ps(rsq11);
799 rinv12 = gmx_mm_invsqrt_ps(rsq12);
800 rinv20 = gmx_mm_invsqrt_ps(rsq20);
801 rinv21 = gmx_mm_invsqrt_ps(rsq21);
802 rinv22 = gmx_mm_invsqrt_ps(rsq22);
804 fjx0 = _mm_setzero_ps();
805 fjy0 = _mm_setzero_ps();
806 fjz0 = _mm_setzero_ps();
807 fjx1 = _mm_setzero_ps();
808 fjy1 = _mm_setzero_ps();
809 fjz1 = _mm_setzero_ps();
810 fjx2 = _mm_setzero_ps();
811 fjy2 = _mm_setzero_ps();
812 fjz2 = _mm_setzero_ps();
814 /**************************
815 * CALCULATE INTERACTIONS *
816 **************************/
818 r00 = _mm_mul_ps(rsq00,rinv00);
819 r00 = _mm_andnot_ps(dummy_mask,r00);
821 /* Calculate table index by multiplying r with table scale and truncate to integer */
822 rt = _mm_mul_ps(r00,vftabscale);
823 vfitab = _mm_cvttps_epi32(rt);
824 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
825 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
827 /* CUBIC SPLINE TABLE ELECTROSTATICS */
828 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
829 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
830 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
831 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
832 _MM_TRANSPOSE4_PS(Y,F,G,H);
833 Heps = _mm_mul_ps(vfeps,H);
834 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
835 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
836 velec = _mm_mul_ps(qq00,VV);
837 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
838 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
840 /* CUBIC SPLINE TABLE DISPERSION */
841 vfitab = _mm_add_epi32(vfitab,ifour);
842 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
843 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
844 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
845 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
846 _MM_TRANSPOSE4_PS(Y,F,G,H);
847 Heps = _mm_mul_ps(vfeps,H);
848 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
849 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
850 vvdw6 = _mm_mul_ps(c6_00,VV);
851 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
852 fvdw6 = _mm_mul_ps(c6_00,FF);
854 /* CUBIC SPLINE TABLE REPULSION */
855 vfitab = _mm_add_epi32(vfitab,ifour);
856 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
857 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
858 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
859 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
860 _MM_TRANSPOSE4_PS(Y,F,G,H);
861 Heps = _mm_mul_ps(vfeps,H);
862 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
863 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
864 vvdw12 = _mm_mul_ps(c12_00,VV);
865 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
866 fvdw12 = _mm_mul_ps(c12_00,FF);
867 vvdw = _mm_add_ps(vvdw12,vvdw6);
868 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
870 /* Update potential sum for this i atom from the interaction with this j atom. */
871 velec = _mm_andnot_ps(dummy_mask,velec);
872 velecsum = _mm_add_ps(velecsum,velec);
873 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
874 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
876 fscal = _mm_add_ps(felec,fvdw);
878 fscal = _mm_andnot_ps(dummy_mask,fscal);
880 /* Calculate temporary vectorial force */
881 tx = _mm_mul_ps(fscal,dx00);
882 ty = _mm_mul_ps(fscal,dy00);
883 tz = _mm_mul_ps(fscal,dz00);
885 /* Update vectorial force */
886 fix0 = _mm_add_ps(fix0,tx);
887 fiy0 = _mm_add_ps(fiy0,ty);
888 fiz0 = _mm_add_ps(fiz0,tz);
890 fjx0 = _mm_add_ps(fjx0,tx);
891 fjy0 = _mm_add_ps(fjy0,ty);
892 fjz0 = _mm_add_ps(fjz0,tz);
894 /**************************
895 * CALCULATE INTERACTIONS *
896 **************************/
898 r01 = _mm_mul_ps(rsq01,rinv01);
899 r01 = _mm_andnot_ps(dummy_mask,r01);
901 /* Calculate table index by multiplying r with table scale and truncate to integer */
902 rt = _mm_mul_ps(r01,vftabscale);
903 vfitab = _mm_cvttps_epi32(rt);
904 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
905 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
907 /* CUBIC SPLINE TABLE ELECTROSTATICS */
908 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
909 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
910 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
911 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
912 _MM_TRANSPOSE4_PS(Y,F,G,H);
913 Heps = _mm_mul_ps(vfeps,H);
914 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
915 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
916 velec = _mm_mul_ps(qq01,VV);
917 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
918 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
920 /* Update potential sum for this i atom from the interaction with this j atom. */
921 velec = _mm_andnot_ps(dummy_mask,velec);
922 velecsum = _mm_add_ps(velecsum,velec);
926 fscal = _mm_andnot_ps(dummy_mask,fscal);
928 /* Calculate temporary vectorial force */
929 tx = _mm_mul_ps(fscal,dx01);
930 ty = _mm_mul_ps(fscal,dy01);
931 tz = _mm_mul_ps(fscal,dz01);
933 /* Update vectorial force */
934 fix0 = _mm_add_ps(fix0,tx);
935 fiy0 = _mm_add_ps(fiy0,ty);
936 fiz0 = _mm_add_ps(fiz0,tz);
938 fjx1 = _mm_add_ps(fjx1,tx);
939 fjy1 = _mm_add_ps(fjy1,ty);
940 fjz1 = _mm_add_ps(fjz1,tz);
942 /**************************
943 * CALCULATE INTERACTIONS *
944 **************************/
946 r02 = _mm_mul_ps(rsq02,rinv02);
947 r02 = _mm_andnot_ps(dummy_mask,r02);
949 /* Calculate table index by multiplying r with table scale and truncate to integer */
950 rt = _mm_mul_ps(r02,vftabscale);
951 vfitab = _mm_cvttps_epi32(rt);
952 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
953 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
955 /* CUBIC SPLINE TABLE ELECTROSTATICS */
956 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
957 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
958 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
959 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
960 _MM_TRANSPOSE4_PS(Y,F,G,H);
961 Heps = _mm_mul_ps(vfeps,H);
962 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
963 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
964 velec = _mm_mul_ps(qq02,VV);
965 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
966 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
968 /* Update potential sum for this i atom from the interaction with this j atom. */
969 velec = _mm_andnot_ps(dummy_mask,velec);
970 velecsum = _mm_add_ps(velecsum,velec);
974 fscal = _mm_andnot_ps(dummy_mask,fscal);
976 /* Calculate temporary vectorial force */
977 tx = _mm_mul_ps(fscal,dx02);
978 ty = _mm_mul_ps(fscal,dy02);
979 tz = _mm_mul_ps(fscal,dz02);
981 /* Update vectorial force */
982 fix0 = _mm_add_ps(fix0,tx);
983 fiy0 = _mm_add_ps(fiy0,ty);
984 fiz0 = _mm_add_ps(fiz0,tz);
986 fjx2 = _mm_add_ps(fjx2,tx);
987 fjy2 = _mm_add_ps(fjy2,ty);
988 fjz2 = _mm_add_ps(fjz2,tz);
990 /**************************
991 * CALCULATE INTERACTIONS *
992 **************************/
994 r10 = _mm_mul_ps(rsq10,rinv10);
995 r10 = _mm_andnot_ps(dummy_mask,r10);
997 /* Calculate table index by multiplying r with table scale and truncate to integer */
998 rt = _mm_mul_ps(r10,vftabscale);
999 vfitab = _mm_cvttps_epi32(rt);
1000 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1001 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1003 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1004 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1005 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1006 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1007 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1008 _MM_TRANSPOSE4_PS(Y,F,G,H);
1009 Heps = _mm_mul_ps(vfeps,H);
1010 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1011 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1012 velec = _mm_mul_ps(qq10,VV);
1013 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1014 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1016 /* Update potential sum for this i atom from the interaction with this j atom. */
1017 velec = _mm_andnot_ps(dummy_mask,velec);
1018 velecsum = _mm_add_ps(velecsum,velec);
1022 fscal = _mm_andnot_ps(dummy_mask,fscal);
1024 /* Calculate temporary vectorial force */
1025 tx = _mm_mul_ps(fscal,dx10);
1026 ty = _mm_mul_ps(fscal,dy10);
1027 tz = _mm_mul_ps(fscal,dz10);
1029 /* Update vectorial force */
1030 fix1 = _mm_add_ps(fix1,tx);
1031 fiy1 = _mm_add_ps(fiy1,ty);
1032 fiz1 = _mm_add_ps(fiz1,tz);
1034 fjx0 = _mm_add_ps(fjx0,tx);
1035 fjy0 = _mm_add_ps(fjy0,ty);
1036 fjz0 = _mm_add_ps(fjz0,tz);
1038 /**************************
1039 * CALCULATE INTERACTIONS *
1040 **************************/
1042 r11 = _mm_mul_ps(rsq11,rinv11);
1043 r11 = _mm_andnot_ps(dummy_mask,r11);
1045 /* Calculate table index by multiplying r with table scale and truncate to integer */
1046 rt = _mm_mul_ps(r11,vftabscale);
1047 vfitab = _mm_cvttps_epi32(rt);
1048 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1049 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1051 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1052 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1053 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1054 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1055 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1056 _MM_TRANSPOSE4_PS(Y,F,G,H);
1057 Heps = _mm_mul_ps(vfeps,H);
1058 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1059 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1060 velec = _mm_mul_ps(qq11,VV);
1061 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1062 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1064 /* Update potential sum for this i atom from the interaction with this j atom. */
1065 velec = _mm_andnot_ps(dummy_mask,velec);
1066 velecsum = _mm_add_ps(velecsum,velec);
1070 fscal = _mm_andnot_ps(dummy_mask,fscal);
1072 /* Calculate temporary vectorial force */
1073 tx = _mm_mul_ps(fscal,dx11);
1074 ty = _mm_mul_ps(fscal,dy11);
1075 tz = _mm_mul_ps(fscal,dz11);
1077 /* Update vectorial force */
1078 fix1 = _mm_add_ps(fix1,tx);
1079 fiy1 = _mm_add_ps(fiy1,ty);
1080 fiz1 = _mm_add_ps(fiz1,tz);
1082 fjx1 = _mm_add_ps(fjx1,tx);
1083 fjy1 = _mm_add_ps(fjy1,ty);
1084 fjz1 = _mm_add_ps(fjz1,tz);
1086 /**************************
1087 * CALCULATE INTERACTIONS *
1088 **************************/
1090 r12 = _mm_mul_ps(rsq12,rinv12);
1091 r12 = _mm_andnot_ps(dummy_mask,r12);
1093 /* Calculate table index by multiplying r with table scale and truncate to integer */
1094 rt = _mm_mul_ps(r12,vftabscale);
1095 vfitab = _mm_cvttps_epi32(rt);
1096 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1097 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1099 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1100 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1101 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1102 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1103 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1104 _MM_TRANSPOSE4_PS(Y,F,G,H);
1105 Heps = _mm_mul_ps(vfeps,H);
1106 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1107 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1108 velec = _mm_mul_ps(qq12,VV);
1109 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1110 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1112 /* Update potential sum for this i atom from the interaction with this j atom. */
1113 velec = _mm_andnot_ps(dummy_mask,velec);
1114 velecsum = _mm_add_ps(velecsum,velec);
1118 fscal = _mm_andnot_ps(dummy_mask,fscal);
1120 /* Calculate temporary vectorial force */
1121 tx = _mm_mul_ps(fscal,dx12);
1122 ty = _mm_mul_ps(fscal,dy12);
1123 tz = _mm_mul_ps(fscal,dz12);
1125 /* Update vectorial force */
1126 fix1 = _mm_add_ps(fix1,tx);
1127 fiy1 = _mm_add_ps(fiy1,ty);
1128 fiz1 = _mm_add_ps(fiz1,tz);
1130 fjx2 = _mm_add_ps(fjx2,tx);
1131 fjy2 = _mm_add_ps(fjy2,ty);
1132 fjz2 = _mm_add_ps(fjz2,tz);
1134 /**************************
1135 * CALCULATE INTERACTIONS *
1136 **************************/
1138 r20 = _mm_mul_ps(rsq20,rinv20);
1139 r20 = _mm_andnot_ps(dummy_mask,r20);
1141 /* Calculate table index by multiplying r with table scale and truncate to integer */
1142 rt = _mm_mul_ps(r20,vftabscale);
1143 vfitab = _mm_cvttps_epi32(rt);
1144 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1145 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1147 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1148 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1149 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1150 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1151 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1152 _MM_TRANSPOSE4_PS(Y,F,G,H);
1153 Heps = _mm_mul_ps(vfeps,H);
1154 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1155 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1156 velec = _mm_mul_ps(qq20,VV);
1157 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1158 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1160 /* Update potential sum for this i atom from the interaction with this j atom. */
1161 velec = _mm_andnot_ps(dummy_mask,velec);
1162 velecsum = _mm_add_ps(velecsum,velec);
1166 fscal = _mm_andnot_ps(dummy_mask,fscal);
1168 /* Calculate temporary vectorial force */
1169 tx = _mm_mul_ps(fscal,dx20);
1170 ty = _mm_mul_ps(fscal,dy20);
1171 tz = _mm_mul_ps(fscal,dz20);
1173 /* Update vectorial force */
1174 fix2 = _mm_add_ps(fix2,tx);
1175 fiy2 = _mm_add_ps(fiy2,ty);
1176 fiz2 = _mm_add_ps(fiz2,tz);
1178 fjx0 = _mm_add_ps(fjx0,tx);
1179 fjy0 = _mm_add_ps(fjy0,ty);
1180 fjz0 = _mm_add_ps(fjz0,tz);
1182 /**************************
1183 * CALCULATE INTERACTIONS *
1184 **************************/
1186 r21 = _mm_mul_ps(rsq21,rinv21);
1187 r21 = _mm_andnot_ps(dummy_mask,r21);
1189 /* Calculate table index by multiplying r with table scale and truncate to integer */
1190 rt = _mm_mul_ps(r21,vftabscale);
1191 vfitab = _mm_cvttps_epi32(rt);
1192 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1193 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1195 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1196 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1197 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1198 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1199 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1200 _MM_TRANSPOSE4_PS(Y,F,G,H);
1201 Heps = _mm_mul_ps(vfeps,H);
1202 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1203 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1204 velec = _mm_mul_ps(qq21,VV);
1205 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1206 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1208 /* Update potential sum for this i atom from the interaction with this j atom. */
1209 velec = _mm_andnot_ps(dummy_mask,velec);
1210 velecsum = _mm_add_ps(velecsum,velec);
1214 fscal = _mm_andnot_ps(dummy_mask,fscal);
1216 /* Calculate temporary vectorial force */
1217 tx = _mm_mul_ps(fscal,dx21);
1218 ty = _mm_mul_ps(fscal,dy21);
1219 tz = _mm_mul_ps(fscal,dz21);
1221 /* Update vectorial force */
1222 fix2 = _mm_add_ps(fix2,tx);
1223 fiy2 = _mm_add_ps(fiy2,ty);
1224 fiz2 = _mm_add_ps(fiz2,tz);
1226 fjx1 = _mm_add_ps(fjx1,tx);
1227 fjy1 = _mm_add_ps(fjy1,ty);
1228 fjz1 = _mm_add_ps(fjz1,tz);
1230 /**************************
1231 * CALCULATE INTERACTIONS *
1232 **************************/
1234 r22 = _mm_mul_ps(rsq22,rinv22);
1235 r22 = _mm_andnot_ps(dummy_mask,r22);
1237 /* Calculate table index by multiplying r with table scale and truncate to integer */
1238 rt = _mm_mul_ps(r22,vftabscale);
1239 vfitab = _mm_cvttps_epi32(rt);
1240 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1241 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1243 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1244 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1245 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1246 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1247 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1248 _MM_TRANSPOSE4_PS(Y,F,G,H);
1249 Heps = _mm_mul_ps(vfeps,H);
1250 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1251 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1252 velec = _mm_mul_ps(qq22,VV);
1253 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1254 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1256 /* Update potential sum for this i atom from the interaction with this j atom. */
1257 velec = _mm_andnot_ps(dummy_mask,velec);
1258 velecsum = _mm_add_ps(velecsum,velec);
1262 fscal = _mm_andnot_ps(dummy_mask,fscal);
1264 /* Calculate temporary vectorial force */
1265 tx = _mm_mul_ps(fscal,dx22);
1266 ty = _mm_mul_ps(fscal,dy22);
1267 tz = _mm_mul_ps(fscal,dz22);
1269 /* Update vectorial force */
1270 fix2 = _mm_add_ps(fix2,tx);
1271 fiy2 = _mm_add_ps(fiy2,ty);
1272 fiz2 = _mm_add_ps(fiz2,tz);
1274 fjx2 = _mm_add_ps(fjx2,tx);
1275 fjy2 = _mm_add_ps(fjy2,ty);
1276 fjz2 = _mm_add_ps(fjz2,tz);
1278 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1279 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1280 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1281 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1283 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1284 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1286 /* Inner loop uses 426 flops */
1289 /* End of innermost loop */
1291 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1292 f+i_coord_offset,fshift+i_shift_offset);
1295 /* Update potential energies */
1296 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1297 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1299 /* Increment number of inner iterations */
1300 inneriter += j_index_end - j_index_start;
1302 /* Outer loop uses 20 flops */
1305 /* Increment number of outer iterations */
1308 /* Update outer/inner flops */
1310 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*426);
1313 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single
1314 * Electrostatics interaction: CubicSplineTable
1315 * VdW interaction: CubicSplineTable
1316 * Geometry: Water3-Water3
1317 * Calculate force/pot: Force
1320 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single
1321 (t_nblist * gmx_restrict nlist,
1322 rvec * gmx_restrict xx,
1323 rvec * gmx_restrict ff,
1324 t_forcerec * gmx_restrict fr,
1325 t_mdatoms * gmx_restrict mdatoms,
1326 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1327 t_nrnb * gmx_restrict nrnb)
1329 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1330 * just 0 for non-waters.
1331 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1332 * jnr indices corresponding to data put in the four positions in the SIMD register.
1334 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1335 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1336 int jnrA,jnrB,jnrC,jnrD;
1337 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1338 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1339 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1340 real rcutoff_scalar;
1341 real *shiftvec,*fshift,*x,*f;
1342 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1343 real scratch[4*DIM];
1344 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1346 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1348 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1350 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1351 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1352 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1353 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1354 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1355 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1356 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1357 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1358 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1359 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1360 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1361 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1362 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1363 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1364 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1365 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1366 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1369 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1372 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1373 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1375 __m128i ifour = _mm_set1_epi32(4);
1376 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1378 __m128 dummy_mask,cutoff_mask;
1379 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1380 __m128 one = _mm_set1_ps(1.0);
1381 __m128 two = _mm_set1_ps(2.0);
1387 jindex = nlist->jindex;
1389 shiftidx = nlist->shift;
1391 shiftvec = fr->shift_vec[0];
1392 fshift = fr->fshift[0];
1393 facel = _mm_set1_ps(fr->epsfac);
1394 charge = mdatoms->chargeA;
1395 nvdwtype = fr->ntype;
1396 vdwparam = fr->nbfp;
1397 vdwtype = mdatoms->typeA;
1399 vftab = kernel_data->table_elec_vdw->data;
1400 vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
1402 /* Setup water-specific parameters */
1403 inr = nlist->iinr[0];
1404 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1405 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1406 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1407 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1409 jq0 = _mm_set1_ps(charge[inr+0]);
1410 jq1 = _mm_set1_ps(charge[inr+1]);
1411 jq2 = _mm_set1_ps(charge[inr+2]);
1412 vdwjidx0A = 2*vdwtype[inr+0];
1413 qq00 = _mm_mul_ps(iq0,jq0);
1414 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1415 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1416 qq01 = _mm_mul_ps(iq0,jq1);
1417 qq02 = _mm_mul_ps(iq0,jq2);
1418 qq10 = _mm_mul_ps(iq1,jq0);
1419 qq11 = _mm_mul_ps(iq1,jq1);
1420 qq12 = _mm_mul_ps(iq1,jq2);
1421 qq20 = _mm_mul_ps(iq2,jq0);
1422 qq21 = _mm_mul_ps(iq2,jq1);
1423 qq22 = _mm_mul_ps(iq2,jq2);
1425 /* Avoid stupid compiler warnings */
1426 jnrA = jnrB = jnrC = jnrD = 0;
1427 j_coord_offsetA = 0;
1428 j_coord_offsetB = 0;
1429 j_coord_offsetC = 0;
1430 j_coord_offsetD = 0;
1435 for(iidx=0;iidx<4*DIM;iidx++)
1437 scratch[iidx] = 0.0;
1440 /* Start outer loop over neighborlists */
1441 for(iidx=0; iidx<nri; iidx++)
1443 /* Load shift vector for this list */
1444 i_shift_offset = DIM*shiftidx[iidx];
1446 /* Load limits for loop over neighbors */
1447 j_index_start = jindex[iidx];
1448 j_index_end = jindex[iidx+1];
1450 /* Get outer coordinate index */
1452 i_coord_offset = DIM*inr;
1454 /* Load i particle coords and add shift vector */
1455 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1456 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1458 fix0 = _mm_setzero_ps();
1459 fiy0 = _mm_setzero_ps();
1460 fiz0 = _mm_setzero_ps();
1461 fix1 = _mm_setzero_ps();
1462 fiy1 = _mm_setzero_ps();
1463 fiz1 = _mm_setzero_ps();
1464 fix2 = _mm_setzero_ps();
1465 fiy2 = _mm_setzero_ps();
1466 fiz2 = _mm_setzero_ps();
1468 /* Start inner kernel loop */
1469 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1472 /* Get j neighbor index, and coordinate index */
1474 jnrB = jjnr[jidx+1];
1475 jnrC = jjnr[jidx+2];
1476 jnrD = jjnr[jidx+3];
1477 j_coord_offsetA = DIM*jnrA;
1478 j_coord_offsetB = DIM*jnrB;
1479 j_coord_offsetC = DIM*jnrC;
1480 j_coord_offsetD = DIM*jnrD;
1482 /* load j atom coordinates */
1483 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1484 x+j_coord_offsetC,x+j_coord_offsetD,
1485 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1487 /* Calculate displacement vector */
1488 dx00 = _mm_sub_ps(ix0,jx0);
1489 dy00 = _mm_sub_ps(iy0,jy0);
1490 dz00 = _mm_sub_ps(iz0,jz0);
1491 dx01 = _mm_sub_ps(ix0,jx1);
1492 dy01 = _mm_sub_ps(iy0,jy1);
1493 dz01 = _mm_sub_ps(iz0,jz1);
1494 dx02 = _mm_sub_ps(ix0,jx2);
1495 dy02 = _mm_sub_ps(iy0,jy2);
1496 dz02 = _mm_sub_ps(iz0,jz2);
1497 dx10 = _mm_sub_ps(ix1,jx0);
1498 dy10 = _mm_sub_ps(iy1,jy0);
1499 dz10 = _mm_sub_ps(iz1,jz0);
1500 dx11 = _mm_sub_ps(ix1,jx1);
1501 dy11 = _mm_sub_ps(iy1,jy1);
1502 dz11 = _mm_sub_ps(iz1,jz1);
1503 dx12 = _mm_sub_ps(ix1,jx2);
1504 dy12 = _mm_sub_ps(iy1,jy2);
1505 dz12 = _mm_sub_ps(iz1,jz2);
1506 dx20 = _mm_sub_ps(ix2,jx0);
1507 dy20 = _mm_sub_ps(iy2,jy0);
1508 dz20 = _mm_sub_ps(iz2,jz0);
1509 dx21 = _mm_sub_ps(ix2,jx1);
1510 dy21 = _mm_sub_ps(iy2,jy1);
1511 dz21 = _mm_sub_ps(iz2,jz1);
1512 dx22 = _mm_sub_ps(ix2,jx2);
1513 dy22 = _mm_sub_ps(iy2,jy2);
1514 dz22 = _mm_sub_ps(iz2,jz2);
1516 /* Calculate squared distance and things based on it */
1517 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1518 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1519 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1520 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1521 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1522 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1523 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1524 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1525 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1527 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1528 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1529 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1530 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1531 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1532 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1533 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1534 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1535 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1537 fjx0 = _mm_setzero_ps();
1538 fjy0 = _mm_setzero_ps();
1539 fjz0 = _mm_setzero_ps();
1540 fjx1 = _mm_setzero_ps();
1541 fjy1 = _mm_setzero_ps();
1542 fjz1 = _mm_setzero_ps();
1543 fjx2 = _mm_setzero_ps();
1544 fjy2 = _mm_setzero_ps();
1545 fjz2 = _mm_setzero_ps();
1547 /**************************
1548 * CALCULATE INTERACTIONS *
1549 **************************/
1551 r00 = _mm_mul_ps(rsq00,rinv00);
1553 /* Calculate table index by multiplying r with table scale and truncate to integer */
1554 rt = _mm_mul_ps(r00,vftabscale);
1555 vfitab = _mm_cvttps_epi32(rt);
1556 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1557 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1559 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1560 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1561 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1562 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1563 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1564 _MM_TRANSPOSE4_PS(Y,F,G,H);
1565 Heps = _mm_mul_ps(vfeps,H);
1566 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1567 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1568 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1570 /* CUBIC SPLINE TABLE DISPERSION */
1571 vfitab = _mm_add_epi32(vfitab,ifour);
1572 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1573 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1574 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1575 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1576 _MM_TRANSPOSE4_PS(Y,F,G,H);
1577 Heps = _mm_mul_ps(vfeps,H);
1578 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1579 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1580 fvdw6 = _mm_mul_ps(c6_00,FF);
1582 /* CUBIC SPLINE TABLE REPULSION */
1583 vfitab = _mm_add_epi32(vfitab,ifour);
1584 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1585 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1586 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1587 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1588 _MM_TRANSPOSE4_PS(Y,F,G,H);
1589 Heps = _mm_mul_ps(vfeps,H);
1590 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1591 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1592 fvdw12 = _mm_mul_ps(c12_00,FF);
1593 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1595 fscal = _mm_add_ps(felec,fvdw);
1597 /* Calculate temporary vectorial force */
1598 tx = _mm_mul_ps(fscal,dx00);
1599 ty = _mm_mul_ps(fscal,dy00);
1600 tz = _mm_mul_ps(fscal,dz00);
1602 /* Update vectorial force */
1603 fix0 = _mm_add_ps(fix0,tx);
1604 fiy0 = _mm_add_ps(fiy0,ty);
1605 fiz0 = _mm_add_ps(fiz0,tz);
1607 fjx0 = _mm_add_ps(fjx0,tx);
1608 fjy0 = _mm_add_ps(fjy0,ty);
1609 fjz0 = _mm_add_ps(fjz0,tz);
1611 /**************************
1612 * CALCULATE INTERACTIONS *
1613 **************************/
1615 r01 = _mm_mul_ps(rsq01,rinv01);
1617 /* Calculate table index by multiplying r with table scale and truncate to integer */
1618 rt = _mm_mul_ps(r01,vftabscale);
1619 vfitab = _mm_cvttps_epi32(rt);
1620 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1621 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1623 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1624 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1625 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1626 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1627 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1628 _MM_TRANSPOSE4_PS(Y,F,G,H);
1629 Heps = _mm_mul_ps(vfeps,H);
1630 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1631 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1632 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
1636 /* Calculate temporary vectorial force */
1637 tx = _mm_mul_ps(fscal,dx01);
1638 ty = _mm_mul_ps(fscal,dy01);
1639 tz = _mm_mul_ps(fscal,dz01);
1641 /* Update vectorial force */
1642 fix0 = _mm_add_ps(fix0,tx);
1643 fiy0 = _mm_add_ps(fiy0,ty);
1644 fiz0 = _mm_add_ps(fiz0,tz);
1646 fjx1 = _mm_add_ps(fjx1,tx);
1647 fjy1 = _mm_add_ps(fjy1,ty);
1648 fjz1 = _mm_add_ps(fjz1,tz);
1650 /**************************
1651 * CALCULATE INTERACTIONS *
1652 **************************/
1654 r02 = _mm_mul_ps(rsq02,rinv02);
1656 /* Calculate table index by multiplying r with table scale and truncate to integer */
1657 rt = _mm_mul_ps(r02,vftabscale);
1658 vfitab = _mm_cvttps_epi32(rt);
1659 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1660 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1662 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1663 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1664 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1665 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1666 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1667 _MM_TRANSPOSE4_PS(Y,F,G,H);
1668 Heps = _mm_mul_ps(vfeps,H);
1669 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1670 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1671 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
1675 /* Calculate temporary vectorial force */
1676 tx = _mm_mul_ps(fscal,dx02);
1677 ty = _mm_mul_ps(fscal,dy02);
1678 tz = _mm_mul_ps(fscal,dz02);
1680 /* Update vectorial force */
1681 fix0 = _mm_add_ps(fix0,tx);
1682 fiy0 = _mm_add_ps(fiy0,ty);
1683 fiz0 = _mm_add_ps(fiz0,tz);
1685 fjx2 = _mm_add_ps(fjx2,tx);
1686 fjy2 = _mm_add_ps(fjy2,ty);
1687 fjz2 = _mm_add_ps(fjz2,tz);
1689 /**************************
1690 * CALCULATE INTERACTIONS *
1691 **************************/
1693 r10 = _mm_mul_ps(rsq10,rinv10);
1695 /* Calculate table index by multiplying r with table scale and truncate to integer */
1696 rt = _mm_mul_ps(r10,vftabscale);
1697 vfitab = _mm_cvttps_epi32(rt);
1698 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1699 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1701 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1702 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1703 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1704 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1705 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1706 _MM_TRANSPOSE4_PS(Y,F,G,H);
1707 Heps = _mm_mul_ps(vfeps,H);
1708 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1709 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1710 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1714 /* Calculate temporary vectorial force */
1715 tx = _mm_mul_ps(fscal,dx10);
1716 ty = _mm_mul_ps(fscal,dy10);
1717 tz = _mm_mul_ps(fscal,dz10);
1719 /* Update vectorial force */
1720 fix1 = _mm_add_ps(fix1,tx);
1721 fiy1 = _mm_add_ps(fiy1,ty);
1722 fiz1 = _mm_add_ps(fiz1,tz);
1724 fjx0 = _mm_add_ps(fjx0,tx);
1725 fjy0 = _mm_add_ps(fjy0,ty);
1726 fjz0 = _mm_add_ps(fjz0,tz);
1728 /**************************
1729 * CALCULATE INTERACTIONS *
1730 **************************/
1732 r11 = _mm_mul_ps(rsq11,rinv11);
1734 /* Calculate table index by multiplying r with table scale and truncate to integer */
1735 rt = _mm_mul_ps(r11,vftabscale);
1736 vfitab = _mm_cvttps_epi32(rt);
1737 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1738 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1740 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1741 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1742 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1743 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1744 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1745 _MM_TRANSPOSE4_PS(Y,F,G,H);
1746 Heps = _mm_mul_ps(vfeps,H);
1747 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1748 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1749 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1753 /* Calculate temporary vectorial force */
1754 tx = _mm_mul_ps(fscal,dx11);
1755 ty = _mm_mul_ps(fscal,dy11);
1756 tz = _mm_mul_ps(fscal,dz11);
1758 /* Update vectorial force */
1759 fix1 = _mm_add_ps(fix1,tx);
1760 fiy1 = _mm_add_ps(fiy1,ty);
1761 fiz1 = _mm_add_ps(fiz1,tz);
1763 fjx1 = _mm_add_ps(fjx1,tx);
1764 fjy1 = _mm_add_ps(fjy1,ty);
1765 fjz1 = _mm_add_ps(fjz1,tz);
1767 /**************************
1768 * CALCULATE INTERACTIONS *
1769 **************************/
1771 r12 = _mm_mul_ps(rsq12,rinv12);
1773 /* Calculate table index by multiplying r with table scale and truncate to integer */
1774 rt = _mm_mul_ps(r12,vftabscale);
1775 vfitab = _mm_cvttps_epi32(rt);
1776 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1777 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1779 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1780 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1781 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1782 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1783 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1784 _MM_TRANSPOSE4_PS(Y,F,G,H);
1785 Heps = _mm_mul_ps(vfeps,H);
1786 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1787 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1788 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1792 /* Calculate temporary vectorial force */
1793 tx = _mm_mul_ps(fscal,dx12);
1794 ty = _mm_mul_ps(fscal,dy12);
1795 tz = _mm_mul_ps(fscal,dz12);
1797 /* Update vectorial force */
1798 fix1 = _mm_add_ps(fix1,tx);
1799 fiy1 = _mm_add_ps(fiy1,ty);
1800 fiz1 = _mm_add_ps(fiz1,tz);
1802 fjx2 = _mm_add_ps(fjx2,tx);
1803 fjy2 = _mm_add_ps(fjy2,ty);
1804 fjz2 = _mm_add_ps(fjz2,tz);
1806 /**************************
1807 * CALCULATE INTERACTIONS *
1808 **************************/
1810 r20 = _mm_mul_ps(rsq20,rinv20);
1812 /* Calculate table index by multiplying r with table scale and truncate to integer */
1813 rt = _mm_mul_ps(r20,vftabscale);
1814 vfitab = _mm_cvttps_epi32(rt);
1815 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1816 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1818 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1819 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1820 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1821 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1822 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1823 _MM_TRANSPOSE4_PS(Y,F,G,H);
1824 Heps = _mm_mul_ps(vfeps,H);
1825 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1826 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1827 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1831 /* Calculate temporary vectorial force */
1832 tx = _mm_mul_ps(fscal,dx20);
1833 ty = _mm_mul_ps(fscal,dy20);
1834 tz = _mm_mul_ps(fscal,dz20);
1836 /* Update vectorial force */
1837 fix2 = _mm_add_ps(fix2,tx);
1838 fiy2 = _mm_add_ps(fiy2,ty);
1839 fiz2 = _mm_add_ps(fiz2,tz);
1841 fjx0 = _mm_add_ps(fjx0,tx);
1842 fjy0 = _mm_add_ps(fjy0,ty);
1843 fjz0 = _mm_add_ps(fjz0,tz);
1845 /**************************
1846 * CALCULATE INTERACTIONS *
1847 **************************/
1849 r21 = _mm_mul_ps(rsq21,rinv21);
1851 /* Calculate table index by multiplying r with table scale and truncate to integer */
1852 rt = _mm_mul_ps(r21,vftabscale);
1853 vfitab = _mm_cvttps_epi32(rt);
1854 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1855 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1857 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1858 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1859 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1860 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1861 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1862 _MM_TRANSPOSE4_PS(Y,F,G,H);
1863 Heps = _mm_mul_ps(vfeps,H);
1864 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1865 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1866 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1870 /* Calculate temporary vectorial force */
1871 tx = _mm_mul_ps(fscal,dx21);
1872 ty = _mm_mul_ps(fscal,dy21);
1873 tz = _mm_mul_ps(fscal,dz21);
1875 /* Update vectorial force */
1876 fix2 = _mm_add_ps(fix2,tx);
1877 fiy2 = _mm_add_ps(fiy2,ty);
1878 fiz2 = _mm_add_ps(fiz2,tz);
1880 fjx1 = _mm_add_ps(fjx1,tx);
1881 fjy1 = _mm_add_ps(fjy1,ty);
1882 fjz1 = _mm_add_ps(fjz1,tz);
1884 /**************************
1885 * CALCULATE INTERACTIONS *
1886 **************************/
1888 r22 = _mm_mul_ps(rsq22,rinv22);
1890 /* Calculate table index by multiplying r with table scale and truncate to integer */
1891 rt = _mm_mul_ps(r22,vftabscale);
1892 vfitab = _mm_cvttps_epi32(rt);
1893 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1894 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1896 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1897 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1898 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1899 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1900 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1901 _MM_TRANSPOSE4_PS(Y,F,G,H);
1902 Heps = _mm_mul_ps(vfeps,H);
1903 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1904 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1905 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1909 /* Calculate temporary vectorial force */
1910 tx = _mm_mul_ps(fscal,dx22);
1911 ty = _mm_mul_ps(fscal,dy22);
1912 tz = _mm_mul_ps(fscal,dz22);
1914 /* Update vectorial force */
1915 fix2 = _mm_add_ps(fix2,tx);
1916 fiy2 = _mm_add_ps(fiy2,ty);
1917 fiz2 = _mm_add_ps(fiz2,tz);
1919 fjx2 = _mm_add_ps(fjx2,tx);
1920 fjy2 = _mm_add_ps(fjy2,ty);
1921 fjz2 = _mm_add_ps(fjz2,tz);
1923 fjptrA = f+j_coord_offsetA;
1924 fjptrB = f+j_coord_offsetB;
1925 fjptrC = f+j_coord_offsetC;
1926 fjptrD = f+j_coord_offsetD;
1928 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1929 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1931 /* Inner loop uses 373 flops */
1934 if(jidx<j_index_end)
1937 /* Get j neighbor index, and coordinate index */
1938 jnrlistA = jjnr[jidx];
1939 jnrlistB = jjnr[jidx+1];
1940 jnrlistC = jjnr[jidx+2];
1941 jnrlistD = jjnr[jidx+3];
1942 /* Sign of each element will be negative for non-real atoms.
1943 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1944 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1946 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1947 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1948 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1949 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1950 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1951 j_coord_offsetA = DIM*jnrA;
1952 j_coord_offsetB = DIM*jnrB;
1953 j_coord_offsetC = DIM*jnrC;
1954 j_coord_offsetD = DIM*jnrD;
1956 /* load j atom coordinates */
1957 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1958 x+j_coord_offsetC,x+j_coord_offsetD,
1959 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1961 /* Calculate displacement vector */
1962 dx00 = _mm_sub_ps(ix0,jx0);
1963 dy00 = _mm_sub_ps(iy0,jy0);
1964 dz00 = _mm_sub_ps(iz0,jz0);
1965 dx01 = _mm_sub_ps(ix0,jx1);
1966 dy01 = _mm_sub_ps(iy0,jy1);
1967 dz01 = _mm_sub_ps(iz0,jz1);
1968 dx02 = _mm_sub_ps(ix0,jx2);
1969 dy02 = _mm_sub_ps(iy0,jy2);
1970 dz02 = _mm_sub_ps(iz0,jz2);
1971 dx10 = _mm_sub_ps(ix1,jx0);
1972 dy10 = _mm_sub_ps(iy1,jy0);
1973 dz10 = _mm_sub_ps(iz1,jz0);
1974 dx11 = _mm_sub_ps(ix1,jx1);
1975 dy11 = _mm_sub_ps(iy1,jy1);
1976 dz11 = _mm_sub_ps(iz1,jz1);
1977 dx12 = _mm_sub_ps(ix1,jx2);
1978 dy12 = _mm_sub_ps(iy1,jy2);
1979 dz12 = _mm_sub_ps(iz1,jz2);
1980 dx20 = _mm_sub_ps(ix2,jx0);
1981 dy20 = _mm_sub_ps(iy2,jy0);
1982 dz20 = _mm_sub_ps(iz2,jz0);
1983 dx21 = _mm_sub_ps(ix2,jx1);
1984 dy21 = _mm_sub_ps(iy2,jy1);
1985 dz21 = _mm_sub_ps(iz2,jz1);
1986 dx22 = _mm_sub_ps(ix2,jx2);
1987 dy22 = _mm_sub_ps(iy2,jy2);
1988 dz22 = _mm_sub_ps(iz2,jz2);
1990 /* Calculate squared distance and things based on it */
1991 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1992 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1993 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1994 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1995 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1996 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1997 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1998 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1999 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2001 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2002 rinv01 = gmx_mm_invsqrt_ps(rsq01);
2003 rinv02 = gmx_mm_invsqrt_ps(rsq02);
2004 rinv10 = gmx_mm_invsqrt_ps(rsq10);
2005 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2006 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2007 rinv20 = gmx_mm_invsqrt_ps(rsq20);
2008 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2009 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2011 fjx0 = _mm_setzero_ps();
2012 fjy0 = _mm_setzero_ps();
2013 fjz0 = _mm_setzero_ps();
2014 fjx1 = _mm_setzero_ps();
2015 fjy1 = _mm_setzero_ps();
2016 fjz1 = _mm_setzero_ps();
2017 fjx2 = _mm_setzero_ps();
2018 fjy2 = _mm_setzero_ps();
2019 fjz2 = _mm_setzero_ps();
2021 /**************************
2022 * CALCULATE INTERACTIONS *
2023 **************************/
2025 r00 = _mm_mul_ps(rsq00,rinv00);
2026 r00 = _mm_andnot_ps(dummy_mask,r00);
2028 /* Calculate table index by multiplying r with table scale and truncate to integer */
2029 rt = _mm_mul_ps(r00,vftabscale);
2030 vfitab = _mm_cvttps_epi32(rt);
2031 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2032 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2034 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2035 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2036 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2037 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2038 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2039 _MM_TRANSPOSE4_PS(Y,F,G,H);
2040 Heps = _mm_mul_ps(vfeps,H);
2041 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2042 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2043 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
2045 /* CUBIC SPLINE TABLE DISPERSION */
2046 vfitab = _mm_add_epi32(vfitab,ifour);
2047 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2048 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2049 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2050 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2051 _MM_TRANSPOSE4_PS(Y,F,G,H);
2052 Heps = _mm_mul_ps(vfeps,H);
2053 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2054 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2055 fvdw6 = _mm_mul_ps(c6_00,FF);
2057 /* CUBIC SPLINE TABLE REPULSION */
2058 vfitab = _mm_add_epi32(vfitab,ifour);
2059 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2060 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2061 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2062 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2063 _MM_TRANSPOSE4_PS(Y,F,G,H);
2064 Heps = _mm_mul_ps(vfeps,H);
2065 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2066 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2067 fvdw12 = _mm_mul_ps(c12_00,FF);
2068 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
2070 fscal = _mm_add_ps(felec,fvdw);
2072 fscal = _mm_andnot_ps(dummy_mask,fscal);
2074 /* Calculate temporary vectorial force */
2075 tx = _mm_mul_ps(fscal,dx00);
2076 ty = _mm_mul_ps(fscal,dy00);
2077 tz = _mm_mul_ps(fscal,dz00);
2079 /* Update vectorial force */
2080 fix0 = _mm_add_ps(fix0,tx);
2081 fiy0 = _mm_add_ps(fiy0,ty);
2082 fiz0 = _mm_add_ps(fiz0,tz);
2084 fjx0 = _mm_add_ps(fjx0,tx);
2085 fjy0 = _mm_add_ps(fjy0,ty);
2086 fjz0 = _mm_add_ps(fjz0,tz);
2088 /**************************
2089 * CALCULATE INTERACTIONS *
2090 **************************/
2092 r01 = _mm_mul_ps(rsq01,rinv01);
2093 r01 = _mm_andnot_ps(dummy_mask,r01);
2095 /* Calculate table index by multiplying r with table scale and truncate to integer */
2096 rt = _mm_mul_ps(r01,vftabscale);
2097 vfitab = _mm_cvttps_epi32(rt);
2098 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2099 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2101 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2102 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2103 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2104 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2105 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2106 _MM_TRANSPOSE4_PS(Y,F,G,H);
2107 Heps = _mm_mul_ps(vfeps,H);
2108 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2109 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2110 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
2114 fscal = _mm_andnot_ps(dummy_mask,fscal);
2116 /* Calculate temporary vectorial force */
2117 tx = _mm_mul_ps(fscal,dx01);
2118 ty = _mm_mul_ps(fscal,dy01);
2119 tz = _mm_mul_ps(fscal,dz01);
2121 /* Update vectorial force */
2122 fix0 = _mm_add_ps(fix0,tx);
2123 fiy0 = _mm_add_ps(fiy0,ty);
2124 fiz0 = _mm_add_ps(fiz0,tz);
2126 fjx1 = _mm_add_ps(fjx1,tx);
2127 fjy1 = _mm_add_ps(fjy1,ty);
2128 fjz1 = _mm_add_ps(fjz1,tz);
2130 /**************************
2131 * CALCULATE INTERACTIONS *
2132 **************************/
2134 r02 = _mm_mul_ps(rsq02,rinv02);
2135 r02 = _mm_andnot_ps(dummy_mask,r02);
2137 /* Calculate table index by multiplying r with table scale and truncate to integer */
2138 rt = _mm_mul_ps(r02,vftabscale);
2139 vfitab = _mm_cvttps_epi32(rt);
2140 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2141 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2143 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2144 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2145 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2146 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2147 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2148 _MM_TRANSPOSE4_PS(Y,F,G,H);
2149 Heps = _mm_mul_ps(vfeps,H);
2150 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2151 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2152 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
2156 fscal = _mm_andnot_ps(dummy_mask,fscal);
2158 /* Calculate temporary vectorial force */
2159 tx = _mm_mul_ps(fscal,dx02);
2160 ty = _mm_mul_ps(fscal,dy02);
2161 tz = _mm_mul_ps(fscal,dz02);
2163 /* Update vectorial force */
2164 fix0 = _mm_add_ps(fix0,tx);
2165 fiy0 = _mm_add_ps(fiy0,ty);
2166 fiz0 = _mm_add_ps(fiz0,tz);
2168 fjx2 = _mm_add_ps(fjx2,tx);
2169 fjy2 = _mm_add_ps(fjy2,ty);
2170 fjz2 = _mm_add_ps(fjz2,tz);
2172 /**************************
2173 * CALCULATE INTERACTIONS *
2174 **************************/
2176 r10 = _mm_mul_ps(rsq10,rinv10);
2177 r10 = _mm_andnot_ps(dummy_mask,r10);
2179 /* Calculate table index by multiplying r with table scale and truncate to integer */
2180 rt = _mm_mul_ps(r10,vftabscale);
2181 vfitab = _mm_cvttps_epi32(rt);
2182 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2183 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2185 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2186 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2187 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2188 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2189 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2190 _MM_TRANSPOSE4_PS(Y,F,G,H);
2191 Heps = _mm_mul_ps(vfeps,H);
2192 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2193 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2194 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
2198 fscal = _mm_andnot_ps(dummy_mask,fscal);
2200 /* Calculate temporary vectorial force */
2201 tx = _mm_mul_ps(fscal,dx10);
2202 ty = _mm_mul_ps(fscal,dy10);
2203 tz = _mm_mul_ps(fscal,dz10);
2205 /* Update vectorial force */
2206 fix1 = _mm_add_ps(fix1,tx);
2207 fiy1 = _mm_add_ps(fiy1,ty);
2208 fiz1 = _mm_add_ps(fiz1,tz);
2210 fjx0 = _mm_add_ps(fjx0,tx);
2211 fjy0 = _mm_add_ps(fjy0,ty);
2212 fjz0 = _mm_add_ps(fjz0,tz);
2214 /**************************
2215 * CALCULATE INTERACTIONS *
2216 **************************/
2218 r11 = _mm_mul_ps(rsq11,rinv11);
2219 r11 = _mm_andnot_ps(dummy_mask,r11);
2221 /* Calculate table index by multiplying r with table scale and truncate to integer */
2222 rt = _mm_mul_ps(r11,vftabscale);
2223 vfitab = _mm_cvttps_epi32(rt);
2224 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2225 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2227 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2228 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2229 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2230 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2231 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2232 _MM_TRANSPOSE4_PS(Y,F,G,H);
2233 Heps = _mm_mul_ps(vfeps,H);
2234 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2235 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2236 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2240 fscal = _mm_andnot_ps(dummy_mask,fscal);
2242 /* Calculate temporary vectorial force */
2243 tx = _mm_mul_ps(fscal,dx11);
2244 ty = _mm_mul_ps(fscal,dy11);
2245 tz = _mm_mul_ps(fscal,dz11);
2247 /* Update vectorial force */
2248 fix1 = _mm_add_ps(fix1,tx);
2249 fiy1 = _mm_add_ps(fiy1,ty);
2250 fiz1 = _mm_add_ps(fiz1,tz);
2252 fjx1 = _mm_add_ps(fjx1,tx);
2253 fjy1 = _mm_add_ps(fjy1,ty);
2254 fjz1 = _mm_add_ps(fjz1,tz);
2256 /**************************
2257 * CALCULATE INTERACTIONS *
2258 **************************/
2260 r12 = _mm_mul_ps(rsq12,rinv12);
2261 r12 = _mm_andnot_ps(dummy_mask,r12);
2263 /* Calculate table index by multiplying r with table scale and truncate to integer */
2264 rt = _mm_mul_ps(r12,vftabscale);
2265 vfitab = _mm_cvttps_epi32(rt);
2266 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2267 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2269 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2270 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2271 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2272 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2273 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2274 _MM_TRANSPOSE4_PS(Y,F,G,H);
2275 Heps = _mm_mul_ps(vfeps,H);
2276 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2277 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2278 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2282 fscal = _mm_andnot_ps(dummy_mask,fscal);
2284 /* Calculate temporary vectorial force */
2285 tx = _mm_mul_ps(fscal,dx12);
2286 ty = _mm_mul_ps(fscal,dy12);
2287 tz = _mm_mul_ps(fscal,dz12);
2289 /* Update vectorial force */
2290 fix1 = _mm_add_ps(fix1,tx);
2291 fiy1 = _mm_add_ps(fiy1,ty);
2292 fiz1 = _mm_add_ps(fiz1,tz);
2294 fjx2 = _mm_add_ps(fjx2,tx);
2295 fjy2 = _mm_add_ps(fjy2,ty);
2296 fjz2 = _mm_add_ps(fjz2,tz);
2298 /**************************
2299 * CALCULATE INTERACTIONS *
2300 **************************/
2302 r20 = _mm_mul_ps(rsq20,rinv20);
2303 r20 = _mm_andnot_ps(dummy_mask,r20);
2305 /* Calculate table index by multiplying r with table scale and truncate to integer */
2306 rt = _mm_mul_ps(r20,vftabscale);
2307 vfitab = _mm_cvttps_epi32(rt);
2308 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2309 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2311 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2312 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2313 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2314 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2315 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2316 _MM_TRANSPOSE4_PS(Y,F,G,H);
2317 Heps = _mm_mul_ps(vfeps,H);
2318 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2319 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2320 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
2324 fscal = _mm_andnot_ps(dummy_mask,fscal);
2326 /* Calculate temporary vectorial force */
2327 tx = _mm_mul_ps(fscal,dx20);
2328 ty = _mm_mul_ps(fscal,dy20);
2329 tz = _mm_mul_ps(fscal,dz20);
2331 /* Update vectorial force */
2332 fix2 = _mm_add_ps(fix2,tx);
2333 fiy2 = _mm_add_ps(fiy2,ty);
2334 fiz2 = _mm_add_ps(fiz2,tz);
2336 fjx0 = _mm_add_ps(fjx0,tx);
2337 fjy0 = _mm_add_ps(fjy0,ty);
2338 fjz0 = _mm_add_ps(fjz0,tz);
2340 /**************************
2341 * CALCULATE INTERACTIONS *
2342 **************************/
2344 r21 = _mm_mul_ps(rsq21,rinv21);
2345 r21 = _mm_andnot_ps(dummy_mask,r21);
2347 /* Calculate table index by multiplying r with table scale and truncate to integer */
2348 rt = _mm_mul_ps(r21,vftabscale);
2349 vfitab = _mm_cvttps_epi32(rt);
2350 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2351 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2353 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2354 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2355 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2356 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2357 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2358 _MM_TRANSPOSE4_PS(Y,F,G,H);
2359 Heps = _mm_mul_ps(vfeps,H);
2360 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2361 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2362 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2366 fscal = _mm_andnot_ps(dummy_mask,fscal);
2368 /* Calculate temporary vectorial force */
2369 tx = _mm_mul_ps(fscal,dx21);
2370 ty = _mm_mul_ps(fscal,dy21);
2371 tz = _mm_mul_ps(fscal,dz21);
2373 /* Update vectorial force */
2374 fix2 = _mm_add_ps(fix2,tx);
2375 fiy2 = _mm_add_ps(fiy2,ty);
2376 fiz2 = _mm_add_ps(fiz2,tz);
2378 fjx1 = _mm_add_ps(fjx1,tx);
2379 fjy1 = _mm_add_ps(fjy1,ty);
2380 fjz1 = _mm_add_ps(fjz1,tz);
2382 /**************************
2383 * CALCULATE INTERACTIONS *
2384 **************************/
2386 r22 = _mm_mul_ps(rsq22,rinv22);
2387 r22 = _mm_andnot_ps(dummy_mask,r22);
2389 /* Calculate table index by multiplying r with table scale and truncate to integer */
2390 rt = _mm_mul_ps(r22,vftabscale);
2391 vfitab = _mm_cvttps_epi32(rt);
2392 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2393 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2395 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2396 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2397 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2398 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2399 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2400 _MM_TRANSPOSE4_PS(Y,F,G,H);
2401 Heps = _mm_mul_ps(vfeps,H);
2402 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2403 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2404 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2408 fscal = _mm_andnot_ps(dummy_mask,fscal);
2410 /* Calculate temporary vectorial force */
2411 tx = _mm_mul_ps(fscal,dx22);
2412 ty = _mm_mul_ps(fscal,dy22);
2413 tz = _mm_mul_ps(fscal,dz22);
2415 /* Update vectorial force */
2416 fix2 = _mm_add_ps(fix2,tx);
2417 fiy2 = _mm_add_ps(fiy2,ty);
2418 fiz2 = _mm_add_ps(fiz2,tz);
2420 fjx2 = _mm_add_ps(fjx2,tx);
2421 fjy2 = _mm_add_ps(fjy2,ty);
2422 fjz2 = _mm_add_ps(fjz2,tz);
2424 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2425 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2426 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2427 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2429 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2430 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2432 /* Inner loop uses 382 flops */
2435 /* End of innermost loop */
2437 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2438 f+i_coord_offset,fshift+i_shift_offset);
2440 /* Increment number of inner iterations */
2441 inneriter += j_index_end - j_index_start;
2443 /* Outer loop uses 18 flops */
2446 /* Increment number of outer iterations */
2449 /* Update outer/inner flops */
2451 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*382);