2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_sse2_single.h"
48 #include "kernelutil_x86_sse2_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_single
52 * Electrostatics interaction: CubicSplineTable
53 * VdW interaction: LennardJones
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
107 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
111 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
113 __m128i ifour = _mm_set1_epi32(4);
114 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
116 __m128 dummy_mask,cutoff_mask;
117 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
118 __m128 one = _mm_set1_ps(1.0);
119 __m128 two = _mm_set1_ps(2.0);
125 jindex = nlist->jindex;
127 shiftidx = nlist->shift;
129 shiftvec = fr->shift_vec[0];
130 fshift = fr->fshift[0];
131 facel = _mm_set1_ps(fr->epsfac);
132 charge = mdatoms->chargeA;
133 nvdwtype = fr->ntype;
135 vdwtype = mdatoms->typeA;
137 vftab = kernel_data->table_elec->data;
138 vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
140 /* Setup water-specific parameters */
141 inr = nlist->iinr[0];
142 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
143 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
144 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
145 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
147 jq0 = _mm_set1_ps(charge[inr+0]);
148 jq1 = _mm_set1_ps(charge[inr+1]);
149 jq2 = _mm_set1_ps(charge[inr+2]);
150 vdwjidx0A = 2*vdwtype[inr+0];
151 qq00 = _mm_mul_ps(iq0,jq0);
152 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
153 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
154 qq01 = _mm_mul_ps(iq0,jq1);
155 qq02 = _mm_mul_ps(iq0,jq2);
156 qq10 = _mm_mul_ps(iq1,jq0);
157 qq11 = _mm_mul_ps(iq1,jq1);
158 qq12 = _mm_mul_ps(iq1,jq2);
159 qq20 = _mm_mul_ps(iq2,jq0);
160 qq21 = _mm_mul_ps(iq2,jq1);
161 qq22 = _mm_mul_ps(iq2,jq2);
163 /* Avoid stupid compiler warnings */
164 jnrA = jnrB = jnrC = jnrD = 0;
173 for(iidx=0;iidx<4*DIM;iidx++)
178 /* Start outer loop over neighborlists */
179 for(iidx=0; iidx<nri; iidx++)
181 /* Load shift vector for this list */
182 i_shift_offset = DIM*shiftidx[iidx];
184 /* Load limits for loop over neighbors */
185 j_index_start = jindex[iidx];
186 j_index_end = jindex[iidx+1];
188 /* Get outer coordinate index */
190 i_coord_offset = DIM*inr;
192 /* Load i particle coords and add shift vector */
193 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
194 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
196 fix0 = _mm_setzero_ps();
197 fiy0 = _mm_setzero_ps();
198 fiz0 = _mm_setzero_ps();
199 fix1 = _mm_setzero_ps();
200 fiy1 = _mm_setzero_ps();
201 fiz1 = _mm_setzero_ps();
202 fix2 = _mm_setzero_ps();
203 fiy2 = _mm_setzero_ps();
204 fiz2 = _mm_setzero_ps();
206 /* Reset potential sums */
207 velecsum = _mm_setzero_ps();
208 vvdwsum = _mm_setzero_ps();
210 /* Start inner kernel loop */
211 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
214 /* Get j neighbor index, and coordinate index */
219 j_coord_offsetA = DIM*jnrA;
220 j_coord_offsetB = DIM*jnrB;
221 j_coord_offsetC = DIM*jnrC;
222 j_coord_offsetD = DIM*jnrD;
224 /* load j atom coordinates */
225 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
226 x+j_coord_offsetC,x+j_coord_offsetD,
227 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
229 /* Calculate displacement vector */
230 dx00 = _mm_sub_ps(ix0,jx0);
231 dy00 = _mm_sub_ps(iy0,jy0);
232 dz00 = _mm_sub_ps(iz0,jz0);
233 dx01 = _mm_sub_ps(ix0,jx1);
234 dy01 = _mm_sub_ps(iy0,jy1);
235 dz01 = _mm_sub_ps(iz0,jz1);
236 dx02 = _mm_sub_ps(ix0,jx2);
237 dy02 = _mm_sub_ps(iy0,jy2);
238 dz02 = _mm_sub_ps(iz0,jz2);
239 dx10 = _mm_sub_ps(ix1,jx0);
240 dy10 = _mm_sub_ps(iy1,jy0);
241 dz10 = _mm_sub_ps(iz1,jz0);
242 dx11 = _mm_sub_ps(ix1,jx1);
243 dy11 = _mm_sub_ps(iy1,jy1);
244 dz11 = _mm_sub_ps(iz1,jz1);
245 dx12 = _mm_sub_ps(ix1,jx2);
246 dy12 = _mm_sub_ps(iy1,jy2);
247 dz12 = _mm_sub_ps(iz1,jz2);
248 dx20 = _mm_sub_ps(ix2,jx0);
249 dy20 = _mm_sub_ps(iy2,jy0);
250 dz20 = _mm_sub_ps(iz2,jz0);
251 dx21 = _mm_sub_ps(ix2,jx1);
252 dy21 = _mm_sub_ps(iy2,jy1);
253 dz21 = _mm_sub_ps(iz2,jz1);
254 dx22 = _mm_sub_ps(ix2,jx2);
255 dy22 = _mm_sub_ps(iy2,jy2);
256 dz22 = _mm_sub_ps(iz2,jz2);
258 /* Calculate squared distance and things based on it */
259 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
260 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
261 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
262 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
263 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
264 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
265 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
266 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
267 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
269 rinv00 = gmx_mm_invsqrt_ps(rsq00);
270 rinv01 = gmx_mm_invsqrt_ps(rsq01);
271 rinv02 = gmx_mm_invsqrt_ps(rsq02);
272 rinv10 = gmx_mm_invsqrt_ps(rsq10);
273 rinv11 = gmx_mm_invsqrt_ps(rsq11);
274 rinv12 = gmx_mm_invsqrt_ps(rsq12);
275 rinv20 = gmx_mm_invsqrt_ps(rsq20);
276 rinv21 = gmx_mm_invsqrt_ps(rsq21);
277 rinv22 = gmx_mm_invsqrt_ps(rsq22);
279 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
281 fjx0 = _mm_setzero_ps();
282 fjy0 = _mm_setzero_ps();
283 fjz0 = _mm_setzero_ps();
284 fjx1 = _mm_setzero_ps();
285 fjy1 = _mm_setzero_ps();
286 fjz1 = _mm_setzero_ps();
287 fjx2 = _mm_setzero_ps();
288 fjy2 = _mm_setzero_ps();
289 fjz2 = _mm_setzero_ps();
291 /**************************
292 * CALCULATE INTERACTIONS *
293 **************************/
295 r00 = _mm_mul_ps(rsq00,rinv00);
297 /* Calculate table index by multiplying r with table scale and truncate to integer */
298 rt = _mm_mul_ps(r00,vftabscale);
299 vfitab = _mm_cvttps_epi32(rt);
300 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
301 vfitab = _mm_slli_epi32(vfitab,2);
303 /* CUBIC SPLINE TABLE ELECTROSTATICS */
304 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
305 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
306 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
307 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
308 _MM_TRANSPOSE4_PS(Y,F,G,H);
309 Heps = _mm_mul_ps(vfeps,H);
310 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
311 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
312 velec = _mm_mul_ps(qq00,VV);
313 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
314 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
316 /* LENNARD-JONES DISPERSION/REPULSION */
318 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
319 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
320 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
321 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
322 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
324 /* Update potential sum for this i atom from the interaction with this j atom. */
325 velecsum = _mm_add_ps(velecsum,velec);
326 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
328 fscal = _mm_add_ps(felec,fvdw);
330 /* Calculate temporary vectorial force */
331 tx = _mm_mul_ps(fscal,dx00);
332 ty = _mm_mul_ps(fscal,dy00);
333 tz = _mm_mul_ps(fscal,dz00);
335 /* Update vectorial force */
336 fix0 = _mm_add_ps(fix0,tx);
337 fiy0 = _mm_add_ps(fiy0,ty);
338 fiz0 = _mm_add_ps(fiz0,tz);
340 fjx0 = _mm_add_ps(fjx0,tx);
341 fjy0 = _mm_add_ps(fjy0,ty);
342 fjz0 = _mm_add_ps(fjz0,tz);
344 /**************************
345 * CALCULATE INTERACTIONS *
346 **************************/
348 r01 = _mm_mul_ps(rsq01,rinv01);
350 /* Calculate table index by multiplying r with table scale and truncate to integer */
351 rt = _mm_mul_ps(r01,vftabscale);
352 vfitab = _mm_cvttps_epi32(rt);
353 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
354 vfitab = _mm_slli_epi32(vfitab,2);
356 /* CUBIC SPLINE TABLE ELECTROSTATICS */
357 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
358 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
359 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
360 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
361 _MM_TRANSPOSE4_PS(Y,F,G,H);
362 Heps = _mm_mul_ps(vfeps,H);
363 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
364 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
365 velec = _mm_mul_ps(qq01,VV);
366 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
367 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
369 /* Update potential sum for this i atom from the interaction with this j atom. */
370 velecsum = _mm_add_ps(velecsum,velec);
374 /* Calculate temporary vectorial force */
375 tx = _mm_mul_ps(fscal,dx01);
376 ty = _mm_mul_ps(fscal,dy01);
377 tz = _mm_mul_ps(fscal,dz01);
379 /* Update vectorial force */
380 fix0 = _mm_add_ps(fix0,tx);
381 fiy0 = _mm_add_ps(fiy0,ty);
382 fiz0 = _mm_add_ps(fiz0,tz);
384 fjx1 = _mm_add_ps(fjx1,tx);
385 fjy1 = _mm_add_ps(fjy1,ty);
386 fjz1 = _mm_add_ps(fjz1,tz);
388 /**************************
389 * CALCULATE INTERACTIONS *
390 **************************/
392 r02 = _mm_mul_ps(rsq02,rinv02);
394 /* Calculate table index by multiplying r with table scale and truncate to integer */
395 rt = _mm_mul_ps(r02,vftabscale);
396 vfitab = _mm_cvttps_epi32(rt);
397 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
398 vfitab = _mm_slli_epi32(vfitab,2);
400 /* CUBIC SPLINE TABLE ELECTROSTATICS */
401 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
402 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
403 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
404 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
405 _MM_TRANSPOSE4_PS(Y,F,G,H);
406 Heps = _mm_mul_ps(vfeps,H);
407 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
408 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
409 velec = _mm_mul_ps(qq02,VV);
410 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
411 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
413 /* Update potential sum for this i atom from the interaction with this j atom. */
414 velecsum = _mm_add_ps(velecsum,velec);
418 /* Calculate temporary vectorial force */
419 tx = _mm_mul_ps(fscal,dx02);
420 ty = _mm_mul_ps(fscal,dy02);
421 tz = _mm_mul_ps(fscal,dz02);
423 /* Update vectorial force */
424 fix0 = _mm_add_ps(fix0,tx);
425 fiy0 = _mm_add_ps(fiy0,ty);
426 fiz0 = _mm_add_ps(fiz0,tz);
428 fjx2 = _mm_add_ps(fjx2,tx);
429 fjy2 = _mm_add_ps(fjy2,ty);
430 fjz2 = _mm_add_ps(fjz2,tz);
432 /**************************
433 * CALCULATE INTERACTIONS *
434 **************************/
436 r10 = _mm_mul_ps(rsq10,rinv10);
438 /* Calculate table index by multiplying r with table scale and truncate to integer */
439 rt = _mm_mul_ps(r10,vftabscale);
440 vfitab = _mm_cvttps_epi32(rt);
441 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
442 vfitab = _mm_slli_epi32(vfitab,2);
444 /* CUBIC SPLINE TABLE ELECTROSTATICS */
445 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
446 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
447 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
448 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
449 _MM_TRANSPOSE4_PS(Y,F,G,H);
450 Heps = _mm_mul_ps(vfeps,H);
451 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
452 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
453 velec = _mm_mul_ps(qq10,VV);
454 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
455 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
457 /* Update potential sum for this i atom from the interaction with this j atom. */
458 velecsum = _mm_add_ps(velecsum,velec);
462 /* Calculate temporary vectorial force */
463 tx = _mm_mul_ps(fscal,dx10);
464 ty = _mm_mul_ps(fscal,dy10);
465 tz = _mm_mul_ps(fscal,dz10);
467 /* Update vectorial force */
468 fix1 = _mm_add_ps(fix1,tx);
469 fiy1 = _mm_add_ps(fiy1,ty);
470 fiz1 = _mm_add_ps(fiz1,tz);
472 fjx0 = _mm_add_ps(fjx0,tx);
473 fjy0 = _mm_add_ps(fjy0,ty);
474 fjz0 = _mm_add_ps(fjz0,tz);
476 /**************************
477 * CALCULATE INTERACTIONS *
478 **************************/
480 r11 = _mm_mul_ps(rsq11,rinv11);
482 /* Calculate table index by multiplying r with table scale and truncate to integer */
483 rt = _mm_mul_ps(r11,vftabscale);
484 vfitab = _mm_cvttps_epi32(rt);
485 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
486 vfitab = _mm_slli_epi32(vfitab,2);
488 /* CUBIC SPLINE TABLE ELECTROSTATICS */
489 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
490 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
491 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
492 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
493 _MM_TRANSPOSE4_PS(Y,F,G,H);
494 Heps = _mm_mul_ps(vfeps,H);
495 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
496 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
497 velec = _mm_mul_ps(qq11,VV);
498 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
499 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
501 /* Update potential sum for this i atom from the interaction with this j atom. */
502 velecsum = _mm_add_ps(velecsum,velec);
506 /* Calculate temporary vectorial force */
507 tx = _mm_mul_ps(fscal,dx11);
508 ty = _mm_mul_ps(fscal,dy11);
509 tz = _mm_mul_ps(fscal,dz11);
511 /* Update vectorial force */
512 fix1 = _mm_add_ps(fix1,tx);
513 fiy1 = _mm_add_ps(fiy1,ty);
514 fiz1 = _mm_add_ps(fiz1,tz);
516 fjx1 = _mm_add_ps(fjx1,tx);
517 fjy1 = _mm_add_ps(fjy1,ty);
518 fjz1 = _mm_add_ps(fjz1,tz);
520 /**************************
521 * CALCULATE INTERACTIONS *
522 **************************/
524 r12 = _mm_mul_ps(rsq12,rinv12);
526 /* Calculate table index by multiplying r with table scale and truncate to integer */
527 rt = _mm_mul_ps(r12,vftabscale);
528 vfitab = _mm_cvttps_epi32(rt);
529 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
530 vfitab = _mm_slli_epi32(vfitab,2);
532 /* CUBIC SPLINE TABLE ELECTROSTATICS */
533 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
534 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
535 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
536 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
537 _MM_TRANSPOSE4_PS(Y,F,G,H);
538 Heps = _mm_mul_ps(vfeps,H);
539 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
540 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
541 velec = _mm_mul_ps(qq12,VV);
542 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
543 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
545 /* Update potential sum for this i atom from the interaction with this j atom. */
546 velecsum = _mm_add_ps(velecsum,velec);
550 /* Calculate temporary vectorial force */
551 tx = _mm_mul_ps(fscal,dx12);
552 ty = _mm_mul_ps(fscal,dy12);
553 tz = _mm_mul_ps(fscal,dz12);
555 /* Update vectorial force */
556 fix1 = _mm_add_ps(fix1,tx);
557 fiy1 = _mm_add_ps(fiy1,ty);
558 fiz1 = _mm_add_ps(fiz1,tz);
560 fjx2 = _mm_add_ps(fjx2,tx);
561 fjy2 = _mm_add_ps(fjy2,ty);
562 fjz2 = _mm_add_ps(fjz2,tz);
564 /**************************
565 * CALCULATE INTERACTIONS *
566 **************************/
568 r20 = _mm_mul_ps(rsq20,rinv20);
570 /* Calculate table index by multiplying r with table scale and truncate to integer */
571 rt = _mm_mul_ps(r20,vftabscale);
572 vfitab = _mm_cvttps_epi32(rt);
573 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
574 vfitab = _mm_slli_epi32(vfitab,2);
576 /* CUBIC SPLINE TABLE ELECTROSTATICS */
577 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
578 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
579 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
580 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
581 _MM_TRANSPOSE4_PS(Y,F,G,H);
582 Heps = _mm_mul_ps(vfeps,H);
583 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
584 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
585 velec = _mm_mul_ps(qq20,VV);
586 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
587 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
589 /* Update potential sum for this i atom from the interaction with this j atom. */
590 velecsum = _mm_add_ps(velecsum,velec);
594 /* Calculate temporary vectorial force */
595 tx = _mm_mul_ps(fscal,dx20);
596 ty = _mm_mul_ps(fscal,dy20);
597 tz = _mm_mul_ps(fscal,dz20);
599 /* Update vectorial force */
600 fix2 = _mm_add_ps(fix2,tx);
601 fiy2 = _mm_add_ps(fiy2,ty);
602 fiz2 = _mm_add_ps(fiz2,tz);
604 fjx0 = _mm_add_ps(fjx0,tx);
605 fjy0 = _mm_add_ps(fjy0,ty);
606 fjz0 = _mm_add_ps(fjz0,tz);
608 /**************************
609 * CALCULATE INTERACTIONS *
610 **************************/
612 r21 = _mm_mul_ps(rsq21,rinv21);
614 /* Calculate table index by multiplying r with table scale and truncate to integer */
615 rt = _mm_mul_ps(r21,vftabscale);
616 vfitab = _mm_cvttps_epi32(rt);
617 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
618 vfitab = _mm_slli_epi32(vfitab,2);
620 /* CUBIC SPLINE TABLE ELECTROSTATICS */
621 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
622 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
623 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
624 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
625 _MM_TRANSPOSE4_PS(Y,F,G,H);
626 Heps = _mm_mul_ps(vfeps,H);
627 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
628 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
629 velec = _mm_mul_ps(qq21,VV);
630 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
631 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
633 /* Update potential sum for this i atom from the interaction with this j atom. */
634 velecsum = _mm_add_ps(velecsum,velec);
638 /* Calculate temporary vectorial force */
639 tx = _mm_mul_ps(fscal,dx21);
640 ty = _mm_mul_ps(fscal,dy21);
641 tz = _mm_mul_ps(fscal,dz21);
643 /* Update vectorial force */
644 fix2 = _mm_add_ps(fix2,tx);
645 fiy2 = _mm_add_ps(fiy2,ty);
646 fiz2 = _mm_add_ps(fiz2,tz);
648 fjx1 = _mm_add_ps(fjx1,tx);
649 fjy1 = _mm_add_ps(fjy1,ty);
650 fjz1 = _mm_add_ps(fjz1,tz);
652 /**************************
653 * CALCULATE INTERACTIONS *
654 **************************/
656 r22 = _mm_mul_ps(rsq22,rinv22);
658 /* Calculate table index by multiplying r with table scale and truncate to integer */
659 rt = _mm_mul_ps(r22,vftabscale);
660 vfitab = _mm_cvttps_epi32(rt);
661 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
662 vfitab = _mm_slli_epi32(vfitab,2);
664 /* CUBIC SPLINE TABLE ELECTROSTATICS */
665 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
666 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
667 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
668 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
669 _MM_TRANSPOSE4_PS(Y,F,G,H);
670 Heps = _mm_mul_ps(vfeps,H);
671 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
672 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
673 velec = _mm_mul_ps(qq22,VV);
674 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
675 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
677 /* Update potential sum for this i atom from the interaction with this j atom. */
678 velecsum = _mm_add_ps(velecsum,velec);
682 /* Calculate temporary vectorial force */
683 tx = _mm_mul_ps(fscal,dx22);
684 ty = _mm_mul_ps(fscal,dy22);
685 tz = _mm_mul_ps(fscal,dz22);
687 /* Update vectorial force */
688 fix2 = _mm_add_ps(fix2,tx);
689 fiy2 = _mm_add_ps(fiy2,ty);
690 fiz2 = _mm_add_ps(fiz2,tz);
692 fjx2 = _mm_add_ps(fjx2,tx);
693 fjy2 = _mm_add_ps(fjy2,ty);
694 fjz2 = _mm_add_ps(fjz2,tz);
696 fjptrA = f+j_coord_offsetA;
697 fjptrB = f+j_coord_offsetB;
698 fjptrC = f+j_coord_offsetC;
699 fjptrD = f+j_coord_offsetD;
701 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
702 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
704 /* Inner loop uses 400 flops */
710 /* Get j neighbor index, and coordinate index */
711 jnrlistA = jjnr[jidx];
712 jnrlistB = jjnr[jidx+1];
713 jnrlistC = jjnr[jidx+2];
714 jnrlistD = jjnr[jidx+3];
715 /* Sign of each element will be negative for non-real atoms.
716 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
717 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
719 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
720 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
721 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
722 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
723 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
724 j_coord_offsetA = DIM*jnrA;
725 j_coord_offsetB = DIM*jnrB;
726 j_coord_offsetC = DIM*jnrC;
727 j_coord_offsetD = DIM*jnrD;
729 /* load j atom coordinates */
730 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
731 x+j_coord_offsetC,x+j_coord_offsetD,
732 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
734 /* Calculate displacement vector */
735 dx00 = _mm_sub_ps(ix0,jx0);
736 dy00 = _mm_sub_ps(iy0,jy0);
737 dz00 = _mm_sub_ps(iz0,jz0);
738 dx01 = _mm_sub_ps(ix0,jx1);
739 dy01 = _mm_sub_ps(iy0,jy1);
740 dz01 = _mm_sub_ps(iz0,jz1);
741 dx02 = _mm_sub_ps(ix0,jx2);
742 dy02 = _mm_sub_ps(iy0,jy2);
743 dz02 = _mm_sub_ps(iz0,jz2);
744 dx10 = _mm_sub_ps(ix1,jx0);
745 dy10 = _mm_sub_ps(iy1,jy0);
746 dz10 = _mm_sub_ps(iz1,jz0);
747 dx11 = _mm_sub_ps(ix1,jx1);
748 dy11 = _mm_sub_ps(iy1,jy1);
749 dz11 = _mm_sub_ps(iz1,jz1);
750 dx12 = _mm_sub_ps(ix1,jx2);
751 dy12 = _mm_sub_ps(iy1,jy2);
752 dz12 = _mm_sub_ps(iz1,jz2);
753 dx20 = _mm_sub_ps(ix2,jx0);
754 dy20 = _mm_sub_ps(iy2,jy0);
755 dz20 = _mm_sub_ps(iz2,jz0);
756 dx21 = _mm_sub_ps(ix2,jx1);
757 dy21 = _mm_sub_ps(iy2,jy1);
758 dz21 = _mm_sub_ps(iz2,jz1);
759 dx22 = _mm_sub_ps(ix2,jx2);
760 dy22 = _mm_sub_ps(iy2,jy2);
761 dz22 = _mm_sub_ps(iz2,jz2);
763 /* Calculate squared distance and things based on it */
764 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
765 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
766 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
767 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
768 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
769 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
770 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
771 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
772 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
774 rinv00 = gmx_mm_invsqrt_ps(rsq00);
775 rinv01 = gmx_mm_invsqrt_ps(rsq01);
776 rinv02 = gmx_mm_invsqrt_ps(rsq02);
777 rinv10 = gmx_mm_invsqrt_ps(rsq10);
778 rinv11 = gmx_mm_invsqrt_ps(rsq11);
779 rinv12 = gmx_mm_invsqrt_ps(rsq12);
780 rinv20 = gmx_mm_invsqrt_ps(rsq20);
781 rinv21 = gmx_mm_invsqrt_ps(rsq21);
782 rinv22 = gmx_mm_invsqrt_ps(rsq22);
784 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
786 fjx0 = _mm_setzero_ps();
787 fjy0 = _mm_setzero_ps();
788 fjz0 = _mm_setzero_ps();
789 fjx1 = _mm_setzero_ps();
790 fjy1 = _mm_setzero_ps();
791 fjz1 = _mm_setzero_ps();
792 fjx2 = _mm_setzero_ps();
793 fjy2 = _mm_setzero_ps();
794 fjz2 = _mm_setzero_ps();
796 /**************************
797 * CALCULATE INTERACTIONS *
798 **************************/
800 r00 = _mm_mul_ps(rsq00,rinv00);
801 r00 = _mm_andnot_ps(dummy_mask,r00);
803 /* Calculate table index by multiplying r with table scale and truncate to integer */
804 rt = _mm_mul_ps(r00,vftabscale);
805 vfitab = _mm_cvttps_epi32(rt);
806 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
807 vfitab = _mm_slli_epi32(vfitab,2);
809 /* CUBIC SPLINE TABLE ELECTROSTATICS */
810 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
811 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
812 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
813 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
814 _MM_TRANSPOSE4_PS(Y,F,G,H);
815 Heps = _mm_mul_ps(vfeps,H);
816 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
817 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
818 velec = _mm_mul_ps(qq00,VV);
819 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
820 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
822 /* LENNARD-JONES DISPERSION/REPULSION */
824 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
825 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
826 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
827 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
828 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
830 /* Update potential sum for this i atom from the interaction with this j atom. */
831 velec = _mm_andnot_ps(dummy_mask,velec);
832 velecsum = _mm_add_ps(velecsum,velec);
833 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
834 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
836 fscal = _mm_add_ps(felec,fvdw);
838 fscal = _mm_andnot_ps(dummy_mask,fscal);
840 /* Calculate temporary vectorial force */
841 tx = _mm_mul_ps(fscal,dx00);
842 ty = _mm_mul_ps(fscal,dy00);
843 tz = _mm_mul_ps(fscal,dz00);
845 /* Update vectorial force */
846 fix0 = _mm_add_ps(fix0,tx);
847 fiy0 = _mm_add_ps(fiy0,ty);
848 fiz0 = _mm_add_ps(fiz0,tz);
850 fjx0 = _mm_add_ps(fjx0,tx);
851 fjy0 = _mm_add_ps(fjy0,ty);
852 fjz0 = _mm_add_ps(fjz0,tz);
854 /**************************
855 * CALCULATE INTERACTIONS *
856 **************************/
858 r01 = _mm_mul_ps(rsq01,rinv01);
859 r01 = _mm_andnot_ps(dummy_mask,r01);
861 /* Calculate table index by multiplying r with table scale and truncate to integer */
862 rt = _mm_mul_ps(r01,vftabscale);
863 vfitab = _mm_cvttps_epi32(rt);
864 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
865 vfitab = _mm_slli_epi32(vfitab,2);
867 /* CUBIC SPLINE TABLE ELECTROSTATICS */
868 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
869 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
870 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
871 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
872 _MM_TRANSPOSE4_PS(Y,F,G,H);
873 Heps = _mm_mul_ps(vfeps,H);
874 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
875 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
876 velec = _mm_mul_ps(qq01,VV);
877 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
878 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
880 /* Update potential sum for this i atom from the interaction with this j atom. */
881 velec = _mm_andnot_ps(dummy_mask,velec);
882 velecsum = _mm_add_ps(velecsum,velec);
886 fscal = _mm_andnot_ps(dummy_mask,fscal);
888 /* Calculate temporary vectorial force */
889 tx = _mm_mul_ps(fscal,dx01);
890 ty = _mm_mul_ps(fscal,dy01);
891 tz = _mm_mul_ps(fscal,dz01);
893 /* Update vectorial force */
894 fix0 = _mm_add_ps(fix0,tx);
895 fiy0 = _mm_add_ps(fiy0,ty);
896 fiz0 = _mm_add_ps(fiz0,tz);
898 fjx1 = _mm_add_ps(fjx1,tx);
899 fjy1 = _mm_add_ps(fjy1,ty);
900 fjz1 = _mm_add_ps(fjz1,tz);
902 /**************************
903 * CALCULATE INTERACTIONS *
904 **************************/
906 r02 = _mm_mul_ps(rsq02,rinv02);
907 r02 = _mm_andnot_ps(dummy_mask,r02);
909 /* Calculate table index by multiplying r with table scale and truncate to integer */
910 rt = _mm_mul_ps(r02,vftabscale);
911 vfitab = _mm_cvttps_epi32(rt);
912 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
913 vfitab = _mm_slli_epi32(vfitab,2);
915 /* CUBIC SPLINE TABLE ELECTROSTATICS */
916 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
917 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
918 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
919 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
920 _MM_TRANSPOSE4_PS(Y,F,G,H);
921 Heps = _mm_mul_ps(vfeps,H);
922 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
923 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
924 velec = _mm_mul_ps(qq02,VV);
925 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
926 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
928 /* Update potential sum for this i atom from the interaction with this j atom. */
929 velec = _mm_andnot_ps(dummy_mask,velec);
930 velecsum = _mm_add_ps(velecsum,velec);
934 fscal = _mm_andnot_ps(dummy_mask,fscal);
936 /* Calculate temporary vectorial force */
937 tx = _mm_mul_ps(fscal,dx02);
938 ty = _mm_mul_ps(fscal,dy02);
939 tz = _mm_mul_ps(fscal,dz02);
941 /* Update vectorial force */
942 fix0 = _mm_add_ps(fix0,tx);
943 fiy0 = _mm_add_ps(fiy0,ty);
944 fiz0 = _mm_add_ps(fiz0,tz);
946 fjx2 = _mm_add_ps(fjx2,tx);
947 fjy2 = _mm_add_ps(fjy2,ty);
948 fjz2 = _mm_add_ps(fjz2,tz);
950 /**************************
951 * CALCULATE INTERACTIONS *
952 **************************/
954 r10 = _mm_mul_ps(rsq10,rinv10);
955 r10 = _mm_andnot_ps(dummy_mask,r10);
957 /* Calculate table index by multiplying r with table scale and truncate to integer */
958 rt = _mm_mul_ps(r10,vftabscale);
959 vfitab = _mm_cvttps_epi32(rt);
960 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
961 vfitab = _mm_slli_epi32(vfitab,2);
963 /* CUBIC SPLINE TABLE ELECTROSTATICS */
964 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
965 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
966 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
967 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
968 _MM_TRANSPOSE4_PS(Y,F,G,H);
969 Heps = _mm_mul_ps(vfeps,H);
970 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
971 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
972 velec = _mm_mul_ps(qq10,VV);
973 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
974 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
976 /* Update potential sum for this i atom from the interaction with this j atom. */
977 velec = _mm_andnot_ps(dummy_mask,velec);
978 velecsum = _mm_add_ps(velecsum,velec);
982 fscal = _mm_andnot_ps(dummy_mask,fscal);
984 /* Calculate temporary vectorial force */
985 tx = _mm_mul_ps(fscal,dx10);
986 ty = _mm_mul_ps(fscal,dy10);
987 tz = _mm_mul_ps(fscal,dz10);
989 /* Update vectorial force */
990 fix1 = _mm_add_ps(fix1,tx);
991 fiy1 = _mm_add_ps(fiy1,ty);
992 fiz1 = _mm_add_ps(fiz1,tz);
994 fjx0 = _mm_add_ps(fjx0,tx);
995 fjy0 = _mm_add_ps(fjy0,ty);
996 fjz0 = _mm_add_ps(fjz0,tz);
998 /**************************
999 * CALCULATE INTERACTIONS *
1000 **************************/
1002 r11 = _mm_mul_ps(rsq11,rinv11);
1003 r11 = _mm_andnot_ps(dummy_mask,r11);
1005 /* Calculate table index by multiplying r with table scale and truncate to integer */
1006 rt = _mm_mul_ps(r11,vftabscale);
1007 vfitab = _mm_cvttps_epi32(rt);
1008 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1009 vfitab = _mm_slli_epi32(vfitab,2);
1011 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1012 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1013 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1014 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1015 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1016 _MM_TRANSPOSE4_PS(Y,F,G,H);
1017 Heps = _mm_mul_ps(vfeps,H);
1018 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1019 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1020 velec = _mm_mul_ps(qq11,VV);
1021 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1022 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1024 /* Update potential sum for this i atom from the interaction with this j atom. */
1025 velec = _mm_andnot_ps(dummy_mask,velec);
1026 velecsum = _mm_add_ps(velecsum,velec);
1030 fscal = _mm_andnot_ps(dummy_mask,fscal);
1032 /* Calculate temporary vectorial force */
1033 tx = _mm_mul_ps(fscal,dx11);
1034 ty = _mm_mul_ps(fscal,dy11);
1035 tz = _mm_mul_ps(fscal,dz11);
1037 /* Update vectorial force */
1038 fix1 = _mm_add_ps(fix1,tx);
1039 fiy1 = _mm_add_ps(fiy1,ty);
1040 fiz1 = _mm_add_ps(fiz1,tz);
1042 fjx1 = _mm_add_ps(fjx1,tx);
1043 fjy1 = _mm_add_ps(fjy1,ty);
1044 fjz1 = _mm_add_ps(fjz1,tz);
1046 /**************************
1047 * CALCULATE INTERACTIONS *
1048 **************************/
1050 r12 = _mm_mul_ps(rsq12,rinv12);
1051 r12 = _mm_andnot_ps(dummy_mask,r12);
1053 /* Calculate table index by multiplying r with table scale and truncate to integer */
1054 rt = _mm_mul_ps(r12,vftabscale);
1055 vfitab = _mm_cvttps_epi32(rt);
1056 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1057 vfitab = _mm_slli_epi32(vfitab,2);
1059 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1060 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1061 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1062 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1063 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1064 _MM_TRANSPOSE4_PS(Y,F,G,H);
1065 Heps = _mm_mul_ps(vfeps,H);
1066 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1067 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1068 velec = _mm_mul_ps(qq12,VV);
1069 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1070 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1072 /* Update potential sum for this i atom from the interaction with this j atom. */
1073 velec = _mm_andnot_ps(dummy_mask,velec);
1074 velecsum = _mm_add_ps(velecsum,velec);
1078 fscal = _mm_andnot_ps(dummy_mask,fscal);
1080 /* Calculate temporary vectorial force */
1081 tx = _mm_mul_ps(fscal,dx12);
1082 ty = _mm_mul_ps(fscal,dy12);
1083 tz = _mm_mul_ps(fscal,dz12);
1085 /* Update vectorial force */
1086 fix1 = _mm_add_ps(fix1,tx);
1087 fiy1 = _mm_add_ps(fiy1,ty);
1088 fiz1 = _mm_add_ps(fiz1,tz);
1090 fjx2 = _mm_add_ps(fjx2,tx);
1091 fjy2 = _mm_add_ps(fjy2,ty);
1092 fjz2 = _mm_add_ps(fjz2,tz);
1094 /**************************
1095 * CALCULATE INTERACTIONS *
1096 **************************/
1098 r20 = _mm_mul_ps(rsq20,rinv20);
1099 r20 = _mm_andnot_ps(dummy_mask,r20);
1101 /* Calculate table index by multiplying r with table scale and truncate to integer */
1102 rt = _mm_mul_ps(r20,vftabscale);
1103 vfitab = _mm_cvttps_epi32(rt);
1104 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1105 vfitab = _mm_slli_epi32(vfitab,2);
1107 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1108 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1109 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1110 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1111 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1112 _MM_TRANSPOSE4_PS(Y,F,G,H);
1113 Heps = _mm_mul_ps(vfeps,H);
1114 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1115 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1116 velec = _mm_mul_ps(qq20,VV);
1117 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1118 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1120 /* Update potential sum for this i atom from the interaction with this j atom. */
1121 velec = _mm_andnot_ps(dummy_mask,velec);
1122 velecsum = _mm_add_ps(velecsum,velec);
1126 fscal = _mm_andnot_ps(dummy_mask,fscal);
1128 /* Calculate temporary vectorial force */
1129 tx = _mm_mul_ps(fscal,dx20);
1130 ty = _mm_mul_ps(fscal,dy20);
1131 tz = _mm_mul_ps(fscal,dz20);
1133 /* Update vectorial force */
1134 fix2 = _mm_add_ps(fix2,tx);
1135 fiy2 = _mm_add_ps(fiy2,ty);
1136 fiz2 = _mm_add_ps(fiz2,tz);
1138 fjx0 = _mm_add_ps(fjx0,tx);
1139 fjy0 = _mm_add_ps(fjy0,ty);
1140 fjz0 = _mm_add_ps(fjz0,tz);
1142 /**************************
1143 * CALCULATE INTERACTIONS *
1144 **************************/
1146 r21 = _mm_mul_ps(rsq21,rinv21);
1147 r21 = _mm_andnot_ps(dummy_mask,r21);
1149 /* Calculate table index by multiplying r with table scale and truncate to integer */
1150 rt = _mm_mul_ps(r21,vftabscale);
1151 vfitab = _mm_cvttps_epi32(rt);
1152 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1153 vfitab = _mm_slli_epi32(vfitab,2);
1155 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1156 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1157 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1158 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1159 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1160 _MM_TRANSPOSE4_PS(Y,F,G,H);
1161 Heps = _mm_mul_ps(vfeps,H);
1162 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1163 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1164 velec = _mm_mul_ps(qq21,VV);
1165 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1166 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1168 /* Update potential sum for this i atom from the interaction with this j atom. */
1169 velec = _mm_andnot_ps(dummy_mask,velec);
1170 velecsum = _mm_add_ps(velecsum,velec);
1174 fscal = _mm_andnot_ps(dummy_mask,fscal);
1176 /* Calculate temporary vectorial force */
1177 tx = _mm_mul_ps(fscal,dx21);
1178 ty = _mm_mul_ps(fscal,dy21);
1179 tz = _mm_mul_ps(fscal,dz21);
1181 /* Update vectorial force */
1182 fix2 = _mm_add_ps(fix2,tx);
1183 fiy2 = _mm_add_ps(fiy2,ty);
1184 fiz2 = _mm_add_ps(fiz2,tz);
1186 fjx1 = _mm_add_ps(fjx1,tx);
1187 fjy1 = _mm_add_ps(fjy1,ty);
1188 fjz1 = _mm_add_ps(fjz1,tz);
1190 /**************************
1191 * CALCULATE INTERACTIONS *
1192 **************************/
1194 r22 = _mm_mul_ps(rsq22,rinv22);
1195 r22 = _mm_andnot_ps(dummy_mask,r22);
1197 /* Calculate table index by multiplying r with table scale and truncate to integer */
1198 rt = _mm_mul_ps(r22,vftabscale);
1199 vfitab = _mm_cvttps_epi32(rt);
1200 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1201 vfitab = _mm_slli_epi32(vfitab,2);
1203 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1204 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1205 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1206 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1207 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1208 _MM_TRANSPOSE4_PS(Y,F,G,H);
1209 Heps = _mm_mul_ps(vfeps,H);
1210 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1211 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1212 velec = _mm_mul_ps(qq22,VV);
1213 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1214 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1216 /* Update potential sum for this i atom from the interaction with this j atom. */
1217 velec = _mm_andnot_ps(dummy_mask,velec);
1218 velecsum = _mm_add_ps(velecsum,velec);
1222 fscal = _mm_andnot_ps(dummy_mask,fscal);
1224 /* Calculate temporary vectorial force */
1225 tx = _mm_mul_ps(fscal,dx22);
1226 ty = _mm_mul_ps(fscal,dy22);
1227 tz = _mm_mul_ps(fscal,dz22);
1229 /* Update vectorial force */
1230 fix2 = _mm_add_ps(fix2,tx);
1231 fiy2 = _mm_add_ps(fiy2,ty);
1232 fiz2 = _mm_add_ps(fiz2,tz);
1234 fjx2 = _mm_add_ps(fjx2,tx);
1235 fjy2 = _mm_add_ps(fjy2,ty);
1236 fjz2 = _mm_add_ps(fjz2,tz);
1238 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1239 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1240 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1241 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1243 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1244 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1246 /* Inner loop uses 409 flops */
1249 /* End of innermost loop */
1251 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1252 f+i_coord_offset,fshift+i_shift_offset);
1255 /* Update potential energies */
1256 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1257 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1259 /* Increment number of inner iterations */
1260 inneriter += j_index_end - j_index_start;
1262 /* Outer loop uses 20 flops */
1265 /* Increment number of outer iterations */
1268 /* Update outer/inner flops */
1270 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*409);
1273 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_single
1274 * Electrostatics interaction: CubicSplineTable
1275 * VdW interaction: LennardJones
1276 * Geometry: Water3-Water3
1277 * Calculate force/pot: Force
1280 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_single
1281 (t_nblist * gmx_restrict nlist,
1282 rvec * gmx_restrict xx,
1283 rvec * gmx_restrict ff,
1284 t_forcerec * gmx_restrict fr,
1285 t_mdatoms * gmx_restrict mdatoms,
1286 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1287 t_nrnb * gmx_restrict nrnb)
1289 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1290 * just 0 for non-waters.
1291 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1292 * jnr indices corresponding to data put in the four positions in the SIMD register.
1294 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1295 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1296 int jnrA,jnrB,jnrC,jnrD;
1297 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1298 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1299 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1300 real rcutoff_scalar;
1301 real *shiftvec,*fshift,*x,*f;
1302 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1303 real scratch[4*DIM];
1304 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1306 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1308 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1310 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1311 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1312 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1313 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1314 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1315 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1316 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1317 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1318 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1319 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1320 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1321 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1322 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1323 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1324 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1325 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1326 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1329 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1332 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1333 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1335 __m128i ifour = _mm_set1_epi32(4);
1336 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1338 __m128 dummy_mask,cutoff_mask;
1339 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1340 __m128 one = _mm_set1_ps(1.0);
1341 __m128 two = _mm_set1_ps(2.0);
1347 jindex = nlist->jindex;
1349 shiftidx = nlist->shift;
1351 shiftvec = fr->shift_vec[0];
1352 fshift = fr->fshift[0];
1353 facel = _mm_set1_ps(fr->epsfac);
1354 charge = mdatoms->chargeA;
1355 nvdwtype = fr->ntype;
1356 vdwparam = fr->nbfp;
1357 vdwtype = mdatoms->typeA;
1359 vftab = kernel_data->table_elec->data;
1360 vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
1362 /* Setup water-specific parameters */
1363 inr = nlist->iinr[0];
1364 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1365 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1366 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1367 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1369 jq0 = _mm_set1_ps(charge[inr+0]);
1370 jq1 = _mm_set1_ps(charge[inr+1]);
1371 jq2 = _mm_set1_ps(charge[inr+2]);
1372 vdwjidx0A = 2*vdwtype[inr+0];
1373 qq00 = _mm_mul_ps(iq0,jq0);
1374 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1375 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1376 qq01 = _mm_mul_ps(iq0,jq1);
1377 qq02 = _mm_mul_ps(iq0,jq2);
1378 qq10 = _mm_mul_ps(iq1,jq0);
1379 qq11 = _mm_mul_ps(iq1,jq1);
1380 qq12 = _mm_mul_ps(iq1,jq2);
1381 qq20 = _mm_mul_ps(iq2,jq0);
1382 qq21 = _mm_mul_ps(iq2,jq1);
1383 qq22 = _mm_mul_ps(iq2,jq2);
1385 /* Avoid stupid compiler warnings */
1386 jnrA = jnrB = jnrC = jnrD = 0;
1387 j_coord_offsetA = 0;
1388 j_coord_offsetB = 0;
1389 j_coord_offsetC = 0;
1390 j_coord_offsetD = 0;
1395 for(iidx=0;iidx<4*DIM;iidx++)
1397 scratch[iidx] = 0.0;
1400 /* Start outer loop over neighborlists */
1401 for(iidx=0; iidx<nri; iidx++)
1403 /* Load shift vector for this list */
1404 i_shift_offset = DIM*shiftidx[iidx];
1406 /* Load limits for loop over neighbors */
1407 j_index_start = jindex[iidx];
1408 j_index_end = jindex[iidx+1];
1410 /* Get outer coordinate index */
1412 i_coord_offset = DIM*inr;
1414 /* Load i particle coords and add shift vector */
1415 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1416 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1418 fix0 = _mm_setzero_ps();
1419 fiy0 = _mm_setzero_ps();
1420 fiz0 = _mm_setzero_ps();
1421 fix1 = _mm_setzero_ps();
1422 fiy1 = _mm_setzero_ps();
1423 fiz1 = _mm_setzero_ps();
1424 fix2 = _mm_setzero_ps();
1425 fiy2 = _mm_setzero_ps();
1426 fiz2 = _mm_setzero_ps();
1428 /* Start inner kernel loop */
1429 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1432 /* Get j neighbor index, and coordinate index */
1434 jnrB = jjnr[jidx+1];
1435 jnrC = jjnr[jidx+2];
1436 jnrD = jjnr[jidx+3];
1437 j_coord_offsetA = DIM*jnrA;
1438 j_coord_offsetB = DIM*jnrB;
1439 j_coord_offsetC = DIM*jnrC;
1440 j_coord_offsetD = DIM*jnrD;
1442 /* load j atom coordinates */
1443 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1444 x+j_coord_offsetC,x+j_coord_offsetD,
1445 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1447 /* Calculate displacement vector */
1448 dx00 = _mm_sub_ps(ix0,jx0);
1449 dy00 = _mm_sub_ps(iy0,jy0);
1450 dz00 = _mm_sub_ps(iz0,jz0);
1451 dx01 = _mm_sub_ps(ix0,jx1);
1452 dy01 = _mm_sub_ps(iy0,jy1);
1453 dz01 = _mm_sub_ps(iz0,jz1);
1454 dx02 = _mm_sub_ps(ix0,jx2);
1455 dy02 = _mm_sub_ps(iy0,jy2);
1456 dz02 = _mm_sub_ps(iz0,jz2);
1457 dx10 = _mm_sub_ps(ix1,jx0);
1458 dy10 = _mm_sub_ps(iy1,jy0);
1459 dz10 = _mm_sub_ps(iz1,jz0);
1460 dx11 = _mm_sub_ps(ix1,jx1);
1461 dy11 = _mm_sub_ps(iy1,jy1);
1462 dz11 = _mm_sub_ps(iz1,jz1);
1463 dx12 = _mm_sub_ps(ix1,jx2);
1464 dy12 = _mm_sub_ps(iy1,jy2);
1465 dz12 = _mm_sub_ps(iz1,jz2);
1466 dx20 = _mm_sub_ps(ix2,jx0);
1467 dy20 = _mm_sub_ps(iy2,jy0);
1468 dz20 = _mm_sub_ps(iz2,jz0);
1469 dx21 = _mm_sub_ps(ix2,jx1);
1470 dy21 = _mm_sub_ps(iy2,jy1);
1471 dz21 = _mm_sub_ps(iz2,jz1);
1472 dx22 = _mm_sub_ps(ix2,jx2);
1473 dy22 = _mm_sub_ps(iy2,jy2);
1474 dz22 = _mm_sub_ps(iz2,jz2);
1476 /* Calculate squared distance and things based on it */
1477 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1478 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1479 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1480 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1481 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1482 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1483 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1484 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1485 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1487 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1488 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1489 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1490 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1491 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1492 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1493 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1494 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1495 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1497 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1499 fjx0 = _mm_setzero_ps();
1500 fjy0 = _mm_setzero_ps();
1501 fjz0 = _mm_setzero_ps();
1502 fjx1 = _mm_setzero_ps();
1503 fjy1 = _mm_setzero_ps();
1504 fjz1 = _mm_setzero_ps();
1505 fjx2 = _mm_setzero_ps();
1506 fjy2 = _mm_setzero_ps();
1507 fjz2 = _mm_setzero_ps();
1509 /**************************
1510 * CALCULATE INTERACTIONS *
1511 **************************/
1513 r00 = _mm_mul_ps(rsq00,rinv00);
1515 /* Calculate table index by multiplying r with table scale and truncate to integer */
1516 rt = _mm_mul_ps(r00,vftabscale);
1517 vfitab = _mm_cvttps_epi32(rt);
1518 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1519 vfitab = _mm_slli_epi32(vfitab,2);
1521 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1522 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1523 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1524 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1525 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1526 _MM_TRANSPOSE4_PS(Y,F,G,H);
1527 Heps = _mm_mul_ps(vfeps,H);
1528 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1529 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1530 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1532 /* LENNARD-JONES DISPERSION/REPULSION */
1534 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1535 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1537 fscal = _mm_add_ps(felec,fvdw);
1539 /* Calculate temporary vectorial force */
1540 tx = _mm_mul_ps(fscal,dx00);
1541 ty = _mm_mul_ps(fscal,dy00);
1542 tz = _mm_mul_ps(fscal,dz00);
1544 /* Update vectorial force */
1545 fix0 = _mm_add_ps(fix0,tx);
1546 fiy0 = _mm_add_ps(fiy0,ty);
1547 fiz0 = _mm_add_ps(fiz0,tz);
1549 fjx0 = _mm_add_ps(fjx0,tx);
1550 fjy0 = _mm_add_ps(fjy0,ty);
1551 fjz0 = _mm_add_ps(fjz0,tz);
1553 /**************************
1554 * CALCULATE INTERACTIONS *
1555 **************************/
1557 r01 = _mm_mul_ps(rsq01,rinv01);
1559 /* Calculate table index by multiplying r with table scale and truncate to integer */
1560 rt = _mm_mul_ps(r01,vftabscale);
1561 vfitab = _mm_cvttps_epi32(rt);
1562 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1563 vfitab = _mm_slli_epi32(vfitab,2);
1565 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1566 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1567 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1568 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1569 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1570 _MM_TRANSPOSE4_PS(Y,F,G,H);
1571 Heps = _mm_mul_ps(vfeps,H);
1572 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1573 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1574 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
1578 /* Calculate temporary vectorial force */
1579 tx = _mm_mul_ps(fscal,dx01);
1580 ty = _mm_mul_ps(fscal,dy01);
1581 tz = _mm_mul_ps(fscal,dz01);
1583 /* Update vectorial force */
1584 fix0 = _mm_add_ps(fix0,tx);
1585 fiy0 = _mm_add_ps(fiy0,ty);
1586 fiz0 = _mm_add_ps(fiz0,tz);
1588 fjx1 = _mm_add_ps(fjx1,tx);
1589 fjy1 = _mm_add_ps(fjy1,ty);
1590 fjz1 = _mm_add_ps(fjz1,tz);
1592 /**************************
1593 * CALCULATE INTERACTIONS *
1594 **************************/
1596 r02 = _mm_mul_ps(rsq02,rinv02);
1598 /* Calculate table index by multiplying r with table scale and truncate to integer */
1599 rt = _mm_mul_ps(r02,vftabscale);
1600 vfitab = _mm_cvttps_epi32(rt);
1601 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1602 vfitab = _mm_slli_epi32(vfitab,2);
1604 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1605 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1606 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1607 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1608 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1609 _MM_TRANSPOSE4_PS(Y,F,G,H);
1610 Heps = _mm_mul_ps(vfeps,H);
1611 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1612 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1613 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
1617 /* Calculate temporary vectorial force */
1618 tx = _mm_mul_ps(fscal,dx02);
1619 ty = _mm_mul_ps(fscal,dy02);
1620 tz = _mm_mul_ps(fscal,dz02);
1622 /* Update vectorial force */
1623 fix0 = _mm_add_ps(fix0,tx);
1624 fiy0 = _mm_add_ps(fiy0,ty);
1625 fiz0 = _mm_add_ps(fiz0,tz);
1627 fjx2 = _mm_add_ps(fjx2,tx);
1628 fjy2 = _mm_add_ps(fjy2,ty);
1629 fjz2 = _mm_add_ps(fjz2,tz);
1631 /**************************
1632 * CALCULATE INTERACTIONS *
1633 **************************/
1635 r10 = _mm_mul_ps(rsq10,rinv10);
1637 /* Calculate table index by multiplying r with table scale and truncate to integer */
1638 rt = _mm_mul_ps(r10,vftabscale);
1639 vfitab = _mm_cvttps_epi32(rt);
1640 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1641 vfitab = _mm_slli_epi32(vfitab,2);
1643 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1644 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1645 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1646 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1647 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1648 _MM_TRANSPOSE4_PS(Y,F,G,H);
1649 Heps = _mm_mul_ps(vfeps,H);
1650 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1651 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1652 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1656 /* Calculate temporary vectorial force */
1657 tx = _mm_mul_ps(fscal,dx10);
1658 ty = _mm_mul_ps(fscal,dy10);
1659 tz = _mm_mul_ps(fscal,dz10);
1661 /* Update vectorial force */
1662 fix1 = _mm_add_ps(fix1,tx);
1663 fiy1 = _mm_add_ps(fiy1,ty);
1664 fiz1 = _mm_add_ps(fiz1,tz);
1666 fjx0 = _mm_add_ps(fjx0,tx);
1667 fjy0 = _mm_add_ps(fjy0,ty);
1668 fjz0 = _mm_add_ps(fjz0,tz);
1670 /**************************
1671 * CALCULATE INTERACTIONS *
1672 **************************/
1674 r11 = _mm_mul_ps(rsq11,rinv11);
1676 /* Calculate table index by multiplying r with table scale and truncate to integer */
1677 rt = _mm_mul_ps(r11,vftabscale);
1678 vfitab = _mm_cvttps_epi32(rt);
1679 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1680 vfitab = _mm_slli_epi32(vfitab,2);
1682 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1683 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1684 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1685 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1686 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1687 _MM_TRANSPOSE4_PS(Y,F,G,H);
1688 Heps = _mm_mul_ps(vfeps,H);
1689 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1690 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1691 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1695 /* Calculate temporary vectorial force */
1696 tx = _mm_mul_ps(fscal,dx11);
1697 ty = _mm_mul_ps(fscal,dy11);
1698 tz = _mm_mul_ps(fscal,dz11);
1700 /* Update vectorial force */
1701 fix1 = _mm_add_ps(fix1,tx);
1702 fiy1 = _mm_add_ps(fiy1,ty);
1703 fiz1 = _mm_add_ps(fiz1,tz);
1705 fjx1 = _mm_add_ps(fjx1,tx);
1706 fjy1 = _mm_add_ps(fjy1,ty);
1707 fjz1 = _mm_add_ps(fjz1,tz);
1709 /**************************
1710 * CALCULATE INTERACTIONS *
1711 **************************/
1713 r12 = _mm_mul_ps(rsq12,rinv12);
1715 /* Calculate table index by multiplying r with table scale and truncate to integer */
1716 rt = _mm_mul_ps(r12,vftabscale);
1717 vfitab = _mm_cvttps_epi32(rt);
1718 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1719 vfitab = _mm_slli_epi32(vfitab,2);
1721 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1722 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1723 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1724 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1725 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1726 _MM_TRANSPOSE4_PS(Y,F,G,H);
1727 Heps = _mm_mul_ps(vfeps,H);
1728 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1729 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1730 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1734 /* Calculate temporary vectorial force */
1735 tx = _mm_mul_ps(fscal,dx12);
1736 ty = _mm_mul_ps(fscal,dy12);
1737 tz = _mm_mul_ps(fscal,dz12);
1739 /* Update vectorial force */
1740 fix1 = _mm_add_ps(fix1,tx);
1741 fiy1 = _mm_add_ps(fiy1,ty);
1742 fiz1 = _mm_add_ps(fiz1,tz);
1744 fjx2 = _mm_add_ps(fjx2,tx);
1745 fjy2 = _mm_add_ps(fjy2,ty);
1746 fjz2 = _mm_add_ps(fjz2,tz);
1748 /**************************
1749 * CALCULATE INTERACTIONS *
1750 **************************/
1752 r20 = _mm_mul_ps(rsq20,rinv20);
1754 /* Calculate table index by multiplying r with table scale and truncate to integer */
1755 rt = _mm_mul_ps(r20,vftabscale);
1756 vfitab = _mm_cvttps_epi32(rt);
1757 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1758 vfitab = _mm_slli_epi32(vfitab,2);
1760 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1761 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1762 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1763 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1764 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1765 _MM_TRANSPOSE4_PS(Y,F,G,H);
1766 Heps = _mm_mul_ps(vfeps,H);
1767 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1768 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1769 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1773 /* Calculate temporary vectorial force */
1774 tx = _mm_mul_ps(fscal,dx20);
1775 ty = _mm_mul_ps(fscal,dy20);
1776 tz = _mm_mul_ps(fscal,dz20);
1778 /* Update vectorial force */
1779 fix2 = _mm_add_ps(fix2,tx);
1780 fiy2 = _mm_add_ps(fiy2,ty);
1781 fiz2 = _mm_add_ps(fiz2,tz);
1783 fjx0 = _mm_add_ps(fjx0,tx);
1784 fjy0 = _mm_add_ps(fjy0,ty);
1785 fjz0 = _mm_add_ps(fjz0,tz);
1787 /**************************
1788 * CALCULATE INTERACTIONS *
1789 **************************/
1791 r21 = _mm_mul_ps(rsq21,rinv21);
1793 /* Calculate table index by multiplying r with table scale and truncate to integer */
1794 rt = _mm_mul_ps(r21,vftabscale);
1795 vfitab = _mm_cvttps_epi32(rt);
1796 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1797 vfitab = _mm_slli_epi32(vfitab,2);
1799 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1800 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1801 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1802 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1803 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1804 _MM_TRANSPOSE4_PS(Y,F,G,H);
1805 Heps = _mm_mul_ps(vfeps,H);
1806 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1807 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1808 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1812 /* Calculate temporary vectorial force */
1813 tx = _mm_mul_ps(fscal,dx21);
1814 ty = _mm_mul_ps(fscal,dy21);
1815 tz = _mm_mul_ps(fscal,dz21);
1817 /* Update vectorial force */
1818 fix2 = _mm_add_ps(fix2,tx);
1819 fiy2 = _mm_add_ps(fiy2,ty);
1820 fiz2 = _mm_add_ps(fiz2,tz);
1822 fjx1 = _mm_add_ps(fjx1,tx);
1823 fjy1 = _mm_add_ps(fjy1,ty);
1824 fjz1 = _mm_add_ps(fjz1,tz);
1826 /**************************
1827 * CALCULATE INTERACTIONS *
1828 **************************/
1830 r22 = _mm_mul_ps(rsq22,rinv22);
1832 /* Calculate table index by multiplying r with table scale and truncate to integer */
1833 rt = _mm_mul_ps(r22,vftabscale);
1834 vfitab = _mm_cvttps_epi32(rt);
1835 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1836 vfitab = _mm_slli_epi32(vfitab,2);
1838 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1839 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1840 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1841 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1842 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1843 _MM_TRANSPOSE4_PS(Y,F,G,H);
1844 Heps = _mm_mul_ps(vfeps,H);
1845 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1846 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1847 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1851 /* Calculate temporary vectorial force */
1852 tx = _mm_mul_ps(fscal,dx22);
1853 ty = _mm_mul_ps(fscal,dy22);
1854 tz = _mm_mul_ps(fscal,dz22);
1856 /* Update vectorial force */
1857 fix2 = _mm_add_ps(fix2,tx);
1858 fiy2 = _mm_add_ps(fiy2,ty);
1859 fiz2 = _mm_add_ps(fiz2,tz);
1861 fjx2 = _mm_add_ps(fjx2,tx);
1862 fjy2 = _mm_add_ps(fjy2,ty);
1863 fjz2 = _mm_add_ps(fjz2,tz);
1865 fjptrA = f+j_coord_offsetA;
1866 fjptrB = f+j_coord_offsetB;
1867 fjptrC = f+j_coord_offsetC;
1868 fjptrD = f+j_coord_offsetD;
1870 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1871 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1873 /* Inner loop uses 359 flops */
1876 if(jidx<j_index_end)
1879 /* Get j neighbor index, and coordinate index */
1880 jnrlistA = jjnr[jidx];
1881 jnrlistB = jjnr[jidx+1];
1882 jnrlistC = jjnr[jidx+2];
1883 jnrlistD = jjnr[jidx+3];
1884 /* Sign of each element will be negative for non-real atoms.
1885 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1886 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1888 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1889 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1890 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1891 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1892 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1893 j_coord_offsetA = DIM*jnrA;
1894 j_coord_offsetB = DIM*jnrB;
1895 j_coord_offsetC = DIM*jnrC;
1896 j_coord_offsetD = DIM*jnrD;
1898 /* load j atom coordinates */
1899 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1900 x+j_coord_offsetC,x+j_coord_offsetD,
1901 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1903 /* Calculate displacement vector */
1904 dx00 = _mm_sub_ps(ix0,jx0);
1905 dy00 = _mm_sub_ps(iy0,jy0);
1906 dz00 = _mm_sub_ps(iz0,jz0);
1907 dx01 = _mm_sub_ps(ix0,jx1);
1908 dy01 = _mm_sub_ps(iy0,jy1);
1909 dz01 = _mm_sub_ps(iz0,jz1);
1910 dx02 = _mm_sub_ps(ix0,jx2);
1911 dy02 = _mm_sub_ps(iy0,jy2);
1912 dz02 = _mm_sub_ps(iz0,jz2);
1913 dx10 = _mm_sub_ps(ix1,jx0);
1914 dy10 = _mm_sub_ps(iy1,jy0);
1915 dz10 = _mm_sub_ps(iz1,jz0);
1916 dx11 = _mm_sub_ps(ix1,jx1);
1917 dy11 = _mm_sub_ps(iy1,jy1);
1918 dz11 = _mm_sub_ps(iz1,jz1);
1919 dx12 = _mm_sub_ps(ix1,jx2);
1920 dy12 = _mm_sub_ps(iy1,jy2);
1921 dz12 = _mm_sub_ps(iz1,jz2);
1922 dx20 = _mm_sub_ps(ix2,jx0);
1923 dy20 = _mm_sub_ps(iy2,jy0);
1924 dz20 = _mm_sub_ps(iz2,jz0);
1925 dx21 = _mm_sub_ps(ix2,jx1);
1926 dy21 = _mm_sub_ps(iy2,jy1);
1927 dz21 = _mm_sub_ps(iz2,jz1);
1928 dx22 = _mm_sub_ps(ix2,jx2);
1929 dy22 = _mm_sub_ps(iy2,jy2);
1930 dz22 = _mm_sub_ps(iz2,jz2);
1932 /* Calculate squared distance and things based on it */
1933 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1934 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1935 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1936 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1937 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1938 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1939 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1940 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1941 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1943 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1944 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1945 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1946 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1947 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1948 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1949 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1950 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1951 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1953 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1955 fjx0 = _mm_setzero_ps();
1956 fjy0 = _mm_setzero_ps();
1957 fjz0 = _mm_setzero_ps();
1958 fjx1 = _mm_setzero_ps();
1959 fjy1 = _mm_setzero_ps();
1960 fjz1 = _mm_setzero_ps();
1961 fjx2 = _mm_setzero_ps();
1962 fjy2 = _mm_setzero_ps();
1963 fjz2 = _mm_setzero_ps();
1965 /**************************
1966 * CALCULATE INTERACTIONS *
1967 **************************/
1969 r00 = _mm_mul_ps(rsq00,rinv00);
1970 r00 = _mm_andnot_ps(dummy_mask,r00);
1972 /* Calculate table index by multiplying r with table scale and truncate to integer */
1973 rt = _mm_mul_ps(r00,vftabscale);
1974 vfitab = _mm_cvttps_epi32(rt);
1975 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1976 vfitab = _mm_slli_epi32(vfitab,2);
1978 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1979 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1980 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1981 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1982 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1983 _MM_TRANSPOSE4_PS(Y,F,G,H);
1984 Heps = _mm_mul_ps(vfeps,H);
1985 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1986 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1987 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1989 /* LENNARD-JONES DISPERSION/REPULSION */
1991 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1992 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1994 fscal = _mm_add_ps(felec,fvdw);
1996 fscal = _mm_andnot_ps(dummy_mask,fscal);
1998 /* Calculate temporary vectorial force */
1999 tx = _mm_mul_ps(fscal,dx00);
2000 ty = _mm_mul_ps(fscal,dy00);
2001 tz = _mm_mul_ps(fscal,dz00);
2003 /* Update vectorial force */
2004 fix0 = _mm_add_ps(fix0,tx);
2005 fiy0 = _mm_add_ps(fiy0,ty);
2006 fiz0 = _mm_add_ps(fiz0,tz);
2008 fjx0 = _mm_add_ps(fjx0,tx);
2009 fjy0 = _mm_add_ps(fjy0,ty);
2010 fjz0 = _mm_add_ps(fjz0,tz);
2012 /**************************
2013 * CALCULATE INTERACTIONS *
2014 **************************/
2016 r01 = _mm_mul_ps(rsq01,rinv01);
2017 r01 = _mm_andnot_ps(dummy_mask,r01);
2019 /* Calculate table index by multiplying r with table scale and truncate to integer */
2020 rt = _mm_mul_ps(r01,vftabscale);
2021 vfitab = _mm_cvttps_epi32(rt);
2022 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2023 vfitab = _mm_slli_epi32(vfitab,2);
2025 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2026 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2027 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2028 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2029 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2030 _MM_TRANSPOSE4_PS(Y,F,G,H);
2031 Heps = _mm_mul_ps(vfeps,H);
2032 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2033 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2034 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
2038 fscal = _mm_andnot_ps(dummy_mask,fscal);
2040 /* Calculate temporary vectorial force */
2041 tx = _mm_mul_ps(fscal,dx01);
2042 ty = _mm_mul_ps(fscal,dy01);
2043 tz = _mm_mul_ps(fscal,dz01);
2045 /* Update vectorial force */
2046 fix0 = _mm_add_ps(fix0,tx);
2047 fiy0 = _mm_add_ps(fiy0,ty);
2048 fiz0 = _mm_add_ps(fiz0,tz);
2050 fjx1 = _mm_add_ps(fjx1,tx);
2051 fjy1 = _mm_add_ps(fjy1,ty);
2052 fjz1 = _mm_add_ps(fjz1,tz);
2054 /**************************
2055 * CALCULATE INTERACTIONS *
2056 **************************/
2058 r02 = _mm_mul_ps(rsq02,rinv02);
2059 r02 = _mm_andnot_ps(dummy_mask,r02);
2061 /* Calculate table index by multiplying r with table scale and truncate to integer */
2062 rt = _mm_mul_ps(r02,vftabscale);
2063 vfitab = _mm_cvttps_epi32(rt);
2064 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2065 vfitab = _mm_slli_epi32(vfitab,2);
2067 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2068 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2069 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2070 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2071 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2072 _MM_TRANSPOSE4_PS(Y,F,G,H);
2073 Heps = _mm_mul_ps(vfeps,H);
2074 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2075 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2076 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
2080 fscal = _mm_andnot_ps(dummy_mask,fscal);
2082 /* Calculate temporary vectorial force */
2083 tx = _mm_mul_ps(fscal,dx02);
2084 ty = _mm_mul_ps(fscal,dy02);
2085 tz = _mm_mul_ps(fscal,dz02);
2087 /* Update vectorial force */
2088 fix0 = _mm_add_ps(fix0,tx);
2089 fiy0 = _mm_add_ps(fiy0,ty);
2090 fiz0 = _mm_add_ps(fiz0,tz);
2092 fjx2 = _mm_add_ps(fjx2,tx);
2093 fjy2 = _mm_add_ps(fjy2,ty);
2094 fjz2 = _mm_add_ps(fjz2,tz);
2096 /**************************
2097 * CALCULATE INTERACTIONS *
2098 **************************/
2100 r10 = _mm_mul_ps(rsq10,rinv10);
2101 r10 = _mm_andnot_ps(dummy_mask,r10);
2103 /* Calculate table index by multiplying r with table scale and truncate to integer */
2104 rt = _mm_mul_ps(r10,vftabscale);
2105 vfitab = _mm_cvttps_epi32(rt);
2106 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2107 vfitab = _mm_slli_epi32(vfitab,2);
2109 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2110 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2111 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2112 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2113 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2114 _MM_TRANSPOSE4_PS(Y,F,G,H);
2115 Heps = _mm_mul_ps(vfeps,H);
2116 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2117 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2118 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
2122 fscal = _mm_andnot_ps(dummy_mask,fscal);
2124 /* Calculate temporary vectorial force */
2125 tx = _mm_mul_ps(fscal,dx10);
2126 ty = _mm_mul_ps(fscal,dy10);
2127 tz = _mm_mul_ps(fscal,dz10);
2129 /* Update vectorial force */
2130 fix1 = _mm_add_ps(fix1,tx);
2131 fiy1 = _mm_add_ps(fiy1,ty);
2132 fiz1 = _mm_add_ps(fiz1,tz);
2134 fjx0 = _mm_add_ps(fjx0,tx);
2135 fjy0 = _mm_add_ps(fjy0,ty);
2136 fjz0 = _mm_add_ps(fjz0,tz);
2138 /**************************
2139 * CALCULATE INTERACTIONS *
2140 **************************/
2142 r11 = _mm_mul_ps(rsq11,rinv11);
2143 r11 = _mm_andnot_ps(dummy_mask,r11);
2145 /* Calculate table index by multiplying r with table scale and truncate to integer */
2146 rt = _mm_mul_ps(r11,vftabscale);
2147 vfitab = _mm_cvttps_epi32(rt);
2148 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2149 vfitab = _mm_slli_epi32(vfitab,2);
2151 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2152 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2153 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2154 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2155 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2156 _MM_TRANSPOSE4_PS(Y,F,G,H);
2157 Heps = _mm_mul_ps(vfeps,H);
2158 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2159 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2160 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2164 fscal = _mm_andnot_ps(dummy_mask,fscal);
2166 /* Calculate temporary vectorial force */
2167 tx = _mm_mul_ps(fscal,dx11);
2168 ty = _mm_mul_ps(fscal,dy11);
2169 tz = _mm_mul_ps(fscal,dz11);
2171 /* Update vectorial force */
2172 fix1 = _mm_add_ps(fix1,tx);
2173 fiy1 = _mm_add_ps(fiy1,ty);
2174 fiz1 = _mm_add_ps(fiz1,tz);
2176 fjx1 = _mm_add_ps(fjx1,tx);
2177 fjy1 = _mm_add_ps(fjy1,ty);
2178 fjz1 = _mm_add_ps(fjz1,tz);
2180 /**************************
2181 * CALCULATE INTERACTIONS *
2182 **************************/
2184 r12 = _mm_mul_ps(rsq12,rinv12);
2185 r12 = _mm_andnot_ps(dummy_mask,r12);
2187 /* Calculate table index by multiplying r with table scale and truncate to integer */
2188 rt = _mm_mul_ps(r12,vftabscale);
2189 vfitab = _mm_cvttps_epi32(rt);
2190 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2191 vfitab = _mm_slli_epi32(vfitab,2);
2193 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2194 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2195 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2196 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2197 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2198 _MM_TRANSPOSE4_PS(Y,F,G,H);
2199 Heps = _mm_mul_ps(vfeps,H);
2200 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2201 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2202 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2206 fscal = _mm_andnot_ps(dummy_mask,fscal);
2208 /* Calculate temporary vectorial force */
2209 tx = _mm_mul_ps(fscal,dx12);
2210 ty = _mm_mul_ps(fscal,dy12);
2211 tz = _mm_mul_ps(fscal,dz12);
2213 /* Update vectorial force */
2214 fix1 = _mm_add_ps(fix1,tx);
2215 fiy1 = _mm_add_ps(fiy1,ty);
2216 fiz1 = _mm_add_ps(fiz1,tz);
2218 fjx2 = _mm_add_ps(fjx2,tx);
2219 fjy2 = _mm_add_ps(fjy2,ty);
2220 fjz2 = _mm_add_ps(fjz2,tz);
2222 /**************************
2223 * CALCULATE INTERACTIONS *
2224 **************************/
2226 r20 = _mm_mul_ps(rsq20,rinv20);
2227 r20 = _mm_andnot_ps(dummy_mask,r20);
2229 /* Calculate table index by multiplying r with table scale and truncate to integer */
2230 rt = _mm_mul_ps(r20,vftabscale);
2231 vfitab = _mm_cvttps_epi32(rt);
2232 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2233 vfitab = _mm_slli_epi32(vfitab,2);
2235 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2236 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2237 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2238 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2239 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2240 _MM_TRANSPOSE4_PS(Y,F,G,H);
2241 Heps = _mm_mul_ps(vfeps,H);
2242 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2243 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2244 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
2248 fscal = _mm_andnot_ps(dummy_mask,fscal);
2250 /* Calculate temporary vectorial force */
2251 tx = _mm_mul_ps(fscal,dx20);
2252 ty = _mm_mul_ps(fscal,dy20);
2253 tz = _mm_mul_ps(fscal,dz20);
2255 /* Update vectorial force */
2256 fix2 = _mm_add_ps(fix2,tx);
2257 fiy2 = _mm_add_ps(fiy2,ty);
2258 fiz2 = _mm_add_ps(fiz2,tz);
2260 fjx0 = _mm_add_ps(fjx0,tx);
2261 fjy0 = _mm_add_ps(fjy0,ty);
2262 fjz0 = _mm_add_ps(fjz0,tz);
2264 /**************************
2265 * CALCULATE INTERACTIONS *
2266 **************************/
2268 r21 = _mm_mul_ps(rsq21,rinv21);
2269 r21 = _mm_andnot_ps(dummy_mask,r21);
2271 /* Calculate table index by multiplying r with table scale and truncate to integer */
2272 rt = _mm_mul_ps(r21,vftabscale);
2273 vfitab = _mm_cvttps_epi32(rt);
2274 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2275 vfitab = _mm_slli_epi32(vfitab,2);
2277 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2278 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2279 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2280 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2281 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2282 _MM_TRANSPOSE4_PS(Y,F,G,H);
2283 Heps = _mm_mul_ps(vfeps,H);
2284 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2285 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2286 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2290 fscal = _mm_andnot_ps(dummy_mask,fscal);
2292 /* Calculate temporary vectorial force */
2293 tx = _mm_mul_ps(fscal,dx21);
2294 ty = _mm_mul_ps(fscal,dy21);
2295 tz = _mm_mul_ps(fscal,dz21);
2297 /* Update vectorial force */
2298 fix2 = _mm_add_ps(fix2,tx);
2299 fiy2 = _mm_add_ps(fiy2,ty);
2300 fiz2 = _mm_add_ps(fiz2,tz);
2302 fjx1 = _mm_add_ps(fjx1,tx);
2303 fjy1 = _mm_add_ps(fjy1,ty);
2304 fjz1 = _mm_add_ps(fjz1,tz);
2306 /**************************
2307 * CALCULATE INTERACTIONS *
2308 **************************/
2310 r22 = _mm_mul_ps(rsq22,rinv22);
2311 r22 = _mm_andnot_ps(dummy_mask,r22);
2313 /* Calculate table index by multiplying r with table scale and truncate to integer */
2314 rt = _mm_mul_ps(r22,vftabscale);
2315 vfitab = _mm_cvttps_epi32(rt);
2316 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2317 vfitab = _mm_slli_epi32(vfitab,2);
2319 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2320 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2321 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2322 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2323 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2324 _MM_TRANSPOSE4_PS(Y,F,G,H);
2325 Heps = _mm_mul_ps(vfeps,H);
2326 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2327 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2328 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2332 fscal = _mm_andnot_ps(dummy_mask,fscal);
2334 /* Calculate temporary vectorial force */
2335 tx = _mm_mul_ps(fscal,dx22);
2336 ty = _mm_mul_ps(fscal,dy22);
2337 tz = _mm_mul_ps(fscal,dz22);
2339 /* Update vectorial force */
2340 fix2 = _mm_add_ps(fix2,tx);
2341 fiy2 = _mm_add_ps(fiy2,ty);
2342 fiz2 = _mm_add_ps(fiz2,tz);
2344 fjx2 = _mm_add_ps(fjx2,tx);
2345 fjy2 = _mm_add_ps(fjy2,ty);
2346 fjz2 = _mm_add_ps(fjz2,tz);
2348 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2349 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2350 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2351 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2353 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2354 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2356 /* Inner loop uses 368 flops */
2359 /* End of innermost loop */
2361 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2362 f+i_coord_offset,fshift+i_shift_offset);
2364 /* Increment number of inner iterations */
2365 inneriter += j_index_end - j_index_start;
2367 /* Outer loop uses 18 flops */
2370 /* Increment number of outer iterations */
2373 /* Update outer/inner flops */
2375 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*368);