2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_sse4_1_double.h"
48 #include "kernelutil_x86_sse4_1_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse4_1_double
52 * Electrostatics interaction: CubicSplineTable
53 * VdW interaction: CubicSplineTable
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse4_1_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int j_coord_offsetA,j_coord_offsetB;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86 int vdwjidx0A,vdwjidx0B;
87 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
88 int vdwjidx1A,vdwjidx1B;
89 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
90 int vdwjidx2A,vdwjidx2B;
91 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
92 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
93 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
94 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
95 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
96 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
97 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
98 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
99 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
100 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
101 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
104 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
107 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
108 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
110 __m128i ifour = _mm_set1_epi32(4);
111 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
113 __m128d dummy_mask,cutoff_mask;
114 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
115 __m128d one = _mm_set1_pd(1.0);
116 __m128d two = _mm_set1_pd(2.0);
122 jindex = nlist->jindex;
124 shiftidx = nlist->shift;
126 shiftvec = fr->shift_vec[0];
127 fshift = fr->fshift[0];
128 facel = _mm_set1_pd(fr->epsfac);
129 charge = mdatoms->chargeA;
130 nvdwtype = fr->ntype;
132 vdwtype = mdatoms->typeA;
134 vftab = kernel_data->table_elec_vdw->data;
135 vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
137 /* Setup water-specific parameters */
138 inr = nlist->iinr[0];
139 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
140 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
141 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
142 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
144 jq0 = _mm_set1_pd(charge[inr+0]);
145 jq1 = _mm_set1_pd(charge[inr+1]);
146 jq2 = _mm_set1_pd(charge[inr+2]);
147 vdwjidx0A = 2*vdwtype[inr+0];
148 qq00 = _mm_mul_pd(iq0,jq0);
149 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
150 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
151 qq01 = _mm_mul_pd(iq0,jq1);
152 qq02 = _mm_mul_pd(iq0,jq2);
153 qq10 = _mm_mul_pd(iq1,jq0);
154 qq11 = _mm_mul_pd(iq1,jq1);
155 qq12 = _mm_mul_pd(iq1,jq2);
156 qq20 = _mm_mul_pd(iq2,jq0);
157 qq21 = _mm_mul_pd(iq2,jq1);
158 qq22 = _mm_mul_pd(iq2,jq2);
160 /* Avoid stupid compiler warnings */
168 /* Start outer loop over neighborlists */
169 for(iidx=0; iidx<nri; iidx++)
171 /* Load shift vector for this list */
172 i_shift_offset = DIM*shiftidx[iidx];
174 /* Load limits for loop over neighbors */
175 j_index_start = jindex[iidx];
176 j_index_end = jindex[iidx+1];
178 /* Get outer coordinate index */
180 i_coord_offset = DIM*inr;
182 /* Load i particle coords and add shift vector */
183 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
184 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
186 fix0 = _mm_setzero_pd();
187 fiy0 = _mm_setzero_pd();
188 fiz0 = _mm_setzero_pd();
189 fix1 = _mm_setzero_pd();
190 fiy1 = _mm_setzero_pd();
191 fiz1 = _mm_setzero_pd();
192 fix2 = _mm_setzero_pd();
193 fiy2 = _mm_setzero_pd();
194 fiz2 = _mm_setzero_pd();
196 /* Reset potential sums */
197 velecsum = _mm_setzero_pd();
198 vvdwsum = _mm_setzero_pd();
200 /* Start inner kernel loop */
201 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
204 /* Get j neighbor index, and coordinate index */
207 j_coord_offsetA = DIM*jnrA;
208 j_coord_offsetB = DIM*jnrB;
210 /* load j atom coordinates */
211 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
212 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
214 /* Calculate displacement vector */
215 dx00 = _mm_sub_pd(ix0,jx0);
216 dy00 = _mm_sub_pd(iy0,jy0);
217 dz00 = _mm_sub_pd(iz0,jz0);
218 dx01 = _mm_sub_pd(ix0,jx1);
219 dy01 = _mm_sub_pd(iy0,jy1);
220 dz01 = _mm_sub_pd(iz0,jz1);
221 dx02 = _mm_sub_pd(ix0,jx2);
222 dy02 = _mm_sub_pd(iy0,jy2);
223 dz02 = _mm_sub_pd(iz0,jz2);
224 dx10 = _mm_sub_pd(ix1,jx0);
225 dy10 = _mm_sub_pd(iy1,jy0);
226 dz10 = _mm_sub_pd(iz1,jz0);
227 dx11 = _mm_sub_pd(ix1,jx1);
228 dy11 = _mm_sub_pd(iy1,jy1);
229 dz11 = _mm_sub_pd(iz1,jz1);
230 dx12 = _mm_sub_pd(ix1,jx2);
231 dy12 = _mm_sub_pd(iy1,jy2);
232 dz12 = _mm_sub_pd(iz1,jz2);
233 dx20 = _mm_sub_pd(ix2,jx0);
234 dy20 = _mm_sub_pd(iy2,jy0);
235 dz20 = _mm_sub_pd(iz2,jz0);
236 dx21 = _mm_sub_pd(ix2,jx1);
237 dy21 = _mm_sub_pd(iy2,jy1);
238 dz21 = _mm_sub_pd(iz2,jz1);
239 dx22 = _mm_sub_pd(ix2,jx2);
240 dy22 = _mm_sub_pd(iy2,jy2);
241 dz22 = _mm_sub_pd(iz2,jz2);
243 /* Calculate squared distance and things based on it */
244 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
245 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
246 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
247 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
248 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
249 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
250 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
251 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
252 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
254 rinv00 = gmx_mm_invsqrt_pd(rsq00);
255 rinv01 = gmx_mm_invsqrt_pd(rsq01);
256 rinv02 = gmx_mm_invsqrt_pd(rsq02);
257 rinv10 = gmx_mm_invsqrt_pd(rsq10);
258 rinv11 = gmx_mm_invsqrt_pd(rsq11);
259 rinv12 = gmx_mm_invsqrt_pd(rsq12);
260 rinv20 = gmx_mm_invsqrt_pd(rsq20);
261 rinv21 = gmx_mm_invsqrt_pd(rsq21);
262 rinv22 = gmx_mm_invsqrt_pd(rsq22);
264 fjx0 = _mm_setzero_pd();
265 fjy0 = _mm_setzero_pd();
266 fjz0 = _mm_setzero_pd();
267 fjx1 = _mm_setzero_pd();
268 fjy1 = _mm_setzero_pd();
269 fjz1 = _mm_setzero_pd();
270 fjx2 = _mm_setzero_pd();
271 fjy2 = _mm_setzero_pd();
272 fjz2 = _mm_setzero_pd();
274 /**************************
275 * CALCULATE INTERACTIONS *
276 **************************/
278 r00 = _mm_mul_pd(rsq00,rinv00);
280 /* Calculate table index by multiplying r with table scale and truncate to integer */
281 rt = _mm_mul_pd(r00,vftabscale);
282 vfitab = _mm_cvttpd_epi32(rt);
283 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
284 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
286 /* CUBIC SPLINE TABLE ELECTROSTATICS */
287 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
288 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
289 GMX_MM_TRANSPOSE2_PD(Y,F);
290 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
291 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
292 GMX_MM_TRANSPOSE2_PD(G,H);
293 Heps = _mm_mul_pd(vfeps,H);
294 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
295 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
296 velec = _mm_mul_pd(qq00,VV);
297 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
298 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
300 /* CUBIC SPLINE TABLE DISPERSION */
301 vfitab = _mm_add_epi32(vfitab,ifour);
302 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
303 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
304 GMX_MM_TRANSPOSE2_PD(Y,F);
305 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
306 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
307 GMX_MM_TRANSPOSE2_PD(G,H);
308 Heps = _mm_mul_pd(vfeps,H);
309 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
310 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
311 vvdw6 = _mm_mul_pd(c6_00,VV);
312 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
313 fvdw6 = _mm_mul_pd(c6_00,FF);
315 /* CUBIC SPLINE TABLE REPULSION */
316 vfitab = _mm_add_epi32(vfitab,ifour);
317 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
318 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
319 GMX_MM_TRANSPOSE2_PD(Y,F);
320 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
321 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
322 GMX_MM_TRANSPOSE2_PD(G,H);
323 Heps = _mm_mul_pd(vfeps,H);
324 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
325 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
326 vvdw12 = _mm_mul_pd(c12_00,VV);
327 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
328 fvdw12 = _mm_mul_pd(c12_00,FF);
329 vvdw = _mm_add_pd(vvdw12,vvdw6);
330 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
332 /* Update potential sum for this i atom from the interaction with this j atom. */
333 velecsum = _mm_add_pd(velecsum,velec);
334 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
336 fscal = _mm_add_pd(felec,fvdw);
338 /* Calculate temporary vectorial force */
339 tx = _mm_mul_pd(fscal,dx00);
340 ty = _mm_mul_pd(fscal,dy00);
341 tz = _mm_mul_pd(fscal,dz00);
343 /* Update vectorial force */
344 fix0 = _mm_add_pd(fix0,tx);
345 fiy0 = _mm_add_pd(fiy0,ty);
346 fiz0 = _mm_add_pd(fiz0,tz);
348 fjx0 = _mm_add_pd(fjx0,tx);
349 fjy0 = _mm_add_pd(fjy0,ty);
350 fjz0 = _mm_add_pd(fjz0,tz);
352 /**************************
353 * CALCULATE INTERACTIONS *
354 **************************/
356 r01 = _mm_mul_pd(rsq01,rinv01);
358 /* Calculate table index by multiplying r with table scale and truncate to integer */
359 rt = _mm_mul_pd(r01,vftabscale);
360 vfitab = _mm_cvttpd_epi32(rt);
361 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
362 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
364 /* CUBIC SPLINE TABLE ELECTROSTATICS */
365 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
366 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
367 GMX_MM_TRANSPOSE2_PD(Y,F);
368 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
369 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
370 GMX_MM_TRANSPOSE2_PD(G,H);
371 Heps = _mm_mul_pd(vfeps,H);
372 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
373 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
374 velec = _mm_mul_pd(qq01,VV);
375 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
376 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
378 /* Update potential sum for this i atom from the interaction with this j atom. */
379 velecsum = _mm_add_pd(velecsum,velec);
383 /* Calculate temporary vectorial force */
384 tx = _mm_mul_pd(fscal,dx01);
385 ty = _mm_mul_pd(fscal,dy01);
386 tz = _mm_mul_pd(fscal,dz01);
388 /* Update vectorial force */
389 fix0 = _mm_add_pd(fix0,tx);
390 fiy0 = _mm_add_pd(fiy0,ty);
391 fiz0 = _mm_add_pd(fiz0,tz);
393 fjx1 = _mm_add_pd(fjx1,tx);
394 fjy1 = _mm_add_pd(fjy1,ty);
395 fjz1 = _mm_add_pd(fjz1,tz);
397 /**************************
398 * CALCULATE INTERACTIONS *
399 **************************/
401 r02 = _mm_mul_pd(rsq02,rinv02);
403 /* Calculate table index by multiplying r with table scale and truncate to integer */
404 rt = _mm_mul_pd(r02,vftabscale);
405 vfitab = _mm_cvttpd_epi32(rt);
406 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
407 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
409 /* CUBIC SPLINE TABLE ELECTROSTATICS */
410 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
411 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
412 GMX_MM_TRANSPOSE2_PD(Y,F);
413 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
414 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
415 GMX_MM_TRANSPOSE2_PD(G,H);
416 Heps = _mm_mul_pd(vfeps,H);
417 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
418 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
419 velec = _mm_mul_pd(qq02,VV);
420 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
421 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
423 /* Update potential sum for this i atom from the interaction with this j atom. */
424 velecsum = _mm_add_pd(velecsum,velec);
428 /* Calculate temporary vectorial force */
429 tx = _mm_mul_pd(fscal,dx02);
430 ty = _mm_mul_pd(fscal,dy02);
431 tz = _mm_mul_pd(fscal,dz02);
433 /* Update vectorial force */
434 fix0 = _mm_add_pd(fix0,tx);
435 fiy0 = _mm_add_pd(fiy0,ty);
436 fiz0 = _mm_add_pd(fiz0,tz);
438 fjx2 = _mm_add_pd(fjx2,tx);
439 fjy2 = _mm_add_pd(fjy2,ty);
440 fjz2 = _mm_add_pd(fjz2,tz);
442 /**************************
443 * CALCULATE INTERACTIONS *
444 **************************/
446 r10 = _mm_mul_pd(rsq10,rinv10);
448 /* Calculate table index by multiplying r with table scale and truncate to integer */
449 rt = _mm_mul_pd(r10,vftabscale);
450 vfitab = _mm_cvttpd_epi32(rt);
451 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
452 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
454 /* CUBIC SPLINE TABLE ELECTROSTATICS */
455 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
456 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
457 GMX_MM_TRANSPOSE2_PD(Y,F);
458 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
459 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
460 GMX_MM_TRANSPOSE2_PD(G,H);
461 Heps = _mm_mul_pd(vfeps,H);
462 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
463 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
464 velec = _mm_mul_pd(qq10,VV);
465 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
466 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
468 /* Update potential sum for this i atom from the interaction with this j atom. */
469 velecsum = _mm_add_pd(velecsum,velec);
473 /* Calculate temporary vectorial force */
474 tx = _mm_mul_pd(fscal,dx10);
475 ty = _mm_mul_pd(fscal,dy10);
476 tz = _mm_mul_pd(fscal,dz10);
478 /* Update vectorial force */
479 fix1 = _mm_add_pd(fix1,tx);
480 fiy1 = _mm_add_pd(fiy1,ty);
481 fiz1 = _mm_add_pd(fiz1,tz);
483 fjx0 = _mm_add_pd(fjx0,tx);
484 fjy0 = _mm_add_pd(fjy0,ty);
485 fjz0 = _mm_add_pd(fjz0,tz);
487 /**************************
488 * CALCULATE INTERACTIONS *
489 **************************/
491 r11 = _mm_mul_pd(rsq11,rinv11);
493 /* Calculate table index by multiplying r with table scale and truncate to integer */
494 rt = _mm_mul_pd(r11,vftabscale);
495 vfitab = _mm_cvttpd_epi32(rt);
496 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
497 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
499 /* CUBIC SPLINE TABLE ELECTROSTATICS */
500 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
501 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
502 GMX_MM_TRANSPOSE2_PD(Y,F);
503 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
504 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
505 GMX_MM_TRANSPOSE2_PD(G,H);
506 Heps = _mm_mul_pd(vfeps,H);
507 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
508 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
509 velec = _mm_mul_pd(qq11,VV);
510 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
511 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
513 /* Update potential sum for this i atom from the interaction with this j atom. */
514 velecsum = _mm_add_pd(velecsum,velec);
518 /* Calculate temporary vectorial force */
519 tx = _mm_mul_pd(fscal,dx11);
520 ty = _mm_mul_pd(fscal,dy11);
521 tz = _mm_mul_pd(fscal,dz11);
523 /* Update vectorial force */
524 fix1 = _mm_add_pd(fix1,tx);
525 fiy1 = _mm_add_pd(fiy1,ty);
526 fiz1 = _mm_add_pd(fiz1,tz);
528 fjx1 = _mm_add_pd(fjx1,tx);
529 fjy1 = _mm_add_pd(fjy1,ty);
530 fjz1 = _mm_add_pd(fjz1,tz);
532 /**************************
533 * CALCULATE INTERACTIONS *
534 **************************/
536 r12 = _mm_mul_pd(rsq12,rinv12);
538 /* Calculate table index by multiplying r with table scale and truncate to integer */
539 rt = _mm_mul_pd(r12,vftabscale);
540 vfitab = _mm_cvttpd_epi32(rt);
541 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
542 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
544 /* CUBIC SPLINE TABLE ELECTROSTATICS */
545 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
546 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
547 GMX_MM_TRANSPOSE2_PD(Y,F);
548 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
549 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
550 GMX_MM_TRANSPOSE2_PD(G,H);
551 Heps = _mm_mul_pd(vfeps,H);
552 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
553 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
554 velec = _mm_mul_pd(qq12,VV);
555 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
556 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
558 /* Update potential sum for this i atom from the interaction with this j atom. */
559 velecsum = _mm_add_pd(velecsum,velec);
563 /* Calculate temporary vectorial force */
564 tx = _mm_mul_pd(fscal,dx12);
565 ty = _mm_mul_pd(fscal,dy12);
566 tz = _mm_mul_pd(fscal,dz12);
568 /* Update vectorial force */
569 fix1 = _mm_add_pd(fix1,tx);
570 fiy1 = _mm_add_pd(fiy1,ty);
571 fiz1 = _mm_add_pd(fiz1,tz);
573 fjx2 = _mm_add_pd(fjx2,tx);
574 fjy2 = _mm_add_pd(fjy2,ty);
575 fjz2 = _mm_add_pd(fjz2,tz);
577 /**************************
578 * CALCULATE INTERACTIONS *
579 **************************/
581 r20 = _mm_mul_pd(rsq20,rinv20);
583 /* Calculate table index by multiplying r with table scale and truncate to integer */
584 rt = _mm_mul_pd(r20,vftabscale);
585 vfitab = _mm_cvttpd_epi32(rt);
586 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
587 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
589 /* CUBIC SPLINE TABLE ELECTROSTATICS */
590 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
591 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
592 GMX_MM_TRANSPOSE2_PD(Y,F);
593 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
594 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
595 GMX_MM_TRANSPOSE2_PD(G,H);
596 Heps = _mm_mul_pd(vfeps,H);
597 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
598 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
599 velec = _mm_mul_pd(qq20,VV);
600 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
601 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
603 /* Update potential sum for this i atom from the interaction with this j atom. */
604 velecsum = _mm_add_pd(velecsum,velec);
608 /* Calculate temporary vectorial force */
609 tx = _mm_mul_pd(fscal,dx20);
610 ty = _mm_mul_pd(fscal,dy20);
611 tz = _mm_mul_pd(fscal,dz20);
613 /* Update vectorial force */
614 fix2 = _mm_add_pd(fix2,tx);
615 fiy2 = _mm_add_pd(fiy2,ty);
616 fiz2 = _mm_add_pd(fiz2,tz);
618 fjx0 = _mm_add_pd(fjx0,tx);
619 fjy0 = _mm_add_pd(fjy0,ty);
620 fjz0 = _mm_add_pd(fjz0,tz);
622 /**************************
623 * CALCULATE INTERACTIONS *
624 **************************/
626 r21 = _mm_mul_pd(rsq21,rinv21);
628 /* Calculate table index by multiplying r with table scale and truncate to integer */
629 rt = _mm_mul_pd(r21,vftabscale);
630 vfitab = _mm_cvttpd_epi32(rt);
631 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
632 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
634 /* CUBIC SPLINE TABLE ELECTROSTATICS */
635 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
636 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
637 GMX_MM_TRANSPOSE2_PD(Y,F);
638 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
639 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
640 GMX_MM_TRANSPOSE2_PD(G,H);
641 Heps = _mm_mul_pd(vfeps,H);
642 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
643 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
644 velec = _mm_mul_pd(qq21,VV);
645 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
646 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
648 /* Update potential sum for this i atom from the interaction with this j atom. */
649 velecsum = _mm_add_pd(velecsum,velec);
653 /* Calculate temporary vectorial force */
654 tx = _mm_mul_pd(fscal,dx21);
655 ty = _mm_mul_pd(fscal,dy21);
656 tz = _mm_mul_pd(fscal,dz21);
658 /* Update vectorial force */
659 fix2 = _mm_add_pd(fix2,tx);
660 fiy2 = _mm_add_pd(fiy2,ty);
661 fiz2 = _mm_add_pd(fiz2,tz);
663 fjx1 = _mm_add_pd(fjx1,tx);
664 fjy1 = _mm_add_pd(fjy1,ty);
665 fjz1 = _mm_add_pd(fjz1,tz);
667 /**************************
668 * CALCULATE INTERACTIONS *
669 **************************/
671 r22 = _mm_mul_pd(rsq22,rinv22);
673 /* Calculate table index by multiplying r with table scale and truncate to integer */
674 rt = _mm_mul_pd(r22,vftabscale);
675 vfitab = _mm_cvttpd_epi32(rt);
676 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
677 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
679 /* CUBIC SPLINE TABLE ELECTROSTATICS */
680 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
681 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
682 GMX_MM_TRANSPOSE2_PD(Y,F);
683 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
684 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
685 GMX_MM_TRANSPOSE2_PD(G,H);
686 Heps = _mm_mul_pd(vfeps,H);
687 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
688 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
689 velec = _mm_mul_pd(qq22,VV);
690 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
691 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
693 /* Update potential sum for this i atom from the interaction with this j atom. */
694 velecsum = _mm_add_pd(velecsum,velec);
698 /* Calculate temporary vectorial force */
699 tx = _mm_mul_pd(fscal,dx22);
700 ty = _mm_mul_pd(fscal,dy22);
701 tz = _mm_mul_pd(fscal,dz22);
703 /* Update vectorial force */
704 fix2 = _mm_add_pd(fix2,tx);
705 fiy2 = _mm_add_pd(fiy2,ty);
706 fiz2 = _mm_add_pd(fiz2,tz);
708 fjx2 = _mm_add_pd(fjx2,tx);
709 fjy2 = _mm_add_pd(fjy2,ty);
710 fjz2 = _mm_add_pd(fjz2,tz);
712 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
714 /* Inner loop uses 417 flops */
721 j_coord_offsetA = DIM*jnrA;
723 /* load j atom coordinates */
724 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
725 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
727 /* Calculate displacement vector */
728 dx00 = _mm_sub_pd(ix0,jx0);
729 dy00 = _mm_sub_pd(iy0,jy0);
730 dz00 = _mm_sub_pd(iz0,jz0);
731 dx01 = _mm_sub_pd(ix0,jx1);
732 dy01 = _mm_sub_pd(iy0,jy1);
733 dz01 = _mm_sub_pd(iz0,jz1);
734 dx02 = _mm_sub_pd(ix0,jx2);
735 dy02 = _mm_sub_pd(iy0,jy2);
736 dz02 = _mm_sub_pd(iz0,jz2);
737 dx10 = _mm_sub_pd(ix1,jx0);
738 dy10 = _mm_sub_pd(iy1,jy0);
739 dz10 = _mm_sub_pd(iz1,jz0);
740 dx11 = _mm_sub_pd(ix1,jx1);
741 dy11 = _mm_sub_pd(iy1,jy1);
742 dz11 = _mm_sub_pd(iz1,jz1);
743 dx12 = _mm_sub_pd(ix1,jx2);
744 dy12 = _mm_sub_pd(iy1,jy2);
745 dz12 = _mm_sub_pd(iz1,jz2);
746 dx20 = _mm_sub_pd(ix2,jx0);
747 dy20 = _mm_sub_pd(iy2,jy0);
748 dz20 = _mm_sub_pd(iz2,jz0);
749 dx21 = _mm_sub_pd(ix2,jx1);
750 dy21 = _mm_sub_pd(iy2,jy1);
751 dz21 = _mm_sub_pd(iz2,jz1);
752 dx22 = _mm_sub_pd(ix2,jx2);
753 dy22 = _mm_sub_pd(iy2,jy2);
754 dz22 = _mm_sub_pd(iz2,jz2);
756 /* Calculate squared distance and things based on it */
757 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
758 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
759 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
760 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
761 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
762 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
763 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
764 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
765 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
767 rinv00 = gmx_mm_invsqrt_pd(rsq00);
768 rinv01 = gmx_mm_invsqrt_pd(rsq01);
769 rinv02 = gmx_mm_invsqrt_pd(rsq02);
770 rinv10 = gmx_mm_invsqrt_pd(rsq10);
771 rinv11 = gmx_mm_invsqrt_pd(rsq11);
772 rinv12 = gmx_mm_invsqrt_pd(rsq12);
773 rinv20 = gmx_mm_invsqrt_pd(rsq20);
774 rinv21 = gmx_mm_invsqrt_pd(rsq21);
775 rinv22 = gmx_mm_invsqrt_pd(rsq22);
777 fjx0 = _mm_setzero_pd();
778 fjy0 = _mm_setzero_pd();
779 fjz0 = _mm_setzero_pd();
780 fjx1 = _mm_setzero_pd();
781 fjy1 = _mm_setzero_pd();
782 fjz1 = _mm_setzero_pd();
783 fjx2 = _mm_setzero_pd();
784 fjy2 = _mm_setzero_pd();
785 fjz2 = _mm_setzero_pd();
787 /**************************
788 * CALCULATE INTERACTIONS *
789 **************************/
791 r00 = _mm_mul_pd(rsq00,rinv00);
793 /* Calculate table index by multiplying r with table scale and truncate to integer */
794 rt = _mm_mul_pd(r00,vftabscale);
795 vfitab = _mm_cvttpd_epi32(rt);
796 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
797 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
799 /* CUBIC SPLINE TABLE ELECTROSTATICS */
800 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
801 F = _mm_setzero_pd();
802 GMX_MM_TRANSPOSE2_PD(Y,F);
803 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
804 H = _mm_setzero_pd();
805 GMX_MM_TRANSPOSE2_PD(G,H);
806 Heps = _mm_mul_pd(vfeps,H);
807 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
808 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
809 velec = _mm_mul_pd(qq00,VV);
810 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
811 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
813 /* CUBIC SPLINE TABLE DISPERSION */
814 vfitab = _mm_add_epi32(vfitab,ifour);
815 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
816 F = _mm_setzero_pd();
817 GMX_MM_TRANSPOSE2_PD(Y,F);
818 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
819 H = _mm_setzero_pd();
820 GMX_MM_TRANSPOSE2_PD(G,H);
821 Heps = _mm_mul_pd(vfeps,H);
822 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
823 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
824 vvdw6 = _mm_mul_pd(c6_00,VV);
825 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
826 fvdw6 = _mm_mul_pd(c6_00,FF);
828 /* CUBIC SPLINE TABLE REPULSION */
829 vfitab = _mm_add_epi32(vfitab,ifour);
830 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
831 F = _mm_setzero_pd();
832 GMX_MM_TRANSPOSE2_PD(Y,F);
833 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
834 H = _mm_setzero_pd();
835 GMX_MM_TRANSPOSE2_PD(G,H);
836 Heps = _mm_mul_pd(vfeps,H);
837 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
838 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
839 vvdw12 = _mm_mul_pd(c12_00,VV);
840 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
841 fvdw12 = _mm_mul_pd(c12_00,FF);
842 vvdw = _mm_add_pd(vvdw12,vvdw6);
843 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
845 /* Update potential sum for this i atom from the interaction with this j atom. */
846 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
847 velecsum = _mm_add_pd(velecsum,velec);
848 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
849 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
851 fscal = _mm_add_pd(felec,fvdw);
853 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
855 /* Calculate temporary vectorial force */
856 tx = _mm_mul_pd(fscal,dx00);
857 ty = _mm_mul_pd(fscal,dy00);
858 tz = _mm_mul_pd(fscal,dz00);
860 /* Update vectorial force */
861 fix0 = _mm_add_pd(fix0,tx);
862 fiy0 = _mm_add_pd(fiy0,ty);
863 fiz0 = _mm_add_pd(fiz0,tz);
865 fjx0 = _mm_add_pd(fjx0,tx);
866 fjy0 = _mm_add_pd(fjy0,ty);
867 fjz0 = _mm_add_pd(fjz0,tz);
869 /**************************
870 * CALCULATE INTERACTIONS *
871 **************************/
873 r01 = _mm_mul_pd(rsq01,rinv01);
875 /* Calculate table index by multiplying r with table scale and truncate to integer */
876 rt = _mm_mul_pd(r01,vftabscale);
877 vfitab = _mm_cvttpd_epi32(rt);
878 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
879 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
881 /* CUBIC SPLINE TABLE ELECTROSTATICS */
882 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
883 F = _mm_setzero_pd();
884 GMX_MM_TRANSPOSE2_PD(Y,F);
885 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
886 H = _mm_setzero_pd();
887 GMX_MM_TRANSPOSE2_PD(G,H);
888 Heps = _mm_mul_pd(vfeps,H);
889 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
890 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
891 velec = _mm_mul_pd(qq01,VV);
892 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
893 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
895 /* Update potential sum for this i atom from the interaction with this j atom. */
896 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
897 velecsum = _mm_add_pd(velecsum,velec);
901 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
903 /* Calculate temporary vectorial force */
904 tx = _mm_mul_pd(fscal,dx01);
905 ty = _mm_mul_pd(fscal,dy01);
906 tz = _mm_mul_pd(fscal,dz01);
908 /* Update vectorial force */
909 fix0 = _mm_add_pd(fix0,tx);
910 fiy0 = _mm_add_pd(fiy0,ty);
911 fiz0 = _mm_add_pd(fiz0,tz);
913 fjx1 = _mm_add_pd(fjx1,tx);
914 fjy1 = _mm_add_pd(fjy1,ty);
915 fjz1 = _mm_add_pd(fjz1,tz);
917 /**************************
918 * CALCULATE INTERACTIONS *
919 **************************/
921 r02 = _mm_mul_pd(rsq02,rinv02);
923 /* Calculate table index by multiplying r with table scale and truncate to integer */
924 rt = _mm_mul_pd(r02,vftabscale);
925 vfitab = _mm_cvttpd_epi32(rt);
926 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
927 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
929 /* CUBIC SPLINE TABLE ELECTROSTATICS */
930 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
931 F = _mm_setzero_pd();
932 GMX_MM_TRANSPOSE2_PD(Y,F);
933 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
934 H = _mm_setzero_pd();
935 GMX_MM_TRANSPOSE2_PD(G,H);
936 Heps = _mm_mul_pd(vfeps,H);
937 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
938 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
939 velec = _mm_mul_pd(qq02,VV);
940 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
941 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
943 /* Update potential sum for this i atom from the interaction with this j atom. */
944 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
945 velecsum = _mm_add_pd(velecsum,velec);
949 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
951 /* Calculate temporary vectorial force */
952 tx = _mm_mul_pd(fscal,dx02);
953 ty = _mm_mul_pd(fscal,dy02);
954 tz = _mm_mul_pd(fscal,dz02);
956 /* Update vectorial force */
957 fix0 = _mm_add_pd(fix0,tx);
958 fiy0 = _mm_add_pd(fiy0,ty);
959 fiz0 = _mm_add_pd(fiz0,tz);
961 fjx2 = _mm_add_pd(fjx2,tx);
962 fjy2 = _mm_add_pd(fjy2,ty);
963 fjz2 = _mm_add_pd(fjz2,tz);
965 /**************************
966 * CALCULATE INTERACTIONS *
967 **************************/
969 r10 = _mm_mul_pd(rsq10,rinv10);
971 /* Calculate table index by multiplying r with table scale and truncate to integer */
972 rt = _mm_mul_pd(r10,vftabscale);
973 vfitab = _mm_cvttpd_epi32(rt);
974 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
975 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
977 /* CUBIC SPLINE TABLE ELECTROSTATICS */
978 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
979 F = _mm_setzero_pd();
980 GMX_MM_TRANSPOSE2_PD(Y,F);
981 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
982 H = _mm_setzero_pd();
983 GMX_MM_TRANSPOSE2_PD(G,H);
984 Heps = _mm_mul_pd(vfeps,H);
985 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
986 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
987 velec = _mm_mul_pd(qq10,VV);
988 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
989 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
991 /* Update potential sum for this i atom from the interaction with this j atom. */
992 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
993 velecsum = _mm_add_pd(velecsum,velec);
997 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
999 /* Calculate temporary vectorial force */
1000 tx = _mm_mul_pd(fscal,dx10);
1001 ty = _mm_mul_pd(fscal,dy10);
1002 tz = _mm_mul_pd(fscal,dz10);
1004 /* Update vectorial force */
1005 fix1 = _mm_add_pd(fix1,tx);
1006 fiy1 = _mm_add_pd(fiy1,ty);
1007 fiz1 = _mm_add_pd(fiz1,tz);
1009 fjx0 = _mm_add_pd(fjx0,tx);
1010 fjy0 = _mm_add_pd(fjy0,ty);
1011 fjz0 = _mm_add_pd(fjz0,tz);
1013 /**************************
1014 * CALCULATE INTERACTIONS *
1015 **************************/
1017 r11 = _mm_mul_pd(rsq11,rinv11);
1019 /* Calculate table index by multiplying r with table scale and truncate to integer */
1020 rt = _mm_mul_pd(r11,vftabscale);
1021 vfitab = _mm_cvttpd_epi32(rt);
1022 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1023 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1025 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1026 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1027 F = _mm_setzero_pd();
1028 GMX_MM_TRANSPOSE2_PD(Y,F);
1029 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1030 H = _mm_setzero_pd();
1031 GMX_MM_TRANSPOSE2_PD(G,H);
1032 Heps = _mm_mul_pd(vfeps,H);
1033 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1034 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1035 velec = _mm_mul_pd(qq11,VV);
1036 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1037 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1039 /* Update potential sum for this i atom from the interaction with this j atom. */
1040 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1041 velecsum = _mm_add_pd(velecsum,velec);
1045 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1047 /* Calculate temporary vectorial force */
1048 tx = _mm_mul_pd(fscal,dx11);
1049 ty = _mm_mul_pd(fscal,dy11);
1050 tz = _mm_mul_pd(fscal,dz11);
1052 /* Update vectorial force */
1053 fix1 = _mm_add_pd(fix1,tx);
1054 fiy1 = _mm_add_pd(fiy1,ty);
1055 fiz1 = _mm_add_pd(fiz1,tz);
1057 fjx1 = _mm_add_pd(fjx1,tx);
1058 fjy1 = _mm_add_pd(fjy1,ty);
1059 fjz1 = _mm_add_pd(fjz1,tz);
1061 /**************************
1062 * CALCULATE INTERACTIONS *
1063 **************************/
1065 r12 = _mm_mul_pd(rsq12,rinv12);
1067 /* Calculate table index by multiplying r with table scale and truncate to integer */
1068 rt = _mm_mul_pd(r12,vftabscale);
1069 vfitab = _mm_cvttpd_epi32(rt);
1070 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1071 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1073 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1074 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1075 F = _mm_setzero_pd();
1076 GMX_MM_TRANSPOSE2_PD(Y,F);
1077 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1078 H = _mm_setzero_pd();
1079 GMX_MM_TRANSPOSE2_PD(G,H);
1080 Heps = _mm_mul_pd(vfeps,H);
1081 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1082 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1083 velec = _mm_mul_pd(qq12,VV);
1084 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1085 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1087 /* Update potential sum for this i atom from the interaction with this j atom. */
1088 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1089 velecsum = _mm_add_pd(velecsum,velec);
1093 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1095 /* Calculate temporary vectorial force */
1096 tx = _mm_mul_pd(fscal,dx12);
1097 ty = _mm_mul_pd(fscal,dy12);
1098 tz = _mm_mul_pd(fscal,dz12);
1100 /* Update vectorial force */
1101 fix1 = _mm_add_pd(fix1,tx);
1102 fiy1 = _mm_add_pd(fiy1,ty);
1103 fiz1 = _mm_add_pd(fiz1,tz);
1105 fjx2 = _mm_add_pd(fjx2,tx);
1106 fjy2 = _mm_add_pd(fjy2,ty);
1107 fjz2 = _mm_add_pd(fjz2,tz);
1109 /**************************
1110 * CALCULATE INTERACTIONS *
1111 **************************/
1113 r20 = _mm_mul_pd(rsq20,rinv20);
1115 /* Calculate table index by multiplying r with table scale and truncate to integer */
1116 rt = _mm_mul_pd(r20,vftabscale);
1117 vfitab = _mm_cvttpd_epi32(rt);
1118 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1119 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1121 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1122 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1123 F = _mm_setzero_pd();
1124 GMX_MM_TRANSPOSE2_PD(Y,F);
1125 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1126 H = _mm_setzero_pd();
1127 GMX_MM_TRANSPOSE2_PD(G,H);
1128 Heps = _mm_mul_pd(vfeps,H);
1129 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1130 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1131 velec = _mm_mul_pd(qq20,VV);
1132 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1133 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1135 /* Update potential sum for this i atom from the interaction with this j atom. */
1136 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1137 velecsum = _mm_add_pd(velecsum,velec);
1141 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1143 /* Calculate temporary vectorial force */
1144 tx = _mm_mul_pd(fscal,dx20);
1145 ty = _mm_mul_pd(fscal,dy20);
1146 tz = _mm_mul_pd(fscal,dz20);
1148 /* Update vectorial force */
1149 fix2 = _mm_add_pd(fix2,tx);
1150 fiy2 = _mm_add_pd(fiy2,ty);
1151 fiz2 = _mm_add_pd(fiz2,tz);
1153 fjx0 = _mm_add_pd(fjx0,tx);
1154 fjy0 = _mm_add_pd(fjy0,ty);
1155 fjz0 = _mm_add_pd(fjz0,tz);
1157 /**************************
1158 * CALCULATE INTERACTIONS *
1159 **************************/
1161 r21 = _mm_mul_pd(rsq21,rinv21);
1163 /* Calculate table index by multiplying r with table scale and truncate to integer */
1164 rt = _mm_mul_pd(r21,vftabscale);
1165 vfitab = _mm_cvttpd_epi32(rt);
1166 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1167 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1169 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1170 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1171 F = _mm_setzero_pd();
1172 GMX_MM_TRANSPOSE2_PD(Y,F);
1173 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1174 H = _mm_setzero_pd();
1175 GMX_MM_TRANSPOSE2_PD(G,H);
1176 Heps = _mm_mul_pd(vfeps,H);
1177 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1178 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1179 velec = _mm_mul_pd(qq21,VV);
1180 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1181 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1183 /* Update potential sum for this i atom from the interaction with this j atom. */
1184 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1185 velecsum = _mm_add_pd(velecsum,velec);
1189 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1191 /* Calculate temporary vectorial force */
1192 tx = _mm_mul_pd(fscal,dx21);
1193 ty = _mm_mul_pd(fscal,dy21);
1194 tz = _mm_mul_pd(fscal,dz21);
1196 /* Update vectorial force */
1197 fix2 = _mm_add_pd(fix2,tx);
1198 fiy2 = _mm_add_pd(fiy2,ty);
1199 fiz2 = _mm_add_pd(fiz2,tz);
1201 fjx1 = _mm_add_pd(fjx1,tx);
1202 fjy1 = _mm_add_pd(fjy1,ty);
1203 fjz1 = _mm_add_pd(fjz1,tz);
1205 /**************************
1206 * CALCULATE INTERACTIONS *
1207 **************************/
1209 r22 = _mm_mul_pd(rsq22,rinv22);
1211 /* Calculate table index by multiplying r with table scale and truncate to integer */
1212 rt = _mm_mul_pd(r22,vftabscale);
1213 vfitab = _mm_cvttpd_epi32(rt);
1214 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1215 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1217 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1218 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1219 F = _mm_setzero_pd();
1220 GMX_MM_TRANSPOSE2_PD(Y,F);
1221 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1222 H = _mm_setzero_pd();
1223 GMX_MM_TRANSPOSE2_PD(G,H);
1224 Heps = _mm_mul_pd(vfeps,H);
1225 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1226 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1227 velec = _mm_mul_pd(qq22,VV);
1228 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1229 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1231 /* Update potential sum for this i atom from the interaction with this j atom. */
1232 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1233 velecsum = _mm_add_pd(velecsum,velec);
1237 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1239 /* Calculate temporary vectorial force */
1240 tx = _mm_mul_pd(fscal,dx22);
1241 ty = _mm_mul_pd(fscal,dy22);
1242 tz = _mm_mul_pd(fscal,dz22);
1244 /* Update vectorial force */
1245 fix2 = _mm_add_pd(fix2,tx);
1246 fiy2 = _mm_add_pd(fiy2,ty);
1247 fiz2 = _mm_add_pd(fiz2,tz);
1249 fjx2 = _mm_add_pd(fjx2,tx);
1250 fjy2 = _mm_add_pd(fjy2,ty);
1251 fjz2 = _mm_add_pd(fjz2,tz);
1253 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1255 /* Inner loop uses 417 flops */
1258 /* End of innermost loop */
1260 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1261 f+i_coord_offset,fshift+i_shift_offset);
1264 /* Update potential energies */
1265 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1266 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1268 /* Increment number of inner iterations */
1269 inneriter += j_index_end - j_index_start;
1271 /* Outer loop uses 20 flops */
1274 /* Increment number of outer iterations */
1277 /* Update outer/inner flops */
1279 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*417);
1282 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse4_1_double
1283 * Electrostatics interaction: CubicSplineTable
1284 * VdW interaction: CubicSplineTable
1285 * Geometry: Water3-Water3
1286 * Calculate force/pot: Force
1289 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse4_1_double
1290 (t_nblist * gmx_restrict nlist,
1291 rvec * gmx_restrict xx,
1292 rvec * gmx_restrict ff,
1293 t_forcerec * gmx_restrict fr,
1294 t_mdatoms * gmx_restrict mdatoms,
1295 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1296 t_nrnb * gmx_restrict nrnb)
1298 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1299 * just 0 for non-waters.
1300 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1301 * jnr indices corresponding to data put in the four positions in the SIMD register.
1303 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1304 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1306 int j_coord_offsetA,j_coord_offsetB;
1307 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1308 real rcutoff_scalar;
1309 real *shiftvec,*fshift,*x,*f;
1310 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1312 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1314 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1316 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1317 int vdwjidx0A,vdwjidx0B;
1318 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1319 int vdwjidx1A,vdwjidx1B;
1320 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1321 int vdwjidx2A,vdwjidx2B;
1322 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1323 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1324 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1325 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1326 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1327 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1328 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1329 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1330 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1331 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1332 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1335 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1338 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1339 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1341 __m128i ifour = _mm_set1_epi32(4);
1342 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1344 __m128d dummy_mask,cutoff_mask;
1345 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1346 __m128d one = _mm_set1_pd(1.0);
1347 __m128d two = _mm_set1_pd(2.0);
1353 jindex = nlist->jindex;
1355 shiftidx = nlist->shift;
1357 shiftvec = fr->shift_vec[0];
1358 fshift = fr->fshift[0];
1359 facel = _mm_set1_pd(fr->epsfac);
1360 charge = mdatoms->chargeA;
1361 nvdwtype = fr->ntype;
1362 vdwparam = fr->nbfp;
1363 vdwtype = mdatoms->typeA;
1365 vftab = kernel_data->table_elec_vdw->data;
1366 vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
1368 /* Setup water-specific parameters */
1369 inr = nlist->iinr[0];
1370 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1371 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1372 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1373 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1375 jq0 = _mm_set1_pd(charge[inr+0]);
1376 jq1 = _mm_set1_pd(charge[inr+1]);
1377 jq2 = _mm_set1_pd(charge[inr+2]);
1378 vdwjidx0A = 2*vdwtype[inr+0];
1379 qq00 = _mm_mul_pd(iq0,jq0);
1380 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1381 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1382 qq01 = _mm_mul_pd(iq0,jq1);
1383 qq02 = _mm_mul_pd(iq0,jq2);
1384 qq10 = _mm_mul_pd(iq1,jq0);
1385 qq11 = _mm_mul_pd(iq1,jq1);
1386 qq12 = _mm_mul_pd(iq1,jq2);
1387 qq20 = _mm_mul_pd(iq2,jq0);
1388 qq21 = _mm_mul_pd(iq2,jq1);
1389 qq22 = _mm_mul_pd(iq2,jq2);
1391 /* Avoid stupid compiler warnings */
1393 j_coord_offsetA = 0;
1394 j_coord_offsetB = 0;
1399 /* Start outer loop over neighborlists */
1400 for(iidx=0; iidx<nri; iidx++)
1402 /* Load shift vector for this list */
1403 i_shift_offset = DIM*shiftidx[iidx];
1405 /* Load limits for loop over neighbors */
1406 j_index_start = jindex[iidx];
1407 j_index_end = jindex[iidx+1];
1409 /* Get outer coordinate index */
1411 i_coord_offset = DIM*inr;
1413 /* Load i particle coords and add shift vector */
1414 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1415 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1417 fix0 = _mm_setzero_pd();
1418 fiy0 = _mm_setzero_pd();
1419 fiz0 = _mm_setzero_pd();
1420 fix1 = _mm_setzero_pd();
1421 fiy1 = _mm_setzero_pd();
1422 fiz1 = _mm_setzero_pd();
1423 fix2 = _mm_setzero_pd();
1424 fiy2 = _mm_setzero_pd();
1425 fiz2 = _mm_setzero_pd();
1427 /* Start inner kernel loop */
1428 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1431 /* Get j neighbor index, and coordinate index */
1433 jnrB = jjnr[jidx+1];
1434 j_coord_offsetA = DIM*jnrA;
1435 j_coord_offsetB = DIM*jnrB;
1437 /* load j atom coordinates */
1438 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1439 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1441 /* Calculate displacement vector */
1442 dx00 = _mm_sub_pd(ix0,jx0);
1443 dy00 = _mm_sub_pd(iy0,jy0);
1444 dz00 = _mm_sub_pd(iz0,jz0);
1445 dx01 = _mm_sub_pd(ix0,jx1);
1446 dy01 = _mm_sub_pd(iy0,jy1);
1447 dz01 = _mm_sub_pd(iz0,jz1);
1448 dx02 = _mm_sub_pd(ix0,jx2);
1449 dy02 = _mm_sub_pd(iy0,jy2);
1450 dz02 = _mm_sub_pd(iz0,jz2);
1451 dx10 = _mm_sub_pd(ix1,jx0);
1452 dy10 = _mm_sub_pd(iy1,jy0);
1453 dz10 = _mm_sub_pd(iz1,jz0);
1454 dx11 = _mm_sub_pd(ix1,jx1);
1455 dy11 = _mm_sub_pd(iy1,jy1);
1456 dz11 = _mm_sub_pd(iz1,jz1);
1457 dx12 = _mm_sub_pd(ix1,jx2);
1458 dy12 = _mm_sub_pd(iy1,jy2);
1459 dz12 = _mm_sub_pd(iz1,jz2);
1460 dx20 = _mm_sub_pd(ix2,jx0);
1461 dy20 = _mm_sub_pd(iy2,jy0);
1462 dz20 = _mm_sub_pd(iz2,jz0);
1463 dx21 = _mm_sub_pd(ix2,jx1);
1464 dy21 = _mm_sub_pd(iy2,jy1);
1465 dz21 = _mm_sub_pd(iz2,jz1);
1466 dx22 = _mm_sub_pd(ix2,jx2);
1467 dy22 = _mm_sub_pd(iy2,jy2);
1468 dz22 = _mm_sub_pd(iz2,jz2);
1470 /* Calculate squared distance and things based on it */
1471 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1472 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1473 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1474 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1475 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1476 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1477 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1478 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1479 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1481 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1482 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1483 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1484 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1485 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1486 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1487 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1488 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1489 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1491 fjx0 = _mm_setzero_pd();
1492 fjy0 = _mm_setzero_pd();
1493 fjz0 = _mm_setzero_pd();
1494 fjx1 = _mm_setzero_pd();
1495 fjy1 = _mm_setzero_pd();
1496 fjz1 = _mm_setzero_pd();
1497 fjx2 = _mm_setzero_pd();
1498 fjy2 = _mm_setzero_pd();
1499 fjz2 = _mm_setzero_pd();
1501 /**************************
1502 * CALCULATE INTERACTIONS *
1503 **************************/
1505 r00 = _mm_mul_pd(rsq00,rinv00);
1507 /* Calculate table index by multiplying r with table scale and truncate to integer */
1508 rt = _mm_mul_pd(r00,vftabscale);
1509 vfitab = _mm_cvttpd_epi32(rt);
1510 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1511 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1513 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1514 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1515 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1516 GMX_MM_TRANSPOSE2_PD(Y,F);
1517 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1518 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1519 GMX_MM_TRANSPOSE2_PD(G,H);
1520 Heps = _mm_mul_pd(vfeps,H);
1521 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1522 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1523 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1525 /* CUBIC SPLINE TABLE DISPERSION */
1526 vfitab = _mm_add_epi32(vfitab,ifour);
1527 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1528 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1529 GMX_MM_TRANSPOSE2_PD(Y,F);
1530 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1531 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1532 GMX_MM_TRANSPOSE2_PD(G,H);
1533 Heps = _mm_mul_pd(vfeps,H);
1534 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1535 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1536 fvdw6 = _mm_mul_pd(c6_00,FF);
1538 /* CUBIC SPLINE TABLE REPULSION */
1539 vfitab = _mm_add_epi32(vfitab,ifour);
1540 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1541 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1542 GMX_MM_TRANSPOSE2_PD(Y,F);
1543 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1544 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1545 GMX_MM_TRANSPOSE2_PD(G,H);
1546 Heps = _mm_mul_pd(vfeps,H);
1547 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1548 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1549 fvdw12 = _mm_mul_pd(c12_00,FF);
1550 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1552 fscal = _mm_add_pd(felec,fvdw);
1554 /* Calculate temporary vectorial force */
1555 tx = _mm_mul_pd(fscal,dx00);
1556 ty = _mm_mul_pd(fscal,dy00);
1557 tz = _mm_mul_pd(fscal,dz00);
1559 /* Update vectorial force */
1560 fix0 = _mm_add_pd(fix0,tx);
1561 fiy0 = _mm_add_pd(fiy0,ty);
1562 fiz0 = _mm_add_pd(fiz0,tz);
1564 fjx0 = _mm_add_pd(fjx0,tx);
1565 fjy0 = _mm_add_pd(fjy0,ty);
1566 fjz0 = _mm_add_pd(fjz0,tz);
1568 /**************************
1569 * CALCULATE INTERACTIONS *
1570 **************************/
1572 r01 = _mm_mul_pd(rsq01,rinv01);
1574 /* Calculate table index by multiplying r with table scale and truncate to integer */
1575 rt = _mm_mul_pd(r01,vftabscale);
1576 vfitab = _mm_cvttpd_epi32(rt);
1577 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1578 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1580 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1581 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1582 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1583 GMX_MM_TRANSPOSE2_PD(Y,F);
1584 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1585 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1586 GMX_MM_TRANSPOSE2_PD(G,H);
1587 Heps = _mm_mul_pd(vfeps,H);
1588 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1589 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1590 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1594 /* Calculate temporary vectorial force */
1595 tx = _mm_mul_pd(fscal,dx01);
1596 ty = _mm_mul_pd(fscal,dy01);
1597 tz = _mm_mul_pd(fscal,dz01);
1599 /* Update vectorial force */
1600 fix0 = _mm_add_pd(fix0,tx);
1601 fiy0 = _mm_add_pd(fiy0,ty);
1602 fiz0 = _mm_add_pd(fiz0,tz);
1604 fjx1 = _mm_add_pd(fjx1,tx);
1605 fjy1 = _mm_add_pd(fjy1,ty);
1606 fjz1 = _mm_add_pd(fjz1,tz);
1608 /**************************
1609 * CALCULATE INTERACTIONS *
1610 **************************/
1612 r02 = _mm_mul_pd(rsq02,rinv02);
1614 /* Calculate table index by multiplying r with table scale and truncate to integer */
1615 rt = _mm_mul_pd(r02,vftabscale);
1616 vfitab = _mm_cvttpd_epi32(rt);
1617 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1618 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1620 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1621 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1622 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1623 GMX_MM_TRANSPOSE2_PD(Y,F);
1624 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1625 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1626 GMX_MM_TRANSPOSE2_PD(G,H);
1627 Heps = _mm_mul_pd(vfeps,H);
1628 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1629 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1630 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1634 /* Calculate temporary vectorial force */
1635 tx = _mm_mul_pd(fscal,dx02);
1636 ty = _mm_mul_pd(fscal,dy02);
1637 tz = _mm_mul_pd(fscal,dz02);
1639 /* Update vectorial force */
1640 fix0 = _mm_add_pd(fix0,tx);
1641 fiy0 = _mm_add_pd(fiy0,ty);
1642 fiz0 = _mm_add_pd(fiz0,tz);
1644 fjx2 = _mm_add_pd(fjx2,tx);
1645 fjy2 = _mm_add_pd(fjy2,ty);
1646 fjz2 = _mm_add_pd(fjz2,tz);
1648 /**************************
1649 * CALCULATE INTERACTIONS *
1650 **************************/
1652 r10 = _mm_mul_pd(rsq10,rinv10);
1654 /* Calculate table index by multiplying r with table scale and truncate to integer */
1655 rt = _mm_mul_pd(r10,vftabscale);
1656 vfitab = _mm_cvttpd_epi32(rt);
1657 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1658 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1660 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1661 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1662 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1663 GMX_MM_TRANSPOSE2_PD(Y,F);
1664 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1665 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1666 GMX_MM_TRANSPOSE2_PD(G,H);
1667 Heps = _mm_mul_pd(vfeps,H);
1668 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1669 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1670 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1674 /* Calculate temporary vectorial force */
1675 tx = _mm_mul_pd(fscal,dx10);
1676 ty = _mm_mul_pd(fscal,dy10);
1677 tz = _mm_mul_pd(fscal,dz10);
1679 /* Update vectorial force */
1680 fix1 = _mm_add_pd(fix1,tx);
1681 fiy1 = _mm_add_pd(fiy1,ty);
1682 fiz1 = _mm_add_pd(fiz1,tz);
1684 fjx0 = _mm_add_pd(fjx0,tx);
1685 fjy0 = _mm_add_pd(fjy0,ty);
1686 fjz0 = _mm_add_pd(fjz0,tz);
1688 /**************************
1689 * CALCULATE INTERACTIONS *
1690 **************************/
1692 r11 = _mm_mul_pd(rsq11,rinv11);
1694 /* Calculate table index by multiplying r with table scale and truncate to integer */
1695 rt = _mm_mul_pd(r11,vftabscale);
1696 vfitab = _mm_cvttpd_epi32(rt);
1697 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1698 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1700 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1701 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1702 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1703 GMX_MM_TRANSPOSE2_PD(Y,F);
1704 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1705 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1706 GMX_MM_TRANSPOSE2_PD(G,H);
1707 Heps = _mm_mul_pd(vfeps,H);
1708 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1709 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1710 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1714 /* Calculate temporary vectorial force */
1715 tx = _mm_mul_pd(fscal,dx11);
1716 ty = _mm_mul_pd(fscal,dy11);
1717 tz = _mm_mul_pd(fscal,dz11);
1719 /* Update vectorial force */
1720 fix1 = _mm_add_pd(fix1,tx);
1721 fiy1 = _mm_add_pd(fiy1,ty);
1722 fiz1 = _mm_add_pd(fiz1,tz);
1724 fjx1 = _mm_add_pd(fjx1,tx);
1725 fjy1 = _mm_add_pd(fjy1,ty);
1726 fjz1 = _mm_add_pd(fjz1,tz);
1728 /**************************
1729 * CALCULATE INTERACTIONS *
1730 **************************/
1732 r12 = _mm_mul_pd(rsq12,rinv12);
1734 /* Calculate table index by multiplying r with table scale and truncate to integer */
1735 rt = _mm_mul_pd(r12,vftabscale);
1736 vfitab = _mm_cvttpd_epi32(rt);
1737 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1738 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1740 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1741 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1742 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1743 GMX_MM_TRANSPOSE2_PD(Y,F);
1744 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1745 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1746 GMX_MM_TRANSPOSE2_PD(G,H);
1747 Heps = _mm_mul_pd(vfeps,H);
1748 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1749 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1750 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1754 /* Calculate temporary vectorial force */
1755 tx = _mm_mul_pd(fscal,dx12);
1756 ty = _mm_mul_pd(fscal,dy12);
1757 tz = _mm_mul_pd(fscal,dz12);
1759 /* Update vectorial force */
1760 fix1 = _mm_add_pd(fix1,tx);
1761 fiy1 = _mm_add_pd(fiy1,ty);
1762 fiz1 = _mm_add_pd(fiz1,tz);
1764 fjx2 = _mm_add_pd(fjx2,tx);
1765 fjy2 = _mm_add_pd(fjy2,ty);
1766 fjz2 = _mm_add_pd(fjz2,tz);
1768 /**************************
1769 * CALCULATE INTERACTIONS *
1770 **************************/
1772 r20 = _mm_mul_pd(rsq20,rinv20);
1774 /* Calculate table index by multiplying r with table scale and truncate to integer */
1775 rt = _mm_mul_pd(r20,vftabscale);
1776 vfitab = _mm_cvttpd_epi32(rt);
1777 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1778 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1780 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1781 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1782 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1783 GMX_MM_TRANSPOSE2_PD(Y,F);
1784 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1785 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1786 GMX_MM_TRANSPOSE2_PD(G,H);
1787 Heps = _mm_mul_pd(vfeps,H);
1788 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1789 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1790 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1794 /* Calculate temporary vectorial force */
1795 tx = _mm_mul_pd(fscal,dx20);
1796 ty = _mm_mul_pd(fscal,dy20);
1797 tz = _mm_mul_pd(fscal,dz20);
1799 /* Update vectorial force */
1800 fix2 = _mm_add_pd(fix2,tx);
1801 fiy2 = _mm_add_pd(fiy2,ty);
1802 fiz2 = _mm_add_pd(fiz2,tz);
1804 fjx0 = _mm_add_pd(fjx0,tx);
1805 fjy0 = _mm_add_pd(fjy0,ty);
1806 fjz0 = _mm_add_pd(fjz0,tz);
1808 /**************************
1809 * CALCULATE INTERACTIONS *
1810 **************************/
1812 r21 = _mm_mul_pd(rsq21,rinv21);
1814 /* Calculate table index by multiplying r with table scale and truncate to integer */
1815 rt = _mm_mul_pd(r21,vftabscale);
1816 vfitab = _mm_cvttpd_epi32(rt);
1817 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1818 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1820 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1821 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1822 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1823 GMX_MM_TRANSPOSE2_PD(Y,F);
1824 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1825 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1826 GMX_MM_TRANSPOSE2_PD(G,H);
1827 Heps = _mm_mul_pd(vfeps,H);
1828 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1829 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1830 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1834 /* Calculate temporary vectorial force */
1835 tx = _mm_mul_pd(fscal,dx21);
1836 ty = _mm_mul_pd(fscal,dy21);
1837 tz = _mm_mul_pd(fscal,dz21);
1839 /* Update vectorial force */
1840 fix2 = _mm_add_pd(fix2,tx);
1841 fiy2 = _mm_add_pd(fiy2,ty);
1842 fiz2 = _mm_add_pd(fiz2,tz);
1844 fjx1 = _mm_add_pd(fjx1,tx);
1845 fjy1 = _mm_add_pd(fjy1,ty);
1846 fjz1 = _mm_add_pd(fjz1,tz);
1848 /**************************
1849 * CALCULATE INTERACTIONS *
1850 **************************/
1852 r22 = _mm_mul_pd(rsq22,rinv22);
1854 /* Calculate table index by multiplying r with table scale and truncate to integer */
1855 rt = _mm_mul_pd(r22,vftabscale);
1856 vfitab = _mm_cvttpd_epi32(rt);
1857 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1858 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1860 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1861 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1862 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1863 GMX_MM_TRANSPOSE2_PD(Y,F);
1864 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1865 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1866 GMX_MM_TRANSPOSE2_PD(G,H);
1867 Heps = _mm_mul_pd(vfeps,H);
1868 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1869 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1870 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1874 /* Calculate temporary vectorial force */
1875 tx = _mm_mul_pd(fscal,dx22);
1876 ty = _mm_mul_pd(fscal,dy22);
1877 tz = _mm_mul_pd(fscal,dz22);
1879 /* Update vectorial force */
1880 fix2 = _mm_add_pd(fix2,tx);
1881 fiy2 = _mm_add_pd(fiy2,ty);
1882 fiz2 = _mm_add_pd(fiz2,tz);
1884 fjx2 = _mm_add_pd(fjx2,tx);
1885 fjy2 = _mm_add_pd(fjy2,ty);
1886 fjz2 = _mm_add_pd(fjz2,tz);
1888 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1890 /* Inner loop uses 373 flops */
1893 if(jidx<j_index_end)
1897 j_coord_offsetA = DIM*jnrA;
1899 /* load j atom coordinates */
1900 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1901 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1903 /* Calculate displacement vector */
1904 dx00 = _mm_sub_pd(ix0,jx0);
1905 dy00 = _mm_sub_pd(iy0,jy0);
1906 dz00 = _mm_sub_pd(iz0,jz0);
1907 dx01 = _mm_sub_pd(ix0,jx1);
1908 dy01 = _mm_sub_pd(iy0,jy1);
1909 dz01 = _mm_sub_pd(iz0,jz1);
1910 dx02 = _mm_sub_pd(ix0,jx2);
1911 dy02 = _mm_sub_pd(iy0,jy2);
1912 dz02 = _mm_sub_pd(iz0,jz2);
1913 dx10 = _mm_sub_pd(ix1,jx0);
1914 dy10 = _mm_sub_pd(iy1,jy0);
1915 dz10 = _mm_sub_pd(iz1,jz0);
1916 dx11 = _mm_sub_pd(ix1,jx1);
1917 dy11 = _mm_sub_pd(iy1,jy1);
1918 dz11 = _mm_sub_pd(iz1,jz1);
1919 dx12 = _mm_sub_pd(ix1,jx2);
1920 dy12 = _mm_sub_pd(iy1,jy2);
1921 dz12 = _mm_sub_pd(iz1,jz2);
1922 dx20 = _mm_sub_pd(ix2,jx0);
1923 dy20 = _mm_sub_pd(iy2,jy0);
1924 dz20 = _mm_sub_pd(iz2,jz0);
1925 dx21 = _mm_sub_pd(ix2,jx1);
1926 dy21 = _mm_sub_pd(iy2,jy1);
1927 dz21 = _mm_sub_pd(iz2,jz1);
1928 dx22 = _mm_sub_pd(ix2,jx2);
1929 dy22 = _mm_sub_pd(iy2,jy2);
1930 dz22 = _mm_sub_pd(iz2,jz2);
1932 /* Calculate squared distance and things based on it */
1933 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1934 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1935 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1936 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1937 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1938 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1939 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1940 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1941 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1943 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1944 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1945 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1946 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1947 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1948 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1949 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1950 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1951 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1953 fjx0 = _mm_setzero_pd();
1954 fjy0 = _mm_setzero_pd();
1955 fjz0 = _mm_setzero_pd();
1956 fjx1 = _mm_setzero_pd();
1957 fjy1 = _mm_setzero_pd();
1958 fjz1 = _mm_setzero_pd();
1959 fjx2 = _mm_setzero_pd();
1960 fjy2 = _mm_setzero_pd();
1961 fjz2 = _mm_setzero_pd();
1963 /**************************
1964 * CALCULATE INTERACTIONS *
1965 **************************/
1967 r00 = _mm_mul_pd(rsq00,rinv00);
1969 /* Calculate table index by multiplying r with table scale and truncate to integer */
1970 rt = _mm_mul_pd(r00,vftabscale);
1971 vfitab = _mm_cvttpd_epi32(rt);
1972 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1973 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1975 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1976 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1977 F = _mm_setzero_pd();
1978 GMX_MM_TRANSPOSE2_PD(Y,F);
1979 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1980 H = _mm_setzero_pd();
1981 GMX_MM_TRANSPOSE2_PD(G,H);
1982 Heps = _mm_mul_pd(vfeps,H);
1983 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1984 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1985 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1987 /* CUBIC SPLINE TABLE DISPERSION */
1988 vfitab = _mm_add_epi32(vfitab,ifour);
1989 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1990 F = _mm_setzero_pd();
1991 GMX_MM_TRANSPOSE2_PD(Y,F);
1992 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1993 H = _mm_setzero_pd();
1994 GMX_MM_TRANSPOSE2_PD(G,H);
1995 Heps = _mm_mul_pd(vfeps,H);
1996 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1997 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1998 fvdw6 = _mm_mul_pd(c6_00,FF);
2000 /* CUBIC SPLINE TABLE REPULSION */
2001 vfitab = _mm_add_epi32(vfitab,ifour);
2002 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2003 F = _mm_setzero_pd();
2004 GMX_MM_TRANSPOSE2_PD(Y,F);
2005 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2006 H = _mm_setzero_pd();
2007 GMX_MM_TRANSPOSE2_PD(G,H);
2008 Heps = _mm_mul_pd(vfeps,H);
2009 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2010 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2011 fvdw12 = _mm_mul_pd(c12_00,FF);
2012 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
2014 fscal = _mm_add_pd(felec,fvdw);
2016 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2018 /* Calculate temporary vectorial force */
2019 tx = _mm_mul_pd(fscal,dx00);
2020 ty = _mm_mul_pd(fscal,dy00);
2021 tz = _mm_mul_pd(fscal,dz00);
2023 /* Update vectorial force */
2024 fix0 = _mm_add_pd(fix0,tx);
2025 fiy0 = _mm_add_pd(fiy0,ty);
2026 fiz0 = _mm_add_pd(fiz0,tz);
2028 fjx0 = _mm_add_pd(fjx0,tx);
2029 fjy0 = _mm_add_pd(fjy0,ty);
2030 fjz0 = _mm_add_pd(fjz0,tz);
2032 /**************************
2033 * CALCULATE INTERACTIONS *
2034 **************************/
2036 r01 = _mm_mul_pd(rsq01,rinv01);
2038 /* Calculate table index by multiplying r with table scale and truncate to integer */
2039 rt = _mm_mul_pd(r01,vftabscale);
2040 vfitab = _mm_cvttpd_epi32(rt);
2041 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2042 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2044 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2045 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2046 F = _mm_setzero_pd();
2047 GMX_MM_TRANSPOSE2_PD(Y,F);
2048 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2049 H = _mm_setzero_pd();
2050 GMX_MM_TRANSPOSE2_PD(G,H);
2051 Heps = _mm_mul_pd(vfeps,H);
2052 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2053 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2054 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
2058 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2060 /* Calculate temporary vectorial force */
2061 tx = _mm_mul_pd(fscal,dx01);
2062 ty = _mm_mul_pd(fscal,dy01);
2063 tz = _mm_mul_pd(fscal,dz01);
2065 /* Update vectorial force */
2066 fix0 = _mm_add_pd(fix0,tx);
2067 fiy0 = _mm_add_pd(fiy0,ty);
2068 fiz0 = _mm_add_pd(fiz0,tz);
2070 fjx1 = _mm_add_pd(fjx1,tx);
2071 fjy1 = _mm_add_pd(fjy1,ty);
2072 fjz1 = _mm_add_pd(fjz1,tz);
2074 /**************************
2075 * CALCULATE INTERACTIONS *
2076 **************************/
2078 r02 = _mm_mul_pd(rsq02,rinv02);
2080 /* Calculate table index by multiplying r with table scale and truncate to integer */
2081 rt = _mm_mul_pd(r02,vftabscale);
2082 vfitab = _mm_cvttpd_epi32(rt);
2083 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2084 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2086 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2087 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2088 F = _mm_setzero_pd();
2089 GMX_MM_TRANSPOSE2_PD(Y,F);
2090 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2091 H = _mm_setzero_pd();
2092 GMX_MM_TRANSPOSE2_PD(G,H);
2093 Heps = _mm_mul_pd(vfeps,H);
2094 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2095 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2096 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
2100 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2102 /* Calculate temporary vectorial force */
2103 tx = _mm_mul_pd(fscal,dx02);
2104 ty = _mm_mul_pd(fscal,dy02);
2105 tz = _mm_mul_pd(fscal,dz02);
2107 /* Update vectorial force */
2108 fix0 = _mm_add_pd(fix0,tx);
2109 fiy0 = _mm_add_pd(fiy0,ty);
2110 fiz0 = _mm_add_pd(fiz0,tz);
2112 fjx2 = _mm_add_pd(fjx2,tx);
2113 fjy2 = _mm_add_pd(fjy2,ty);
2114 fjz2 = _mm_add_pd(fjz2,tz);
2116 /**************************
2117 * CALCULATE INTERACTIONS *
2118 **************************/
2120 r10 = _mm_mul_pd(rsq10,rinv10);
2122 /* Calculate table index by multiplying r with table scale and truncate to integer */
2123 rt = _mm_mul_pd(r10,vftabscale);
2124 vfitab = _mm_cvttpd_epi32(rt);
2125 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2126 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2128 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2129 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2130 F = _mm_setzero_pd();
2131 GMX_MM_TRANSPOSE2_PD(Y,F);
2132 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2133 H = _mm_setzero_pd();
2134 GMX_MM_TRANSPOSE2_PD(G,H);
2135 Heps = _mm_mul_pd(vfeps,H);
2136 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2137 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2138 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
2142 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2144 /* Calculate temporary vectorial force */
2145 tx = _mm_mul_pd(fscal,dx10);
2146 ty = _mm_mul_pd(fscal,dy10);
2147 tz = _mm_mul_pd(fscal,dz10);
2149 /* Update vectorial force */
2150 fix1 = _mm_add_pd(fix1,tx);
2151 fiy1 = _mm_add_pd(fiy1,ty);
2152 fiz1 = _mm_add_pd(fiz1,tz);
2154 fjx0 = _mm_add_pd(fjx0,tx);
2155 fjy0 = _mm_add_pd(fjy0,ty);
2156 fjz0 = _mm_add_pd(fjz0,tz);
2158 /**************************
2159 * CALCULATE INTERACTIONS *
2160 **************************/
2162 r11 = _mm_mul_pd(rsq11,rinv11);
2164 /* Calculate table index by multiplying r with table scale and truncate to integer */
2165 rt = _mm_mul_pd(r11,vftabscale);
2166 vfitab = _mm_cvttpd_epi32(rt);
2167 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2168 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2170 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2171 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2172 F = _mm_setzero_pd();
2173 GMX_MM_TRANSPOSE2_PD(Y,F);
2174 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2175 H = _mm_setzero_pd();
2176 GMX_MM_TRANSPOSE2_PD(G,H);
2177 Heps = _mm_mul_pd(vfeps,H);
2178 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2179 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2180 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2184 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2186 /* Calculate temporary vectorial force */
2187 tx = _mm_mul_pd(fscal,dx11);
2188 ty = _mm_mul_pd(fscal,dy11);
2189 tz = _mm_mul_pd(fscal,dz11);
2191 /* Update vectorial force */
2192 fix1 = _mm_add_pd(fix1,tx);
2193 fiy1 = _mm_add_pd(fiy1,ty);
2194 fiz1 = _mm_add_pd(fiz1,tz);
2196 fjx1 = _mm_add_pd(fjx1,tx);
2197 fjy1 = _mm_add_pd(fjy1,ty);
2198 fjz1 = _mm_add_pd(fjz1,tz);
2200 /**************************
2201 * CALCULATE INTERACTIONS *
2202 **************************/
2204 r12 = _mm_mul_pd(rsq12,rinv12);
2206 /* Calculate table index by multiplying r with table scale and truncate to integer */
2207 rt = _mm_mul_pd(r12,vftabscale);
2208 vfitab = _mm_cvttpd_epi32(rt);
2209 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2210 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2212 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2213 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2214 F = _mm_setzero_pd();
2215 GMX_MM_TRANSPOSE2_PD(Y,F);
2216 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2217 H = _mm_setzero_pd();
2218 GMX_MM_TRANSPOSE2_PD(G,H);
2219 Heps = _mm_mul_pd(vfeps,H);
2220 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2221 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2222 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2226 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2228 /* Calculate temporary vectorial force */
2229 tx = _mm_mul_pd(fscal,dx12);
2230 ty = _mm_mul_pd(fscal,dy12);
2231 tz = _mm_mul_pd(fscal,dz12);
2233 /* Update vectorial force */
2234 fix1 = _mm_add_pd(fix1,tx);
2235 fiy1 = _mm_add_pd(fiy1,ty);
2236 fiz1 = _mm_add_pd(fiz1,tz);
2238 fjx2 = _mm_add_pd(fjx2,tx);
2239 fjy2 = _mm_add_pd(fjy2,ty);
2240 fjz2 = _mm_add_pd(fjz2,tz);
2242 /**************************
2243 * CALCULATE INTERACTIONS *
2244 **************************/
2246 r20 = _mm_mul_pd(rsq20,rinv20);
2248 /* Calculate table index by multiplying r with table scale and truncate to integer */
2249 rt = _mm_mul_pd(r20,vftabscale);
2250 vfitab = _mm_cvttpd_epi32(rt);
2251 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2252 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2254 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2255 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2256 F = _mm_setzero_pd();
2257 GMX_MM_TRANSPOSE2_PD(Y,F);
2258 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2259 H = _mm_setzero_pd();
2260 GMX_MM_TRANSPOSE2_PD(G,H);
2261 Heps = _mm_mul_pd(vfeps,H);
2262 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2263 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2264 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
2268 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2270 /* Calculate temporary vectorial force */
2271 tx = _mm_mul_pd(fscal,dx20);
2272 ty = _mm_mul_pd(fscal,dy20);
2273 tz = _mm_mul_pd(fscal,dz20);
2275 /* Update vectorial force */
2276 fix2 = _mm_add_pd(fix2,tx);
2277 fiy2 = _mm_add_pd(fiy2,ty);
2278 fiz2 = _mm_add_pd(fiz2,tz);
2280 fjx0 = _mm_add_pd(fjx0,tx);
2281 fjy0 = _mm_add_pd(fjy0,ty);
2282 fjz0 = _mm_add_pd(fjz0,tz);
2284 /**************************
2285 * CALCULATE INTERACTIONS *
2286 **************************/
2288 r21 = _mm_mul_pd(rsq21,rinv21);
2290 /* Calculate table index by multiplying r with table scale and truncate to integer */
2291 rt = _mm_mul_pd(r21,vftabscale);
2292 vfitab = _mm_cvttpd_epi32(rt);
2293 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2294 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2296 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2297 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2298 F = _mm_setzero_pd();
2299 GMX_MM_TRANSPOSE2_PD(Y,F);
2300 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2301 H = _mm_setzero_pd();
2302 GMX_MM_TRANSPOSE2_PD(G,H);
2303 Heps = _mm_mul_pd(vfeps,H);
2304 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2305 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2306 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2310 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2312 /* Calculate temporary vectorial force */
2313 tx = _mm_mul_pd(fscal,dx21);
2314 ty = _mm_mul_pd(fscal,dy21);
2315 tz = _mm_mul_pd(fscal,dz21);
2317 /* Update vectorial force */
2318 fix2 = _mm_add_pd(fix2,tx);
2319 fiy2 = _mm_add_pd(fiy2,ty);
2320 fiz2 = _mm_add_pd(fiz2,tz);
2322 fjx1 = _mm_add_pd(fjx1,tx);
2323 fjy1 = _mm_add_pd(fjy1,ty);
2324 fjz1 = _mm_add_pd(fjz1,tz);
2326 /**************************
2327 * CALCULATE INTERACTIONS *
2328 **************************/
2330 r22 = _mm_mul_pd(rsq22,rinv22);
2332 /* Calculate table index by multiplying r with table scale and truncate to integer */
2333 rt = _mm_mul_pd(r22,vftabscale);
2334 vfitab = _mm_cvttpd_epi32(rt);
2335 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2336 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2338 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2339 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2340 F = _mm_setzero_pd();
2341 GMX_MM_TRANSPOSE2_PD(Y,F);
2342 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2343 H = _mm_setzero_pd();
2344 GMX_MM_TRANSPOSE2_PD(G,H);
2345 Heps = _mm_mul_pd(vfeps,H);
2346 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2347 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2348 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2352 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2354 /* Calculate temporary vectorial force */
2355 tx = _mm_mul_pd(fscal,dx22);
2356 ty = _mm_mul_pd(fscal,dy22);
2357 tz = _mm_mul_pd(fscal,dz22);
2359 /* Update vectorial force */
2360 fix2 = _mm_add_pd(fix2,tx);
2361 fiy2 = _mm_add_pd(fiy2,ty);
2362 fiz2 = _mm_add_pd(fiz2,tz);
2364 fjx2 = _mm_add_pd(fjx2,tx);
2365 fjy2 = _mm_add_pd(fjy2,ty);
2366 fjz2 = _mm_add_pd(fjz2,tz);
2368 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2370 /* Inner loop uses 373 flops */
2373 /* End of innermost loop */
2375 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2376 f+i_coord_offset,fshift+i_shift_offset);
2378 /* Increment number of inner iterations */
2379 inneriter += j_index_end - j_index_start;
2381 /* Outer loop uses 18 flops */
2384 /* Increment number of outer iterations */
2387 /* Update outer/inner flops */
2389 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*373);