2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse4_1_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse4_1_double
51 * Electrostatics interaction: CubicSplineTable
52 * VdW interaction: CubicSplineTable
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse4_1_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int j_coord_offsetA,j_coord_offsetB;
75 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
77 real *shiftvec,*fshift,*x,*f;
78 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
82 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
84 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
87 int vdwjidx0A,vdwjidx0B;
88 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
89 int vdwjidx1A,vdwjidx1B;
90 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
91 int vdwjidx2A,vdwjidx2B;
92 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
93 int vdwjidx3A,vdwjidx3B;
94 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
95 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
97 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
98 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
99 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
100 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
101 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
102 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
103 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
104 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
105 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
108 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
111 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
112 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
114 __m128i ifour = _mm_set1_epi32(4);
115 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
117 __m128d dummy_mask,cutoff_mask;
118 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
119 __m128d one = _mm_set1_pd(1.0);
120 __m128d two = _mm_set1_pd(2.0);
126 jindex = nlist->jindex;
128 shiftidx = nlist->shift;
130 shiftvec = fr->shift_vec[0];
131 fshift = fr->fshift[0];
132 facel = _mm_set1_pd(fr->ic->epsfac);
133 charge = mdatoms->chargeA;
134 nvdwtype = fr->ntype;
136 vdwtype = mdatoms->typeA;
138 vftab = kernel_data->table_elec_vdw->data;
139 vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
141 /* Setup water-specific parameters */
142 inr = nlist->iinr[0];
143 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
144 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
145 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
146 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
148 jq1 = _mm_set1_pd(charge[inr+1]);
149 jq2 = _mm_set1_pd(charge[inr+2]);
150 jq3 = _mm_set1_pd(charge[inr+3]);
151 vdwjidx0A = 2*vdwtype[inr+0];
152 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
153 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
154 qq11 = _mm_mul_pd(iq1,jq1);
155 qq12 = _mm_mul_pd(iq1,jq2);
156 qq13 = _mm_mul_pd(iq1,jq3);
157 qq21 = _mm_mul_pd(iq2,jq1);
158 qq22 = _mm_mul_pd(iq2,jq2);
159 qq23 = _mm_mul_pd(iq2,jq3);
160 qq31 = _mm_mul_pd(iq3,jq1);
161 qq32 = _mm_mul_pd(iq3,jq2);
162 qq33 = _mm_mul_pd(iq3,jq3);
164 /* Avoid stupid compiler warnings */
172 /* Start outer loop over neighborlists */
173 for(iidx=0; iidx<nri; iidx++)
175 /* Load shift vector for this list */
176 i_shift_offset = DIM*shiftidx[iidx];
178 /* Load limits for loop over neighbors */
179 j_index_start = jindex[iidx];
180 j_index_end = jindex[iidx+1];
182 /* Get outer coordinate index */
184 i_coord_offset = DIM*inr;
186 /* Load i particle coords and add shift vector */
187 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
188 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
190 fix0 = _mm_setzero_pd();
191 fiy0 = _mm_setzero_pd();
192 fiz0 = _mm_setzero_pd();
193 fix1 = _mm_setzero_pd();
194 fiy1 = _mm_setzero_pd();
195 fiz1 = _mm_setzero_pd();
196 fix2 = _mm_setzero_pd();
197 fiy2 = _mm_setzero_pd();
198 fiz2 = _mm_setzero_pd();
199 fix3 = _mm_setzero_pd();
200 fiy3 = _mm_setzero_pd();
201 fiz3 = _mm_setzero_pd();
203 /* Reset potential sums */
204 velecsum = _mm_setzero_pd();
205 vvdwsum = _mm_setzero_pd();
207 /* Start inner kernel loop */
208 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
211 /* Get j neighbor index, and coordinate index */
214 j_coord_offsetA = DIM*jnrA;
215 j_coord_offsetB = DIM*jnrB;
217 /* load j atom coordinates */
218 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
219 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
220 &jy2,&jz2,&jx3,&jy3,&jz3);
222 /* Calculate displacement vector */
223 dx00 = _mm_sub_pd(ix0,jx0);
224 dy00 = _mm_sub_pd(iy0,jy0);
225 dz00 = _mm_sub_pd(iz0,jz0);
226 dx11 = _mm_sub_pd(ix1,jx1);
227 dy11 = _mm_sub_pd(iy1,jy1);
228 dz11 = _mm_sub_pd(iz1,jz1);
229 dx12 = _mm_sub_pd(ix1,jx2);
230 dy12 = _mm_sub_pd(iy1,jy2);
231 dz12 = _mm_sub_pd(iz1,jz2);
232 dx13 = _mm_sub_pd(ix1,jx3);
233 dy13 = _mm_sub_pd(iy1,jy3);
234 dz13 = _mm_sub_pd(iz1,jz3);
235 dx21 = _mm_sub_pd(ix2,jx1);
236 dy21 = _mm_sub_pd(iy2,jy1);
237 dz21 = _mm_sub_pd(iz2,jz1);
238 dx22 = _mm_sub_pd(ix2,jx2);
239 dy22 = _mm_sub_pd(iy2,jy2);
240 dz22 = _mm_sub_pd(iz2,jz2);
241 dx23 = _mm_sub_pd(ix2,jx3);
242 dy23 = _mm_sub_pd(iy2,jy3);
243 dz23 = _mm_sub_pd(iz2,jz3);
244 dx31 = _mm_sub_pd(ix3,jx1);
245 dy31 = _mm_sub_pd(iy3,jy1);
246 dz31 = _mm_sub_pd(iz3,jz1);
247 dx32 = _mm_sub_pd(ix3,jx2);
248 dy32 = _mm_sub_pd(iy3,jy2);
249 dz32 = _mm_sub_pd(iz3,jz2);
250 dx33 = _mm_sub_pd(ix3,jx3);
251 dy33 = _mm_sub_pd(iy3,jy3);
252 dz33 = _mm_sub_pd(iz3,jz3);
254 /* Calculate squared distance and things based on it */
255 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
256 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
257 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
258 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
259 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
260 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
261 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
262 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
263 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
264 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
266 rinv00 = sse41_invsqrt_d(rsq00);
267 rinv11 = sse41_invsqrt_d(rsq11);
268 rinv12 = sse41_invsqrt_d(rsq12);
269 rinv13 = sse41_invsqrt_d(rsq13);
270 rinv21 = sse41_invsqrt_d(rsq21);
271 rinv22 = sse41_invsqrt_d(rsq22);
272 rinv23 = sse41_invsqrt_d(rsq23);
273 rinv31 = sse41_invsqrt_d(rsq31);
274 rinv32 = sse41_invsqrt_d(rsq32);
275 rinv33 = sse41_invsqrt_d(rsq33);
277 fjx0 = _mm_setzero_pd();
278 fjy0 = _mm_setzero_pd();
279 fjz0 = _mm_setzero_pd();
280 fjx1 = _mm_setzero_pd();
281 fjy1 = _mm_setzero_pd();
282 fjz1 = _mm_setzero_pd();
283 fjx2 = _mm_setzero_pd();
284 fjy2 = _mm_setzero_pd();
285 fjz2 = _mm_setzero_pd();
286 fjx3 = _mm_setzero_pd();
287 fjy3 = _mm_setzero_pd();
288 fjz3 = _mm_setzero_pd();
290 /**************************
291 * CALCULATE INTERACTIONS *
292 **************************/
294 r00 = _mm_mul_pd(rsq00,rinv00);
296 /* Calculate table index by multiplying r with table scale and truncate to integer */
297 rt = _mm_mul_pd(r00,vftabscale);
298 vfitab = _mm_cvttpd_epi32(rt);
299 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
300 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
302 /* CUBIC SPLINE TABLE DISPERSION */
303 vfitab = _mm_add_epi32(vfitab,ifour);
304 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
305 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
306 GMX_MM_TRANSPOSE2_PD(Y,F);
307 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
308 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
309 GMX_MM_TRANSPOSE2_PD(G,H);
310 Heps = _mm_mul_pd(vfeps,H);
311 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
312 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
313 vvdw6 = _mm_mul_pd(c6_00,VV);
314 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
315 fvdw6 = _mm_mul_pd(c6_00,FF);
317 /* CUBIC SPLINE TABLE REPULSION */
318 vfitab = _mm_add_epi32(vfitab,ifour);
319 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
320 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
321 GMX_MM_TRANSPOSE2_PD(Y,F);
322 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
323 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
324 GMX_MM_TRANSPOSE2_PD(G,H);
325 Heps = _mm_mul_pd(vfeps,H);
326 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
327 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
328 vvdw12 = _mm_mul_pd(c12_00,VV);
329 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
330 fvdw12 = _mm_mul_pd(c12_00,FF);
331 vvdw = _mm_add_pd(vvdw12,vvdw6);
332 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
334 /* Update potential sum for this i atom from the interaction with this j atom. */
335 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
339 /* Calculate temporary vectorial force */
340 tx = _mm_mul_pd(fscal,dx00);
341 ty = _mm_mul_pd(fscal,dy00);
342 tz = _mm_mul_pd(fscal,dz00);
344 /* Update vectorial force */
345 fix0 = _mm_add_pd(fix0,tx);
346 fiy0 = _mm_add_pd(fiy0,ty);
347 fiz0 = _mm_add_pd(fiz0,tz);
349 fjx0 = _mm_add_pd(fjx0,tx);
350 fjy0 = _mm_add_pd(fjy0,ty);
351 fjz0 = _mm_add_pd(fjz0,tz);
353 /**************************
354 * CALCULATE INTERACTIONS *
355 **************************/
357 r11 = _mm_mul_pd(rsq11,rinv11);
359 /* Calculate table index by multiplying r with table scale and truncate to integer */
360 rt = _mm_mul_pd(r11,vftabscale);
361 vfitab = _mm_cvttpd_epi32(rt);
362 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
363 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
365 /* CUBIC SPLINE TABLE ELECTROSTATICS */
366 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
367 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
368 GMX_MM_TRANSPOSE2_PD(Y,F);
369 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
370 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
371 GMX_MM_TRANSPOSE2_PD(G,H);
372 Heps = _mm_mul_pd(vfeps,H);
373 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
374 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
375 velec = _mm_mul_pd(qq11,VV);
376 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
377 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
379 /* Update potential sum for this i atom from the interaction with this j atom. */
380 velecsum = _mm_add_pd(velecsum,velec);
384 /* Calculate temporary vectorial force */
385 tx = _mm_mul_pd(fscal,dx11);
386 ty = _mm_mul_pd(fscal,dy11);
387 tz = _mm_mul_pd(fscal,dz11);
389 /* Update vectorial force */
390 fix1 = _mm_add_pd(fix1,tx);
391 fiy1 = _mm_add_pd(fiy1,ty);
392 fiz1 = _mm_add_pd(fiz1,tz);
394 fjx1 = _mm_add_pd(fjx1,tx);
395 fjy1 = _mm_add_pd(fjy1,ty);
396 fjz1 = _mm_add_pd(fjz1,tz);
398 /**************************
399 * CALCULATE INTERACTIONS *
400 **************************/
402 r12 = _mm_mul_pd(rsq12,rinv12);
404 /* Calculate table index by multiplying r with table scale and truncate to integer */
405 rt = _mm_mul_pd(r12,vftabscale);
406 vfitab = _mm_cvttpd_epi32(rt);
407 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
408 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
410 /* CUBIC SPLINE TABLE ELECTROSTATICS */
411 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
412 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
413 GMX_MM_TRANSPOSE2_PD(Y,F);
414 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
415 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
416 GMX_MM_TRANSPOSE2_PD(G,H);
417 Heps = _mm_mul_pd(vfeps,H);
418 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
419 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
420 velec = _mm_mul_pd(qq12,VV);
421 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
422 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
424 /* Update potential sum for this i atom from the interaction with this j atom. */
425 velecsum = _mm_add_pd(velecsum,velec);
429 /* Calculate temporary vectorial force */
430 tx = _mm_mul_pd(fscal,dx12);
431 ty = _mm_mul_pd(fscal,dy12);
432 tz = _mm_mul_pd(fscal,dz12);
434 /* Update vectorial force */
435 fix1 = _mm_add_pd(fix1,tx);
436 fiy1 = _mm_add_pd(fiy1,ty);
437 fiz1 = _mm_add_pd(fiz1,tz);
439 fjx2 = _mm_add_pd(fjx2,tx);
440 fjy2 = _mm_add_pd(fjy2,ty);
441 fjz2 = _mm_add_pd(fjz2,tz);
443 /**************************
444 * CALCULATE INTERACTIONS *
445 **************************/
447 r13 = _mm_mul_pd(rsq13,rinv13);
449 /* Calculate table index by multiplying r with table scale and truncate to integer */
450 rt = _mm_mul_pd(r13,vftabscale);
451 vfitab = _mm_cvttpd_epi32(rt);
452 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
453 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
455 /* CUBIC SPLINE TABLE ELECTROSTATICS */
456 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
457 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
458 GMX_MM_TRANSPOSE2_PD(Y,F);
459 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
460 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
461 GMX_MM_TRANSPOSE2_PD(G,H);
462 Heps = _mm_mul_pd(vfeps,H);
463 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
464 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
465 velec = _mm_mul_pd(qq13,VV);
466 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
467 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
469 /* Update potential sum for this i atom from the interaction with this j atom. */
470 velecsum = _mm_add_pd(velecsum,velec);
474 /* Calculate temporary vectorial force */
475 tx = _mm_mul_pd(fscal,dx13);
476 ty = _mm_mul_pd(fscal,dy13);
477 tz = _mm_mul_pd(fscal,dz13);
479 /* Update vectorial force */
480 fix1 = _mm_add_pd(fix1,tx);
481 fiy1 = _mm_add_pd(fiy1,ty);
482 fiz1 = _mm_add_pd(fiz1,tz);
484 fjx3 = _mm_add_pd(fjx3,tx);
485 fjy3 = _mm_add_pd(fjy3,ty);
486 fjz3 = _mm_add_pd(fjz3,tz);
488 /**************************
489 * CALCULATE INTERACTIONS *
490 **************************/
492 r21 = _mm_mul_pd(rsq21,rinv21);
494 /* Calculate table index by multiplying r with table scale and truncate to integer */
495 rt = _mm_mul_pd(r21,vftabscale);
496 vfitab = _mm_cvttpd_epi32(rt);
497 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
498 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
500 /* CUBIC SPLINE TABLE ELECTROSTATICS */
501 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
502 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
503 GMX_MM_TRANSPOSE2_PD(Y,F);
504 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
505 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
506 GMX_MM_TRANSPOSE2_PD(G,H);
507 Heps = _mm_mul_pd(vfeps,H);
508 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
509 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
510 velec = _mm_mul_pd(qq21,VV);
511 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
512 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
514 /* Update potential sum for this i atom from the interaction with this j atom. */
515 velecsum = _mm_add_pd(velecsum,velec);
519 /* Calculate temporary vectorial force */
520 tx = _mm_mul_pd(fscal,dx21);
521 ty = _mm_mul_pd(fscal,dy21);
522 tz = _mm_mul_pd(fscal,dz21);
524 /* Update vectorial force */
525 fix2 = _mm_add_pd(fix2,tx);
526 fiy2 = _mm_add_pd(fiy2,ty);
527 fiz2 = _mm_add_pd(fiz2,tz);
529 fjx1 = _mm_add_pd(fjx1,tx);
530 fjy1 = _mm_add_pd(fjy1,ty);
531 fjz1 = _mm_add_pd(fjz1,tz);
533 /**************************
534 * CALCULATE INTERACTIONS *
535 **************************/
537 r22 = _mm_mul_pd(rsq22,rinv22);
539 /* Calculate table index by multiplying r with table scale and truncate to integer */
540 rt = _mm_mul_pd(r22,vftabscale);
541 vfitab = _mm_cvttpd_epi32(rt);
542 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
543 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
545 /* CUBIC SPLINE TABLE ELECTROSTATICS */
546 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
547 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
548 GMX_MM_TRANSPOSE2_PD(Y,F);
549 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
550 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
551 GMX_MM_TRANSPOSE2_PD(G,H);
552 Heps = _mm_mul_pd(vfeps,H);
553 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
554 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
555 velec = _mm_mul_pd(qq22,VV);
556 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
557 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
559 /* Update potential sum for this i atom from the interaction with this j atom. */
560 velecsum = _mm_add_pd(velecsum,velec);
564 /* Calculate temporary vectorial force */
565 tx = _mm_mul_pd(fscal,dx22);
566 ty = _mm_mul_pd(fscal,dy22);
567 tz = _mm_mul_pd(fscal,dz22);
569 /* Update vectorial force */
570 fix2 = _mm_add_pd(fix2,tx);
571 fiy2 = _mm_add_pd(fiy2,ty);
572 fiz2 = _mm_add_pd(fiz2,tz);
574 fjx2 = _mm_add_pd(fjx2,tx);
575 fjy2 = _mm_add_pd(fjy2,ty);
576 fjz2 = _mm_add_pd(fjz2,tz);
578 /**************************
579 * CALCULATE INTERACTIONS *
580 **************************/
582 r23 = _mm_mul_pd(rsq23,rinv23);
584 /* Calculate table index by multiplying r with table scale and truncate to integer */
585 rt = _mm_mul_pd(r23,vftabscale);
586 vfitab = _mm_cvttpd_epi32(rt);
587 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
588 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
590 /* CUBIC SPLINE TABLE ELECTROSTATICS */
591 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
592 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
593 GMX_MM_TRANSPOSE2_PD(Y,F);
594 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
595 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
596 GMX_MM_TRANSPOSE2_PD(G,H);
597 Heps = _mm_mul_pd(vfeps,H);
598 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
599 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
600 velec = _mm_mul_pd(qq23,VV);
601 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
602 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
604 /* Update potential sum for this i atom from the interaction with this j atom. */
605 velecsum = _mm_add_pd(velecsum,velec);
609 /* Calculate temporary vectorial force */
610 tx = _mm_mul_pd(fscal,dx23);
611 ty = _mm_mul_pd(fscal,dy23);
612 tz = _mm_mul_pd(fscal,dz23);
614 /* Update vectorial force */
615 fix2 = _mm_add_pd(fix2,tx);
616 fiy2 = _mm_add_pd(fiy2,ty);
617 fiz2 = _mm_add_pd(fiz2,tz);
619 fjx3 = _mm_add_pd(fjx3,tx);
620 fjy3 = _mm_add_pd(fjy3,ty);
621 fjz3 = _mm_add_pd(fjz3,tz);
623 /**************************
624 * CALCULATE INTERACTIONS *
625 **************************/
627 r31 = _mm_mul_pd(rsq31,rinv31);
629 /* Calculate table index by multiplying r with table scale and truncate to integer */
630 rt = _mm_mul_pd(r31,vftabscale);
631 vfitab = _mm_cvttpd_epi32(rt);
632 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
633 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
635 /* CUBIC SPLINE TABLE ELECTROSTATICS */
636 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
637 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
638 GMX_MM_TRANSPOSE2_PD(Y,F);
639 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
640 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
641 GMX_MM_TRANSPOSE2_PD(G,H);
642 Heps = _mm_mul_pd(vfeps,H);
643 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
644 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
645 velec = _mm_mul_pd(qq31,VV);
646 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
647 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
649 /* Update potential sum for this i atom from the interaction with this j atom. */
650 velecsum = _mm_add_pd(velecsum,velec);
654 /* Calculate temporary vectorial force */
655 tx = _mm_mul_pd(fscal,dx31);
656 ty = _mm_mul_pd(fscal,dy31);
657 tz = _mm_mul_pd(fscal,dz31);
659 /* Update vectorial force */
660 fix3 = _mm_add_pd(fix3,tx);
661 fiy3 = _mm_add_pd(fiy3,ty);
662 fiz3 = _mm_add_pd(fiz3,tz);
664 fjx1 = _mm_add_pd(fjx1,tx);
665 fjy1 = _mm_add_pd(fjy1,ty);
666 fjz1 = _mm_add_pd(fjz1,tz);
668 /**************************
669 * CALCULATE INTERACTIONS *
670 **************************/
672 r32 = _mm_mul_pd(rsq32,rinv32);
674 /* Calculate table index by multiplying r with table scale and truncate to integer */
675 rt = _mm_mul_pd(r32,vftabscale);
676 vfitab = _mm_cvttpd_epi32(rt);
677 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
678 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
680 /* CUBIC SPLINE TABLE ELECTROSTATICS */
681 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
682 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
683 GMX_MM_TRANSPOSE2_PD(Y,F);
684 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
685 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
686 GMX_MM_TRANSPOSE2_PD(G,H);
687 Heps = _mm_mul_pd(vfeps,H);
688 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
689 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
690 velec = _mm_mul_pd(qq32,VV);
691 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
692 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
694 /* Update potential sum for this i atom from the interaction with this j atom. */
695 velecsum = _mm_add_pd(velecsum,velec);
699 /* Calculate temporary vectorial force */
700 tx = _mm_mul_pd(fscal,dx32);
701 ty = _mm_mul_pd(fscal,dy32);
702 tz = _mm_mul_pd(fscal,dz32);
704 /* Update vectorial force */
705 fix3 = _mm_add_pd(fix3,tx);
706 fiy3 = _mm_add_pd(fiy3,ty);
707 fiz3 = _mm_add_pd(fiz3,tz);
709 fjx2 = _mm_add_pd(fjx2,tx);
710 fjy2 = _mm_add_pd(fjy2,ty);
711 fjz2 = _mm_add_pd(fjz2,tz);
713 /**************************
714 * CALCULATE INTERACTIONS *
715 **************************/
717 r33 = _mm_mul_pd(rsq33,rinv33);
719 /* Calculate table index by multiplying r with table scale and truncate to integer */
720 rt = _mm_mul_pd(r33,vftabscale);
721 vfitab = _mm_cvttpd_epi32(rt);
722 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
723 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
725 /* CUBIC SPLINE TABLE ELECTROSTATICS */
726 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
727 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
728 GMX_MM_TRANSPOSE2_PD(Y,F);
729 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
730 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
731 GMX_MM_TRANSPOSE2_PD(G,H);
732 Heps = _mm_mul_pd(vfeps,H);
733 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
734 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
735 velec = _mm_mul_pd(qq33,VV);
736 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
737 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
739 /* Update potential sum for this i atom from the interaction with this j atom. */
740 velecsum = _mm_add_pd(velecsum,velec);
744 /* Calculate temporary vectorial force */
745 tx = _mm_mul_pd(fscal,dx33);
746 ty = _mm_mul_pd(fscal,dy33);
747 tz = _mm_mul_pd(fscal,dz33);
749 /* Update vectorial force */
750 fix3 = _mm_add_pd(fix3,tx);
751 fiy3 = _mm_add_pd(fiy3,ty);
752 fiz3 = _mm_add_pd(fiz3,tz);
754 fjx3 = _mm_add_pd(fjx3,tx);
755 fjy3 = _mm_add_pd(fjy3,ty);
756 fjz3 = _mm_add_pd(fjz3,tz);
758 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
760 /* Inner loop uses 446 flops */
767 j_coord_offsetA = DIM*jnrA;
769 /* load j atom coordinates */
770 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
771 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
772 &jy2,&jz2,&jx3,&jy3,&jz3);
774 /* Calculate displacement vector */
775 dx00 = _mm_sub_pd(ix0,jx0);
776 dy00 = _mm_sub_pd(iy0,jy0);
777 dz00 = _mm_sub_pd(iz0,jz0);
778 dx11 = _mm_sub_pd(ix1,jx1);
779 dy11 = _mm_sub_pd(iy1,jy1);
780 dz11 = _mm_sub_pd(iz1,jz1);
781 dx12 = _mm_sub_pd(ix1,jx2);
782 dy12 = _mm_sub_pd(iy1,jy2);
783 dz12 = _mm_sub_pd(iz1,jz2);
784 dx13 = _mm_sub_pd(ix1,jx3);
785 dy13 = _mm_sub_pd(iy1,jy3);
786 dz13 = _mm_sub_pd(iz1,jz3);
787 dx21 = _mm_sub_pd(ix2,jx1);
788 dy21 = _mm_sub_pd(iy2,jy1);
789 dz21 = _mm_sub_pd(iz2,jz1);
790 dx22 = _mm_sub_pd(ix2,jx2);
791 dy22 = _mm_sub_pd(iy2,jy2);
792 dz22 = _mm_sub_pd(iz2,jz2);
793 dx23 = _mm_sub_pd(ix2,jx3);
794 dy23 = _mm_sub_pd(iy2,jy3);
795 dz23 = _mm_sub_pd(iz2,jz3);
796 dx31 = _mm_sub_pd(ix3,jx1);
797 dy31 = _mm_sub_pd(iy3,jy1);
798 dz31 = _mm_sub_pd(iz3,jz1);
799 dx32 = _mm_sub_pd(ix3,jx2);
800 dy32 = _mm_sub_pd(iy3,jy2);
801 dz32 = _mm_sub_pd(iz3,jz2);
802 dx33 = _mm_sub_pd(ix3,jx3);
803 dy33 = _mm_sub_pd(iy3,jy3);
804 dz33 = _mm_sub_pd(iz3,jz3);
806 /* Calculate squared distance and things based on it */
807 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
808 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
809 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
810 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
811 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
812 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
813 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
814 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
815 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
816 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
818 rinv00 = sse41_invsqrt_d(rsq00);
819 rinv11 = sse41_invsqrt_d(rsq11);
820 rinv12 = sse41_invsqrt_d(rsq12);
821 rinv13 = sse41_invsqrt_d(rsq13);
822 rinv21 = sse41_invsqrt_d(rsq21);
823 rinv22 = sse41_invsqrt_d(rsq22);
824 rinv23 = sse41_invsqrt_d(rsq23);
825 rinv31 = sse41_invsqrt_d(rsq31);
826 rinv32 = sse41_invsqrt_d(rsq32);
827 rinv33 = sse41_invsqrt_d(rsq33);
829 fjx0 = _mm_setzero_pd();
830 fjy0 = _mm_setzero_pd();
831 fjz0 = _mm_setzero_pd();
832 fjx1 = _mm_setzero_pd();
833 fjy1 = _mm_setzero_pd();
834 fjz1 = _mm_setzero_pd();
835 fjx2 = _mm_setzero_pd();
836 fjy2 = _mm_setzero_pd();
837 fjz2 = _mm_setzero_pd();
838 fjx3 = _mm_setzero_pd();
839 fjy3 = _mm_setzero_pd();
840 fjz3 = _mm_setzero_pd();
842 /**************************
843 * CALCULATE INTERACTIONS *
844 **************************/
846 r00 = _mm_mul_pd(rsq00,rinv00);
848 /* Calculate table index by multiplying r with table scale and truncate to integer */
849 rt = _mm_mul_pd(r00,vftabscale);
850 vfitab = _mm_cvttpd_epi32(rt);
851 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
852 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
854 /* CUBIC SPLINE TABLE DISPERSION */
855 vfitab = _mm_add_epi32(vfitab,ifour);
856 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
857 F = _mm_setzero_pd();
858 GMX_MM_TRANSPOSE2_PD(Y,F);
859 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
860 H = _mm_setzero_pd();
861 GMX_MM_TRANSPOSE2_PD(G,H);
862 Heps = _mm_mul_pd(vfeps,H);
863 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
864 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
865 vvdw6 = _mm_mul_pd(c6_00,VV);
866 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
867 fvdw6 = _mm_mul_pd(c6_00,FF);
869 /* CUBIC SPLINE TABLE REPULSION */
870 vfitab = _mm_add_epi32(vfitab,ifour);
871 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
872 F = _mm_setzero_pd();
873 GMX_MM_TRANSPOSE2_PD(Y,F);
874 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
875 H = _mm_setzero_pd();
876 GMX_MM_TRANSPOSE2_PD(G,H);
877 Heps = _mm_mul_pd(vfeps,H);
878 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
879 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
880 vvdw12 = _mm_mul_pd(c12_00,VV);
881 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
882 fvdw12 = _mm_mul_pd(c12_00,FF);
883 vvdw = _mm_add_pd(vvdw12,vvdw6);
884 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
886 /* Update potential sum for this i atom from the interaction with this j atom. */
887 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
888 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
892 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
894 /* Calculate temporary vectorial force */
895 tx = _mm_mul_pd(fscal,dx00);
896 ty = _mm_mul_pd(fscal,dy00);
897 tz = _mm_mul_pd(fscal,dz00);
899 /* Update vectorial force */
900 fix0 = _mm_add_pd(fix0,tx);
901 fiy0 = _mm_add_pd(fiy0,ty);
902 fiz0 = _mm_add_pd(fiz0,tz);
904 fjx0 = _mm_add_pd(fjx0,tx);
905 fjy0 = _mm_add_pd(fjy0,ty);
906 fjz0 = _mm_add_pd(fjz0,tz);
908 /**************************
909 * CALCULATE INTERACTIONS *
910 **************************/
912 r11 = _mm_mul_pd(rsq11,rinv11);
914 /* Calculate table index by multiplying r with table scale and truncate to integer */
915 rt = _mm_mul_pd(r11,vftabscale);
916 vfitab = _mm_cvttpd_epi32(rt);
917 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
918 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
920 /* CUBIC SPLINE TABLE ELECTROSTATICS */
921 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
922 F = _mm_setzero_pd();
923 GMX_MM_TRANSPOSE2_PD(Y,F);
924 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
925 H = _mm_setzero_pd();
926 GMX_MM_TRANSPOSE2_PD(G,H);
927 Heps = _mm_mul_pd(vfeps,H);
928 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
929 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
930 velec = _mm_mul_pd(qq11,VV);
931 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
932 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
934 /* Update potential sum for this i atom from the interaction with this j atom. */
935 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
936 velecsum = _mm_add_pd(velecsum,velec);
940 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
942 /* Calculate temporary vectorial force */
943 tx = _mm_mul_pd(fscal,dx11);
944 ty = _mm_mul_pd(fscal,dy11);
945 tz = _mm_mul_pd(fscal,dz11);
947 /* Update vectorial force */
948 fix1 = _mm_add_pd(fix1,tx);
949 fiy1 = _mm_add_pd(fiy1,ty);
950 fiz1 = _mm_add_pd(fiz1,tz);
952 fjx1 = _mm_add_pd(fjx1,tx);
953 fjy1 = _mm_add_pd(fjy1,ty);
954 fjz1 = _mm_add_pd(fjz1,tz);
956 /**************************
957 * CALCULATE INTERACTIONS *
958 **************************/
960 r12 = _mm_mul_pd(rsq12,rinv12);
962 /* Calculate table index by multiplying r with table scale and truncate to integer */
963 rt = _mm_mul_pd(r12,vftabscale);
964 vfitab = _mm_cvttpd_epi32(rt);
965 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
966 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
968 /* CUBIC SPLINE TABLE ELECTROSTATICS */
969 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
970 F = _mm_setzero_pd();
971 GMX_MM_TRANSPOSE2_PD(Y,F);
972 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
973 H = _mm_setzero_pd();
974 GMX_MM_TRANSPOSE2_PD(G,H);
975 Heps = _mm_mul_pd(vfeps,H);
976 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
977 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
978 velec = _mm_mul_pd(qq12,VV);
979 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
980 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
982 /* Update potential sum for this i atom from the interaction with this j atom. */
983 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
984 velecsum = _mm_add_pd(velecsum,velec);
988 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
990 /* Calculate temporary vectorial force */
991 tx = _mm_mul_pd(fscal,dx12);
992 ty = _mm_mul_pd(fscal,dy12);
993 tz = _mm_mul_pd(fscal,dz12);
995 /* Update vectorial force */
996 fix1 = _mm_add_pd(fix1,tx);
997 fiy1 = _mm_add_pd(fiy1,ty);
998 fiz1 = _mm_add_pd(fiz1,tz);
1000 fjx2 = _mm_add_pd(fjx2,tx);
1001 fjy2 = _mm_add_pd(fjy2,ty);
1002 fjz2 = _mm_add_pd(fjz2,tz);
1004 /**************************
1005 * CALCULATE INTERACTIONS *
1006 **************************/
1008 r13 = _mm_mul_pd(rsq13,rinv13);
1010 /* Calculate table index by multiplying r with table scale and truncate to integer */
1011 rt = _mm_mul_pd(r13,vftabscale);
1012 vfitab = _mm_cvttpd_epi32(rt);
1013 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1014 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1016 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1017 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1018 F = _mm_setzero_pd();
1019 GMX_MM_TRANSPOSE2_PD(Y,F);
1020 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1021 H = _mm_setzero_pd();
1022 GMX_MM_TRANSPOSE2_PD(G,H);
1023 Heps = _mm_mul_pd(vfeps,H);
1024 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1025 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1026 velec = _mm_mul_pd(qq13,VV);
1027 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1028 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
1030 /* Update potential sum for this i atom from the interaction with this j atom. */
1031 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1032 velecsum = _mm_add_pd(velecsum,velec);
1036 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1038 /* Calculate temporary vectorial force */
1039 tx = _mm_mul_pd(fscal,dx13);
1040 ty = _mm_mul_pd(fscal,dy13);
1041 tz = _mm_mul_pd(fscal,dz13);
1043 /* Update vectorial force */
1044 fix1 = _mm_add_pd(fix1,tx);
1045 fiy1 = _mm_add_pd(fiy1,ty);
1046 fiz1 = _mm_add_pd(fiz1,tz);
1048 fjx3 = _mm_add_pd(fjx3,tx);
1049 fjy3 = _mm_add_pd(fjy3,ty);
1050 fjz3 = _mm_add_pd(fjz3,tz);
1052 /**************************
1053 * CALCULATE INTERACTIONS *
1054 **************************/
1056 r21 = _mm_mul_pd(rsq21,rinv21);
1058 /* Calculate table index by multiplying r with table scale and truncate to integer */
1059 rt = _mm_mul_pd(r21,vftabscale);
1060 vfitab = _mm_cvttpd_epi32(rt);
1061 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1062 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1064 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1065 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1066 F = _mm_setzero_pd();
1067 GMX_MM_TRANSPOSE2_PD(Y,F);
1068 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1069 H = _mm_setzero_pd();
1070 GMX_MM_TRANSPOSE2_PD(G,H);
1071 Heps = _mm_mul_pd(vfeps,H);
1072 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1073 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1074 velec = _mm_mul_pd(qq21,VV);
1075 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1076 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1078 /* Update potential sum for this i atom from the interaction with this j atom. */
1079 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1080 velecsum = _mm_add_pd(velecsum,velec);
1084 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1086 /* Calculate temporary vectorial force */
1087 tx = _mm_mul_pd(fscal,dx21);
1088 ty = _mm_mul_pd(fscal,dy21);
1089 tz = _mm_mul_pd(fscal,dz21);
1091 /* Update vectorial force */
1092 fix2 = _mm_add_pd(fix2,tx);
1093 fiy2 = _mm_add_pd(fiy2,ty);
1094 fiz2 = _mm_add_pd(fiz2,tz);
1096 fjx1 = _mm_add_pd(fjx1,tx);
1097 fjy1 = _mm_add_pd(fjy1,ty);
1098 fjz1 = _mm_add_pd(fjz1,tz);
1100 /**************************
1101 * CALCULATE INTERACTIONS *
1102 **************************/
1104 r22 = _mm_mul_pd(rsq22,rinv22);
1106 /* Calculate table index by multiplying r with table scale and truncate to integer */
1107 rt = _mm_mul_pd(r22,vftabscale);
1108 vfitab = _mm_cvttpd_epi32(rt);
1109 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1110 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1112 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1113 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1114 F = _mm_setzero_pd();
1115 GMX_MM_TRANSPOSE2_PD(Y,F);
1116 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1117 H = _mm_setzero_pd();
1118 GMX_MM_TRANSPOSE2_PD(G,H);
1119 Heps = _mm_mul_pd(vfeps,H);
1120 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1121 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1122 velec = _mm_mul_pd(qq22,VV);
1123 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1124 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1126 /* Update potential sum for this i atom from the interaction with this j atom. */
1127 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1128 velecsum = _mm_add_pd(velecsum,velec);
1132 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1134 /* Calculate temporary vectorial force */
1135 tx = _mm_mul_pd(fscal,dx22);
1136 ty = _mm_mul_pd(fscal,dy22);
1137 tz = _mm_mul_pd(fscal,dz22);
1139 /* Update vectorial force */
1140 fix2 = _mm_add_pd(fix2,tx);
1141 fiy2 = _mm_add_pd(fiy2,ty);
1142 fiz2 = _mm_add_pd(fiz2,tz);
1144 fjx2 = _mm_add_pd(fjx2,tx);
1145 fjy2 = _mm_add_pd(fjy2,ty);
1146 fjz2 = _mm_add_pd(fjz2,tz);
1148 /**************************
1149 * CALCULATE INTERACTIONS *
1150 **************************/
1152 r23 = _mm_mul_pd(rsq23,rinv23);
1154 /* Calculate table index by multiplying r with table scale and truncate to integer */
1155 rt = _mm_mul_pd(r23,vftabscale);
1156 vfitab = _mm_cvttpd_epi32(rt);
1157 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1158 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1160 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1161 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1162 F = _mm_setzero_pd();
1163 GMX_MM_TRANSPOSE2_PD(Y,F);
1164 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1165 H = _mm_setzero_pd();
1166 GMX_MM_TRANSPOSE2_PD(G,H);
1167 Heps = _mm_mul_pd(vfeps,H);
1168 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1169 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1170 velec = _mm_mul_pd(qq23,VV);
1171 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1172 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
1174 /* Update potential sum for this i atom from the interaction with this j atom. */
1175 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1176 velecsum = _mm_add_pd(velecsum,velec);
1180 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1182 /* Calculate temporary vectorial force */
1183 tx = _mm_mul_pd(fscal,dx23);
1184 ty = _mm_mul_pd(fscal,dy23);
1185 tz = _mm_mul_pd(fscal,dz23);
1187 /* Update vectorial force */
1188 fix2 = _mm_add_pd(fix2,tx);
1189 fiy2 = _mm_add_pd(fiy2,ty);
1190 fiz2 = _mm_add_pd(fiz2,tz);
1192 fjx3 = _mm_add_pd(fjx3,tx);
1193 fjy3 = _mm_add_pd(fjy3,ty);
1194 fjz3 = _mm_add_pd(fjz3,tz);
1196 /**************************
1197 * CALCULATE INTERACTIONS *
1198 **************************/
1200 r31 = _mm_mul_pd(rsq31,rinv31);
1202 /* Calculate table index by multiplying r with table scale and truncate to integer */
1203 rt = _mm_mul_pd(r31,vftabscale);
1204 vfitab = _mm_cvttpd_epi32(rt);
1205 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1206 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1208 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1209 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1210 F = _mm_setzero_pd();
1211 GMX_MM_TRANSPOSE2_PD(Y,F);
1212 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1213 H = _mm_setzero_pd();
1214 GMX_MM_TRANSPOSE2_PD(G,H);
1215 Heps = _mm_mul_pd(vfeps,H);
1216 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1217 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1218 velec = _mm_mul_pd(qq31,VV);
1219 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1220 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
1222 /* Update potential sum for this i atom from the interaction with this j atom. */
1223 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1224 velecsum = _mm_add_pd(velecsum,velec);
1228 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1230 /* Calculate temporary vectorial force */
1231 tx = _mm_mul_pd(fscal,dx31);
1232 ty = _mm_mul_pd(fscal,dy31);
1233 tz = _mm_mul_pd(fscal,dz31);
1235 /* Update vectorial force */
1236 fix3 = _mm_add_pd(fix3,tx);
1237 fiy3 = _mm_add_pd(fiy3,ty);
1238 fiz3 = _mm_add_pd(fiz3,tz);
1240 fjx1 = _mm_add_pd(fjx1,tx);
1241 fjy1 = _mm_add_pd(fjy1,ty);
1242 fjz1 = _mm_add_pd(fjz1,tz);
1244 /**************************
1245 * CALCULATE INTERACTIONS *
1246 **************************/
1248 r32 = _mm_mul_pd(rsq32,rinv32);
1250 /* Calculate table index by multiplying r with table scale and truncate to integer */
1251 rt = _mm_mul_pd(r32,vftabscale);
1252 vfitab = _mm_cvttpd_epi32(rt);
1253 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1254 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1256 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1257 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1258 F = _mm_setzero_pd();
1259 GMX_MM_TRANSPOSE2_PD(Y,F);
1260 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1261 H = _mm_setzero_pd();
1262 GMX_MM_TRANSPOSE2_PD(G,H);
1263 Heps = _mm_mul_pd(vfeps,H);
1264 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1265 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1266 velec = _mm_mul_pd(qq32,VV);
1267 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1268 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
1270 /* Update potential sum for this i atom from the interaction with this j atom. */
1271 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1272 velecsum = _mm_add_pd(velecsum,velec);
1276 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1278 /* Calculate temporary vectorial force */
1279 tx = _mm_mul_pd(fscal,dx32);
1280 ty = _mm_mul_pd(fscal,dy32);
1281 tz = _mm_mul_pd(fscal,dz32);
1283 /* Update vectorial force */
1284 fix3 = _mm_add_pd(fix3,tx);
1285 fiy3 = _mm_add_pd(fiy3,ty);
1286 fiz3 = _mm_add_pd(fiz3,tz);
1288 fjx2 = _mm_add_pd(fjx2,tx);
1289 fjy2 = _mm_add_pd(fjy2,ty);
1290 fjz2 = _mm_add_pd(fjz2,tz);
1292 /**************************
1293 * CALCULATE INTERACTIONS *
1294 **************************/
1296 r33 = _mm_mul_pd(rsq33,rinv33);
1298 /* Calculate table index by multiplying r with table scale and truncate to integer */
1299 rt = _mm_mul_pd(r33,vftabscale);
1300 vfitab = _mm_cvttpd_epi32(rt);
1301 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1302 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1304 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1305 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1306 F = _mm_setzero_pd();
1307 GMX_MM_TRANSPOSE2_PD(Y,F);
1308 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1309 H = _mm_setzero_pd();
1310 GMX_MM_TRANSPOSE2_PD(G,H);
1311 Heps = _mm_mul_pd(vfeps,H);
1312 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1313 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1314 velec = _mm_mul_pd(qq33,VV);
1315 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1316 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
1318 /* Update potential sum for this i atom from the interaction with this j atom. */
1319 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1320 velecsum = _mm_add_pd(velecsum,velec);
1324 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1326 /* Calculate temporary vectorial force */
1327 tx = _mm_mul_pd(fscal,dx33);
1328 ty = _mm_mul_pd(fscal,dy33);
1329 tz = _mm_mul_pd(fscal,dz33);
1331 /* Update vectorial force */
1332 fix3 = _mm_add_pd(fix3,tx);
1333 fiy3 = _mm_add_pd(fiy3,ty);
1334 fiz3 = _mm_add_pd(fiz3,tz);
1336 fjx3 = _mm_add_pd(fjx3,tx);
1337 fjy3 = _mm_add_pd(fjy3,ty);
1338 fjz3 = _mm_add_pd(fjz3,tz);
1340 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1342 /* Inner loop uses 446 flops */
1345 /* End of innermost loop */
1347 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1348 f+i_coord_offset,fshift+i_shift_offset);
1351 /* Update potential energies */
1352 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1353 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1355 /* Increment number of inner iterations */
1356 inneriter += j_index_end - j_index_start;
1358 /* Outer loop uses 26 flops */
1361 /* Increment number of outer iterations */
1364 /* Update outer/inner flops */
1366 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*446);
1369 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse4_1_double
1370 * Electrostatics interaction: CubicSplineTable
1371 * VdW interaction: CubicSplineTable
1372 * Geometry: Water4-Water4
1373 * Calculate force/pot: Force
1376 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse4_1_double
1377 (t_nblist * gmx_restrict nlist,
1378 rvec * gmx_restrict xx,
1379 rvec * gmx_restrict ff,
1380 struct t_forcerec * gmx_restrict fr,
1381 t_mdatoms * gmx_restrict mdatoms,
1382 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1383 t_nrnb * gmx_restrict nrnb)
1385 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1386 * just 0 for non-waters.
1387 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1388 * jnr indices corresponding to data put in the four positions in the SIMD register.
1390 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1391 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1393 int j_coord_offsetA,j_coord_offsetB;
1394 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1395 real rcutoff_scalar;
1396 real *shiftvec,*fshift,*x,*f;
1397 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1399 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1401 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1403 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1405 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1406 int vdwjidx0A,vdwjidx0B;
1407 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1408 int vdwjidx1A,vdwjidx1B;
1409 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1410 int vdwjidx2A,vdwjidx2B;
1411 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1412 int vdwjidx3A,vdwjidx3B;
1413 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1414 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1415 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1416 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1417 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1418 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1419 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1420 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1421 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1422 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1423 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1424 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1427 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1430 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1431 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1433 __m128i ifour = _mm_set1_epi32(4);
1434 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1436 __m128d dummy_mask,cutoff_mask;
1437 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1438 __m128d one = _mm_set1_pd(1.0);
1439 __m128d two = _mm_set1_pd(2.0);
1445 jindex = nlist->jindex;
1447 shiftidx = nlist->shift;
1449 shiftvec = fr->shift_vec[0];
1450 fshift = fr->fshift[0];
1451 facel = _mm_set1_pd(fr->ic->epsfac);
1452 charge = mdatoms->chargeA;
1453 nvdwtype = fr->ntype;
1454 vdwparam = fr->nbfp;
1455 vdwtype = mdatoms->typeA;
1457 vftab = kernel_data->table_elec_vdw->data;
1458 vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
1460 /* Setup water-specific parameters */
1461 inr = nlist->iinr[0];
1462 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1463 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1464 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1465 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1467 jq1 = _mm_set1_pd(charge[inr+1]);
1468 jq2 = _mm_set1_pd(charge[inr+2]);
1469 jq3 = _mm_set1_pd(charge[inr+3]);
1470 vdwjidx0A = 2*vdwtype[inr+0];
1471 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1472 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1473 qq11 = _mm_mul_pd(iq1,jq1);
1474 qq12 = _mm_mul_pd(iq1,jq2);
1475 qq13 = _mm_mul_pd(iq1,jq3);
1476 qq21 = _mm_mul_pd(iq2,jq1);
1477 qq22 = _mm_mul_pd(iq2,jq2);
1478 qq23 = _mm_mul_pd(iq2,jq3);
1479 qq31 = _mm_mul_pd(iq3,jq1);
1480 qq32 = _mm_mul_pd(iq3,jq2);
1481 qq33 = _mm_mul_pd(iq3,jq3);
1483 /* Avoid stupid compiler warnings */
1485 j_coord_offsetA = 0;
1486 j_coord_offsetB = 0;
1491 /* Start outer loop over neighborlists */
1492 for(iidx=0; iidx<nri; iidx++)
1494 /* Load shift vector for this list */
1495 i_shift_offset = DIM*shiftidx[iidx];
1497 /* Load limits for loop over neighbors */
1498 j_index_start = jindex[iidx];
1499 j_index_end = jindex[iidx+1];
1501 /* Get outer coordinate index */
1503 i_coord_offset = DIM*inr;
1505 /* Load i particle coords and add shift vector */
1506 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1507 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1509 fix0 = _mm_setzero_pd();
1510 fiy0 = _mm_setzero_pd();
1511 fiz0 = _mm_setzero_pd();
1512 fix1 = _mm_setzero_pd();
1513 fiy1 = _mm_setzero_pd();
1514 fiz1 = _mm_setzero_pd();
1515 fix2 = _mm_setzero_pd();
1516 fiy2 = _mm_setzero_pd();
1517 fiz2 = _mm_setzero_pd();
1518 fix3 = _mm_setzero_pd();
1519 fiy3 = _mm_setzero_pd();
1520 fiz3 = _mm_setzero_pd();
1522 /* Start inner kernel loop */
1523 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1526 /* Get j neighbor index, and coordinate index */
1528 jnrB = jjnr[jidx+1];
1529 j_coord_offsetA = DIM*jnrA;
1530 j_coord_offsetB = DIM*jnrB;
1532 /* load j atom coordinates */
1533 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1534 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1535 &jy2,&jz2,&jx3,&jy3,&jz3);
1537 /* Calculate displacement vector */
1538 dx00 = _mm_sub_pd(ix0,jx0);
1539 dy00 = _mm_sub_pd(iy0,jy0);
1540 dz00 = _mm_sub_pd(iz0,jz0);
1541 dx11 = _mm_sub_pd(ix1,jx1);
1542 dy11 = _mm_sub_pd(iy1,jy1);
1543 dz11 = _mm_sub_pd(iz1,jz1);
1544 dx12 = _mm_sub_pd(ix1,jx2);
1545 dy12 = _mm_sub_pd(iy1,jy2);
1546 dz12 = _mm_sub_pd(iz1,jz2);
1547 dx13 = _mm_sub_pd(ix1,jx3);
1548 dy13 = _mm_sub_pd(iy1,jy3);
1549 dz13 = _mm_sub_pd(iz1,jz3);
1550 dx21 = _mm_sub_pd(ix2,jx1);
1551 dy21 = _mm_sub_pd(iy2,jy1);
1552 dz21 = _mm_sub_pd(iz2,jz1);
1553 dx22 = _mm_sub_pd(ix2,jx2);
1554 dy22 = _mm_sub_pd(iy2,jy2);
1555 dz22 = _mm_sub_pd(iz2,jz2);
1556 dx23 = _mm_sub_pd(ix2,jx3);
1557 dy23 = _mm_sub_pd(iy2,jy3);
1558 dz23 = _mm_sub_pd(iz2,jz3);
1559 dx31 = _mm_sub_pd(ix3,jx1);
1560 dy31 = _mm_sub_pd(iy3,jy1);
1561 dz31 = _mm_sub_pd(iz3,jz1);
1562 dx32 = _mm_sub_pd(ix3,jx2);
1563 dy32 = _mm_sub_pd(iy3,jy2);
1564 dz32 = _mm_sub_pd(iz3,jz2);
1565 dx33 = _mm_sub_pd(ix3,jx3);
1566 dy33 = _mm_sub_pd(iy3,jy3);
1567 dz33 = _mm_sub_pd(iz3,jz3);
1569 /* Calculate squared distance and things based on it */
1570 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1571 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1572 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1573 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1574 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1575 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1576 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1577 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1578 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1579 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1581 rinv00 = sse41_invsqrt_d(rsq00);
1582 rinv11 = sse41_invsqrt_d(rsq11);
1583 rinv12 = sse41_invsqrt_d(rsq12);
1584 rinv13 = sse41_invsqrt_d(rsq13);
1585 rinv21 = sse41_invsqrt_d(rsq21);
1586 rinv22 = sse41_invsqrt_d(rsq22);
1587 rinv23 = sse41_invsqrt_d(rsq23);
1588 rinv31 = sse41_invsqrt_d(rsq31);
1589 rinv32 = sse41_invsqrt_d(rsq32);
1590 rinv33 = sse41_invsqrt_d(rsq33);
1592 fjx0 = _mm_setzero_pd();
1593 fjy0 = _mm_setzero_pd();
1594 fjz0 = _mm_setzero_pd();
1595 fjx1 = _mm_setzero_pd();
1596 fjy1 = _mm_setzero_pd();
1597 fjz1 = _mm_setzero_pd();
1598 fjx2 = _mm_setzero_pd();
1599 fjy2 = _mm_setzero_pd();
1600 fjz2 = _mm_setzero_pd();
1601 fjx3 = _mm_setzero_pd();
1602 fjy3 = _mm_setzero_pd();
1603 fjz3 = _mm_setzero_pd();
1605 /**************************
1606 * CALCULATE INTERACTIONS *
1607 **************************/
1609 r00 = _mm_mul_pd(rsq00,rinv00);
1611 /* Calculate table index by multiplying r with table scale and truncate to integer */
1612 rt = _mm_mul_pd(r00,vftabscale);
1613 vfitab = _mm_cvttpd_epi32(rt);
1614 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1615 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1617 /* CUBIC SPLINE TABLE DISPERSION */
1618 vfitab = _mm_add_epi32(vfitab,ifour);
1619 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1620 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1621 GMX_MM_TRANSPOSE2_PD(Y,F);
1622 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1623 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1624 GMX_MM_TRANSPOSE2_PD(G,H);
1625 Heps = _mm_mul_pd(vfeps,H);
1626 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1627 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1628 fvdw6 = _mm_mul_pd(c6_00,FF);
1630 /* CUBIC SPLINE TABLE REPULSION */
1631 vfitab = _mm_add_epi32(vfitab,ifour);
1632 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1633 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1634 GMX_MM_TRANSPOSE2_PD(Y,F);
1635 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1636 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1637 GMX_MM_TRANSPOSE2_PD(G,H);
1638 Heps = _mm_mul_pd(vfeps,H);
1639 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1640 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1641 fvdw12 = _mm_mul_pd(c12_00,FF);
1642 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1646 /* Calculate temporary vectorial force */
1647 tx = _mm_mul_pd(fscal,dx00);
1648 ty = _mm_mul_pd(fscal,dy00);
1649 tz = _mm_mul_pd(fscal,dz00);
1651 /* Update vectorial force */
1652 fix0 = _mm_add_pd(fix0,tx);
1653 fiy0 = _mm_add_pd(fiy0,ty);
1654 fiz0 = _mm_add_pd(fiz0,tz);
1656 fjx0 = _mm_add_pd(fjx0,tx);
1657 fjy0 = _mm_add_pd(fjy0,ty);
1658 fjz0 = _mm_add_pd(fjz0,tz);
1660 /**************************
1661 * CALCULATE INTERACTIONS *
1662 **************************/
1664 r11 = _mm_mul_pd(rsq11,rinv11);
1666 /* Calculate table index by multiplying r with table scale and truncate to integer */
1667 rt = _mm_mul_pd(r11,vftabscale);
1668 vfitab = _mm_cvttpd_epi32(rt);
1669 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1670 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1672 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1673 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1674 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1675 GMX_MM_TRANSPOSE2_PD(Y,F);
1676 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1677 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1678 GMX_MM_TRANSPOSE2_PD(G,H);
1679 Heps = _mm_mul_pd(vfeps,H);
1680 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1681 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1682 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1686 /* Calculate temporary vectorial force */
1687 tx = _mm_mul_pd(fscal,dx11);
1688 ty = _mm_mul_pd(fscal,dy11);
1689 tz = _mm_mul_pd(fscal,dz11);
1691 /* Update vectorial force */
1692 fix1 = _mm_add_pd(fix1,tx);
1693 fiy1 = _mm_add_pd(fiy1,ty);
1694 fiz1 = _mm_add_pd(fiz1,tz);
1696 fjx1 = _mm_add_pd(fjx1,tx);
1697 fjy1 = _mm_add_pd(fjy1,ty);
1698 fjz1 = _mm_add_pd(fjz1,tz);
1700 /**************************
1701 * CALCULATE INTERACTIONS *
1702 **************************/
1704 r12 = _mm_mul_pd(rsq12,rinv12);
1706 /* Calculate table index by multiplying r with table scale and truncate to integer */
1707 rt = _mm_mul_pd(r12,vftabscale);
1708 vfitab = _mm_cvttpd_epi32(rt);
1709 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1710 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1712 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1713 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1714 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1715 GMX_MM_TRANSPOSE2_PD(Y,F);
1716 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1717 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1718 GMX_MM_TRANSPOSE2_PD(G,H);
1719 Heps = _mm_mul_pd(vfeps,H);
1720 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1721 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1722 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1726 /* Calculate temporary vectorial force */
1727 tx = _mm_mul_pd(fscal,dx12);
1728 ty = _mm_mul_pd(fscal,dy12);
1729 tz = _mm_mul_pd(fscal,dz12);
1731 /* Update vectorial force */
1732 fix1 = _mm_add_pd(fix1,tx);
1733 fiy1 = _mm_add_pd(fiy1,ty);
1734 fiz1 = _mm_add_pd(fiz1,tz);
1736 fjx2 = _mm_add_pd(fjx2,tx);
1737 fjy2 = _mm_add_pd(fjy2,ty);
1738 fjz2 = _mm_add_pd(fjz2,tz);
1740 /**************************
1741 * CALCULATE INTERACTIONS *
1742 **************************/
1744 r13 = _mm_mul_pd(rsq13,rinv13);
1746 /* Calculate table index by multiplying r with table scale and truncate to integer */
1747 rt = _mm_mul_pd(r13,vftabscale);
1748 vfitab = _mm_cvttpd_epi32(rt);
1749 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1750 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1752 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1753 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1754 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1755 GMX_MM_TRANSPOSE2_PD(Y,F);
1756 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1757 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1758 GMX_MM_TRANSPOSE2_PD(G,H);
1759 Heps = _mm_mul_pd(vfeps,H);
1760 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1761 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1762 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
1766 /* Calculate temporary vectorial force */
1767 tx = _mm_mul_pd(fscal,dx13);
1768 ty = _mm_mul_pd(fscal,dy13);
1769 tz = _mm_mul_pd(fscal,dz13);
1771 /* Update vectorial force */
1772 fix1 = _mm_add_pd(fix1,tx);
1773 fiy1 = _mm_add_pd(fiy1,ty);
1774 fiz1 = _mm_add_pd(fiz1,tz);
1776 fjx3 = _mm_add_pd(fjx3,tx);
1777 fjy3 = _mm_add_pd(fjy3,ty);
1778 fjz3 = _mm_add_pd(fjz3,tz);
1780 /**************************
1781 * CALCULATE INTERACTIONS *
1782 **************************/
1784 r21 = _mm_mul_pd(rsq21,rinv21);
1786 /* Calculate table index by multiplying r with table scale and truncate to integer */
1787 rt = _mm_mul_pd(r21,vftabscale);
1788 vfitab = _mm_cvttpd_epi32(rt);
1789 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1790 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1792 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1793 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1794 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1795 GMX_MM_TRANSPOSE2_PD(Y,F);
1796 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1797 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1798 GMX_MM_TRANSPOSE2_PD(G,H);
1799 Heps = _mm_mul_pd(vfeps,H);
1800 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1801 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1802 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1806 /* Calculate temporary vectorial force */
1807 tx = _mm_mul_pd(fscal,dx21);
1808 ty = _mm_mul_pd(fscal,dy21);
1809 tz = _mm_mul_pd(fscal,dz21);
1811 /* Update vectorial force */
1812 fix2 = _mm_add_pd(fix2,tx);
1813 fiy2 = _mm_add_pd(fiy2,ty);
1814 fiz2 = _mm_add_pd(fiz2,tz);
1816 fjx1 = _mm_add_pd(fjx1,tx);
1817 fjy1 = _mm_add_pd(fjy1,ty);
1818 fjz1 = _mm_add_pd(fjz1,tz);
1820 /**************************
1821 * CALCULATE INTERACTIONS *
1822 **************************/
1824 r22 = _mm_mul_pd(rsq22,rinv22);
1826 /* Calculate table index by multiplying r with table scale and truncate to integer */
1827 rt = _mm_mul_pd(r22,vftabscale);
1828 vfitab = _mm_cvttpd_epi32(rt);
1829 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1830 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1832 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1833 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1834 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1835 GMX_MM_TRANSPOSE2_PD(Y,F);
1836 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1837 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1838 GMX_MM_TRANSPOSE2_PD(G,H);
1839 Heps = _mm_mul_pd(vfeps,H);
1840 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1841 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1842 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1846 /* Calculate temporary vectorial force */
1847 tx = _mm_mul_pd(fscal,dx22);
1848 ty = _mm_mul_pd(fscal,dy22);
1849 tz = _mm_mul_pd(fscal,dz22);
1851 /* Update vectorial force */
1852 fix2 = _mm_add_pd(fix2,tx);
1853 fiy2 = _mm_add_pd(fiy2,ty);
1854 fiz2 = _mm_add_pd(fiz2,tz);
1856 fjx2 = _mm_add_pd(fjx2,tx);
1857 fjy2 = _mm_add_pd(fjy2,ty);
1858 fjz2 = _mm_add_pd(fjz2,tz);
1860 /**************************
1861 * CALCULATE INTERACTIONS *
1862 **************************/
1864 r23 = _mm_mul_pd(rsq23,rinv23);
1866 /* Calculate table index by multiplying r with table scale and truncate to integer */
1867 rt = _mm_mul_pd(r23,vftabscale);
1868 vfitab = _mm_cvttpd_epi32(rt);
1869 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1870 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1872 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1873 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1874 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1875 GMX_MM_TRANSPOSE2_PD(Y,F);
1876 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1877 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1878 GMX_MM_TRANSPOSE2_PD(G,H);
1879 Heps = _mm_mul_pd(vfeps,H);
1880 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1881 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1882 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
1886 /* Calculate temporary vectorial force */
1887 tx = _mm_mul_pd(fscal,dx23);
1888 ty = _mm_mul_pd(fscal,dy23);
1889 tz = _mm_mul_pd(fscal,dz23);
1891 /* Update vectorial force */
1892 fix2 = _mm_add_pd(fix2,tx);
1893 fiy2 = _mm_add_pd(fiy2,ty);
1894 fiz2 = _mm_add_pd(fiz2,tz);
1896 fjx3 = _mm_add_pd(fjx3,tx);
1897 fjy3 = _mm_add_pd(fjy3,ty);
1898 fjz3 = _mm_add_pd(fjz3,tz);
1900 /**************************
1901 * CALCULATE INTERACTIONS *
1902 **************************/
1904 r31 = _mm_mul_pd(rsq31,rinv31);
1906 /* Calculate table index by multiplying r with table scale and truncate to integer */
1907 rt = _mm_mul_pd(r31,vftabscale);
1908 vfitab = _mm_cvttpd_epi32(rt);
1909 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1910 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1912 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1913 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1914 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1915 GMX_MM_TRANSPOSE2_PD(Y,F);
1916 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1917 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1918 GMX_MM_TRANSPOSE2_PD(G,H);
1919 Heps = _mm_mul_pd(vfeps,H);
1920 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1921 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1922 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
1926 /* Calculate temporary vectorial force */
1927 tx = _mm_mul_pd(fscal,dx31);
1928 ty = _mm_mul_pd(fscal,dy31);
1929 tz = _mm_mul_pd(fscal,dz31);
1931 /* Update vectorial force */
1932 fix3 = _mm_add_pd(fix3,tx);
1933 fiy3 = _mm_add_pd(fiy3,ty);
1934 fiz3 = _mm_add_pd(fiz3,tz);
1936 fjx1 = _mm_add_pd(fjx1,tx);
1937 fjy1 = _mm_add_pd(fjy1,ty);
1938 fjz1 = _mm_add_pd(fjz1,tz);
1940 /**************************
1941 * CALCULATE INTERACTIONS *
1942 **************************/
1944 r32 = _mm_mul_pd(rsq32,rinv32);
1946 /* Calculate table index by multiplying r with table scale and truncate to integer */
1947 rt = _mm_mul_pd(r32,vftabscale);
1948 vfitab = _mm_cvttpd_epi32(rt);
1949 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1950 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1952 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1953 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1954 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1955 GMX_MM_TRANSPOSE2_PD(Y,F);
1956 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1957 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1958 GMX_MM_TRANSPOSE2_PD(G,H);
1959 Heps = _mm_mul_pd(vfeps,H);
1960 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1961 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1962 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
1966 /* Calculate temporary vectorial force */
1967 tx = _mm_mul_pd(fscal,dx32);
1968 ty = _mm_mul_pd(fscal,dy32);
1969 tz = _mm_mul_pd(fscal,dz32);
1971 /* Update vectorial force */
1972 fix3 = _mm_add_pd(fix3,tx);
1973 fiy3 = _mm_add_pd(fiy3,ty);
1974 fiz3 = _mm_add_pd(fiz3,tz);
1976 fjx2 = _mm_add_pd(fjx2,tx);
1977 fjy2 = _mm_add_pd(fjy2,ty);
1978 fjz2 = _mm_add_pd(fjz2,tz);
1980 /**************************
1981 * CALCULATE INTERACTIONS *
1982 **************************/
1984 r33 = _mm_mul_pd(rsq33,rinv33);
1986 /* Calculate table index by multiplying r with table scale and truncate to integer */
1987 rt = _mm_mul_pd(r33,vftabscale);
1988 vfitab = _mm_cvttpd_epi32(rt);
1989 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1990 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1992 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1993 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1994 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1995 GMX_MM_TRANSPOSE2_PD(Y,F);
1996 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1997 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1998 GMX_MM_TRANSPOSE2_PD(G,H);
1999 Heps = _mm_mul_pd(vfeps,H);
2000 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2001 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2002 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
2006 /* Calculate temporary vectorial force */
2007 tx = _mm_mul_pd(fscal,dx33);
2008 ty = _mm_mul_pd(fscal,dy33);
2009 tz = _mm_mul_pd(fscal,dz33);
2011 /* Update vectorial force */
2012 fix3 = _mm_add_pd(fix3,tx);
2013 fiy3 = _mm_add_pd(fiy3,ty);
2014 fiz3 = _mm_add_pd(fiz3,tz);
2016 fjx3 = _mm_add_pd(fjx3,tx);
2017 fjy3 = _mm_add_pd(fjy3,ty);
2018 fjz3 = _mm_add_pd(fjz3,tz);
2020 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2022 /* Inner loop uses 402 flops */
2025 if(jidx<j_index_end)
2029 j_coord_offsetA = DIM*jnrA;
2031 /* load j atom coordinates */
2032 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2033 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2034 &jy2,&jz2,&jx3,&jy3,&jz3);
2036 /* Calculate displacement vector */
2037 dx00 = _mm_sub_pd(ix0,jx0);
2038 dy00 = _mm_sub_pd(iy0,jy0);
2039 dz00 = _mm_sub_pd(iz0,jz0);
2040 dx11 = _mm_sub_pd(ix1,jx1);
2041 dy11 = _mm_sub_pd(iy1,jy1);
2042 dz11 = _mm_sub_pd(iz1,jz1);
2043 dx12 = _mm_sub_pd(ix1,jx2);
2044 dy12 = _mm_sub_pd(iy1,jy2);
2045 dz12 = _mm_sub_pd(iz1,jz2);
2046 dx13 = _mm_sub_pd(ix1,jx3);
2047 dy13 = _mm_sub_pd(iy1,jy3);
2048 dz13 = _mm_sub_pd(iz1,jz3);
2049 dx21 = _mm_sub_pd(ix2,jx1);
2050 dy21 = _mm_sub_pd(iy2,jy1);
2051 dz21 = _mm_sub_pd(iz2,jz1);
2052 dx22 = _mm_sub_pd(ix2,jx2);
2053 dy22 = _mm_sub_pd(iy2,jy2);
2054 dz22 = _mm_sub_pd(iz2,jz2);
2055 dx23 = _mm_sub_pd(ix2,jx3);
2056 dy23 = _mm_sub_pd(iy2,jy3);
2057 dz23 = _mm_sub_pd(iz2,jz3);
2058 dx31 = _mm_sub_pd(ix3,jx1);
2059 dy31 = _mm_sub_pd(iy3,jy1);
2060 dz31 = _mm_sub_pd(iz3,jz1);
2061 dx32 = _mm_sub_pd(ix3,jx2);
2062 dy32 = _mm_sub_pd(iy3,jy2);
2063 dz32 = _mm_sub_pd(iz3,jz2);
2064 dx33 = _mm_sub_pd(ix3,jx3);
2065 dy33 = _mm_sub_pd(iy3,jy3);
2066 dz33 = _mm_sub_pd(iz3,jz3);
2068 /* Calculate squared distance and things based on it */
2069 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2070 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2071 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2072 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2073 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2074 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2075 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2076 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2077 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2078 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2080 rinv00 = sse41_invsqrt_d(rsq00);
2081 rinv11 = sse41_invsqrt_d(rsq11);
2082 rinv12 = sse41_invsqrt_d(rsq12);
2083 rinv13 = sse41_invsqrt_d(rsq13);
2084 rinv21 = sse41_invsqrt_d(rsq21);
2085 rinv22 = sse41_invsqrt_d(rsq22);
2086 rinv23 = sse41_invsqrt_d(rsq23);
2087 rinv31 = sse41_invsqrt_d(rsq31);
2088 rinv32 = sse41_invsqrt_d(rsq32);
2089 rinv33 = sse41_invsqrt_d(rsq33);
2091 fjx0 = _mm_setzero_pd();
2092 fjy0 = _mm_setzero_pd();
2093 fjz0 = _mm_setzero_pd();
2094 fjx1 = _mm_setzero_pd();
2095 fjy1 = _mm_setzero_pd();
2096 fjz1 = _mm_setzero_pd();
2097 fjx2 = _mm_setzero_pd();
2098 fjy2 = _mm_setzero_pd();
2099 fjz2 = _mm_setzero_pd();
2100 fjx3 = _mm_setzero_pd();
2101 fjy3 = _mm_setzero_pd();
2102 fjz3 = _mm_setzero_pd();
2104 /**************************
2105 * CALCULATE INTERACTIONS *
2106 **************************/
2108 r00 = _mm_mul_pd(rsq00,rinv00);
2110 /* Calculate table index by multiplying r with table scale and truncate to integer */
2111 rt = _mm_mul_pd(r00,vftabscale);
2112 vfitab = _mm_cvttpd_epi32(rt);
2113 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2114 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2116 /* CUBIC SPLINE TABLE DISPERSION */
2117 vfitab = _mm_add_epi32(vfitab,ifour);
2118 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2119 F = _mm_setzero_pd();
2120 GMX_MM_TRANSPOSE2_PD(Y,F);
2121 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2122 H = _mm_setzero_pd();
2123 GMX_MM_TRANSPOSE2_PD(G,H);
2124 Heps = _mm_mul_pd(vfeps,H);
2125 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2126 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2127 fvdw6 = _mm_mul_pd(c6_00,FF);
2129 /* CUBIC SPLINE TABLE REPULSION */
2130 vfitab = _mm_add_epi32(vfitab,ifour);
2131 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2132 F = _mm_setzero_pd();
2133 GMX_MM_TRANSPOSE2_PD(Y,F);
2134 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2135 H = _mm_setzero_pd();
2136 GMX_MM_TRANSPOSE2_PD(G,H);
2137 Heps = _mm_mul_pd(vfeps,H);
2138 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2139 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2140 fvdw12 = _mm_mul_pd(c12_00,FF);
2141 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
2145 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2147 /* Calculate temporary vectorial force */
2148 tx = _mm_mul_pd(fscal,dx00);
2149 ty = _mm_mul_pd(fscal,dy00);
2150 tz = _mm_mul_pd(fscal,dz00);
2152 /* Update vectorial force */
2153 fix0 = _mm_add_pd(fix0,tx);
2154 fiy0 = _mm_add_pd(fiy0,ty);
2155 fiz0 = _mm_add_pd(fiz0,tz);
2157 fjx0 = _mm_add_pd(fjx0,tx);
2158 fjy0 = _mm_add_pd(fjy0,ty);
2159 fjz0 = _mm_add_pd(fjz0,tz);
2161 /**************************
2162 * CALCULATE INTERACTIONS *
2163 **************************/
2165 r11 = _mm_mul_pd(rsq11,rinv11);
2167 /* Calculate table index by multiplying r with table scale and truncate to integer */
2168 rt = _mm_mul_pd(r11,vftabscale);
2169 vfitab = _mm_cvttpd_epi32(rt);
2170 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2171 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2173 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2174 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2175 F = _mm_setzero_pd();
2176 GMX_MM_TRANSPOSE2_PD(Y,F);
2177 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2178 H = _mm_setzero_pd();
2179 GMX_MM_TRANSPOSE2_PD(G,H);
2180 Heps = _mm_mul_pd(vfeps,H);
2181 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2182 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2183 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2187 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2189 /* Calculate temporary vectorial force */
2190 tx = _mm_mul_pd(fscal,dx11);
2191 ty = _mm_mul_pd(fscal,dy11);
2192 tz = _mm_mul_pd(fscal,dz11);
2194 /* Update vectorial force */
2195 fix1 = _mm_add_pd(fix1,tx);
2196 fiy1 = _mm_add_pd(fiy1,ty);
2197 fiz1 = _mm_add_pd(fiz1,tz);
2199 fjx1 = _mm_add_pd(fjx1,tx);
2200 fjy1 = _mm_add_pd(fjy1,ty);
2201 fjz1 = _mm_add_pd(fjz1,tz);
2203 /**************************
2204 * CALCULATE INTERACTIONS *
2205 **************************/
2207 r12 = _mm_mul_pd(rsq12,rinv12);
2209 /* Calculate table index by multiplying r with table scale and truncate to integer */
2210 rt = _mm_mul_pd(r12,vftabscale);
2211 vfitab = _mm_cvttpd_epi32(rt);
2212 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2213 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2215 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2216 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2217 F = _mm_setzero_pd();
2218 GMX_MM_TRANSPOSE2_PD(Y,F);
2219 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2220 H = _mm_setzero_pd();
2221 GMX_MM_TRANSPOSE2_PD(G,H);
2222 Heps = _mm_mul_pd(vfeps,H);
2223 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2224 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2225 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2229 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2231 /* Calculate temporary vectorial force */
2232 tx = _mm_mul_pd(fscal,dx12);
2233 ty = _mm_mul_pd(fscal,dy12);
2234 tz = _mm_mul_pd(fscal,dz12);
2236 /* Update vectorial force */
2237 fix1 = _mm_add_pd(fix1,tx);
2238 fiy1 = _mm_add_pd(fiy1,ty);
2239 fiz1 = _mm_add_pd(fiz1,tz);
2241 fjx2 = _mm_add_pd(fjx2,tx);
2242 fjy2 = _mm_add_pd(fjy2,ty);
2243 fjz2 = _mm_add_pd(fjz2,tz);
2245 /**************************
2246 * CALCULATE INTERACTIONS *
2247 **************************/
2249 r13 = _mm_mul_pd(rsq13,rinv13);
2251 /* Calculate table index by multiplying r with table scale and truncate to integer */
2252 rt = _mm_mul_pd(r13,vftabscale);
2253 vfitab = _mm_cvttpd_epi32(rt);
2254 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2255 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2257 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2258 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2259 F = _mm_setzero_pd();
2260 GMX_MM_TRANSPOSE2_PD(Y,F);
2261 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2262 H = _mm_setzero_pd();
2263 GMX_MM_TRANSPOSE2_PD(G,H);
2264 Heps = _mm_mul_pd(vfeps,H);
2265 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2266 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2267 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
2271 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2273 /* Calculate temporary vectorial force */
2274 tx = _mm_mul_pd(fscal,dx13);
2275 ty = _mm_mul_pd(fscal,dy13);
2276 tz = _mm_mul_pd(fscal,dz13);
2278 /* Update vectorial force */
2279 fix1 = _mm_add_pd(fix1,tx);
2280 fiy1 = _mm_add_pd(fiy1,ty);
2281 fiz1 = _mm_add_pd(fiz1,tz);
2283 fjx3 = _mm_add_pd(fjx3,tx);
2284 fjy3 = _mm_add_pd(fjy3,ty);
2285 fjz3 = _mm_add_pd(fjz3,tz);
2287 /**************************
2288 * CALCULATE INTERACTIONS *
2289 **************************/
2291 r21 = _mm_mul_pd(rsq21,rinv21);
2293 /* Calculate table index by multiplying r with table scale and truncate to integer */
2294 rt = _mm_mul_pd(r21,vftabscale);
2295 vfitab = _mm_cvttpd_epi32(rt);
2296 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2297 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2299 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2300 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2301 F = _mm_setzero_pd();
2302 GMX_MM_TRANSPOSE2_PD(Y,F);
2303 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2304 H = _mm_setzero_pd();
2305 GMX_MM_TRANSPOSE2_PD(G,H);
2306 Heps = _mm_mul_pd(vfeps,H);
2307 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2308 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2309 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2313 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2315 /* Calculate temporary vectorial force */
2316 tx = _mm_mul_pd(fscal,dx21);
2317 ty = _mm_mul_pd(fscal,dy21);
2318 tz = _mm_mul_pd(fscal,dz21);
2320 /* Update vectorial force */
2321 fix2 = _mm_add_pd(fix2,tx);
2322 fiy2 = _mm_add_pd(fiy2,ty);
2323 fiz2 = _mm_add_pd(fiz2,tz);
2325 fjx1 = _mm_add_pd(fjx1,tx);
2326 fjy1 = _mm_add_pd(fjy1,ty);
2327 fjz1 = _mm_add_pd(fjz1,tz);
2329 /**************************
2330 * CALCULATE INTERACTIONS *
2331 **************************/
2333 r22 = _mm_mul_pd(rsq22,rinv22);
2335 /* Calculate table index by multiplying r with table scale and truncate to integer */
2336 rt = _mm_mul_pd(r22,vftabscale);
2337 vfitab = _mm_cvttpd_epi32(rt);
2338 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2339 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2341 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2342 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2343 F = _mm_setzero_pd();
2344 GMX_MM_TRANSPOSE2_PD(Y,F);
2345 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2346 H = _mm_setzero_pd();
2347 GMX_MM_TRANSPOSE2_PD(G,H);
2348 Heps = _mm_mul_pd(vfeps,H);
2349 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2350 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2351 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2355 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2357 /* Calculate temporary vectorial force */
2358 tx = _mm_mul_pd(fscal,dx22);
2359 ty = _mm_mul_pd(fscal,dy22);
2360 tz = _mm_mul_pd(fscal,dz22);
2362 /* Update vectorial force */
2363 fix2 = _mm_add_pd(fix2,tx);
2364 fiy2 = _mm_add_pd(fiy2,ty);
2365 fiz2 = _mm_add_pd(fiz2,tz);
2367 fjx2 = _mm_add_pd(fjx2,tx);
2368 fjy2 = _mm_add_pd(fjy2,ty);
2369 fjz2 = _mm_add_pd(fjz2,tz);
2371 /**************************
2372 * CALCULATE INTERACTIONS *
2373 **************************/
2375 r23 = _mm_mul_pd(rsq23,rinv23);
2377 /* Calculate table index by multiplying r with table scale and truncate to integer */
2378 rt = _mm_mul_pd(r23,vftabscale);
2379 vfitab = _mm_cvttpd_epi32(rt);
2380 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2381 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2383 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2384 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2385 F = _mm_setzero_pd();
2386 GMX_MM_TRANSPOSE2_PD(Y,F);
2387 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2388 H = _mm_setzero_pd();
2389 GMX_MM_TRANSPOSE2_PD(G,H);
2390 Heps = _mm_mul_pd(vfeps,H);
2391 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2392 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2393 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
2397 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2399 /* Calculate temporary vectorial force */
2400 tx = _mm_mul_pd(fscal,dx23);
2401 ty = _mm_mul_pd(fscal,dy23);
2402 tz = _mm_mul_pd(fscal,dz23);
2404 /* Update vectorial force */
2405 fix2 = _mm_add_pd(fix2,tx);
2406 fiy2 = _mm_add_pd(fiy2,ty);
2407 fiz2 = _mm_add_pd(fiz2,tz);
2409 fjx3 = _mm_add_pd(fjx3,tx);
2410 fjy3 = _mm_add_pd(fjy3,ty);
2411 fjz3 = _mm_add_pd(fjz3,tz);
2413 /**************************
2414 * CALCULATE INTERACTIONS *
2415 **************************/
2417 r31 = _mm_mul_pd(rsq31,rinv31);
2419 /* Calculate table index by multiplying r with table scale and truncate to integer */
2420 rt = _mm_mul_pd(r31,vftabscale);
2421 vfitab = _mm_cvttpd_epi32(rt);
2422 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2423 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2425 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2426 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2427 F = _mm_setzero_pd();
2428 GMX_MM_TRANSPOSE2_PD(Y,F);
2429 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2430 H = _mm_setzero_pd();
2431 GMX_MM_TRANSPOSE2_PD(G,H);
2432 Heps = _mm_mul_pd(vfeps,H);
2433 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2434 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2435 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
2439 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2441 /* Calculate temporary vectorial force */
2442 tx = _mm_mul_pd(fscal,dx31);
2443 ty = _mm_mul_pd(fscal,dy31);
2444 tz = _mm_mul_pd(fscal,dz31);
2446 /* Update vectorial force */
2447 fix3 = _mm_add_pd(fix3,tx);
2448 fiy3 = _mm_add_pd(fiy3,ty);
2449 fiz3 = _mm_add_pd(fiz3,tz);
2451 fjx1 = _mm_add_pd(fjx1,tx);
2452 fjy1 = _mm_add_pd(fjy1,ty);
2453 fjz1 = _mm_add_pd(fjz1,tz);
2455 /**************************
2456 * CALCULATE INTERACTIONS *
2457 **************************/
2459 r32 = _mm_mul_pd(rsq32,rinv32);
2461 /* Calculate table index by multiplying r with table scale and truncate to integer */
2462 rt = _mm_mul_pd(r32,vftabscale);
2463 vfitab = _mm_cvttpd_epi32(rt);
2464 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2465 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2467 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2468 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2469 F = _mm_setzero_pd();
2470 GMX_MM_TRANSPOSE2_PD(Y,F);
2471 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2472 H = _mm_setzero_pd();
2473 GMX_MM_TRANSPOSE2_PD(G,H);
2474 Heps = _mm_mul_pd(vfeps,H);
2475 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2476 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2477 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
2481 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2483 /* Calculate temporary vectorial force */
2484 tx = _mm_mul_pd(fscal,dx32);
2485 ty = _mm_mul_pd(fscal,dy32);
2486 tz = _mm_mul_pd(fscal,dz32);
2488 /* Update vectorial force */
2489 fix3 = _mm_add_pd(fix3,tx);
2490 fiy3 = _mm_add_pd(fiy3,ty);
2491 fiz3 = _mm_add_pd(fiz3,tz);
2493 fjx2 = _mm_add_pd(fjx2,tx);
2494 fjy2 = _mm_add_pd(fjy2,ty);
2495 fjz2 = _mm_add_pd(fjz2,tz);
2497 /**************************
2498 * CALCULATE INTERACTIONS *
2499 **************************/
2501 r33 = _mm_mul_pd(rsq33,rinv33);
2503 /* Calculate table index by multiplying r with table scale and truncate to integer */
2504 rt = _mm_mul_pd(r33,vftabscale);
2505 vfitab = _mm_cvttpd_epi32(rt);
2506 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2507 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2509 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2510 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2511 F = _mm_setzero_pd();
2512 GMX_MM_TRANSPOSE2_PD(Y,F);
2513 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2514 H = _mm_setzero_pd();
2515 GMX_MM_TRANSPOSE2_PD(G,H);
2516 Heps = _mm_mul_pd(vfeps,H);
2517 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2518 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2519 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
2523 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2525 /* Calculate temporary vectorial force */
2526 tx = _mm_mul_pd(fscal,dx33);
2527 ty = _mm_mul_pd(fscal,dy33);
2528 tz = _mm_mul_pd(fscal,dz33);
2530 /* Update vectorial force */
2531 fix3 = _mm_add_pd(fix3,tx);
2532 fiy3 = _mm_add_pd(fiy3,ty);
2533 fiz3 = _mm_add_pd(fiz3,tz);
2535 fjx3 = _mm_add_pd(fjx3,tx);
2536 fjy3 = _mm_add_pd(fjy3,ty);
2537 fjz3 = _mm_add_pd(fjz3,tz);
2539 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2541 /* Inner loop uses 402 flops */
2544 /* End of innermost loop */
2546 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2547 f+i_coord_offset,fshift+i_shift_offset);
2549 /* Increment number of inner iterations */
2550 inneriter += j_index_end - j_index_start;
2552 /* Outer loop uses 24 flops */
2555 /* Increment number of outer iterations */
2558 /* Update outer/inner flops */
2560 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*402);