2 * Note: this file was generated by the Gromacs sse4_1_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse4_1_double.h"
34 #include "kernelutil_x86_sse4_1_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse4_1_double
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse4_1_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B;
75 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 int vdwjidx1A,vdwjidx1B;
77 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78 int vdwjidx2A,vdwjidx2B;
79 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80 int vdwjidx3A,vdwjidx3B;
81 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
95 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
98 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
99 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
101 __m128i ifour = _mm_set1_epi32(4);
102 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
104 __m128d dummy_mask,cutoff_mask;
105 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
106 __m128d one = _mm_set1_pd(1.0);
107 __m128d two = _mm_set1_pd(2.0);
113 jindex = nlist->jindex;
115 shiftidx = nlist->shift;
117 shiftvec = fr->shift_vec[0];
118 fshift = fr->fshift[0];
119 facel = _mm_set1_pd(fr->epsfac);
120 charge = mdatoms->chargeA;
121 nvdwtype = fr->ntype;
123 vdwtype = mdatoms->typeA;
125 vftab = kernel_data->table_elec_vdw->data;
126 vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
128 /* Setup water-specific parameters */
129 inr = nlist->iinr[0];
130 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
131 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
132 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
133 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
135 jq1 = _mm_set1_pd(charge[inr+1]);
136 jq2 = _mm_set1_pd(charge[inr+2]);
137 jq3 = _mm_set1_pd(charge[inr+3]);
138 vdwjidx0A = 2*vdwtype[inr+0];
139 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
140 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
141 qq11 = _mm_mul_pd(iq1,jq1);
142 qq12 = _mm_mul_pd(iq1,jq2);
143 qq13 = _mm_mul_pd(iq1,jq3);
144 qq21 = _mm_mul_pd(iq2,jq1);
145 qq22 = _mm_mul_pd(iq2,jq2);
146 qq23 = _mm_mul_pd(iq2,jq3);
147 qq31 = _mm_mul_pd(iq3,jq1);
148 qq32 = _mm_mul_pd(iq3,jq2);
149 qq33 = _mm_mul_pd(iq3,jq3);
151 /* Avoid stupid compiler warnings */
159 /* Start outer loop over neighborlists */
160 for(iidx=0; iidx<nri; iidx++)
162 /* Load shift vector for this list */
163 i_shift_offset = DIM*shiftidx[iidx];
165 /* Load limits for loop over neighbors */
166 j_index_start = jindex[iidx];
167 j_index_end = jindex[iidx+1];
169 /* Get outer coordinate index */
171 i_coord_offset = DIM*inr;
173 /* Load i particle coords and add shift vector */
174 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
175 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
177 fix0 = _mm_setzero_pd();
178 fiy0 = _mm_setzero_pd();
179 fiz0 = _mm_setzero_pd();
180 fix1 = _mm_setzero_pd();
181 fiy1 = _mm_setzero_pd();
182 fiz1 = _mm_setzero_pd();
183 fix2 = _mm_setzero_pd();
184 fiy2 = _mm_setzero_pd();
185 fiz2 = _mm_setzero_pd();
186 fix3 = _mm_setzero_pd();
187 fiy3 = _mm_setzero_pd();
188 fiz3 = _mm_setzero_pd();
190 /* Reset potential sums */
191 velecsum = _mm_setzero_pd();
192 vvdwsum = _mm_setzero_pd();
194 /* Start inner kernel loop */
195 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
198 /* Get j neighbor index, and coordinate index */
201 j_coord_offsetA = DIM*jnrA;
202 j_coord_offsetB = DIM*jnrB;
204 /* load j atom coordinates */
205 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
206 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
207 &jy2,&jz2,&jx3,&jy3,&jz3);
209 /* Calculate displacement vector */
210 dx00 = _mm_sub_pd(ix0,jx0);
211 dy00 = _mm_sub_pd(iy0,jy0);
212 dz00 = _mm_sub_pd(iz0,jz0);
213 dx11 = _mm_sub_pd(ix1,jx1);
214 dy11 = _mm_sub_pd(iy1,jy1);
215 dz11 = _mm_sub_pd(iz1,jz1);
216 dx12 = _mm_sub_pd(ix1,jx2);
217 dy12 = _mm_sub_pd(iy1,jy2);
218 dz12 = _mm_sub_pd(iz1,jz2);
219 dx13 = _mm_sub_pd(ix1,jx3);
220 dy13 = _mm_sub_pd(iy1,jy3);
221 dz13 = _mm_sub_pd(iz1,jz3);
222 dx21 = _mm_sub_pd(ix2,jx1);
223 dy21 = _mm_sub_pd(iy2,jy1);
224 dz21 = _mm_sub_pd(iz2,jz1);
225 dx22 = _mm_sub_pd(ix2,jx2);
226 dy22 = _mm_sub_pd(iy2,jy2);
227 dz22 = _mm_sub_pd(iz2,jz2);
228 dx23 = _mm_sub_pd(ix2,jx3);
229 dy23 = _mm_sub_pd(iy2,jy3);
230 dz23 = _mm_sub_pd(iz2,jz3);
231 dx31 = _mm_sub_pd(ix3,jx1);
232 dy31 = _mm_sub_pd(iy3,jy1);
233 dz31 = _mm_sub_pd(iz3,jz1);
234 dx32 = _mm_sub_pd(ix3,jx2);
235 dy32 = _mm_sub_pd(iy3,jy2);
236 dz32 = _mm_sub_pd(iz3,jz2);
237 dx33 = _mm_sub_pd(ix3,jx3);
238 dy33 = _mm_sub_pd(iy3,jy3);
239 dz33 = _mm_sub_pd(iz3,jz3);
241 /* Calculate squared distance and things based on it */
242 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
243 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
244 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
245 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
246 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
247 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
248 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
249 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
250 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
251 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
253 rinv00 = gmx_mm_invsqrt_pd(rsq00);
254 rinv11 = gmx_mm_invsqrt_pd(rsq11);
255 rinv12 = gmx_mm_invsqrt_pd(rsq12);
256 rinv13 = gmx_mm_invsqrt_pd(rsq13);
257 rinv21 = gmx_mm_invsqrt_pd(rsq21);
258 rinv22 = gmx_mm_invsqrt_pd(rsq22);
259 rinv23 = gmx_mm_invsqrt_pd(rsq23);
260 rinv31 = gmx_mm_invsqrt_pd(rsq31);
261 rinv32 = gmx_mm_invsqrt_pd(rsq32);
262 rinv33 = gmx_mm_invsqrt_pd(rsq33);
264 fjx0 = _mm_setzero_pd();
265 fjy0 = _mm_setzero_pd();
266 fjz0 = _mm_setzero_pd();
267 fjx1 = _mm_setzero_pd();
268 fjy1 = _mm_setzero_pd();
269 fjz1 = _mm_setzero_pd();
270 fjx2 = _mm_setzero_pd();
271 fjy2 = _mm_setzero_pd();
272 fjz2 = _mm_setzero_pd();
273 fjx3 = _mm_setzero_pd();
274 fjy3 = _mm_setzero_pd();
275 fjz3 = _mm_setzero_pd();
277 /**************************
278 * CALCULATE INTERACTIONS *
279 **************************/
281 r00 = _mm_mul_pd(rsq00,rinv00);
283 /* Calculate table index by multiplying r with table scale and truncate to integer */
284 rt = _mm_mul_pd(r00,vftabscale);
285 vfitab = _mm_cvttpd_epi32(rt);
286 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
287 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
289 /* CUBIC SPLINE TABLE DISPERSION */
290 vfitab = _mm_add_epi32(vfitab,ifour);
291 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
292 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
293 GMX_MM_TRANSPOSE2_PD(Y,F);
294 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
295 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
296 GMX_MM_TRANSPOSE2_PD(G,H);
297 Heps = _mm_mul_pd(vfeps,H);
298 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
299 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
300 vvdw6 = _mm_mul_pd(c6_00,VV);
301 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
302 fvdw6 = _mm_mul_pd(c6_00,FF);
304 /* CUBIC SPLINE TABLE REPULSION */
305 vfitab = _mm_add_epi32(vfitab,ifour);
306 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
307 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
308 GMX_MM_TRANSPOSE2_PD(Y,F);
309 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
310 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
311 GMX_MM_TRANSPOSE2_PD(G,H);
312 Heps = _mm_mul_pd(vfeps,H);
313 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
314 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
315 vvdw12 = _mm_mul_pd(c12_00,VV);
316 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
317 fvdw12 = _mm_mul_pd(c12_00,FF);
318 vvdw = _mm_add_pd(vvdw12,vvdw6);
319 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
321 /* Update potential sum for this i atom from the interaction with this j atom. */
322 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
326 /* Calculate temporary vectorial force */
327 tx = _mm_mul_pd(fscal,dx00);
328 ty = _mm_mul_pd(fscal,dy00);
329 tz = _mm_mul_pd(fscal,dz00);
331 /* Update vectorial force */
332 fix0 = _mm_add_pd(fix0,tx);
333 fiy0 = _mm_add_pd(fiy0,ty);
334 fiz0 = _mm_add_pd(fiz0,tz);
336 fjx0 = _mm_add_pd(fjx0,tx);
337 fjy0 = _mm_add_pd(fjy0,ty);
338 fjz0 = _mm_add_pd(fjz0,tz);
340 /**************************
341 * CALCULATE INTERACTIONS *
342 **************************/
344 r11 = _mm_mul_pd(rsq11,rinv11);
346 /* Calculate table index by multiplying r with table scale and truncate to integer */
347 rt = _mm_mul_pd(r11,vftabscale);
348 vfitab = _mm_cvttpd_epi32(rt);
349 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
350 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
352 /* CUBIC SPLINE TABLE ELECTROSTATICS */
353 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
354 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
355 GMX_MM_TRANSPOSE2_PD(Y,F);
356 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
357 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
358 GMX_MM_TRANSPOSE2_PD(G,H);
359 Heps = _mm_mul_pd(vfeps,H);
360 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
361 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
362 velec = _mm_mul_pd(qq11,VV);
363 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
364 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
366 /* Update potential sum for this i atom from the interaction with this j atom. */
367 velecsum = _mm_add_pd(velecsum,velec);
371 /* Calculate temporary vectorial force */
372 tx = _mm_mul_pd(fscal,dx11);
373 ty = _mm_mul_pd(fscal,dy11);
374 tz = _mm_mul_pd(fscal,dz11);
376 /* Update vectorial force */
377 fix1 = _mm_add_pd(fix1,tx);
378 fiy1 = _mm_add_pd(fiy1,ty);
379 fiz1 = _mm_add_pd(fiz1,tz);
381 fjx1 = _mm_add_pd(fjx1,tx);
382 fjy1 = _mm_add_pd(fjy1,ty);
383 fjz1 = _mm_add_pd(fjz1,tz);
385 /**************************
386 * CALCULATE INTERACTIONS *
387 **************************/
389 r12 = _mm_mul_pd(rsq12,rinv12);
391 /* Calculate table index by multiplying r with table scale and truncate to integer */
392 rt = _mm_mul_pd(r12,vftabscale);
393 vfitab = _mm_cvttpd_epi32(rt);
394 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
395 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
397 /* CUBIC SPLINE TABLE ELECTROSTATICS */
398 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
399 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
400 GMX_MM_TRANSPOSE2_PD(Y,F);
401 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
402 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
403 GMX_MM_TRANSPOSE2_PD(G,H);
404 Heps = _mm_mul_pd(vfeps,H);
405 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
406 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
407 velec = _mm_mul_pd(qq12,VV);
408 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
409 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
411 /* Update potential sum for this i atom from the interaction with this j atom. */
412 velecsum = _mm_add_pd(velecsum,velec);
416 /* Calculate temporary vectorial force */
417 tx = _mm_mul_pd(fscal,dx12);
418 ty = _mm_mul_pd(fscal,dy12);
419 tz = _mm_mul_pd(fscal,dz12);
421 /* Update vectorial force */
422 fix1 = _mm_add_pd(fix1,tx);
423 fiy1 = _mm_add_pd(fiy1,ty);
424 fiz1 = _mm_add_pd(fiz1,tz);
426 fjx2 = _mm_add_pd(fjx2,tx);
427 fjy2 = _mm_add_pd(fjy2,ty);
428 fjz2 = _mm_add_pd(fjz2,tz);
430 /**************************
431 * CALCULATE INTERACTIONS *
432 **************************/
434 r13 = _mm_mul_pd(rsq13,rinv13);
436 /* Calculate table index by multiplying r with table scale and truncate to integer */
437 rt = _mm_mul_pd(r13,vftabscale);
438 vfitab = _mm_cvttpd_epi32(rt);
439 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
440 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
442 /* CUBIC SPLINE TABLE ELECTROSTATICS */
443 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
444 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
445 GMX_MM_TRANSPOSE2_PD(Y,F);
446 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
447 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
448 GMX_MM_TRANSPOSE2_PD(G,H);
449 Heps = _mm_mul_pd(vfeps,H);
450 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
451 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
452 velec = _mm_mul_pd(qq13,VV);
453 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
454 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
456 /* Update potential sum for this i atom from the interaction with this j atom. */
457 velecsum = _mm_add_pd(velecsum,velec);
461 /* Calculate temporary vectorial force */
462 tx = _mm_mul_pd(fscal,dx13);
463 ty = _mm_mul_pd(fscal,dy13);
464 tz = _mm_mul_pd(fscal,dz13);
466 /* Update vectorial force */
467 fix1 = _mm_add_pd(fix1,tx);
468 fiy1 = _mm_add_pd(fiy1,ty);
469 fiz1 = _mm_add_pd(fiz1,tz);
471 fjx3 = _mm_add_pd(fjx3,tx);
472 fjy3 = _mm_add_pd(fjy3,ty);
473 fjz3 = _mm_add_pd(fjz3,tz);
475 /**************************
476 * CALCULATE INTERACTIONS *
477 **************************/
479 r21 = _mm_mul_pd(rsq21,rinv21);
481 /* Calculate table index by multiplying r with table scale and truncate to integer */
482 rt = _mm_mul_pd(r21,vftabscale);
483 vfitab = _mm_cvttpd_epi32(rt);
484 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
485 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
487 /* CUBIC SPLINE TABLE ELECTROSTATICS */
488 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
489 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
490 GMX_MM_TRANSPOSE2_PD(Y,F);
491 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
492 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
493 GMX_MM_TRANSPOSE2_PD(G,H);
494 Heps = _mm_mul_pd(vfeps,H);
495 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
496 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
497 velec = _mm_mul_pd(qq21,VV);
498 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
499 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
501 /* Update potential sum for this i atom from the interaction with this j atom. */
502 velecsum = _mm_add_pd(velecsum,velec);
506 /* Calculate temporary vectorial force */
507 tx = _mm_mul_pd(fscal,dx21);
508 ty = _mm_mul_pd(fscal,dy21);
509 tz = _mm_mul_pd(fscal,dz21);
511 /* Update vectorial force */
512 fix2 = _mm_add_pd(fix2,tx);
513 fiy2 = _mm_add_pd(fiy2,ty);
514 fiz2 = _mm_add_pd(fiz2,tz);
516 fjx1 = _mm_add_pd(fjx1,tx);
517 fjy1 = _mm_add_pd(fjy1,ty);
518 fjz1 = _mm_add_pd(fjz1,tz);
520 /**************************
521 * CALCULATE INTERACTIONS *
522 **************************/
524 r22 = _mm_mul_pd(rsq22,rinv22);
526 /* Calculate table index by multiplying r with table scale and truncate to integer */
527 rt = _mm_mul_pd(r22,vftabscale);
528 vfitab = _mm_cvttpd_epi32(rt);
529 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
530 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
532 /* CUBIC SPLINE TABLE ELECTROSTATICS */
533 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
534 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
535 GMX_MM_TRANSPOSE2_PD(Y,F);
536 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
537 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
538 GMX_MM_TRANSPOSE2_PD(G,H);
539 Heps = _mm_mul_pd(vfeps,H);
540 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
541 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
542 velec = _mm_mul_pd(qq22,VV);
543 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
544 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
546 /* Update potential sum for this i atom from the interaction with this j atom. */
547 velecsum = _mm_add_pd(velecsum,velec);
551 /* Calculate temporary vectorial force */
552 tx = _mm_mul_pd(fscal,dx22);
553 ty = _mm_mul_pd(fscal,dy22);
554 tz = _mm_mul_pd(fscal,dz22);
556 /* Update vectorial force */
557 fix2 = _mm_add_pd(fix2,tx);
558 fiy2 = _mm_add_pd(fiy2,ty);
559 fiz2 = _mm_add_pd(fiz2,tz);
561 fjx2 = _mm_add_pd(fjx2,tx);
562 fjy2 = _mm_add_pd(fjy2,ty);
563 fjz2 = _mm_add_pd(fjz2,tz);
565 /**************************
566 * CALCULATE INTERACTIONS *
567 **************************/
569 r23 = _mm_mul_pd(rsq23,rinv23);
571 /* Calculate table index by multiplying r with table scale and truncate to integer */
572 rt = _mm_mul_pd(r23,vftabscale);
573 vfitab = _mm_cvttpd_epi32(rt);
574 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
575 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
577 /* CUBIC SPLINE TABLE ELECTROSTATICS */
578 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
579 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
580 GMX_MM_TRANSPOSE2_PD(Y,F);
581 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
582 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
583 GMX_MM_TRANSPOSE2_PD(G,H);
584 Heps = _mm_mul_pd(vfeps,H);
585 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
586 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
587 velec = _mm_mul_pd(qq23,VV);
588 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
589 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
591 /* Update potential sum for this i atom from the interaction with this j atom. */
592 velecsum = _mm_add_pd(velecsum,velec);
596 /* Calculate temporary vectorial force */
597 tx = _mm_mul_pd(fscal,dx23);
598 ty = _mm_mul_pd(fscal,dy23);
599 tz = _mm_mul_pd(fscal,dz23);
601 /* Update vectorial force */
602 fix2 = _mm_add_pd(fix2,tx);
603 fiy2 = _mm_add_pd(fiy2,ty);
604 fiz2 = _mm_add_pd(fiz2,tz);
606 fjx3 = _mm_add_pd(fjx3,tx);
607 fjy3 = _mm_add_pd(fjy3,ty);
608 fjz3 = _mm_add_pd(fjz3,tz);
610 /**************************
611 * CALCULATE INTERACTIONS *
612 **************************/
614 r31 = _mm_mul_pd(rsq31,rinv31);
616 /* Calculate table index by multiplying r with table scale and truncate to integer */
617 rt = _mm_mul_pd(r31,vftabscale);
618 vfitab = _mm_cvttpd_epi32(rt);
619 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
620 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
622 /* CUBIC SPLINE TABLE ELECTROSTATICS */
623 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
624 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
625 GMX_MM_TRANSPOSE2_PD(Y,F);
626 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
627 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
628 GMX_MM_TRANSPOSE2_PD(G,H);
629 Heps = _mm_mul_pd(vfeps,H);
630 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
631 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
632 velec = _mm_mul_pd(qq31,VV);
633 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
634 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
636 /* Update potential sum for this i atom from the interaction with this j atom. */
637 velecsum = _mm_add_pd(velecsum,velec);
641 /* Calculate temporary vectorial force */
642 tx = _mm_mul_pd(fscal,dx31);
643 ty = _mm_mul_pd(fscal,dy31);
644 tz = _mm_mul_pd(fscal,dz31);
646 /* Update vectorial force */
647 fix3 = _mm_add_pd(fix3,tx);
648 fiy3 = _mm_add_pd(fiy3,ty);
649 fiz3 = _mm_add_pd(fiz3,tz);
651 fjx1 = _mm_add_pd(fjx1,tx);
652 fjy1 = _mm_add_pd(fjy1,ty);
653 fjz1 = _mm_add_pd(fjz1,tz);
655 /**************************
656 * CALCULATE INTERACTIONS *
657 **************************/
659 r32 = _mm_mul_pd(rsq32,rinv32);
661 /* Calculate table index by multiplying r with table scale and truncate to integer */
662 rt = _mm_mul_pd(r32,vftabscale);
663 vfitab = _mm_cvttpd_epi32(rt);
664 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
665 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
667 /* CUBIC SPLINE TABLE ELECTROSTATICS */
668 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
669 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
670 GMX_MM_TRANSPOSE2_PD(Y,F);
671 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
672 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
673 GMX_MM_TRANSPOSE2_PD(G,H);
674 Heps = _mm_mul_pd(vfeps,H);
675 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
676 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
677 velec = _mm_mul_pd(qq32,VV);
678 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
679 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
681 /* Update potential sum for this i atom from the interaction with this j atom. */
682 velecsum = _mm_add_pd(velecsum,velec);
686 /* Calculate temporary vectorial force */
687 tx = _mm_mul_pd(fscal,dx32);
688 ty = _mm_mul_pd(fscal,dy32);
689 tz = _mm_mul_pd(fscal,dz32);
691 /* Update vectorial force */
692 fix3 = _mm_add_pd(fix3,tx);
693 fiy3 = _mm_add_pd(fiy3,ty);
694 fiz3 = _mm_add_pd(fiz3,tz);
696 fjx2 = _mm_add_pd(fjx2,tx);
697 fjy2 = _mm_add_pd(fjy2,ty);
698 fjz2 = _mm_add_pd(fjz2,tz);
700 /**************************
701 * CALCULATE INTERACTIONS *
702 **************************/
704 r33 = _mm_mul_pd(rsq33,rinv33);
706 /* Calculate table index by multiplying r with table scale and truncate to integer */
707 rt = _mm_mul_pd(r33,vftabscale);
708 vfitab = _mm_cvttpd_epi32(rt);
709 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
710 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
712 /* CUBIC SPLINE TABLE ELECTROSTATICS */
713 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
714 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
715 GMX_MM_TRANSPOSE2_PD(Y,F);
716 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
717 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
718 GMX_MM_TRANSPOSE2_PD(G,H);
719 Heps = _mm_mul_pd(vfeps,H);
720 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
721 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
722 velec = _mm_mul_pd(qq33,VV);
723 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
724 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
726 /* Update potential sum for this i atom from the interaction with this j atom. */
727 velecsum = _mm_add_pd(velecsum,velec);
731 /* Calculate temporary vectorial force */
732 tx = _mm_mul_pd(fscal,dx33);
733 ty = _mm_mul_pd(fscal,dy33);
734 tz = _mm_mul_pd(fscal,dz33);
736 /* Update vectorial force */
737 fix3 = _mm_add_pd(fix3,tx);
738 fiy3 = _mm_add_pd(fiy3,ty);
739 fiz3 = _mm_add_pd(fiz3,tz);
741 fjx3 = _mm_add_pd(fjx3,tx);
742 fjy3 = _mm_add_pd(fjy3,ty);
743 fjz3 = _mm_add_pd(fjz3,tz);
745 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
747 /* Inner loop uses 446 flops */
754 j_coord_offsetA = DIM*jnrA;
756 /* load j atom coordinates */
757 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
758 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
759 &jy2,&jz2,&jx3,&jy3,&jz3);
761 /* Calculate displacement vector */
762 dx00 = _mm_sub_pd(ix0,jx0);
763 dy00 = _mm_sub_pd(iy0,jy0);
764 dz00 = _mm_sub_pd(iz0,jz0);
765 dx11 = _mm_sub_pd(ix1,jx1);
766 dy11 = _mm_sub_pd(iy1,jy1);
767 dz11 = _mm_sub_pd(iz1,jz1);
768 dx12 = _mm_sub_pd(ix1,jx2);
769 dy12 = _mm_sub_pd(iy1,jy2);
770 dz12 = _mm_sub_pd(iz1,jz2);
771 dx13 = _mm_sub_pd(ix1,jx3);
772 dy13 = _mm_sub_pd(iy1,jy3);
773 dz13 = _mm_sub_pd(iz1,jz3);
774 dx21 = _mm_sub_pd(ix2,jx1);
775 dy21 = _mm_sub_pd(iy2,jy1);
776 dz21 = _mm_sub_pd(iz2,jz1);
777 dx22 = _mm_sub_pd(ix2,jx2);
778 dy22 = _mm_sub_pd(iy2,jy2);
779 dz22 = _mm_sub_pd(iz2,jz2);
780 dx23 = _mm_sub_pd(ix2,jx3);
781 dy23 = _mm_sub_pd(iy2,jy3);
782 dz23 = _mm_sub_pd(iz2,jz3);
783 dx31 = _mm_sub_pd(ix3,jx1);
784 dy31 = _mm_sub_pd(iy3,jy1);
785 dz31 = _mm_sub_pd(iz3,jz1);
786 dx32 = _mm_sub_pd(ix3,jx2);
787 dy32 = _mm_sub_pd(iy3,jy2);
788 dz32 = _mm_sub_pd(iz3,jz2);
789 dx33 = _mm_sub_pd(ix3,jx3);
790 dy33 = _mm_sub_pd(iy3,jy3);
791 dz33 = _mm_sub_pd(iz3,jz3);
793 /* Calculate squared distance and things based on it */
794 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
795 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
796 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
797 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
798 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
799 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
800 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
801 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
802 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
803 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
805 rinv00 = gmx_mm_invsqrt_pd(rsq00);
806 rinv11 = gmx_mm_invsqrt_pd(rsq11);
807 rinv12 = gmx_mm_invsqrt_pd(rsq12);
808 rinv13 = gmx_mm_invsqrt_pd(rsq13);
809 rinv21 = gmx_mm_invsqrt_pd(rsq21);
810 rinv22 = gmx_mm_invsqrt_pd(rsq22);
811 rinv23 = gmx_mm_invsqrt_pd(rsq23);
812 rinv31 = gmx_mm_invsqrt_pd(rsq31);
813 rinv32 = gmx_mm_invsqrt_pd(rsq32);
814 rinv33 = gmx_mm_invsqrt_pd(rsq33);
816 fjx0 = _mm_setzero_pd();
817 fjy0 = _mm_setzero_pd();
818 fjz0 = _mm_setzero_pd();
819 fjx1 = _mm_setzero_pd();
820 fjy1 = _mm_setzero_pd();
821 fjz1 = _mm_setzero_pd();
822 fjx2 = _mm_setzero_pd();
823 fjy2 = _mm_setzero_pd();
824 fjz2 = _mm_setzero_pd();
825 fjx3 = _mm_setzero_pd();
826 fjy3 = _mm_setzero_pd();
827 fjz3 = _mm_setzero_pd();
829 /**************************
830 * CALCULATE INTERACTIONS *
831 **************************/
833 r00 = _mm_mul_pd(rsq00,rinv00);
835 /* Calculate table index by multiplying r with table scale and truncate to integer */
836 rt = _mm_mul_pd(r00,vftabscale);
837 vfitab = _mm_cvttpd_epi32(rt);
838 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
839 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
841 /* CUBIC SPLINE TABLE DISPERSION */
842 vfitab = _mm_add_epi32(vfitab,ifour);
843 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
844 F = _mm_setzero_pd();
845 GMX_MM_TRANSPOSE2_PD(Y,F);
846 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
847 H = _mm_setzero_pd();
848 GMX_MM_TRANSPOSE2_PD(G,H);
849 Heps = _mm_mul_pd(vfeps,H);
850 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
851 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
852 vvdw6 = _mm_mul_pd(c6_00,VV);
853 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
854 fvdw6 = _mm_mul_pd(c6_00,FF);
856 /* CUBIC SPLINE TABLE REPULSION */
857 vfitab = _mm_add_epi32(vfitab,ifour);
858 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
859 F = _mm_setzero_pd();
860 GMX_MM_TRANSPOSE2_PD(Y,F);
861 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
862 H = _mm_setzero_pd();
863 GMX_MM_TRANSPOSE2_PD(G,H);
864 Heps = _mm_mul_pd(vfeps,H);
865 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
866 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
867 vvdw12 = _mm_mul_pd(c12_00,VV);
868 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
869 fvdw12 = _mm_mul_pd(c12_00,FF);
870 vvdw = _mm_add_pd(vvdw12,vvdw6);
871 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
873 /* Update potential sum for this i atom from the interaction with this j atom. */
874 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
875 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
879 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
881 /* Calculate temporary vectorial force */
882 tx = _mm_mul_pd(fscal,dx00);
883 ty = _mm_mul_pd(fscal,dy00);
884 tz = _mm_mul_pd(fscal,dz00);
886 /* Update vectorial force */
887 fix0 = _mm_add_pd(fix0,tx);
888 fiy0 = _mm_add_pd(fiy0,ty);
889 fiz0 = _mm_add_pd(fiz0,tz);
891 fjx0 = _mm_add_pd(fjx0,tx);
892 fjy0 = _mm_add_pd(fjy0,ty);
893 fjz0 = _mm_add_pd(fjz0,tz);
895 /**************************
896 * CALCULATE INTERACTIONS *
897 **************************/
899 r11 = _mm_mul_pd(rsq11,rinv11);
901 /* Calculate table index by multiplying r with table scale and truncate to integer */
902 rt = _mm_mul_pd(r11,vftabscale);
903 vfitab = _mm_cvttpd_epi32(rt);
904 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
905 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
907 /* CUBIC SPLINE TABLE ELECTROSTATICS */
908 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
909 F = _mm_setzero_pd();
910 GMX_MM_TRANSPOSE2_PD(Y,F);
911 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
912 H = _mm_setzero_pd();
913 GMX_MM_TRANSPOSE2_PD(G,H);
914 Heps = _mm_mul_pd(vfeps,H);
915 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
916 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
917 velec = _mm_mul_pd(qq11,VV);
918 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
919 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
921 /* Update potential sum for this i atom from the interaction with this j atom. */
922 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
923 velecsum = _mm_add_pd(velecsum,velec);
927 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
929 /* Calculate temporary vectorial force */
930 tx = _mm_mul_pd(fscal,dx11);
931 ty = _mm_mul_pd(fscal,dy11);
932 tz = _mm_mul_pd(fscal,dz11);
934 /* Update vectorial force */
935 fix1 = _mm_add_pd(fix1,tx);
936 fiy1 = _mm_add_pd(fiy1,ty);
937 fiz1 = _mm_add_pd(fiz1,tz);
939 fjx1 = _mm_add_pd(fjx1,tx);
940 fjy1 = _mm_add_pd(fjy1,ty);
941 fjz1 = _mm_add_pd(fjz1,tz);
943 /**************************
944 * CALCULATE INTERACTIONS *
945 **************************/
947 r12 = _mm_mul_pd(rsq12,rinv12);
949 /* Calculate table index by multiplying r with table scale and truncate to integer */
950 rt = _mm_mul_pd(r12,vftabscale);
951 vfitab = _mm_cvttpd_epi32(rt);
952 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
953 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
955 /* CUBIC SPLINE TABLE ELECTROSTATICS */
956 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
957 F = _mm_setzero_pd();
958 GMX_MM_TRANSPOSE2_PD(Y,F);
959 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
960 H = _mm_setzero_pd();
961 GMX_MM_TRANSPOSE2_PD(G,H);
962 Heps = _mm_mul_pd(vfeps,H);
963 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
964 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
965 velec = _mm_mul_pd(qq12,VV);
966 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
967 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
969 /* Update potential sum for this i atom from the interaction with this j atom. */
970 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
971 velecsum = _mm_add_pd(velecsum,velec);
975 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
977 /* Calculate temporary vectorial force */
978 tx = _mm_mul_pd(fscal,dx12);
979 ty = _mm_mul_pd(fscal,dy12);
980 tz = _mm_mul_pd(fscal,dz12);
982 /* Update vectorial force */
983 fix1 = _mm_add_pd(fix1,tx);
984 fiy1 = _mm_add_pd(fiy1,ty);
985 fiz1 = _mm_add_pd(fiz1,tz);
987 fjx2 = _mm_add_pd(fjx2,tx);
988 fjy2 = _mm_add_pd(fjy2,ty);
989 fjz2 = _mm_add_pd(fjz2,tz);
991 /**************************
992 * CALCULATE INTERACTIONS *
993 **************************/
995 r13 = _mm_mul_pd(rsq13,rinv13);
997 /* Calculate table index by multiplying r with table scale and truncate to integer */
998 rt = _mm_mul_pd(r13,vftabscale);
999 vfitab = _mm_cvttpd_epi32(rt);
1000 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1001 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1003 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1004 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1005 F = _mm_setzero_pd();
1006 GMX_MM_TRANSPOSE2_PD(Y,F);
1007 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1008 H = _mm_setzero_pd();
1009 GMX_MM_TRANSPOSE2_PD(G,H);
1010 Heps = _mm_mul_pd(vfeps,H);
1011 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1012 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1013 velec = _mm_mul_pd(qq13,VV);
1014 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1015 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
1017 /* Update potential sum for this i atom from the interaction with this j atom. */
1018 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1019 velecsum = _mm_add_pd(velecsum,velec);
1023 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1025 /* Calculate temporary vectorial force */
1026 tx = _mm_mul_pd(fscal,dx13);
1027 ty = _mm_mul_pd(fscal,dy13);
1028 tz = _mm_mul_pd(fscal,dz13);
1030 /* Update vectorial force */
1031 fix1 = _mm_add_pd(fix1,tx);
1032 fiy1 = _mm_add_pd(fiy1,ty);
1033 fiz1 = _mm_add_pd(fiz1,tz);
1035 fjx3 = _mm_add_pd(fjx3,tx);
1036 fjy3 = _mm_add_pd(fjy3,ty);
1037 fjz3 = _mm_add_pd(fjz3,tz);
1039 /**************************
1040 * CALCULATE INTERACTIONS *
1041 **************************/
1043 r21 = _mm_mul_pd(rsq21,rinv21);
1045 /* Calculate table index by multiplying r with table scale and truncate to integer */
1046 rt = _mm_mul_pd(r21,vftabscale);
1047 vfitab = _mm_cvttpd_epi32(rt);
1048 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1049 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1051 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1052 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1053 F = _mm_setzero_pd();
1054 GMX_MM_TRANSPOSE2_PD(Y,F);
1055 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1056 H = _mm_setzero_pd();
1057 GMX_MM_TRANSPOSE2_PD(G,H);
1058 Heps = _mm_mul_pd(vfeps,H);
1059 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1060 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1061 velec = _mm_mul_pd(qq21,VV);
1062 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1063 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1065 /* Update potential sum for this i atom from the interaction with this j atom. */
1066 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1067 velecsum = _mm_add_pd(velecsum,velec);
1071 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1073 /* Calculate temporary vectorial force */
1074 tx = _mm_mul_pd(fscal,dx21);
1075 ty = _mm_mul_pd(fscal,dy21);
1076 tz = _mm_mul_pd(fscal,dz21);
1078 /* Update vectorial force */
1079 fix2 = _mm_add_pd(fix2,tx);
1080 fiy2 = _mm_add_pd(fiy2,ty);
1081 fiz2 = _mm_add_pd(fiz2,tz);
1083 fjx1 = _mm_add_pd(fjx1,tx);
1084 fjy1 = _mm_add_pd(fjy1,ty);
1085 fjz1 = _mm_add_pd(fjz1,tz);
1087 /**************************
1088 * CALCULATE INTERACTIONS *
1089 **************************/
1091 r22 = _mm_mul_pd(rsq22,rinv22);
1093 /* Calculate table index by multiplying r with table scale and truncate to integer */
1094 rt = _mm_mul_pd(r22,vftabscale);
1095 vfitab = _mm_cvttpd_epi32(rt);
1096 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1097 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1099 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1100 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1101 F = _mm_setzero_pd();
1102 GMX_MM_TRANSPOSE2_PD(Y,F);
1103 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1104 H = _mm_setzero_pd();
1105 GMX_MM_TRANSPOSE2_PD(G,H);
1106 Heps = _mm_mul_pd(vfeps,H);
1107 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1108 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1109 velec = _mm_mul_pd(qq22,VV);
1110 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1111 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1113 /* Update potential sum for this i atom from the interaction with this j atom. */
1114 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1115 velecsum = _mm_add_pd(velecsum,velec);
1119 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1121 /* Calculate temporary vectorial force */
1122 tx = _mm_mul_pd(fscal,dx22);
1123 ty = _mm_mul_pd(fscal,dy22);
1124 tz = _mm_mul_pd(fscal,dz22);
1126 /* Update vectorial force */
1127 fix2 = _mm_add_pd(fix2,tx);
1128 fiy2 = _mm_add_pd(fiy2,ty);
1129 fiz2 = _mm_add_pd(fiz2,tz);
1131 fjx2 = _mm_add_pd(fjx2,tx);
1132 fjy2 = _mm_add_pd(fjy2,ty);
1133 fjz2 = _mm_add_pd(fjz2,tz);
1135 /**************************
1136 * CALCULATE INTERACTIONS *
1137 **************************/
1139 r23 = _mm_mul_pd(rsq23,rinv23);
1141 /* Calculate table index by multiplying r with table scale and truncate to integer */
1142 rt = _mm_mul_pd(r23,vftabscale);
1143 vfitab = _mm_cvttpd_epi32(rt);
1144 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1145 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1147 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1148 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1149 F = _mm_setzero_pd();
1150 GMX_MM_TRANSPOSE2_PD(Y,F);
1151 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1152 H = _mm_setzero_pd();
1153 GMX_MM_TRANSPOSE2_PD(G,H);
1154 Heps = _mm_mul_pd(vfeps,H);
1155 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1156 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1157 velec = _mm_mul_pd(qq23,VV);
1158 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1159 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
1161 /* Update potential sum for this i atom from the interaction with this j atom. */
1162 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1163 velecsum = _mm_add_pd(velecsum,velec);
1167 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1169 /* Calculate temporary vectorial force */
1170 tx = _mm_mul_pd(fscal,dx23);
1171 ty = _mm_mul_pd(fscal,dy23);
1172 tz = _mm_mul_pd(fscal,dz23);
1174 /* Update vectorial force */
1175 fix2 = _mm_add_pd(fix2,tx);
1176 fiy2 = _mm_add_pd(fiy2,ty);
1177 fiz2 = _mm_add_pd(fiz2,tz);
1179 fjx3 = _mm_add_pd(fjx3,tx);
1180 fjy3 = _mm_add_pd(fjy3,ty);
1181 fjz3 = _mm_add_pd(fjz3,tz);
1183 /**************************
1184 * CALCULATE INTERACTIONS *
1185 **************************/
1187 r31 = _mm_mul_pd(rsq31,rinv31);
1189 /* Calculate table index by multiplying r with table scale and truncate to integer */
1190 rt = _mm_mul_pd(r31,vftabscale);
1191 vfitab = _mm_cvttpd_epi32(rt);
1192 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1193 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1195 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1196 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1197 F = _mm_setzero_pd();
1198 GMX_MM_TRANSPOSE2_PD(Y,F);
1199 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1200 H = _mm_setzero_pd();
1201 GMX_MM_TRANSPOSE2_PD(G,H);
1202 Heps = _mm_mul_pd(vfeps,H);
1203 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1204 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1205 velec = _mm_mul_pd(qq31,VV);
1206 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1207 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
1209 /* Update potential sum for this i atom from the interaction with this j atom. */
1210 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1211 velecsum = _mm_add_pd(velecsum,velec);
1215 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1217 /* Calculate temporary vectorial force */
1218 tx = _mm_mul_pd(fscal,dx31);
1219 ty = _mm_mul_pd(fscal,dy31);
1220 tz = _mm_mul_pd(fscal,dz31);
1222 /* Update vectorial force */
1223 fix3 = _mm_add_pd(fix3,tx);
1224 fiy3 = _mm_add_pd(fiy3,ty);
1225 fiz3 = _mm_add_pd(fiz3,tz);
1227 fjx1 = _mm_add_pd(fjx1,tx);
1228 fjy1 = _mm_add_pd(fjy1,ty);
1229 fjz1 = _mm_add_pd(fjz1,tz);
1231 /**************************
1232 * CALCULATE INTERACTIONS *
1233 **************************/
1235 r32 = _mm_mul_pd(rsq32,rinv32);
1237 /* Calculate table index by multiplying r with table scale and truncate to integer */
1238 rt = _mm_mul_pd(r32,vftabscale);
1239 vfitab = _mm_cvttpd_epi32(rt);
1240 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1241 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1243 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1244 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1245 F = _mm_setzero_pd();
1246 GMX_MM_TRANSPOSE2_PD(Y,F);
1247 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1248 H = _mm_setzero_pd();
1249 GMX_MM_TRANSPOSE2_PD(G,H);
1250 Heps = _mm_mul_pd(vfeps,H);
1251 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1252 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1253 velec = _mm_mul_pd(qq32,VV);
1254 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1255 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
1257 /* Update potential sum for this i atom from the interaction with this j atom. */
1258 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1259 velecsum = _mm_add_pd(velecsum,velec);
1263 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1265 /* Calculate temporary vectorial force */
1266 tx = _mm_mul_pd(fscal,dx32);
1267 ty = _mm_mul_pd(fscal,dy32);
1268 tz = _mm_mul_pd(fscal,dz32);
1270 /* Update vectorial force */
1271 fix3 = _mm_add_pd(fix3,tx);
1272 fiy3 = _mm_add_pd(fiy3,ty);
1273 fiz3 = _mm_add_pd(fiz3,tz);
1275 fjx2 = _mm_add_pd(fjx2,tx);
1276 fjy2 = _mm_add_pd(fjy2,ty);
1277 fjz2 = _mm_add_pd(fjz2,tz);
1279 /**************************
1280 * CALCULATE INTERACTIONS *
1281 **************************/
1283 r33 = _mm_mul_pd(rsq33,rinv33);
1285 /* Calculate table index by multiplying r with table scale and truncate to integer */
1286 rt = _mm_mul_pd(r33,vftabscale);
1287 vfitab = _mm_cvttpd_epi32(rt);
1288 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1289 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1291 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1292 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1293 F = _mm_setzero_pd();
1294 GMX_MM_TRANSPOSE2_PD(Y,F);
1295 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1296 H = _mm_setzero_pd();
1297 GMX_MM_TRANSPOSE2_PD(G,H);
1298 Heps = _mm_mul_pd(vfeps,H);
1299 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1300 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1301 velec = _mm_mul_pd(qq33,VV);
1302 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1303 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
1305 /* Update potential sum for this i atom from the interaction with this j atom. */
1306 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1307 velecsum = _mm_add_pd(velecsum,velec);
1311 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1313 /* Calculate temporary vectorial force */
1314 tx = _mm_mul_pd(fscal,dx33);
1315 ty = _mm_mul_pd(fscal,dy33);
1316 tz = _mm_mul_pd(fscal,dz33);
1318 /* Update vectorial force */
1319 fix3 = _mm_add_pd(fix3,tx);
1320 fiy3 = _mm_add_pd(fiy3,ty);
1321 fiz3 = _mm_add_pd(fiz3,tz);
1323 fjx3 = _mm_add_pd(fjx3,tx);
1324 fjy3 = _mm_add_pd(fjy3,ty);
1325 fjz3 = _mm_add_pd(fjz3,tz);
1327 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1329 /* Inner loop uses 446 flops */
1332 /* End of innermost loop */
1334 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1335 f+i_coord_offset,fshift+i_shift_offset);
1338 /* Update potential energies */
1339 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1340 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1342 /* Increment number of inner iterations */
1343 inneriter += j_index_end - j_index_start;
1345 /* Outer loop uses 26 flops */
1348 /* Increment number of outer iterations */
1351 /* Update outer/inner flops */
1353 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*446);
1356 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse4_1_double
1357 * Electrostatics interaction: CubicSplineTable
1358 * VdW interaction: CubicSplineTable
1359 * Geometry: Water4-Water4
1360 * Calculate force/pot: Force
1363 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse4_1_double
1364 (t_nblist * gmx_restrict nlist,
1365 rvec * gmx_restrict xx,
1366 rvec * gmx_restrict ff,
1367 t_forcerec * gmx_restrict fr,
1368 t_mdatoms * gmx_restrict mdatoms,
1369 nb_kernel_data_t * gmx_restrict kernel_data,
1370 t_nrnb * gmx_restrict nrnb)
1372 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1373 * just 0 for non-waters.
1374 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1375 * jnr indices corresponding to data put in the four positions in the SIMD register.
1377 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1378 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1380 int j_coord_offsetA,j_coord_offsetB;
1381 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1382 real rcutoff_scalar;
1383 real *shiftvec,*fshift,*x,*f;
1384 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1386 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1388 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1390 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1392 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1393 int vdwjidx0A,vdwjidx0B;
1394 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1395 int vdwjidx1A,vdwjidx1B;
1396 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1397 int vdwjidx2A,vdwjidx2B;
1398 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1399 int vdwjidx3A,vdwjidx3B;
1400 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1401 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1402 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1403 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1404 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1405 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1406 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1407 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1408 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1409 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1410 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1411 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1414 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1417 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1418 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1420 __m128i ifour = _mm_set1_epi32(4);
1421 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1423 __m128d dummy_mask,cutoff_mask;
1424 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1425 __m128d one = _mm_set1_pd(1.0);
1426 __m128d two = _mm_set1_pd(2.0);
1432 jindex = nlist->jindex;
1434 shiftidx = nlist->shift;
1436 shiftvec = fr->shift_vec[0];
1437 fshift = fr->fshift[0];
1438 facel = _mm_set1_pd(fr->epsfac);
1439 charge = mdatoms->chargeA;
1440 nvdwtype = fr->ntype;
1441 vdwparam = fr->nbfp;
1442 vdwtype = mdatoms->typeA;
1444 vftab = kernel_data->table_elec_vdw->data;
1445 vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
1447 /* Setup water-specific parameters */
1448 inr = nlist->iinr[0];
1449 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1450 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1451 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1452 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1454 jq1 = _mm_set1_pd(charge[inr+1]);
1455 jq2 = _mm_set1_pd(charge[inr+2]);
1456 jq3 = _mm_set1_pd(charge[inr+3]);
1457 vdwjidx0A = 2*vdwtype[inr+0];
1458 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1459 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1460 qq11 = _mm_mul_pd(iq1,jq1);
1461 qq12 = _mm_mul_pd(iq1,jq2);
1462 qq13 = _mm_mul_pd(iq1,jq3);
1463 qq21 = _mm_mul_pd(iq2,jq1);
1464 qq22 = _mm_mul_pd(iq2,jq2);
1465 qq23 = _mm_mul_pd(iq2,jq3);
1466 qq31 = _mm_mul_pd(iq3,jq1);
1467 qq32 = _mm_mul_pd(iq3,jq2);
1468 qq33 = _mm_mul_pd(iq3,jq3);
1470 /* Avoid stupid compiler warnings */
1472 j_coord_offsetA = 0;
1473 j_coord_offsetB = 0;
1478 /* Start outer loop over neighborlists */
1479 for(iidx=0; iidx<nri; iidx++)
1481 /* Load shift vector for this list */
1482 i_shift_offset = DIM*shiftidx[iidx];
1484 /* Load limits for loop over neighbors */
1485 j_index_start = jindex[iidx];
1486 j_index_end = jindex[iidx+1];
1488 /* Get outer coordinate index */
1490 i_coord_offset = DIM*inr;
1492 /* Load i particle coords and add shift vector */
1493 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1494 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1496 fix0 = _mm_setzero_pd();
1497 fiy0 = _mm_setzero_pd();
1498 fiz0 = _mm_setzero_pd();
1499 fix1 = _mm_setzero_pd();
1500 fiy1 = _mm_setzero_pd();
1501 fiz1 = _mm_setzero_pd();
1502 fix2 = _mm_setzero_pd();
1503 fiy2 = _mm_setzero_pd();
1504 fiz2 = _mm_setzero_pd();
1505 fix3 = _mm_setzero_pd();
1506 fiy3 = _mm_setzero_pd();
1507 fiz3 = _mm_setzero_pd();
1509 /* Start inner kernel loop */
1510 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1513 /* Get j neighbor index, and coordinate index */
1515 jnrB = jjnr[jidx+1];
1516 j_coord_offsetA = DIM*jnrA;
1517 j_coord_offsetB = DIM*jnrB;
1519 /* load j atom coordinates */
1520 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1521 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1522 &jy2,&jz2,&jx3,&jy3,&jz3);
1524 /* Calculate displacement vector */
1525 dx00 = _mm_sub_pd(ix0,jx0);
1526 dy00 = _mm_sub_pd(iy0,jy0);
1527 dz00 = _mm_sub_pd(iz0,jz0);
1528 dx11 = _mm_sub_pd(ix1,jx1);
1529 dy11 = _mm_sub_pd(iy1,jy1);
1530 dz11 = _mm_sub_pd(iz1,jz1);
1531 dx12 = _mm_sub_pd(ix1,jx2);
1532 dy12 = _mm_sub_pd(iy1,jy2);
1533 dz12 = _mm_sub_pd(iz1,jz2);
1534 dx13 = _mm_sub_pd(ix1,jx3);
1535 dy13 = _mm_sub_pd(iy1,jy3);
1536 dz13 = _mm_sub_pd(iz1,jz3);
1537 dx21 = _mm_sub_pd(ix2,jx1);
1538 dy21 = _mm_sub_pd(iy2,jy1);
1539 dz21 = _mm_sub_pd(iz2,jz1);
1540 dx22 = _mm_sub_pd(ix2,jx2);
1541 dy22 = _mm_sub_pd(iy2,jy2);
1542 dz22 = _mm_sub_pd(iz2,jz2);
1543 dx23 = _mm_sub_pd(ix2,jx3);
1544 dy23 = _mm_sub_pd(iy2,jy3);
1545 dz23 = _mm_sub_pd(iz2,jz3);
1546 dx31 = _mm_sub_pd(ix3,jx1);
1547 dy31 = _mm_sub_pd(iy3,jy1);
1548 dz31 = _mm_sub_pd(iz3,jz1);
1549 dx32 = _mm_sub_pd(ix3,jx2);
1550 dy32 = _mm_sub_pd(iy3,jy2);
1551 dz32 = _mm_sub_pd(iz3,jz2);
1552 dx33 = _mm_sub_pd(ix3,jx3);
1553 dy33 = _mm_sub_pd(iy3,jy3);
1554 dz33 = _mm_sub_pd(iz3,jz3);
1556 /* Calculate squared distance and things based on it */
1557 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1558 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1559 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1560 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1561 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1562 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1563 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1564 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1565 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1566 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1568 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1569 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1570 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1571 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1572 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1573 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1574 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1575 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1576 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1577 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1579 fjx0 = _mm_setzero_pd();
1580 fjy0 = _mm_setzero_pd();
1581 fjz0 = _mm_setzero_pd();
1582 fjx1 = _mm_setzero_pd();
1583 fjy1 = _mm_setzero_pd();
1584 fjz1 = _mm_setzero_pd();
1585 fjx2 = _mm_setzero_pd();
1586 fjy2 = _mm_setzero_pd();
1587 fjz2 = _mm_setzero_pd();
1588 fjx3 = _mm_setzero_pd();
1589 fjy3 = _mm_setzero_pd();
1590 fjz3 = _mm_setzero_pd();
1592 /**************************
1593 * CALCULATE INTERACTIONS *
1594 **************************/
1596 r00 = _mm_mul_pd(rsq00,rinv00);
1598 /* Calculate table index by multiplying r with table scale and truncate to integer */
1599 rt = _mm_mul_pd(r00,vftabscale);
1600 vfitab = _mm_cvttpd_epi32(rt);
1601 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1602 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1604 /* CUBIC SPLINE TABLE DISPERSION */
1605 vfitab = _mm_add_epi32(vfitab,ifour);
1606 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1607 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1608 GMX_MM_TRANSPOSE2_PD(Y,F);
1609 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1610 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1611 GMX_MM_TRANSPOSE2_PD(G,H);
1612 Heps = _mm_mul_pd(vfeps,H);
1613 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1614 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1615 fvdw6 = _mm_mul_pd(c6_00,FF);
1617 /* CUBIC SPLINE TABLE REPULSION */
1618 vfitab = _mm_add_epi32(vfitab,ifour);
1619 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1620 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1621 GMX_MM_TRANSPOSE2_PD(Y,F);
1622 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1623 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1624 GMX_MM_TRANSPOSE2_PD(G,H);
1625 Heps = _mm_mul_pd(vfeps,H);
1626 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1627 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1628 fvdw12 = _mm_mul_pd(c12_00,FF);
1629 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1633 /* Calculate temporary vectorial force */
1634 tx = _mm_mul_pd(fscal,dx00);
1635 ty = _mm_mul_pd(fscal,dy00);
1636 tz = _mm_mul_pd(fscal,dz00);
1638 /* Update vectorial force */
1639 fix0 = _mm_add_pd(fix0,tx);
1640 fiy0 = _mm_add_pd(fiy0,ty);
1641 fiz0 = _mm_add_pd(fiz0,tz);
1643 fjx0 = _mm_add_pd(fjx0,tx);
1644 fjy0 = _mm_add_pd(fjy0,ty);
1645 fjz0 = _mm_add_pd(fjz0,tz);
1647 /**************************
1648 * CALCULATE INTERACTIONS *
1649 **************************/
1651 r11 = _mm_mul_pd(rsq11,rinv11);
1653 /* Calculate table index by multiplying r with table scale and truncate to integer */
1654 rt = _mm_mul_pd(r11,vftabscale);
1655 vfitab = _mm_cvttpd_epi32(rt);
1656 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1657 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1659 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1660 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1661 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1662 GMX_MM_TRANSPOSE2_PD(Y,F);
1663 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1664 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1665 GMX_MM_TRANSPOSE2_PD(G,H);
1666 Heps = _mm_mul_pd(vfeps,H);
1667 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1668 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1669 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1673 /* Calculate temporary vectorial force */
1674 tx = _mm_mul_pd(fscal,dx11);
1675 ty = _mm_mul_pd(fscal,dy11);
1676 tz = _mm_mul_pd(fscal,dz11);
1678 /* Update vectorial force */
1679 fix1 = _mm_add_pd(fix1,tx);
1680 fiy1 = _mm_add_pd(fiy1,ty);
1681 fiz1 = _mm_add_pd(fiz1,tz);
1683 fjx1 = _mm_add_pd(fjx1,tx);
1684 fjy1 = _mm_add_pd(fjy1,ty);
1685 fjz1 = _mm_add_pd(fjz1,tz);
1687 /**************************
1688 * CALCULATE INTERACTIONS *
1689 **************************/
1691 r12 = _mm_mul_pd(rsq12,rinv12);
1693 /* Calculate table index by multiplying r with table scale and truncate to integer */
1694 rt = _mm_mul_pd(r12,vftabscale);
1695 vfitab = _mm_cvttpd_epi32(rt);
1696 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1697 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1699 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1700 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1701 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1702 GMX_MM_TRANSPOSE2_PD(Y,F);
1703 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1704 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1705 GMX_MM_TRANSPOSE2_PD(G,H);
1706 Heps = _mm_mul_pd(vfeps,H);
1707 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1708 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1709 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1713 /* Calculate temporary vectorial force */
1714 tx = _mm_mul_pd(fscal,dx12);
1715 ty = _mm_mul_pd(fscal,dy12);
1716 tz = _mm_mul_pd(fscal,dz12);
1718 /* Update vectorial force */
1719 fix1 = _mm_add_pd(fix1,tx);
1720 fiy1 = _mm_add_pd(fiy1,ty);
1721 fiz1 = _mm_add_pd(fiz1,tz);
1723 fjx2 = _mm_add_pd(fjx2,tx);
1724 fjy2 = _mm_add_pd(fjy2,ty);
1725 fjz2 = _mm_add_pd(fjz2,tz);
1727 /**************************
1728 * CALCULATE INTERACTIONS *
1729 **************************/
1731 r13 = _mm_mul_pd(rsq13,rinv13);
1733 /* Calculate table index by multiplying r with table scale and truncate to integer */
1734 rt = _mm_mul_pd(r13,vftabscale);
1735 vfitab = _mm_cvttpd_epi32(rt);
1736 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1737 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1739 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1740 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1741 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1742 GMX_MM_TRANSPOSE2_PD(Y,F);
1743 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1744 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1745 GMX_MM_TRANSPOSE2_PD(G,H);
1746 Heps = _mm_mul_pd(vfeps,H);
1747 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1748 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1749 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
1753 /* Calculate temporary vectorial force */
1754 tx = _mm_mul_pd(fscal,dx13);
1755 ty = _mm_mul_pd(fscal,dy13);
1756 tz = _mm_mul_pd(fscal,dz13);
1758 /* Update vectorial force */
1759 fix1 = _mm_add_pd(fix1,tx);
1760 fiy1 = _mm_add_pd(fiy1,ty);
1761 fiz1 = _mm_add_pd(fiz1,tz);
1763 fjx3 = _mm_add_pd(fjx3,tx);
1764 fjy3 = _mm_add_pd(fjy3,ty);
1765 fjz3 = _mm_add_pd(fjz3,tz);
1767 /**************************
1768 * CALCULATE INTERACTIONS *
1769 **************************/
1771 r21 = _mm_mul_pd(rsq21,rinv21);
1773 /* Calculate table index by multiplying r with table scale and truncate to integer */
1774 rt = _mm_mul_pd(r21,vftabscale);
1775 vfitab = _mm_cvttpd_epi32(rt);
1776 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1777 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1779 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1780 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1781 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1782 GMX_MM_TRANSPOSE2_PD(Y,F);
1783 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1784 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1785 GMX_MM_TRANSPOSE2_PD(G,H);
1786 Heps = _mm_mul_pd(vfeps,H);
1787 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1788 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1789 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1793 /* Calculate temporary vectorial force */
1794 tx = _mm_mul_pd(fscal,dx21);
1795 ty = _mm_mul_pd(fscal,dy21);
1796 tz = _mm_mul_pd(fscal,dz21);
1798 /* Update vectorial force */
1799 fix2 = _mm_add_pd(fix2,tx);
1800 fiy2 = _mm_add_pd(fiy2,ty);
1801 fiz2 = _mm_add_pd(fiz2,tz);
1803 fjx1 = _mm_add_pd(fjx1,tx);
1804 fjy1 = _mm_add_pd(fjy1,ty);
1805 fjz1 = _mm_add_pd(fjz1,tz);
1807 /**************************
1808 * CALCULATE INTERACTIONS *
1809 **************************/
1811 r22 = _mm_mul_pd(rsq22,rinv22);
1813 /* Calculate table index by multiplying r with table scale and truncate to integer */
1814 rt = _mm_mul_pd(r22,vftabscale);
1815 vfitab = _mm_cvttpd_epi32(rt);
1816 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1817 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1819 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1820 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1821 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1822 GMX_MM_TRANSPOSE2_PD(Y,F);
1823 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1824 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1825 GMX_MM_TRANSPOSE2_PD(G,H);
1826 Heps = _mm_mul_pd(vfeps,H);
1827 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1828 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1829 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1833 /* Calculate temporary vectorial force */
1834 tx = _mm_mul_pd(fscal,dx22);
1835 ty = _mm_mul_pd(fscal,dy22);
1836 tz = _mm_mul_pd(fscal,dz22);
1838 /* Update vectorial force */
1839 fix2 = _mm_add_pd(fix2,tx);
1840 fiy2 = _mm_add_pd(fiy2,ty);
1841 fiz2 = _mm_add_pd(fiz2,tz);
1843 fjx2 = _mm_add_pd(fjx2,tx);
1844 fjy2 = _mm_add_pd(fjy2,ty);
1845 fjz2 = _mm_add_pd(fjz2,tz);
1847 /**************************
1848 * CALCULATE INTERACTIONS *
1849 **************************/
1851 r23 = _mm_mul_pd(rsq23,rinv23);
1853 /* Calculate table index by multiplying r with table scale and truncate to integer */
1854 rt = _mm_mul_pd(r23,vftabscale);
1855 vfitab = _mm_cvttpd_epi32(rt);
1856 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1857 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1859 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1860 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1861 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1862 GMX_MM_TRANSPOSE2_PD(Y,F);
1863 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1864 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1865 GMX_MM_TRANSPOSE2_PD(G,H);
1866 Heps = _mm_mul_pd(vfeps,H);
1867 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1868 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1869 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
1873 /* Calculate temporary vectorial force */
1874 tx = _mm_mul_pd(fscal,dx23);
1875 ty = _mm_mul_pd(fscal,dy23);
1876 tz = _mm_mul_pd(fscal,dz23);
1878 /* Update vectorial force */
1879 fix2 = _mm_add_pd(fix2,tx);
1880 fiy2 = _mm_add_pd(fiy2,ty);
1881 fiz2 = _mm_add_pd(fiz2,tz);
1883 fjx3 = _mm_add_pd(fjx3,tx);
1884 fjy3 = _mm_add_pd(fjy3,ty);
1885 fjz3 = _mm_add_pd(fjz3,tz);
1887 /**************************
1888 * CALCULATE INTERACTIONS *
1889 **************************/
1891 r31 = _mm_mul_pd(rsq31,rinv31);
1893 /* Calculate table index by multiplying r with table scale and truncate to integer */
1894 rt = _mm_mul_pd(r31,vftabscale);
1895 vfitab = _mm_cvttpd_epi32(rt);
1896 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1897 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1899 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1900 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1901 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1902 GMX_MM_TRANSPOSE2_PD(Y,F);
1903 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1904 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1905 GMX_MM_TRANSPOSE2_PD(G,H);
1906 Heps = _mm_mul_pd(vfeps,H);
1907 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1908 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1909 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
1913 /* Calculate temporary vectorial force */
1914 tx = _mm_mul_pd(fscal,dx31);
1915 ty = _mm_mul_pd(fscal,dy31);
1916 tz = _mm_mul_pd(fscal,dz31);
1918 /* Update vectorial force */
1919 fix3 = _mm_add_pd(fix3,tx);
1920 fiy3 = _mm_add_pd(fiy3,ty);
1921 fiz3 = _mm_add_pd(fiz3,tz);
1923 fjx1 = _mm_add_pd(fjx1,tx);
1924 fjy1 = _mm_add_pd(fjy1,ty);
1925 fjz1 = _mm_add_pd(fjz1,tz);
1927 /**************************
1928 * CALCULATE INTERACTIONS *
1929 **************************/
1931 r32 = _mm_mul_pd(rsq32,rinv32);
1933 /* Calculate table index by multiplying r with table scale and truncate to integer */
1934 rt = _mm_mul_pd(r32,vftabscale);
1935 vfitab = _mm_cvttpd_epi32(rt);
1936 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1937 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1939 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1940 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1941 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1942 GMX_MM_TRANSPOSE2_PD(Y,F);
1943 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1944 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1945 GMX_MM_TRANSPOSE2_PD(G,H);
1946 Heps = _mm_mul_pd(vfeps,H);
1947 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1948 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1949 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
1953 /* Calculate temporary vectorial force */
1954 tx = _mm_mul_pd(fscal,dx32);
1955 ty = _mm_mul_pd(fscal,dy32);
1956 tz = _mm_mul_pd(fscal,dz32);
1958 /* Update vectorial force */
1959 fix3 = _mm_add_pd(fix3,tx);
1960 fiy3 = _mm_add_pd(fiy3,ty);
1961 fiz3 = _mm_add_pd(fiz3,tz);
1963 fjx2 = _mm_add_pd(fjx2,tx);
1964 fjy2 = _mm_add_pd(fjy2,ty);
1965 fjz2 = _mm_add_pd(fjz2,tz);
1967 /**************************
1968 * CALCULATE INTERACTIONS *
1969 **************************/
1971 r33 = _mm_mul_pd(rsq33,rinv33);
1973 /* Calculate table index by multiplying r with table scale and truncate to integer */
1974 rt = _mm_mul_pd(r33,vftabscale);
1975 vfitab = _mm_cvttpd_epi32(rt);
1976 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1977 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1979 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1980 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1981 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1982 GMX_MM_TRANSPOSE2_PD(Y,F);
1983 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1984 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1985 GMX_MM_TRANSPOSE2_PD(G,H);
1986 Heps = _mm_mul_pd(vfeps,H);
1987 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1988 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1989 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
1993 /* Calculate temporary vectorial force */
1994 tx = _mm_mul_pd(fscal,dx33);
1995 ty = _mm_mul_pd(fscal,dy33);
1996 tz = _mm_mul_pd(fscal,dz33);
1998 /* Update vectorial force */
1999 fix3 = _mm_add_pd(fix3,tx);
2000 fiy3 = _mm_add_pd(fiy3,ty);
2001 fiz3 = _mm_add_pd(fiz3,tz);
2003 fjx3 = _mm_add_pd(fjx3,tx);
2004 fjy3 = _mm_add_pd(fjy3,ty);
2005 fjz3 = _mm_add_pd(fjz3,tz);
2007 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2009 /* Inner loop uses 402 flops */
2012 if(jidx<j_index_end)
2016 j_coord_offsetA = DIM*jnrA;
2018 /* load j atom coordinates */
2019 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2020 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2021 &jy2,&jz2,&jx3,&jy3,&jz3);
2023 /* Calculate displacement vector */
2024 dx00 = _mm_sub_pd(ix0,jx0);
2025 dy00 = _mm_sub_pd(iy0,jy0);
2026 dz00 = _mm_sub_pd(iz0,jz0);
2027 dx11 = _mm_sub_pd(ix1,jx1);
2028 dy11 = _mm_sub_pd(iy1,jy1);
2029 dz11 = _mm_sub_pd(iz1,jz1);
2030 dx12 = _mm_sub_pd(ix1,jx2);
2031 dy12 = _mm_sub_pd(iy1,jy2);
2032 dz12 = _mm_sub_pd(iz1,jz2);
2033 dx13 = _mm_sub_pd(ix1,jx3);
2034 dy13 = _mm_sub_pd(iy1,jy3);
2035 dz13 = _mm_sub_pd(iz1,jz3);
2036 dx21 = _mm_sub_pd(ix2,jx1);
2037 dy21 = _mm_sub_pd(iy2,jy1);
2038 dz21 = _mm_sub_pd(iz2,jz1);
2039 dx22 = _mm_sub_pd(ix2,jx2);
2040 dy22 = _mm_sub_pd(iy2,jy2);
2041 dz22 = _mm_sub_pd(iz2,jz2);
2042 dx23 = _mm_sub_pd(ix2,jx3);
2043 dy23 = _mm_sub_pd(iy2,jy3);
2044 dz23 = _mm_sub_pd(iz2,jz3);
2045 dx31 = _mm_sub_pd(ix3,jx1);
2046 dy31 = _mm_sub_pd(iy3,jy1);
2047 dz31 = _mm_sub_pd(iz3,jz1);
2048 dx32 = _mm_sub_pd(ix3,jx2);
2049 dy32 = _mm_sub_pd(iy3,jy2);
2050 dz32 = _mm_sub_pd(iz3,jz2);
2051 dx33 = _mm_sub_pd(ix3,jx3);
2052 dy33 = _mm_sub_pd(iy3,jy3);
2053 dz33 = _mm_sub_pd(iz3,jz3);
2055 /* Calculate squared distance and things based on it */
2056 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2057 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2058 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2059 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2060 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2061 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2062 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2063 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2064 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2065 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2067 rinv00 = gmx_mm_invsqrt_pd(rsq00);
2068 rinv11 = gmx_mm_invsqrt_pd(rsq11);
2069 rinv12 = gmx_mm_invsqrt_pd(rsq12);
2070 rinv13 = gmx_mm_invsqrt_pd(rsq13);
2071 rinv21 = gmx_mm_invsqrt_pd(rsq21);
2072 rinv22 = gmx_mm_invsqrt_pd(rsq22);
2073 rinv23 = gmx_mm_invsqrt_pd(rsq23);
2074 rinv31 = gmx_mm_invsqrt_pd(rsq31);
2075 rinv32 = gmx_mm_invsqrt_pd(rsq32);
2076 rinv33 = gmx_mm_invsqrt_pd(rsq33);
2078 fjx0 = _mm_setzero_pd();
2079 fjy0 = _mm_setzero_pd();
2080 fjz0 = _mm_setzero_pd();
2081 fjx1 = _mm_setzero_pd();
2082 fjy1 = _mm_setzero_pd();
2083 fjz1 = _mm_setzero_pd();
2084 fjx2 = _mm_setzero_pd();
2085 fjy2 = _mm_setzero_pd();
2086 fjz2 = _mm_setzero_pd();
2087 fjx3 = _mm_setzero_pd();
2088 fjy3 = _mm_setzero_pd();
2089 fjz3 = _mm_setzero_pd();
2091 /**************************
2092 * CALCULATE INTERACTIONS *
2093 **************************/
2095 r00 = _mm_mul_pd(rsq00,rinv00);
2097 /* Calculate table index by multiplying r with table scale and truncate to integer */
2098 rt = _mm_mul_pd(r00,vftabscale);
2099 vfitab = _mm_cvttpd_epi32(rt);
2100 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2101 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2103 /* CUBIC SPLINE TABLE DISPERSION */
2104 vfitab = _mm_add_epi32(vfitab,ifour);
2105 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2106 F = _mm_setzero_pd();
2107 GMX_MM_TRANSPOSE2_PD(Y,F);
2108 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2109 H = _mm_setzero_pd();
2110 GMX_MM_TRANSPOSE2_PD(G,H);
2111 Heps = _mm_mul_pd(vfeps,H);
2112 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2113 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2114 fvdw6 = _mm_mul_pd(c6_00,FF);
2116 /* CUBIC SPLINE TABLE REPULSION */
2117 vfitab = _mm_add_epi32(vfitab,ifour);
2118 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2119 F = _mm_setzero_pd();
2120 GMX_MM_TRANSPOSE2_PD(Y,F);
2121 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2122 H = _mm_setzero_pd();
2123 GMX_MM_TRANSPOSE2_PD(G,H);
2124 Heps = _mm_mul_pd(vfeps,H);
2125 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2126 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2127 fvdw12 = _mm_mul_pd(c12_00,FF);
2128 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
2132 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2134 /* Calculate temporary vectorial force */
2135 tx = _mm_mul_pd(fscal,dx00);
2136 ty = _mm_mul_pd(fscal,dy00);
2137 tz = _mm_mul_pd(fscal,dz00);
2139 /* Update vectorial force */
2140 fix0 = _mm_add_pd(fix0,tx);
2141 fiy0 = _mm_add_pd(fiy0,ty);
2142 fiz0 = _mm_add_pd(fiz0,tz);
2144 fjx0 = _mm_add_pd(fjx0,tx);
2145 fjy0 = _mm_add_pd(fjy0,ty);
2146 fjz0 = _mm_add_pd(fjz0,tz);
2148 /**************************
2149 * CALCULATE INTERACTIONS *
2150 **************************/
2152 r11 = _mm_mul_pd(rsq11,rinv11);
2154 /* Calculate table index by multiplying r with table scale and truncate to integer */
2155 rt = _mm_mul_pd(r11,vftabscale);
2156 vfitab = _mm_cvttpd_epi32(rt);
2157 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2158 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2160 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2161 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2162 F = _mm_setzero_pd();
2163 GMX_MM_TRANSPOSE2_PD(Y,F);
2164 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2165 H = _mm_setzero_pd();
2166 GMX_MM_TRANSPOSE2_PD(G,H);
2167 Heps = _mm_mul_pd(vfeps,H);
2168 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2169 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2170 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2174 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2176 /* Calculate temporary vectorial force */
2177 tx = _mm_mul_pd(fscal,dx11);
2178 ty = _mm_mul_pd(fscal,dy11);
2179 tz = _mm_mul_pd(fscal,dz11);
2181 /* Update vectorial force */
2182 fix1 = _mm_add_pd(fix1,tx);
2183 fiy1 = _mm_add_pd(fiy1,ty);
2184 fiz1 = _mm_add_pd(fiz1,tz);
2186 fjx1 = _mm_add_pd(fjx1,tx);
2187 fjy1 = _mm_add_pd(fjy1,ty);
2188 fjz1 = _mm_add_pd(fjz1,tz);
2190 /**************************
2191 * CALCULATE INTERACTIONS *
2192 **************************/
2194 r12 = _mm_mul_pd(rsq12,rinv12);
2196 /* Calculate table index by multiplying r with table scale and truncate to integer */
2197 rt = _mm_mul_pd(r12,vftabscale);
2198 vfitab = _mm_cvttpd_epi32(rt);
2199 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2200 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2202 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2203 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2204 F = _mm_setzero_pd();
2205 GMX_MM_TRANSPOSE2_PD(Y,F);
2206 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2207 H = _mm_setzero_pd();
2208 GMX_MM_TRANSPOSE2_PD(G,H);
2209 Heps = _mm_mul_pd(vfeps,H);
2210 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2211 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2212 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2216 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2218 /* Calculate temporary vectorial force */
2219 tx = _mm_mul_pd(fscal,dx12);
2220 ty = _mm_mul_pd(fscal,dy12);
2221 tz = _mm_mul_pd(fscal,dz12);
2223 /* Update vectorial force */
2224 fix1 = _mm_add_pd(fix1,tx);
2225 fiy1 = _mm_add_pd(fiy1,ty);
2226 fiz1 = _mm_add_pd(fiz1,tz);
2228 fjx2 = _mm_add_pd(fjx2,tx);
2229 fjy2 = _mm_add_pd(fjy2,ty);
2230 fjz2 = _mm_add_pd(fjz2,tz);
2232 /**************************
2233 * CALCULATE INTERACTIONS *
2234 **************************/
2236 r13 = _mm_mul_pd(rsq13,rinv13);
2238 /* Calculate table index by multiplying r with table scale and truncate to integer */
2239 rt = _mm_mul_pd(r13,vftabscale);
2240 vfitab = _mm_cvttpd_epi32(rt);
2241 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2242 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2244 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2245 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2246 F = _mm_setzero_pd();
2247 GMX_MM_TRANSPOSE2_PD(Y,F);
2248 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2249 H = _mm_setzero_pd();
2250 GMX_MM_TRANSPOSE2_PD(G,H);
2251 Heps = _mm_mul_pd(vfeps,H);
2252 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2253 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2254 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
2258 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2260 /* Calculate temporary vectorial force */
2261 tx = _mm_mul_pd(fscal,dx13);
2262 ty = _mm_mul_pd(fscal,dy13);
2263 tz = _mm_mul_pd(fscal,dz13);
2265 /* Update vectorial force */
2266 fix1 = _mm_add_pd(fix1,tx);
2267 fiy1 = _mm_add_pd(fiy1,ty);
2268 fiz1 = _mm_add_pd(fiz1,tz);
2270 fjx3 = _mm_add_pd(fjx3,tx);
2271 fjy3 = _mm_add_pd(fjy3,ty);
2272 fjz3 = _mm_add_pd(fjz3,tz);
2274 /**************************
2275 * CALCULATE INTERACTIONS *
2276 **************************/
2278 r21 = _mm_mul_pd(rsq21,rinv21);
2280 /* Calculate table index by multiplying r with table scale and truncate to integer */
2281 rt = _mm_mul_pd(r21,vftabscale);
2282 vfitab = _mm_cvttpd_epi32(rt);
2283 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2284 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2286 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2287 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2288 F = _mm_setzero_pd();
2289 GMX_MM_TRANSPOSE2_PD(Y,F);
2290 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2291 H = _mm_setzero_pd();
2292 GMX_MM_TRANSPOSE2_PD(G,H);
2293 Heps = _mm_mul_pd(vfeps,H);
2294 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2295 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2296 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2300 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2302 /* Calculate temporary vectorial force */
2303 tx = _mm_mul_pd(fscal,dx21);
2304 ty = _mm_mul_pd(fscal,dy21);
2305 tz = _mm_mul_pd(fscal,dz21);
2307 /* Update vectorial force */
2308 fix2 = _mm_add_pd(fix2,tx);
2309 fiy2 = _mm_add_pd(fiy2,ty);
2310 fiz2 = _mm_add_pd(fiz2,tz);
2312 fjx1 = _mm_add_pd(fjx1,tx);
2313 fjy1 = _mm_add_pd(fjy1,ty);
2314 fjz1 = _mm_add_pd(fjz1,tz);
2316 /**************************
2317 * CALCULATE INTERACTIONS *
2318 **************************/
2320 r22 = _mm_mul_pd(rsq22,rinv22);
2322 /* Calculate table index by multiplying r with table scale and truncate to integer */
2323 rt = _mm_mul_pd(r22,vftabscale);
2324 vfitab = _mm_cvttpd_epi32(rt);
2325 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2326 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2328 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2329 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2330 F = _mm_setzero_pd();
2331 GMX_MM_TRANSPOSE2_PD(Y,F);
2332 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2333 H = _mm_setzero_pd();
2334 GMX_MM_TRANSPOSE2_PD(G,H);
2335 Heps = _mm_mul_pd(vfeps,H);
2336 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2337 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2338 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2342 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2344 /* Calculate temporary vectorial force */
2345 tx = _mm_mul_pd(fscal,dx22);
2346 ty = _mm_mul_pd(fscal,dy22);
2347 tz = _mm_mul_pd(fscal,dz22);
2349 /* Update vectorial force */
2350 fix2 = _mm_add_pd(fix2,tx);
2351 fiy2 = _mm_add_pd(fiy2,ty);
2352 fiz2 = _mm_add_pd(fiz2,tz);
2354 fjx2 = _mm_add_pd(fjx2,tx);
2355 fjy2 = _mm_add_pd(fjy2,ty);
2356 fjz2 = _mm_add_pd(fjz2,tz);
2358 /**************************
2359 * CALCULATE INTERACTIONS *
2360 **************************/
2362 r23 = _mm_mul_pd(rsq23,rinv23);
2364 /* Calculate table index by multiplying r with table scale and truncate to integer */
2365 rt = _mm_mul_pd(r23,vftabscale);
2366 vfitab = _mm_cvttpd_epi32(rt);
2367 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2368 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2370 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2371 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2372 F = _mm_setzero_pd();
2373 GMX_MM_TRANSPOSE2_PD(Y,F);
2374 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2375 H = _mm_setzero_pd();
2376 GMX_MM_TRANSPOSE2_PD(G,H);
2377 Heps = _mm_mul_pd(vfeps,H);
2378 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2379 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2380 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
2384 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2386 /* Calculate temporary vectorial force */
2387 tx = _mm_mul_pd(fscal,dx23);
2388 ty = _mm_mul_pd(fscal,dy23);
2389 tz = _mm_mul_pd(fscal,dz23);
2391 /* Update vectorial force */
2392 fix2 = _mm_add_pd(fix2,tx);
2393 fiy2 = _mm_add_pd(fiy2,ty);
2394 fiz2 = _mm_add_pd(fiz2,tz);
2396 fjx3 = _mm_add_pd(fjx3,tx);
2397 fjy3 = _mm_add_pd(fjy3,ty);
2398 fjz3 = _mm_add_pd(fjz3,tz);
2400 /**************************
2401 * CALCULATE INTERACTIONS *
2402 **************************/
2404 r31 = _mm_mul_pd(rsq31,rinv31);
2406 /* Calculate table index by multiplying r with table scale and truncate to integer */
2407 rt = _mm_mul_pd(r31,vftabscale);
2408 vfitab = _mm_cvttpd_epi32(rt);
2409 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2410 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2412 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2413 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2414 F = _mm_setzero_pd();
2415 GMX_MM_TRANSPOSE2_PD(Y,F);
2416 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2417 H = _mm_setzero_pd();
2418 GMX_MM_TRANSPOSE2_PD(G,H);
2419 Heps = _mm_mul_pd(vfeps,H);
2420 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2421 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2422 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
2426 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2428 /* Calculate temporary vectorial force */
2429 tx = _mm_mul_pd(fscal,dx31);
2430 ty = _mm_mul_pd(fscal,dy31);
2431 tz = _mm_mul_pd(fscal,dz31);
2433 /* Update vectorial force */
2434 fix3 = _mm_add_pd(fix3,tx);
2435 fiy3 = _mm_add_pd(fiy3,ty);
2436 fiz3 = _mm_add_pd(fiz3,tz);
2438 fjx1 = _mm_add_pd(fjx1,tx);
2439 fjy1 = _mm_add_pd(fjy1,ty);
2440 fjz1 = _mm_add_pd(fjz1,tz);
2442 /**************************
2443 * CALCULATE INTERACTIONS *
2444 **************************/
2446 r32 = _mm_mul_pd(rsq32,rinv32);
2448 /* Calculate table index by multiplying r with table scale and truncate to integer */
2449 rt = _mm_mul_pd(r32,vftabscale);
2450 vfitab = _mm_cvttpd_epi32(rt);
2451 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2452 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2454 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2455 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2456 F = _mm_setzero_pd();
2457 GMX_MM_TRANSPOSE2_PD(Y,F);
2458 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2459 H = _mm_setzero_pd();
2460 GMX_MM_TRANSPOSE2_PD(G,H);
2461 Heps = _mm_mul_pd(vfeps,H);
2462 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2463 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2464 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
2468 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2470 /* Calculate temporary vectorial force */
2471 tx = _mm_mul_pd(fscal,dx32);
2472 ty = _mm_mul_pd(fscal,dy32);
2473 tz = _mm_mul_pd(fscal,dz32);
2475 /* Update vectorial force */
2476 fix3 = _mm_add_pd(fix3,tx);
2477 fiy3 = _mm_add_pd(fiy3,ty);
2478 fiz3 = _mm_add_pd(fiz3,tz);
2480 fjx2 = _mm_add_pd(fjx2,tx);
2481 fjy2 = _mm_add_pd(fjy2,ty);
2482 fjz2 = _mm_add_pd(fjz2,tz);
2484 /**************************
2485 * CALCULATE INTERACTIONS *
2486 **************************/
2488 r33 = _mm_mul_pd(rsq33,rinv33);
2490 /* Calculate table index by multiplying r with table scale and truncate to integer */
2491 rt = _mm_mul_pd(r33,vftabscale);
2492 vfitab = _mm_cvttpd_epi32(rt);
2493 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2494 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2496 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2497 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2498 F = _mm_setzero_pd();
2499 GMX_MM_TRANSPOSE2_PD(Y,F);
2500 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2501 H = _mm_setzero_pd();
2502 GMX_MM_TRANSPOSE2_PD(G,H);
2503 Heps = _mm_mul_pd(vfeps,H);
2504 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2505 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2506 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
2510 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2512 /* Calculate temporary vectorial force */
2513 tx = _mm_mul_pd(fscal,dx33);
2514 ty = _mm_mul_pd(fscal,dy33);
2515 tz = _mm_mul_pd(fscal,dz33);
2517 /* Update vectorial force */
2518 fix3 = _mm_add_pd(fix3,tx);
2519 fiy3 = _mm_add_pd(fiy3,ty);
2520 fiz3 = _mm_add_pd(fiz3,tz);
2522 fjx3 = _mm_add_pd(fjx3,tx);
2523 fjy3 = _mm_add_pd(fjy3,ty);
2524 fjz3 = _mm_add_pd(fjz3,tz);
2526 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2528 /* Inner loop uses 402 flops */
2531 /* End of innermost loop */
2533 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2534 f+i_coord_offset,fshift+i_shift_offset);
2536 /* Increment number of inner iterations */
2537 inneriter += j_index_end - j_index_start;
2539 /* Outer loop uses 24 flops */
2542 /* Increment number of outer iterations */
2545 /* Update outer/inner flops */
2547 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*402);