2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_double
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: None
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
69 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
71 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
72 int vdwjidx1A,vdwjidx1B;
73 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
74 int vdwjidx2A,vdwjidx2B;
75 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
76 int vdwjidx3A,vdwjidx3B;
77 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
78 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
79 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
80 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
81 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
82 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
83 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
84 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
85 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
86 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
87 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128i ifour = _mm_set1_epi32(4);
91 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
93 __m128d dummy_mask,cutoff_mask;
94 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
95 __m128d one = _mm_set1_pd(1.0);
96 __m128d two = _mm_set1_pd(2.0);
102 jindex = nlist->jindex;
104 shiftidx = nlist->shift;
106 shiftvec = fr->shift_vec[0];
107 fshift = fr->fshift[0];
108 facel = _mm_set1_pd(fr->epsfac);
109 charge = mdatoms->chargeA;
111 vftab = kernel_data->table_elec->data;
112 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
114 /* Setup water-specific parameters */
115 inr = nlist->iinr[0];
116 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
117 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
118 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
120 jq1 = _mm_set1_pd(charge[inr+1]);
121 jq2 = _mm_set1_pd(charge[inr+2]);
122 jq3 = _mm_set1_pd(charge[inr+3]);
123 qq11 = _mm_mul_pd(iq1,jq1);
124 qq12 = _mm_mul_pd(iq1,jq2);
125 qq13 = _mm_mul_pd(iq1,jq3);
126 qq21 = _mm_mul_pd(iq2,jq1);
127 qq22 = _mm_mul_pd(iq2,jq2);
128 qq23 = _mm_mul_pd(iq2,jq3);
129 qq31 = _mm_mul_pd(iq3,jq1);
130 qq32 = _mm_mul_pd(iq3,jq2);
131 qq33 = _mm_mul_pd(iq3,jq3);
133 /* Avoid stupid compiler warnings */
141 /* Start outer loop over neighborlists */
142 for(iidx=0; iidx<nri; iidx++)
144 /* Load shift vector for this list */
145 i_shift_offset = DIM*shiftidx[iidx];
147 /* Load limits for loop over neighbors */
148 j_index_start = jindex[iidx];
149 j_index_end = jindex[iidx+1];
151 /* Get outer coordinate index */
153 i_coord_offset = DIM*inr;
155 /* Load i particle coords and add shift vector */
156 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
157 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
159 fix1 = _mm_setzero_pd();
160 fiy1 = _mm_setzero_pd();
161 fiz1 = _mm_setzero_pd();
162 fix2 = _mm_setzero_pd();
163 fiy2 = _mm_setzero_pd();
164 fiz2 = _mm_setzero_pd();
165 fix3 = _mm_setzero_pd();
166 fiy3 = _mm_setzero_pd();
167 fiz3 = _mm_setzero_pd();
169 /* Reset potential sums */
170 velecsum = _mm_setzero_pd();
172 /* Start inner kernel loop */
173 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
176 /* Get j neighbor index, and coordinate index */
179 j_coord_offsetA = DIM*jnrA;
180 j_coord_offsetB = DIM*jnrB;
182 /* load j atom coordinates */
183 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
184 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
186 /* Calculate displacement vector */
187 dx11 = _mm_sub_pd(ix1,jx1);
188 dy11 = _mm_sub_pd(iy1,jy1);
189 dz11 = _mm_sub_pd(iz1,jz1);
190 dx12 = _mm_sub_pd(ix1,jx2);
191 dy12 = _mm_sub_pd(iy1,jy2);
192 dz12 = _mm_sub_pd(iz1,jz2);
193 dx13 = _mm_sub_pd(ix1,jx3);
194 dy13 = _mm_sub_pd(iy1,jy3);
195 dz13 = _mm_sub_pd(iz1,jz3);
196 dx21 = _mm_sub_pd(ix2,jx1);
197 dy21 = _mm_sub_pd(iy2,jy1);
198 dz21 = _mm_sub_pd(iz2,jz1);
199 dx22 = _mm_sub_pd(ix2,jx2);
200 dy22 = _mm_sub_pd(iy2,jy2);
201 dz22 = _mm_sub_pd(iz2,jz2);
202 dx23 = _mm_sub_pd(ix2,jx3);
203 dy23 = _mm_sub_pd(iy2,jy3);
204 dz23 = _mm_sub_pd(iz2,jz3);
205 dx31 = _mm_sub_pd(ix3,jx1);
206 dy31 = _mm_sub_pd(iy3,jy1);
207 dz31 = _mm_sub_pd(iz3,jz1);
208 dx32 = _mm_sub_pd(ix3,jx2);
209 dy32 = _mm_sub_pd(iy3,jy2);
210 dz32 = _mm_sub_pd(iz3,jz2);
211 dx33 = _mm_sub_pd(ix3,jx3);
212 dy33 = _mm_sub_pd(iy3,jy3);
213 dz33 = _mm_sub_pd(iz3,jz3);
215 /* Calculate squared distance and things based on it */
216 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
217 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
218 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
219 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
220 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
221 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
222 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
223 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
224 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
226 rinv11 = gmx_mm_invsqrt_pd(rsq11);
227 rinv12 = gmx_mm_invsqrt_pd(rsq12);
228 rinv13 = gmx_mm_invsqrt_pd(rsq13);
229 rinv21 = gmx_mm_invsqrt_pd(rsq21);
230 rinv22 = gmx_mm_invsqrt_pd(rsq22);
231 rinv23 = gmx_mm_invsqrt_pd(rsq23);
232 rinv31 = gmx_mm_invsqrt_pd(rsq31);
233 rinv32 = gmx_mm_invsqrt_pd(rsq32);
234 rinv33 = gmx_mm_invsqrt_pd(rsq33);
236 fjx1 = _mm_setzero_pd();
237 fjy1 = _mm_setzero_pd();
238 fjz1 = _mm_setzero_pd();
239 fjx2 = _mm_setzero_pd();
240 fjy2 = _mm_setzero_pd();
241 fjz2 = _mm_setzero_pd();
242 fjx3 = _mm_setzero_pd();
243 fjy3 = _mm_setzero_pd();
244 fjz3 = _mm_setzero_pd();
246 /**************************
247 * CALCULATE INTERACTIONS *
248 **************************/
250 r11 = _mm_mul_pd(rsq11,rinv11);
252 /* Calculate table index by multiplying r with table scale and truncate to integer */
253 rt = _mm_mul_pd(r11,vftabscale);
254 vfitab = _mm_cvttpd_epi32(rt);
255 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
256 vfitab = _mm_slli_epi32(vfitab,2);
258 /* CUBIC SPLINE TABLE ELECTROSTATICS */
259 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
260 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
261 GMX_MM_TRANSPOSE2_PD(Y,F);
262 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
263 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
264 GMX_MM_TRANSPOSE2_PD(G,H);
265 Heps = _mm_mul_pd(vfeps,H);
266 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
267 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
268 velec = _mm_mul_pd(qq11,VV);
269 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
270 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
272 /* Update potential sum for this i atom from the interaction with this j atom. */
273 velecsum = _mm_add_pd(velecsum,velec);
277 /* Calculate temporary vectorial force */
278 tx = _mm_mul_pd(fscal,dx11);
279 ty = _mm_mul_pd(fscal,dy11);
280 tz = _mm_mul_pd(fscal,dz11);
282 /* Update vectorial force */
283 fix1 = _mm_add_pd(fix1,tx);
284 fiy1 = _mm_add_pd(fiy1,ty);
285 fiz1 = _mm_add_pd(fiz1,tz);
287 fjx1 = _mm_add_pd(fjx1,tx);
288 fjy1 = _mm_add_pd(fjy1,ty);
289 fjz1 = _mm_add_pd(fjz1,tz);
291 /**************************
292 * CALCULATE INTERACTIONS *
293 **************************/
295 r12 = _mm_mul_pd(rsq12,rinv12);
297 /* Calculate table index by multiplying r with table scale and truncate to integer */
298 rt = _mm_mul_pd(r12,vftabscale);
299 vfitab = _mm_cvttpd_epi32(rt);
300 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
301 vfitab = _mm_slli_epi32(vfitab,2);
303 /* CUBIC SPLINE TABLE ELECTROSTATICS */
304 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
305 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
306 GMX_MM_TRANSPOSE2_PD(Y,F);
307 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
308 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
309 GMX_MM_TRANSPOSE2_PD(G,H);
310 Heps = _mm_mul_pd(vfeps,H);
311 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
312 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
313 velec = _mm_mul_pd(qq12,VV);
314 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
315 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
317 /* Update potential sum for this i atom from the interaction with this j atom. */
318 velecsum = _mm_add_pd(velecsum,velec);
322 /* Calculate temporary vectorial force */
323 tx = _mm_mul_pd(fscal,dx12);
324 ty = _mm_mul_pd(fscal,dy12);
325 tz = _mm_mul_pd(fscal,dz12);
327 /* Update vectorial force */
328 fix1 = _mm_add_pd(fix1,tx);
329 fiy1 = _mm_add_pd(fiy1,ty);
330 fiz1 = _mm_add_pd(fiz1,tz);
332 fjx2 = _mm_add_pd(fjx2,tx);
333 fjy2 = _mm_add_pd(fjy2,ty);
334 fjz2 = _mm_add_pd(fjz2,tz);
336 /**************************
337 * CALCULATE INTERACTIONS *
338 **************************/
340 r13 = _mm_mul_pd(rsq13,rinv13);
342 /* Calculate table index by multiplying r with table scale and truncate to integer */
343 rt = _mm_mul_pd(r13,vftabscale);
344 vfitab = _mm_cvttpd_epi32(rt);
345 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
346 vfitab = _mm_slli_epi32(vfitab,2);
348 /* CUBIC SPLINE TABLE ELECTROSTATICS */
349 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
350 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
351 GMX_MM_TRANSPOSE2_PD(Y,F);
352 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
353 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
354 GMX_MM_TRANSPOSE2_PD(G,H);
355 Heps = _mm_mul_pd(vfeps,H);
356 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
357 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
358 velec = _mm_mul_pd(qq13,VV);
359 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
360 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
362 /* Update potential sum for this i atom from the interaction with this j atom. */
363 velecsum = _mm_add_pd(velecsum,velec);
367 /* Calculate temporary vectorial force */
368 tx = _mm_mul_pd(fscal,dx13);
369 ty = _mm_mul_pd(fscal,dy13);
370 tz = _mm_mul_pd(fscal,dz13);
372 /* Update vectorial force */
373 fix1 = _mm_add_pd(fix1,tx);
374 fiy1 = _mm_add_pd(fiy1,ty);
375 fiz1 = _mm_add_pd(fiz1,tz);
377 fjx3 = _mm_add_pd(fjx3,tx);
378 fjy3 = _mm_add_pd(fjy3,ty);
379 fjz3 = _mm_add_pd(fjz3,tz);
381 /**************************
382 * CALCULATE INTERACTIONS *
383 **************************/
385 r21 = _mm_mul_pd(rsq21,rinv21);
387 /* Calculate table index by multiplying r with table scale and truncate to integer */
388 rt = _mm_mul_pd(r21,vftabscale);
389 vfitab = _mm_cvttpd_epi32(rt);
390 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
391 vfitab = _mm_slli_epi32(vfitab,2);
393 /* CUBIC SPLINE TABLE ELECTROSTATICS */
394 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
395 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
396 GMX_MM_TRANSPOSE2_PD(Y,F);
397 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
398 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
399 GMX_MM_TRANSPOSE2_PD(G,H);
400 Heps = _mm_mul_pd(vfeps,H);
401 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
402 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
403 velec = _mm_mul_pd(qq21,VV);
404 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
405 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
407 /* Update potential sum for this i atom from the interaction with this j atom. */
408 velecsum = _mm_add_pd(velecsum,velec);
412 /* Calculate temporary vectorial force */
413 tx = _mm_mul_pd(fscal,dx21);
414 ty = _mm_mul_pd(fscal,dy21);
415 tz = _mm_mul_pd(fscal,dz21);
417 /* Update vectorial force */
418 fix2 = _mm_add_pd(fix2,tx);
419 fiy2 = _mm_add_pd(fiy2,ty);
420 fiz2 = _mm_add_pd(fiz2,tz);
422 fjx1 = _mm_add_pd(fjx1,tx);
423 fjy1 = _mm_add_pd(fjy1,ty);
424 fjz1 = _mm_add_pd(fjz1,tz);
426 /**************************
427 * CALCULATE INTERACTIONS *
428 **************************/
430 r22 = _mm_mul_pd(rsq22,rinv22);
432 /* Calculate table index by multiplying r with table scale and truncate to integer */
433 rt = _mm_mul_pd(r22,vftabscale);
434 vfitab = _mm_cvttpd_epi32(rt);
435 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
436 vfitab = _mm_slli_epi32(vfitab,2);
438 /* CUBIC SPLINE TABLE ELECTROSTATICS */
439 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
440 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
441 GMX_MM_TRANSPOSE2_PD(Y,F);
442 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
443 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
444 GMX_MM_TRANSPOSE2_PD(G,H);
445 Heps = _mm_mul_pd(vfeps,H);
446 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
447 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
448 velec = _mm_mul_pd(qq22,VV);
449 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
450 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
452 /* Update potential sum for this i atom from the interaction with this j atom. */
453 velecsum = _mm_add_pd(velecsum,velec);
457 /* Calculate temporary vectorial force */
458 tx = _mm_mul_pd(fscal,dx22);
459 ty = _mm_mul_pd(fscal,dy22);
460 tz = _mm_mul_pd(fscal,dz22);
462 /* Update vectorial force */
463 fix2 = _mm_add_pd(fix2,tx);
464 fiy2 = _mm_add_pd(fiy2,ty);
465 fiz2 = _mm_add_pd(fiz2,tz);
467 fjx2 = _mm_add_pd(fjx2,tx);
468 fjy2 = _mm_add_pd(fjy2,ty);
469 fjz2 = _mm_add_pd(fjz2,tz);
471 /**************************
472 * CALCULATE INTERACTIONS *
473 **************************/
475 r23 = _mm_mul_pd(rsq23,rinv23);
477 /* Calculate table index by multiplying r with table scale and truncate to integer */
478 rt = _mm_mul_pd(r23,vftabscale);
479 vfitab = _mm_cvttpd_epi32(rt);
480 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
481 vfitab = _mm_slli_epi32(vfitab,2);
483 /* CUBIC SPLINE TABLE ELECTROSTATICS */
484 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
485 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
486 GMX_MM_TRANSPOSE2_PD(Y,F);
487 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
488 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
489 GMX_MM_TRANSPOSE2_PD(G,H);
490 Heps = _mm_mul_pd(vfeps,H);
491 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
492 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
493 velec = _mm_mul_pd(qq23,VV);
494 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
495 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
497 /* Update potential sum for this i atom from the interaction with this j atom. */
498 velecsum = _mm_add_pd(velecsum,velec);
502 /* Calculate temporary vectorial force */
503 tx = _mm_mul_pd(fscal,dx23);
504 ty = _mm_mul_pd(fscal,dy23);
505 tz = _mm_mul_pd(fscal,dz23);
507 /* Update vectorial force */
508 fix2 = _mm_add_pd(fix2,tx);
509 fiy2 = _mm_add_pd(fiy2,ty);
510 fiz2 = _mm_add_pd(fiz2,tz);
512 fjx3 = _mm_add_pd(fjx3,tx);
513 fjy3 = _mm_add_pd(fjy3,ty);
514 fjz3 = _mm_add_pd(fjz3,tz);
516 /**************************
517 * CALCULATE INTERACTIONS *
518 **************************/
520 r31 = _mm_mul_pd(rsq31,rinv31);
522 /* Calculate table index by multiplying r with table scale and truncate to integer */
523 rt = _mm_mul_pd(r31,vftabscale);
524 vfitab = _mm_cvttpd_epi32(rt);
525 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
526 vfitab = _mm_slli_epi32(vfitab,2);
528 /* CUBIC SPLINE TABLE ELECTROSTATICS */
529 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
530 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
531 GMX_MM_TRANSPOSE2_PD(Y,F);
532 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
533 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
534 GMX_MM_TRANSPOSE2_PD(G,H);
535 Heps = _mm_mul_pd(vfeps,H);
536 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
537 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
538 velec = _mm_mul_pd(qq31,VV);
539 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
540 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
542 /* Update potential sum for this i atom from the interaction with this j atom. */
543 velecsum = _mm_add_pd(velecsum,velec);
547 /* Calculate temporary vectorial force */
548 tx = _mm_mul_pd(fscal,dx31);
549 ty = _mm_mul_pd(fscal,dy31);
550 tz = _mm_mul_pd(fscal,dz31);
552 /* Update vectorial force */
553 fix3 = _mm_add_pd(fix3,tx);
554 fiy3 = _mm_add_pd(fiy3,ty);
555 fiz3 = _mm_add_pd(fiz3,tz);
557 fjx1 = _mm_add_pd(fjx1,tx);
558 fjy1 = _mm_add_pd(fjy1,ty);
559 fjz1 = _mm_add_pd(fjz1,tz);
561 /**************************
562 * CALCULATE INTERACTIONS *
563 **************************/
565 r32 = _mm_mul_pd(rsq32,rinv32);
567 /* Calculate table index by multiplying r with table scale and truncate to integer */
568 rt = _mm_mul_pd(r32,vftabscale);
569 vfitab = _mm_cvttpd_epi32(rt);
570 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
571 vfitab = _mm_slli_epi32(vfitab,2);
573 /* CUBIC SPLINE TABLE ELECTROSTATICS */
574 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
575 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
576 GMX_MM_TRANSPOSE2_PD(Y,F);
577 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
578 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
579 GMX_MM_TRANSPOSE2_PD(G,H);
580 Heps = _mm_mul_pd(vfeps,H);
581 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
582 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
583 velec = _mm_mul_pd(qq32,VV);
584 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
585 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
587 /* Update potential sum for this i atom from the interaction with this j atom. */
588 velecsum = _mm_add_pd(velecsum,velec);
592 /* Calculate temporary vectorial force */
593 tx = _mm_mul_pd(fscal,dx32);
594 ty = _mm_mul_pd(fscal,dy32);
595 tz = _mm_mul_pd(fscal,dz32);
597 /* Update vectorial force */
598 fix3 = _mm_add_pd(fix3,tx);
599 fiy3 = _mm_add_pd(fiy3,ty);
600 fiz3 = _mm_add_pd(fiz3,tz);
602 fjx2 = _mm_add_pd(fjx2,tx);
603 fjy2 = _mm_add_pd(fjy2,ty);
604 fjz2 = _mm_add_pd(fjz2,tz);
606 /**************************
607 * CALCULATE INTERACTIONS *
608 **************************/
610 r33 = _mm_mul_pd(rsq33,rinv33);
612 /* Calculate table index by multiplying r with table scale and truncate to integer */
613 rt = _mm_mul_pd(r33,vftabscale);
614 vfitab = _mm_cvttpd_epi32(rt);
615 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
616 vfitab = _mm_slli_epi32(vfitab,2);
618 /* CUBIC SPLINE TABLE ELECTROSTATICS */
619 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
620 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
621 GMX_MM_TRANSPOSE2_PD(Y,F);
622 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
623 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
624 GMX_MM_TRANSPOSE2_PD(G,H);
625 Heps = _mm_mul_pd(vfeps,H);
626 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
627 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
628 velec = _mm_mul_pd(qq33,VV);
629 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
630 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
632 /* Update potential sum for this i atom from the interaction with this j atom. */
633 velecsum = _mm_add_pd(velecsum,velec);
637 /* Calculate temporary vectorial force */
638 tx = _mm_mul_pd(fscal,dx33);
639 ty = _mm_mul_pd(fscal,dy33);
640 tz = _mm_mul_pd(fscal,dz33);
642 /* Update vectorial force */
643 fix3 = _mm_add_pd(fix3,tx);
644 fiy3 = _mm_add_pd(fiy3,ty);
645 fiz3 = _mm_add_pd(fiz3,tz);
647 fjx3 = _mm_add_pd(fjx3,tx);
648 fjy3 = _mm_add_pd(fjy3,ty);
649 fjz3 = _mm_add_pd(fjz3,tz);
651 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
653 /* Inner loop uses 387 flops */
660 j_coord_offsetA = DIM*jnrA;
662 /* load j atom coordinates */
663 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA+DIM,
664 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
666 /* Calculate displacement vector */
667 dx11 = _mm_sub_pd(ix1,jx1);
668 dy11 = _mm_sub_pd(iy1,jy1);
669 dz11 = _mm_sub_pd(iz1,jz1);
670 dx12 = _mm_sub_pd(ix1,jx2);
671 dy12 = _mm_sub_pd(iy1,jy2);
672 dz12 = _mm_sub_pd(iz1,jz2);
673 dx13 = _mm_sub_pd(ix1,jx3);
674 dy13 = _mm_sub_pd(iy1,jy3);
675 dz13 = _mm_sub_pd(iz1,jz3);
676 dx21 = _mm_sub_pd(ix2,jx1);
677 dy21 = _mm_sub_pd(iy2,jy1);
678 dz21 = _mm_sub_pd(iz2,jz1);
679 dx22 = _mm_sub_pd(ix2,jx2);
680 dy22 = _mm_sub_pd(iy2,jy2);
681 dz22 = _mm_sub_pd(iz2,jz2);
682 dx23 = _mm_sub_pd(ix2,jx3);
683 dy23 = _mm_sub_pd(iy2,jy3);
684 dz23 = _mm_sub_pd(iz2,jz3);
685 dx31 = _mm_sub_pd(ix3,jx1);
686 dy31 = _mm_sub_pd(iy3,jy1);
687 dz31 = _mm_sub_pd(iz3,jz1);
688 dx32 = _mm_sub_pd(ix3,jx2);
689 dy32 = _mm_sub_pd(iy3,jy2);
690 dz32 = _mm_sub_pd(iz3,jz2);
691 dx33 = _mm_sub_pd(ix3,jx3);
692 dy33 = _mm_sub_pd(iy3,jy3);
693 dz33 = _mm_sub_pd(iz3,jz3);
695 /* Calculate squared distance and things based on it */
696 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
697 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
698 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
699 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
700 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
701 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
702 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
703 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
704 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
706 rinv11 = gmx_mm_invsqrt_pd(rsq11);
707 rinv12 = gmx_mm_invsqrt_pd(rsq12);
708 rinv13 = gmx_mm_invsqrt_pd(rsq13);
709 rinv21 = gmx_mm_invsqrt_pd(rsq21);
710 rinv22 = gmx_mm_invsqrt_pd(rsq22);
711 rinv23 = gmx_mm_invsqrt_pd(rsq23);
712 rinv31 = gmx_mm_invsqrt_pd(rsq31);
713 rinv32 = gmx_mm_invsqrt_pd(rsq32);
714 rinv33 = gmx_mm_invsqrt_pd(rsq33);
716 fjx1 = _mm_setzero_pd();
717 fjy1 = _mm_setzero_pd();
718 fjz1 = _mm_setzero_pd();
719 fjx2 = _mm_setzero_pd();
720 fjy2 = _mm_setzero_pd();
721 fjz2 = _mm_setzero_pd();
722 fjx3 = _mm_setzero_pd();
723 fjy3 = _mm_setzero_pd();
724 fjz3 = _mm_setzero_pd();
726 /**************************
727 * CALCULATE INTERACTIONS *
728 **************************/
730 r11 = _mm_mul_pd(rsq11,rinv11);
732 /* Calculate table index by multiplying r with table scale and truncate to integer */
733 rt = _mm_mul_pd(r11,vftabscale);
734 vfitab = _mm_cvttpd_epi32(rt);
735 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
736 vfitab = _mm_slli_epi32(vfitab,2);
738 /* CUBIC SPLINE TABLE ELECTROSTATICS */
739 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
740 F = _mm_setzero_pd();
741 GMX_MM_TRANSPOSE2_PD(Y,F);
742 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
743 H = _mm_setzero_pd();
744 GMX_MM_TRANSPOSE2_PD(G,H);
745 Heps = _mm_mul_pd(vfeps,H);
746 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
747 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
748 velec = _mm_mul_pd(qq11,VV);
749 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
750 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
752 /* Update potential sum for this i atom from the interaction with this j atom. */
753 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
754 velecsum = _mm_add_pd(velecsum,velec);
758 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
760 /* Calculate temporary vectorial force */
761 tx = _mm_mul_pd(fscal,dx11);
762 ty = _mm_mul_pd(fscal,dy11);
763 tz = _mm_mul_pd(fscal,dz11);
765 /* Update vectorial force */
766 fix1 = _mm_add_pd(fix1,tx);
767 fiy1 = _mm_add_pd(fiy1,ty);
768 fiz1 = _mm_add_pd(fiz1,tz);
770 fjx1 = _mm_add_pd(fjx1,tx);
771 fjy1 = _mm_add_pd(fjy1,ty);
772 fjz1 = _mm_add_pd(fjz1,tz);
774 /**************************
775 * CALCULATE INTERACTIONS *
776 **************************/
778 r12 = _mm_mul_pd(rsq12,rinv12);
780 /* Calculate table index by multiplying r with table scale and truncate to integer */
781 rt = _mm_mul_pd(r12,vftabscale);
782 vfitab = _mm_cvttpd_epi32(rt);
783 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
784 vfitab = _mm_slli_epi32(vfitab,2);
786 /* CUBIC SPLINE TABLE ELECTROSTATICS */
787 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
788 F = _mm_setzero_pd();
789 GMX_MM_TRANSPOSE2_PD(Y,F);
790 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
791 H = _mm_setzero_pd();
792 GMX_MM_TRANSPOSE2_PD(G,H);
793 Heps = _mm_mul_pd(vfeps,H);
794 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
795 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
796 velec = _mm_mul_pd(qq12,VV);
797 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
798 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
800 /* Update potential sum for this i atom from the interaction with this j atom. */
801 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
802 velecsum = _mm_add_pd(velecsum,velec);
806 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
808 /* Calculate temporary vectorial force */
809 tx = _mm_mul_pd(fscal,dx12);
810 ty = _mm_mul_pd(fscal,dy12);
811 tz = _mm_mul_pd(fscal,dz12);
813 /* Update vectorial force */
814 fix1 = _mm_add_pd(fix1,tx);
815 fiy1 = _mm_add_pd(fiy1,ty);
816 fiz1 = _mm_add_pd(fiz1,tz);
818 fjx2 = _mm_add_pd(fjx2,tx);
819 fjy2 = _mm_add_pd(fjy2,ty);
820 fjz2 = _mm_add_pd(fjz2,tz);
822 /**************************
823 * CALCULATE INTERACTIONS *
824 **************************/
826 r13 = _mm_mul_pd(rsq13,rinv13);
828 /* Calculate table index by multiplying r with table scale and truncate to integer */
829 rt = _mm_mul_pd(r13,vftabscale);
830 vfitab = _mm_cvttpd_epi32(rt);
831 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
832 vfitab = _mm_slli_epi32(vfitab,2);
834 /* CUBIC SPLINE TABLE ELECTROSTATICS */
835 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
836 F = _mm_setzero_pd();
837 GMX_MM_TRANSPOSE2_PD(Y,F);
838 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
839 H = _mm_setzero_pd();
840 GMX_MM_TRANSPOSE2_PD(G,H);
841 Heps = _mm_mul_pd(vfeps,H);
842 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
843 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
844 velec = _mm_mul_pd(qq13,VV);
845 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
846 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
848 /* Update potential sum for this i atom from the interaction with this j atom. */
849 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
850 velecsum = _mm_add_pd(velecsum,velec);
854 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
856 /* Calculate temporary vectorial force */
857 tx = _mm_mul_pd(fscal,dx13);
858 ty = _mm_mul_pd(fscal,dy13);
859 tz = _mm_mul_pd(fscal,dz13);
861 /* Update vectorial force */
862 fix1 = _mm_add_pd(fix1,tx);
863 fiy1 = _mm_add_pd(fiy1,ty);
864 fiz1 = _mm_add_pd(fiz1,tz);
866 fjx3 = _mm_add_pd(fjx3,tx);
867 fjy3 = _mm_add_pd(fjy3,ty);
868 fjz3 = _mm_add_pd(fjz3,tz);
870 /**************************
871 * CALCULATE INTERACTIONS *
872 **************************/
874 r21 = _mm_mul_pd(rsq21,rinv21);
876 /* Calculate table index by multiplying r with table scale and truncate to integer */
877 rt = _mm_mul_pd(r21,vftabscale);
878 vfitab = _mm_cvttpd_epi32(rt);
879 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
880 vfitab = _mm_slli_epi32(vfitab,2);
882 /* CUBIC SPLINE TABLE ELECTROSTATICS */
883 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
884 F = _mm_setzero_pd();
885 GMX_MM_TRANSPOSE2_PD(Y,F);
886 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
887 H = _mm_setzero_pd();
888 GMX_MM_TRANSPOSE2_PD(G,H);
889 Heps = _mm_mul_pd(vfeps,H);
890 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
891 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
892 velec = _mm_mul_pd(qq21,VV);
893 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
894 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
896 /* Update potential sum for this i atom from the interaction with this j atom. */
897 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
898 velecsum = _mm_add_pd(velecsum,velec);
902 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
904 /* Calculate temporary vectorial force */
905 tx = _mm_mul_pd(fscal,dx21);
906 ty = _mm_mul_pd(fscal,dy21);
907 tz = _mm_mul_pd(fscal,dz21);
909 /* Update vectorial force */
910 fix2 = _mm_add_pd(fix2,tx);
911 fiy2 = _mm_add_pd(fiy2,ty);
912 fiz2 = _mm_add_pd(fiz2,tz);
914 fjx1 = _mm_add_pd(fjx1,tx);
915 fjy1 = _mm_add_pd(fjy1,ty);
916 fjz1 = _mm_add_pd(fjz1,tz);
918 /**************************
919 * CALCULATE INTERACTIONS *
920 **************************/
922 r22 = _mm_mul_pd(rsq22,rinv22);
924 /* Calculate table index by multiplying r with table scale and truncate to integer */
925 rt = _mm_mul_pd(r22,vftabscale);
926 vfitab = _mm_cvttpd_epi32(rt);
927 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
928 vfitab = _mm_slli_epi32(vfitab,2);
930 /* CUBIC SPLINE TABLE ELECTROSTATICS */
931 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
932 F = _mm_setzero_pd();
933 GMX_MM_TRANSPOSE2_PD(Y,F);
934 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
935 H = _mm_setzero_pd();
936 GMX_MM_TRANSPOSE2_PD(G,H);
937 Heps = _mm_mul_pd(vfeps,H);
938 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
939 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
940 velec = _mm_mul_pd(qq22,VV);
941 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
942 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
944 /* Update potential sum for this i atom from the interaction with this j atom. */
945 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
946 velecsum = _mm_add_pd(velecsum,velec);
950 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
952 /* Calculate temporary vectorial force */
953 tx = _mm_mul_pd(fscal,dx22);
954 ty = _mm_mul_pd(fscal,dy22);
955 tz = _mm_mul_pd(fscal,dz22);
957 /* Update vectorial force */
958 fix2 = _mm_add_pd(fix2,tx);
959 fiy2 = _mm_add_pd(fiy2,ty);
960 fiz2 = _mm_add_pd(fiz2,tz);
962 fjx2 = _mm_add_pd(fjx2,tx);
963 fjy2 = _mm_add_pd(fjy2,ty);
964 fjz2 = _mm_add_pd(fjz2,tz);
966 /**************************
967 * CALCULATE INTERACTIONS *
968 **************************/
970 r23 = _mm_mul_pd(rsq23,rinv23);
972 /* Calculate table index by multiplying r with table scale and truncate to integer */
973 rt = _mm_mul_pd(r23,vftabscale);
974 vfitab = _mm_cvttpd_epi32(rt);
975 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
976 vfitab = _mm_slli_epi32(vfitab,2);
978 /* CUBIC SPLINE TABLE ELECTROSTATICS */
979 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
980 F = _mm_setzero_pd();
981 GMX_MM_TRANSPOSE2_PD(Y,F);
982 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
983 H = _mm_setzero_pd();
984 GMX_MM_TRANSPOSE2_PD(G,H);
985 Heps = _mm_mul_pd(vfeps,H);
986 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
987 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
988 velec = _mm_mul_pd(qq23,VV);
989 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
990 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
992 /* Update potential sum for this i atom from the interaction with this j atom. */
993 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
994 velecsum = _mm_add_pd(velecsum,velec);
998 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1000 /* Calculate temporary vectorial force */
1001 tx = _mm_mul_pd(fscal,dx23);
1002 ty = _mm_mul_pd(fscal,dy23);
1003 tz = _mm_mul_pd(fscal,dz23);
1005 /* Update vectorial force */
1006 fix2 = _mm_add_pd(fix2,tx);
1007 fiy2 = _mm_add_pd(fiy2,ty);
1008 fiz2 = _mm_add_pd(fiz2,tz);
1010 fjx3 = _mm_add_pd(fjx3,tx);
1011 fjy3 = _mm_add_pd(fjy3,ty);
1012 fjz3 = _mm_add_pd(fjz3,tz);
1014 /**************************
1015 * CALCULATE INTERACTIONS *
1016 **************************/
1018 r31 = _mm_mul_pd(rsq31,rinv31);
1020 /* Calculate table index by multiplying r with table scale and truncate to integer */
1021 rt = _mm_mul_pd(r31,vftabscale);
1022 vfitab = _mm_cvttpd_epi32(rt);
1023 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1024 vfitab = _mm_slli_epi32(vfitab,2);
1026 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1027 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1028 F = _mm_setzero_pd();
1029 GMX_MM_TRANSPOSE2_PD(Y,F);
1030 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1031 H = _mm_setzero_pd();
1032 GMX_MM_TRANSPOSE2_PD(G,H);
1033 Heps = _mm_mul_pd(vfeps,H);
1034 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1035 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1036 velec = _mm_mul_pd(qq31,VV);
1037 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1038 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
1040 /* Update potential sum for this i atom from the interaction with this j atom. */
1041 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1042 velecsum = _mm_add_pd(velecsum,velec);
1046 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1048 /* Calculate temporary vectorial force */
1049 tx = _mm_mul_pd(fscal,dx31);
1050 ty = _mm_mul_pd(fscal,dy31);
1051 tz = _mm_mul_pd(fscal,dz31);
1053 /* Update vectorial force */
1054 fix3 = _mm_add_pd(fix3,tx);
1055 fiy3 = _mm_add_pd(fiy3,ty);
1056 fiz3 = _mm_add_pd(fiz3,tz);
1058 fjx1 = _mm_add_pd(fjx1,tx);
1059 fjy1 = _mm_add_pd(fjy1,ty);
1060 fjz1 = _mm_add_pd(fjz1,tz);
1062 /**************************
1063 * CALCULATE INTERACTIONS *
1064 **************************/
1066 r32 = _mm_mul_pd(rsq32,rinv32);
1068 /* Calculate table index by multiplying r with table scale and truncate to integer */
1069 rt = _mm_mul_pd(r32,vftabscale);
1070 vfitab = _mm_cvttpd_epi32(rt);
1071 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1072 vfitab = _mm_slli_epi32(vfitab,2);
1074 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1075 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1076 F = _mm_setzero_pd();
1077 GMX_MM_TRANSPOSE2_PD(Y,F);
1078 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1079 H = _mm_setzero_pd();
1080 GMX_MM_TRANSPOSE2_PD(G,H);
1081 Heps = _mm_mul_pd(vfeps,H);
1082 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1083 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1084 velec = _mm_mul_pd(qq32,VV);
1085 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1086 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
1088 /* Update potential sum for this i atom from the interaction with this j atom. */
1089 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1090 velecsum = _mm_add_pd(velecsum,velec);
1094 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1096 /* Calculate temporary vectorial force */
1097 tx = _mm_mul_pd(fscal,dx32);
1098 ty = _mm_mul_pd(fscal,dy32);
1099 tz = _mm_mul_pd(fscal,dz32);
1101 /* Update vectorial force */
1102 fix3 = _mm_add_pd(fix3,tx);
1103 fiy3 = _mm_add_pd(fiy3,ty);
1104 fiz3 = _mm_add_pd(fiz3,tz);
1106 fjx2 = _mm_add_pd(fjx2,tx);
1107 fjy2 = _mm_add_pd(fjy2,ty);
1108 fjz2 = _mm_add_pd(fjz2,tz);
1110 /**************************
1111 * CALCULATE INTERACTIONS *
1112 **************************/
1114 r33 = _mm_mul_pd(rsq33,rinv33);
1116 /* Calculate table index by multiplying r with table scale and truncate to integer */
1117 rt = _mm_mul_pd(r33,vftabscale);
1118 vfitab = _mm_cvttpd_epi32(rt);
1119 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1120 vfitab = _mm_slli_epi32(vfitab,2);
1122 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1123 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1124 F = _mm_setzero_pd();
1125 GMX_MM_TRANSPOSE2_PD(Y,F);
1126 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1127 H = _mm_setzero_pd();
1128 GMX_MM_TRANSPOSE2_PD(G,H);
1129 Heps = _mm_mul_pd(vfeps,H);
1130 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1131 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1132 velec = _mm_mul_pd(qq33,VV);
1133 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1134 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
1136 /* Update potential sum for this i atom from the interaction with this j atom. */
1137 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1138 velecsum = _mm_add_pd(velecsum,velec);
1142 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1144 /* Calculate temporary vectorial force */
1145 tx = _mm_mul_pd(fscal,dx33);
1146 ty = _mm_mul_pd(fscal,dy33);
1147 tz = _mm_mul_pd(fscal,dz33);
1149 /* Update vectorial force */
1150 fix3 = _mm_add_pd(fix3,tx);
1151 fiy3 = _mm_add_pd(fiy3,ty);
1152 fiz3 = _mm_add_pd(fiz3,tz);
1154 fjx3 = _mm_add_pd(fjx3,tx);
1155 fjy3 = _mm_add_pd(fjy3,ty);
1156 fjz3 = _mm_add_pd(fjz3,tz);
1158 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1160 /* Inner loop uses 387 flops */
1163 /* End of innermost loop */
1165 gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1166 f+i_coord_offset+DIM,fshift+i_shift_offset);
1169 /* Update potential energies */
1170 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1172 /* Increment number of inner iterations */
1173 inneriter += j_index_end - j_index_start;
1175 /* Outer loop uses 19 flops */
1178 /* Increment number of outer iterations */
1181 /* Update outer/inner flops */
1183 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*387);
1186 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_double
1187 * Electrostatics interaction: CubicSplineTable
1188 * VdW interaction: None
1189 * Geometry: Water4-Water4
1190 * Calculate force/pot: Force
1193 nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_double
1194 (t_nblist * gmx_restrict nlist,
1195 rvec * gmx_restrict xx,
1196 rvec * gmx_restrict ff,
1197 t_forcerec * gmx_restrict fr,
1198 t_mdatoms * gmx_restrict mdatoms,
1199 nb_kernel_data_t * gmx_restrict kernel_data,
1200 t_nrnb * gmx_restrict nrnb)
1202 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1203 * just 0 for non-waters.
1204 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1205 * jnr indices corresponding to data put in the four positions in the SIMD register.
1207 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1208 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1210 int j_coord_offsetA,j_coord_offsetB;
1211 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1212 real rcutoff_scalar;
1213 real *shiftvec,*fshift,*x,*f;
1214 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1216 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1218 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1220 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1221 int vdwjidx1A,vdwjidx1B;
1222 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1223 int vdwjidx2A,vdwjidx2B;
1224 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1225 int vdwjidx3A,vdwjidx3B;
1226 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1227 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1228 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1229 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1230 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1231 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1232 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1233 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1234 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1235 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1236 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1239 __m128i ifour = _mm_set1_epi32(4);
1240 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1242 __m128d dummy_mask,cutoff_mask;
1243 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1244 __m128d one = _mm_set1_pd(1.0);
1245 __m128d two = _mm_set1_pd(2.0);
1251 jindex = nlist->jindex;
1253 shiftidx = nlist->shift;
1255 shiftvec = fr->shift_vec[0];
1256 fshift = fr->fshift[0];
1257 facel = _mm_set1_pd(fr->epsfac);
1258 charge = mdatoms->chargeA;
1260 vftab = kernel_data->table_elec->data;
1261 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
1263 /* Setup water-specific parameters */
1264 inr = nlist->iinr[0];
1265 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1266 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1267 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1269 jq1 = _mm_set1_pd(charge[inr+1]);
1270 jq2 = _mm_set1_pd(charge[inr+2]);
1271 jq3 = _mm_set1_pd(charge[inr+3]);
1272 qq11 = _mm_mul_pd(iq1,jq1);
1273 qq12 = _mm_mul_pd(iq1,jq2);
1274 qq13 = _mm_mul_pd(iq1,jq3);
1275 qq21 = _mm_mul_pd(iq2,jq1);
1276 qq22 = _mm_mul_pd(iq2,jq2);
1277 qq23 = _mm_mul_pd(iq2,jq3);
1278 qq31 = _mm_mul_pd(iq3,jq1);
1279 qq32 = _mm_mul_pd(iq3,jq2);
1280 qq33 = _mm_mul_pd(iq3,jq3);
1282 /* Avoid stupid compiler warnings */
1284 j_coord_offsetA = 0;
1285 j_coord_offsetB = 0;
1290 /* Start outer loop over neighborlists */
1291 for(iidx=0; iidx<nri; iidx++)
1293 /* Load shift vector for this list */
1294 i_shift_offset = DIM*shiftidx[iidx];
1296 /* Load limits for loop over neighbors */
1297 j_index_start = jindex[iidx];
1298 j_index_end = jindex[iidx+1];
1300 /* Get outer coordinate index */
1302 i_coord_offset = DIM*inr;
1304 /* Load i particle coords and add shift vector */
1305 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1306 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1308 fix1 = _mm_setzero_pd();
1309 fiy1 = _mm_setzero_pd();
1310 fiz1 = _mm_setzero_pd();
1311 fix2 = _mm_setzero_pd();
1312 fiy2 = _mm_setzero_pd();
1313 fiz2 = _mm_setzero_pd();
1314 fix3 = _mm_setzero_pd();
1315 fiy3 = _mm_setzero_pd();
1316 fiz3 = _mm_setzero_pd();
1318 /* Start inner kernel loop */
1319 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1322 /* Get j neighbor index, and coordinate index */
1324 jnrB = jjnr[jidx+1];
1325 j_coord_offsetA = DIM*jnrA;
1326 j_coord_offsetB = DIM*jnrB;
1328 /* load j atom coordinates */
1329 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1330 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1332 /* Calculate displacement vector */
1333 dx11 = _mm_sub_pd(ix1,jx1);
1334 dy11 = _mm_sub_pd(iy1,jy1);
1335 dz11 = _mm_sub_pd(iz1,jz1);
1336 dx12 = _mm_sub_pd(ix1,jx2);
1337 dy12 = _mm_sub_pd(iy1,jy2);
1338 dz12 = _mm_sub_pd(iz1,jz2);
1339 dx13 = _mm_sub_pd(ix1,jx3);
1340 dy13 = _mm_sub_pd(iy1,jy3);
1341 dz13 = _mm_sub_pd(iz1,jz3);
1342 dx21 = _mm_sub_pd(ix2,jx1);
1343 dy21 = _mm_sub_pd(iy2,jy1);
1344 dz21 = _mm_sub_pd(iz2,jz1);
1345 dx22 = _mm_sub_pd(ix2,jx2);
1346 dy22 = _mm_sub_pd(iy2,jy2);
1347 dz22 = _mm_sub_pd(iz2,jz2);
1348 dx23 = _mm_sub_pd(ix2,jx3);
1349 dy23 = _mm_sub_pd(iy2,jy3);
1350 dz23 = _mm_sub_pd(iz2,jz3);
1351 dx31 = _mm_sub_pd(ix3,jx1);
1352 dy31 = _mm_sub_pd(iy3,jy1);
1353 dz31 = _mm_sub_pd(iz3,jz1);
1354 dx32 = _mm_sub_pd(ix3,jx2);
1355 dy32 = _mm_sub_pd(iy3,jy2);
1356 dz32 = _mm_sub_pd(iz3,jz2);
1357 dx33 = _mm_sub_pd(ix3,jx3);
1358 dy33 = _mm_sub_pd(iy3,jy3);
1359 dz33 = _mm_sub_pd(iz3,jz3);
1361 /* Calculate squared distance and things based on it */
1362 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1363 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1364 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1365 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1366 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1367 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1368 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1369 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1370 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1372 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1373 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1374 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1375 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1376 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1377 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1378 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1379 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1380 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1382 fjx1 = _mm_setzero_pd();
1383 fjy1 = _mm_setzero_pd();
1384 fjz1 = _mm_setzero_pd();
1385 fjx2 = _mm_setzero_pd();
1386 fjy2 = _mm_setzero_pd();
1387 fjz2 = _mm_setzero_pd();
1388 fjx3 = _mm_setzero_pd();
1389 fjy3 = _mm_setzero_pd();
1390 fjz3 = _mm_setzero_pd();
1392 /**************************
1393 * CALCULATE INTERACTIONS *
1394 **************************/
1396 r11 = _mm_mul_pd(rsq11,rinv11);
1398 /* Calculate table index by multiplying r with table scale and truncate to integer */
1399 rt = _mm_mul_pd(r11,vftabscale);
1400 vfitab = _mm_cvttpd_epi32(rt);
1401 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1402 vfitab = _mm_slli_epi32(vfitab,2);
1404 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1405 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1406 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1407 GMX_MM_TRANSPOSE2_PD(Y,F);
1408 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1409 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1410 GMX_MM_TRANSPOSE2_PD(G,H);
1411 Heps = _mm_mul_pd(vfeps,H);
1412 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1413 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1414 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1418 /* Calculate temporary vectorial force */
1419 tx = _mm_mul_pd(fscal,dx11);
1420 ty = _mm_mul_pd(fscal,dy11);
1421 tz = _mm_mul_pd(fscal,dz11);
1423 /* Update vectorial force */
1424 fix1 = _mm_add_pd(fix1,tx);
1425 fiy1 = _mm_add_pd(fiy1,ty);
1426 fiz1 = _mm_add_pd(fiz1,tz);
1428 fjx1 = _mm_add_pd(fjx1,tx);
1429 fjy1 = _mm_add_pd(fjy1,ty);
1430 fjz1 = _mm_add_pd(fjz1,tz);
1432 /**************************
1433 * CALCULATE INTERACTIONS *
1434 **************************/
1436 r12 = _mm_mul_pd(rsq12,rinv12);
1438 /* Calculate table index by multiplying r with table scale and truncate to integer */
1439 rt = _mm_mul_pd(r12,vftabscale);
1440 vfitab = _mm_cvttpd_epi32(rt);
1441 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1442 vfitab = _mm_slli_epi32(vfitab,2);
1444 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1445 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1446 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1447 GMX_MM_TRANSPOSE2_PD(Y,F);
1448 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1449 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1450 GMX_MM_TRANSPOSE2_PD(G,H);
1451 Heps = _mm_mul_pd(vfeps,H);
1452 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1453 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1454 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1458 /* Calculate temporary vectorial force */
1459 tx = _mm_mul_pd(fscal,dx12);
1460 ty = _mm_mul_pd(fscal,dy12);
1461 tz = _mm_mul_pd(fscal,dz12);
1463 /* Update vectorial force */
1464 fix1 = _mm_add_pd(fix1,tx);
1465 fiy1 = _mm_add_pd(fiy1,ty);
1466 fiz1 = _mm_add_pd(fiz1,tz);
1468 fjx2 = _mm_add_pd(fjx2,tx);
1469 fjy2 = _mm_add_pd(fjy2,ty);
1470 fjz2 = _mm_add_pd(fjz2,tz);
1472 /**************************
1473 * CALCULATE INTERACTIONS *
1474 **************************/
1476 r13 = _mm_mul_pd(rsq13,rinv13);
1478 /* Calculate table index by multiplying r with table scale and truncate to integer */
1479 rt = _mm_mul_pd(r13,vftabscale);
1480 vfitab = _mm_cvttpd_epi32(rt);
1481 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1482 vfitab = _mm_slli_epi32(vfitab,2);
1484 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1485 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1486 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1487 GMX_MM_TRANSPOSE2_PD(Y,F);
1488 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1489 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1490 GMX_MM_TRANSPOSE2_PD(G,H);
1491 Heps = _mm_mul_pd(vfeps,H);
1492 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1493 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1494 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
1498 /* Calculate temporary vectorial force */
1499 tx = _mm_mul_pd(fscal,dx13);
1500 ty = _mm_mul_pd(fscal,dy13);
1501 tz = _mm_mul_pd(fscal,dz13);
1503 /* Update vectorial force */
1504 fix1 = _mm_add_pd(fix1,tx);
1505 fiy1 = _mm_add_pd(fiy1,ty);
1506 fiz1 = _mm_add_pd(fiz1,tz);
1508 fjx3 = _mm_add_pd(fjx3,tx);
1509 fjy3 = _mm_add_pd(fjy3,ty);
1510 fjz3 = _mm_add_pd(fjz3,tz);
1512 /**************************
1513 * CALCULATE INTERACTIONS *
1514 **************************/
1516 r21 = _mm_mul_pd(rsq21,rinv21);
1518 /* Calculate table index by multiplying r with table scale and truncate to integer */
1519 rt = _mm_mul_pd(r21,vftabscale);
1520 vfitab = _mm_cvttpd_epi32(rt);
1521 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1522 vfitab = _mm_slli_epi32(vfitab,2);
1524 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1525 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1526 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1527 GMX_MM_TRANSPOSE2_PD(Y,F);
1528 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1529 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1530 GMX_MM_TRANSPOSE2_PD(G,H);
1531 Heps = _mm_mul_pd(vfeps,H);
1532 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1533 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1534 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1538 /* Calculate temporary vectorial force */
1539 tx = _mm_mul_pd(fscal,dx21);
1540 ty = _mm_mul_pd(fscal,dy21);
1541 tz = _mm_mul_pd(fscal,dz21);
1543 /* Update vectorial force */
1544 fix2 = _mm_add_pd(fix2,tx);
1545 fiy2 = _mm_add_pd(fiy2,ty);
1546 fiz2 = _mm_add_pd(fiz2,tz);
1548 fjx1 = _mm_add_pd(fjx1,tx);
1549 fjy1 = _mm_add_pd(fjy1,ty);
1550 fjz1 = _mm_add_pd(fjz1,tz);
1552 /**************************
1553 * CALCULATE INTERACTIONS *
1554 **************************/
1556 r22 = _mm_mul_pd(rsq22,rinv22);
1558 /* Calculate table index by multiplying r with table scale and truncate to integer */
1559 rt = _mm_mul_pd(r22,vftabscale);
1560 vfitab = _mm_cvttpd_epi32(rt);
1561 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1562 vfitab = _mm_slli_epi32(vfitab,2);
1564 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1565 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1566 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1567 GMX_MM_TRANSPOSE2_PD(Y,F);
1568 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1569 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1570 GMX_MM_TRANSPOSE2_PD(G,H);
1571 Heps = _mm_mul_pd(vfeps,H);
1572 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1573 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1574 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1578 /* Calculate temporary vectorial force */
1579 tx = _mm_mul_pd(fscal,dx22);
1580 ty = _mm_mul_pd(fscal,dy22);
1581 tz = _mm_mul_pd(fscal,dz22);
1583 /* Update vectorial force */
1584 fix2 = _mm_add_pd(fix2,tx);
1585 fiy2 = _mm_add_pd(fiy2,ty);
1586 fiz2 = _mm_add_pd(fiz2,tz);
1588 fjx2 = _mm_add_pd(fjx2,tx);
1589 fjy2 = _mm_add_pd(fjy2,ty);
1590 fjz2 = _mm_add_pd(fjz2,tz);
1592 /**************************
1593 * CALCULATE INTERACTIONS *
1594 **************************/
1596 r23 = _mm_mul_pd(rsq23,rinv23);
1598 /* Calculate table index by multiplying r with table scale and truncate to integer */
1599 rt = _mm_mul_pd(r23,vftabscale);
1600 vfitab = _mm_cvttpd_epi32(rt);
1601 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1602 vfitab = _mm_slli_epi32(vfitab,2);
1604 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1605 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1606 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1607 GMX_MM_TRANSPOSE2_PD(Y,F);
1608 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1609 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1610 GMX_MM_TRANSPOSE2_PD(G,H);
1611 Heps = _mm_mul_pd(vfeps,H);
1612 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1613 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1614 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
1618 /* Calculate temporary vectorial force */
1619 tx = _mm_mul_pd(fscal,dx23);
1620 ty = _mm_mul_pd(fscal,dy23);
1621 tz = _mm_mul_pd(fscal,dz23);
1623 /* Update vectorial force */
1624 fix2 = _mm_add_pd(fix2,tx);
1625 fiy2 = _mm_add_pd(fiy2,ty);
1626 fiz2 = _mm_add_pd(fiz2,tz);
1628 fjx3 = _mm_add_pd(fjx3,tx);
1629 fjy3 = _mm_add_pd(fjy3,ty);
1630 fjz3 = _mm_add_pd(fjz3,tz);
1632 /**************************
1633 * CALCULATE INTERACTIONS *
1634 **************************/
1636 r31 = _mm_mul_pd(rsq31,rinv31);
1638 /* Calculate table index by multiplying r with table scale and truncate to integer */
1639 rt = _mm_mul_pd(r31,vftabscale);
1640 vfitab = _mm_cvttpd_epi32(rt);
1641 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1642 vfitab = _mm_slli_epi32(vfitab,2);
1644 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1645 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1646 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1647 GMX_MM_TRANSPOSE2_PD(Y,F);
1648 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1649 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1650 GMX_MM_TRANSPOSE2_PD(G,H);
1651 Heps = _mm_mul_pd(vfeps,H);
1652 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1653 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1654 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
1658 /* Calculate temporary vectorial force */
1659 tx = _mm_mul_pd(fscal,dx31);
1660 ty = _mm_mul_pd(fscal,dy31);
1661 tz = _mm_mul_pd(fscal,dz31);
1663 /* Update vectorial force */
1664 fix3 = _mm_add_pd(fix3,tx);
1665 fiy3 = _mm_add_pd(fiy3,ty);
1666 fiz3 = _mm_add_pd(fiz3,tz);
1668 fjx1 = _mm_add_pd(fjx1,tx);
1669 fjy1 = _mm_add_pd(fjy1,ty);
1670 fjz1 = _mm_add_pd(fjz1,tz);
1672 /**************************
1673 * CALCULATE INTERACTIONS *
1674 **************************/
1676 r32 = _mm_mul_pd(rsq32,rinv32);
1678 /* Calculate table index by multiplying r with table scale and truncate to integer */
1679 rt = _mm_mul_pd(r32,vftabscale);
1680 vfitab = _mm_cvttpd_epi32(rt);
1681 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1682 vfitab = _mm_slli_epi32(vfitab,2);
1684 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1685 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1686 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1687 GMX_MM_TRANSPOSE2_PD(Y,F);
1688 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1689 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1690 GMX_MM_TRANSPOSE2_PD(G,H);
1691 Heps = _mm_mul_pd(vfeps,H);
1692 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1693 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1694 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
1698 /* Calculate temporary vectorial force */
1699 tx = _mm_mul_pd(fscal,dx32);
1700 ty = _mm_mul_pd(fscal,dy32);
1701 tz = _mm_mul_pd(fscal,dz32);
1703 /* Update vectorial force */
1704 fix3 = _mm_add_pd(fix3,tx);
1705 fiy3 = _mm_add_pd(fiy3,ty);
1706 fiz3 = _mm_add_pd(fiz3,tz);
1708 fjx2 = _mm_add_pd(fjx2,tx);
1709 fjy2 = _mm_add_pd(fjy2,ty);
1710 fjz2 = _mm_add_pd(fjz2,tz);
1712 /**************************
1713 * CALCULATE INTERACTIONS *
1714 **************************/
1716 r33 = _mm_mul_pd(rsq33,rinv33);
1718 /* Calculate table index by multiplying r with table scale and truncate to integer */
1719 rt = _mm_mul_pd(r33,vftabscale);
1720 vfitab = _mm_cvttpd_epi32(rt);
1721 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1722 vfitab = _mm_slli_epi32(vfitab,2);
1724 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1725 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1726 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1727 GMX_MM_TRANSPOSE2_PD(Y,F);
1728 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1729 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1730 GMX_MM_TRANSPOSE2_PD(G,H);
1731 Heps = _mm_mul_pd(vfeps,H);
1732 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1733 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1734 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
1738 /* Calculate temporary vectorial force */
1739 tx = _mm_mul_pd(fscal,dx33);
1740 ty = _mm_mul_pd(fscal,dy33);
1741 tz = _mm_mul_pd(fscal,dz33);
1743 /* Update vectorial force */
1744 fix3 = _mm_add_pd(fix3,tx);
1745 fiy3 = _mm_add_pd(fiy3,ty);
1746 fiz3 = _mm_add_pd(fiz3,tz);
1748 fjx3 = _mm_add_pd(fjx3,tx);
1749 fjy3 = _mm_add_pd(fjy3,ty);
1750 fjz3 = _mm_add_pd(fjz3,tz);
1752 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1754 /* Inner loop uses 351 flops */
1757 if(jidx<j_index_end)
1761 j_coord_offsetA = DIM*jnrA;
1763 /* load j atom coordinates */
1764 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA+DIM,
1765 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1767 /* Calculate displacement vector */
1768 dx11 = _mm_sub_pd(ix1,jx1);
1769 dy11 = _mm_sub_pd(iy1,jy1);
1770 dz11 = _mm_sub_pd(iz1,jz1);
1771 dx12 = _mm_sub_pd(ix1,jx2);
1772 dy12 = _mm_sub_pd(iy1,jy2);
1773 dz12 = _mm_sub_pd(iz1,jz2);
1774 dx13 = _mm_sub_pd(ix1,jx3);
1775 dy13 = _mm_sub_pd(iy1,jy3);
1776 dz13 = _mm_sub_pd(iz1,jz3);
1777 dx21 = _mm_sub_pd(ix2,jx1);
1778 dy21 = _mm_sub_pd(iy2,jy1);
1779 dz21 = _mm_sub_pd(iz2,jz1);
1780 dx22 = _mm_sub_pd(ix2,jx2);
1781 dy22 = _mm_sub_pd(iy2,jy2);
1782 dz22 = _mm_sub_pd(iz2,jz2);
1783 dx23 = _mm_sub_pd(ix2,jx3);
1784 dy23 = _mm_sub_pd(iy2,jy3);
1785 dz23 = _mm_sub_pd(iz2,jz3);
1786 dx31 = _mm_sub_pd(ix3,jx1);
1787 dy31 = _mm_sub_pd(iy3,jy1);
1788 dz31 = _mm_sub_pd(iz3,jz1);
1789 dx32 = _mm_sub_pd(ix3,jx2);
1790 dy32 = _mm_sub_pd(iy3,jy2);
1791 dz32 = _mm_sub_pd(iz3,jz2);
1792 dx33 = _mm_sub_pd(ix3,jx3);
1793 dy33 = _mm_sub_pd(iy3,jy3);
1794 dz33 = _mm_sub_pd(iz3,jz3);
1796 /* Calculate squared distance and things based on it */
1797 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1798 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1799 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1800 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1801 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1802 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1803 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1804 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1805 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1807 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1808 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1809 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1810 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1811 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1812 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1813 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1814 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1815 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1817 fjx1 = _mm_setzero_pd();
1818 fjy1 = _mm_setzero_pd();
1819 fjz1 = _mm_setzero_pd();
1820 fjx2 = _mm_setzero_pd();
1821 fjy2 = _mm_setzero_pd();
1822 fjz2 = _mm_setzero_pd();
1823 fjx3 = _mm_setzero_pd();
1824 fjy3 = _mm_setzero_pd();
1825 fjz3 = _mm_setzero_pd();
1827 /**************************
1828 * CALCULATE INTERACTIONS *
1829 **************************/
1831 r11 = _mm_mul_pd(rsq11,rinv11);
1833 /* Calculate table index by multiplying r with table scale and truncate to integer */
1834 rt = _mm_mul_pd(r11,vftabscale);
1835 vfitab = _mm_cvttpd_epi32(rt);
1836 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1837 vfitab = _mm_slli_epi32(vfitab,2);
1839 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1840 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1841 F = _mm_setzero_pd();
1842 GMX_MM_TRANSPOSE2_PD(Y,F);
1843 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1844 H = _mm_setzero_pd();
1845 GMX_MM_TRANSPOSE2_PD(G,H);
1846 Heps = _mm_mul_pd(vfeps,H);
1847 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1848 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1849 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1853 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1855 /* Calculate temporary vectorial force */
1856 tx = _mm_mul_pd(fscal,dx11);
1857 ty = _mm_mul_pd(fscal,dy11);
1858 tz = _mm_mul_pd(fscal,dz11);
1860 /* Update vectorial force */
1861 fix1 = _mm_add_pd(fix1,tx);
1862 fiy1 = _mm_add_pd(fiy1,ty);
1863 fiz1 = _mm_add_pd(fiz1,tz);
1865 fjx1 = _mm_add_pd(fjx1,tx);
1866 fjy1 = _mm_add_pd(fjy1,ty);
1867 fjz1 = _mm_add_pd(fjz1,tz);
1869 /**************************
1870 * CALCULATE INTERACTIONS *
1871 **************************/
1873 r12 = _mm_mul_pd(rsq12,rinv12);
1875 /* Calculate table index by multiplying r with table scale and truncate to integer */
1876 rt = _mm_mul_pd(r12,vftabscale);
1877 vfitab = _mm_cvttpd_epi32(rt);
1878 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1879 vfitab = _mm_slli_epi32(vfitab,2);
1881 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1882 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1883 F = _mm_setzero_pd();
1884 GMX_MM_TRANSPOSE2_PD(Y,F);
1885 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1886 H = _mm_setzero_pd();
1887 GMX_MM_TRANSPOSE2_PD(G,H);
1888 Heps = _mm_mul_pd(vfeps,H);
1889 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1890 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1891 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1895 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1897 /* Calculate temporary vectorial force */
1898 tx = _mm_mul_pd(fscal,dx12);
1899 ty = _mm_mul_pd(fscal,dy12);
1900 tz = _mm_mul_pd(fscal,dz12);
1902 /* Update vectorial force */
1903 fix1 = _mm_add_pd(fix1,tx);
1904 fiy1 = _mm_add_pd(fiy1,ty);
1905 fiz1 = _mm_add_pd(fiz1,tz);
1907 fjx2 = _mm_add_pd(fjx2,tx);
1908 fjy2 = _mm_add_pd(fjy2,ty);
1909 fjz2 = _mm_add_pd(fjz2,tz);
1911 /**************************
1912 * CALCULATE INTERACTIONS *
1913 **************************/
1915 r13 = _mm_mul_pd(rsq13,rinv13);
1917 /* Calculate table index by multiplying r with table scale and truncate to integer */
1918 rt = _mm_mul_pd(r13,vftabscale);
1919 vfitab = _mm_cvttpd_epi32(rt);
1920 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1921 vfitab = _mm_slli_epi32(vfitab,2);
1923 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1924 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1925 F = _mm_setzero_pd();
1926 GMX_MM_TRANSPOSE2_PD(Y,F);
1927 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1928 H = _mm_setzero_pd();
1929 GMX_MM_TRANSPOSE2_PD(G,H);
1930 Heps = _mm_mul_pd(vfeps,H);
1931 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1932 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1933 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
1937 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1939 /* Calculate temporary vectorial force */
1940 tx = _mm_mul_pd(fscal,dx13);
1941 ty = _mm_mul_pd(fscal,dy13);
1942 tz = _mm_mul_pd(fscal,dz13);
1944 /* Update vectorial force */
1945 fix1 = _mm_add_pd(fix1,tx);
1946 fiy1 = _mm_add_pd(fiy1,ty);
1947 fiz1 = _mm_add_pd(fiz1,tz);
1949 fjx3 = _mm_add_pd(fjx3,tx);
1950 fjy3 = _mm_add_pd(fjy3,ty);
1951 fjz3 = _mm_add_pd(fjz3,tz);
1953 /**************************
1954 * CALCULATE INTERACTIONS *
1955 **************************/
1957 r21 = _mm_mul_pd(rsq21,rinv21);
1959 /* Calculate table index by multiplying r with table scale and truncate to integer */
1960 rt = _mm_mul_pd(r21,vftabscale);
1961 vfitab = _mm_cvttpd_epi32(rt);
1962 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1963 vfitab = _mm_slli_epi32(vfitab,2);
1965 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1966 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1967 F = _mm_setzero_pd();
1968 GMX_MM_TRANSPOSE2_PD(Y,F);
1969 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1970 H = _mm_setzero_pd();
1971 GMX_MM_TRANSPOSE2_PD(G,H);
1972 Heps = _mm_mul_pd(vfeps,H);
1973 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1974 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1975 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1979 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1981 /* Calculate temporary vectorial force */
1982 tx = _mm_mul_pd(fscal,dx21);
1983 ty = _mm_mul_pd(fscal,dy21);
1984 tz = _mm_mul_pd(fscal,dz21);
1986 /* Update vectorial force */
1987 fix2 = _mm_add_pd(fix2,tx);
1988 fiy2 = _mm_add_pd(fiy2,ty);
1989 fiz2 = _mm_add_pd(fiz2,tz);
1991 fjx1 = _mm_add_pd(fjx1,tx);
1992 fjy1 = _mm_add_pd(fjy1,ty);
1993 fjz1 = _mm_add_pd(fjz1,tz);
1995 /**************************
1996 * CALCULATE INTERACTIONS *
1997 **************************/
1999 r22 = _mm_mul_pd(rsq22,rinv22);
2001 /* Calculate table index by multiplying r with table scale and truncate to integer */
2002 rt = _mm_mul_pd(r22,vftabscale);
2003 vfitab = _mm_cvttpd_epi32(rt);
2004 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2005 vfitab = _mm_slli_epi32(vfitab,2);
2007 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2008 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2009 F = _mm_setzero_pd();
2010 GMX_MM_TRANSPOSE2_PD(Y,F);
2011 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2012 H = _mm_setzero_pd();
2013 GMX_MM_TRANSPOSE2_PD(G,H);
2014 Heps = _mm_mul_pd(vfeps,H);
2015 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2016 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2017 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2021 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2023 /* Calculate temporary vectorial force */
2024 tx = _mm_mul_pd(fscal,dx22);
2025 ty = _mm_mul_pd(fscal,dy22);
2026 tz = _mm_mul_pd(fscal,dz22);
2028 /* Update vectorial force */
2029 fix2 = _mm_add_pd(fix2,tx);
2030 fiy2 = _mm_add_pd(fiy2,ty);
2031 fiz2 = _mm_add_pd(fiz2,tz);
2033 fjx2 = _mm_add_pd(fjx2,tx);
2034 fjy2 = _mm_add_pd(fjy2,ty);
2035 fjz2 = _mm_add_pd(fjz2,tz);
2037 /**************************
2038 * CALCULATE INTERACTIONS *
2039 **************************/
2041 r23 = _mm_mul_pd(rsq23,rinv23);
2043 /* Calculate table index by multiplying r with table scale and truncate to integer */
2044 rt = _mm_mul_pd(r23,vftabscale);
2045 vfitab = _mm_cvttpd_epi32(rt);
2046 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2047 vfitab = _mm_slli_epi32(vfitab,2);
2049 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2050 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2051 F = _mm_setzero_pd();
2052 GMX_MM_TRANSPOSE2_PD(Y,F);
2053 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2054 H = _mm_setzero_pd();
2055 GMX_MM_TRANSPOSE2_PD(G,H);
2056 Heps = _mm_mul_pd(vfeps,H);
2057 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2058 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2059 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
2063 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2065 /* Calculate temporary vectorial force */
2066 tx = _mm_mul_pd(fscal,dx23);
2067 ty = _mm_mul_pd(fscal,dy23);
2068 tz = _mm_mul_pd(fscal,dz23);
2070 /* Update vectorial force */
2071 fix2 = _mm_add_pd(fix2,tx);
2072 fiy2 = _mm_add_pd(fiy2,ty);
2073 fiz2 = _mm_add_pd(fiz2,tz);
2075 fjx3 = _mm_add_pd(fjx3,tx);
2076 fjy3 = _mm_add_pd(fjy3,ty);
2077 fjz3 = _mm_add_pd(fjz3,tz);
2079 /**************************
2080 * CALCULATE INTERACTIONS *
2081 **************************/
2083 r31 = _mm_mul_pd(rsq31,rinv31);
2085 /* Calculate table index by multiplying r with table scale and truncate to integer */
2086 rt = _mm_mul_pd(r31,vftabscale);
2087 vfitab = _mm_cvttpd_epi32(rt);
2088 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2089 vfitab = _mm_slli_epi32(vfitab,2);
2091 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2092 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2093 F = _mm_setzero_pd();
2094 GMX_MM_TRANSPOSE2_PD(Y,F);
2095 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2096 H = _mm_setzero_pd();
2097 GMX_MM_TRANSPOSE2_PD(G,H);
2098 Heps = _mm_mul_pd(vfeps,H);
2099 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2100 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2101 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
2105 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2107 /* Calculate temporary vectorial force */
2108 tx = _mm_mul_pd(fscal,dx31);
2109 ty = _mm_mul_pd(fscal,dy31);
2110 tz = _mm_mul_pd(fscal,dz31);
2112 /* Update vectorial force */
2113 fix3 = _mm_add_pd(fix3,tx);
2114 fiy3 = _mm_add_pd(fiy3,ty);
2115 fiz3 = _mm_add_pd(fiz3,tz);
2117 fjx1 = _mm_add_pd(fjx1,tx);
2118 fjy1 = _mm_add_pd(fjy1,ty);
2119 fjz1 = _mm_add_pd(fjz1,tz);
2121 /**************************
2122 * CALCULATE INTERACTIONS *
2123 **************************/
2125 r32 = _mm_mul_pd(rsq32,rinv32);
2127 /* Calculate table index by multiplying r with table scale and truncate to integer */
2128 rt = _mm_mul_pd(r32,vftabscale);
2129 vfitab = _mm_cvttpd_epi32(rt);
2130 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2131 vfitab = _mm_slli_epi32(vfitab,2);
2133 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2134 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2135 F = _mm_setzero_pd();
2136 GMX_MM_TRANSPOSE2_PD(Y,F);
2137 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2138 H = _mm_setzero_pd();
2139 GMX_MM_TRANSPOSE2_PD(G,H);
2140 Heps = _mm_mul_pd(vfeps,H);
2141 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2142 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2143 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
2147 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2149 /* Calculate temporary vectorial force */
2150 tx = _mm_mul_pd(fscal,dx32);
2151 ty = _mm_mul_pd(fscal,dy32);
2152 tz = _mm_mul_pd(fscal,dz32);
2154 /* Update vectorial force */
2155 fix3 = _mm_add_pd(fix3,tx);
2156 fiy3 = _mm_add_pd(fiy3,ty);
2157 fiz3 = _mm_add_pd(fiz3,tz);
2159 fjx2 = _mm_add_pd(fjx2,tx);
2160 fjy2 = _mm_add_pd(fjy2,ty);
2161 fjz2 = _mm_add_pd(fjz2,tz);
2163 /**************************
2164 * CALCULATE INTERACTIONS *
2165 **************************/
2167 r33 = _mm_mul_pd(rsq33,rinv33);
2169 /* Calculate table index by multiplying r with table scale and truncate to integer */
2170 rt = _mm_mul_pd(r33,vftabscale);
2171 vfitab = _mm_cvttpd_epi32(rt);
2172 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2173 vfitab = _mm_slli_epi32(vfitab,2);
2175 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2176 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2177 F = _mm_setzero_pd();
2178 GMX_MM_TRANSPOSE2_PD(Y,F);
2179 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2180 H = _mm_setzero_pd();
2181 GMX_MM_TRANSPOSE2_PD(G,H);
2182 Heps = _mm_mul_pd(vfeps,H);
2183 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2184 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2185 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
2189 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2191 /* Calculate temporary vectorial force */
2192 tx = _mm_mul_pd(fscal,dx33);
2193 ty = _mm_mul_pd(fscal,dy33);
2194 tz = _mm_mul_pd(fscal,dz33);
2196 /* Update vectorial force */
2197 fix3 = _mm_add_pd(fix3,tx);
2198 fiy3 = _mm_add_pd(fiy3,ty);
2199 fiz3 = _mm_add_pd(fiz3,tz);
2201 fjx3 = _mm_add_pd(fjx3,tx);
2202 fjy3 = _mm_add_pd(fjy3,ty);
2203 fjz3 = _mm_add_pd(fjz3,tz);
2205 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2207 /* Inner loop uses 351 flops */
2210 /* End of innermost loop */
2212 gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2213 f+i_coord_offset+DIM,fshift+i_shift_offset);
2215 /* Increment number of inner iterations */
2216 inneriter += j_index_end - j_index_start;
2218 /* Outer loop uses 18 flops */
2221 /* Increment number of outer iterations */
2224 /* Update outer/inner flops */
2226 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*351);