2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_double
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: None
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B;
73 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B;
75 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B;
77 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128i ifour = _mm_set1_epi32(4);
91 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
93 __m128d dummy_mask,cutoff_mask;
94 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
95 __m128d one = _mm_set1_pd(1.0);
96 __m128d two = _mm_set1_pd(2.0);
102 jindex = nlist->jindex;
104 shiftidx = nlist->shift;
106 shiftvec = fr->shift_vec[0];
107 fshift = fr->fshift[0];
108 facel = _mm_set1_pd(fr->epsfac);
109 charge = mdatoms->chargeA;
111 vftab = kernel_data->table_elec->data;
112 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
114 /* Setup water-specific parameters */
115 inr = nlist->iinr[0];
116 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
117 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
118 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
120 jq0 = _mm_set1_pd(charge[inr+0]);
121 jq1 = _mm_set1_pd(charge[inr+1]);
122 jq2 = _mm_set1_pd(charge[inr+2]);
123 qq00 = _mm_mul_pd(iq0,jq0);
124 qq01 = _mm_mul_pd(iq0,jq1);
125 qq02 = _mm_mul_pd(iq0,jq2);
126 qq10 = _mm_mul_pd(iq1,jq0);
127 qq11 = _mm_mul_pd(iq1,jq1);
128 qq12 = _mm_mul_pd(iq1,jq2);
129 qq20 = _mm_mul_pd(iq2,jq0);
130 qq21 = _mm_mul_pd(iq2,jq1);
131 qq22 = _mm_mul_pd(iq2,jq2);
133 /* Avoid stupid compiler warnings */
141 /* Start outer loop over neighborlists */
142 for(iidx=0; iidx<nri; iidx++)
144 /* Load shift vector for this list */
145 i_shift_offset = DIM*shiftidx[iidx];
147 /* Load limits for loop over neighbors */
148 j_index_start = jindex[iidx];
149 j_index_end = jindex[iidx+1];
151 /* Get outer coordinate index */
153 i_coord_offset = DIM*inr;
155 /* Load i particle coords and add shift vector */
156 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
157 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
159 fix0 = _mm_setzero_pd();
160 fiy0 = _mm_setzero_pd();
161 fiz0 = _mm_setzero_pd();
162 fix1 = _mm_setzero_pd();
163 fiy1 = _mm_setzero_pd();
164 fiz1 = _mm_setzero_pd();
165 fix2 = _mm_setzero_pd();
166 fiy2 = _mm_setzero_pd();
167 fiz2 = _mm_setzero_pd();
169 /* Reset potential sums */
170 velecsum = _mm_setzero_pd();
172 /* Start inner kernel loop */
173 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
176 /* Get j neighbor index, and coordinate index */
179 j_coord_offsetA = DIM*jnrA;
180 j_coord_offsetB = DIM*jnrB;
182 /* load j atom coordinates */
183 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
184 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
186 /* Calculate displacement vector */
187 dx00 = _mm_sub_pd(ix0,jx0);
188 dy00 = _mm_sub_pd(iy0,jy0);
189 dz00 = _mm_sub_pd(iz0,jz0);
190 dx01 = _mm_sub_pd(ix0,jx1);
191 dy01 = _mm_sub_pd(iy0,jy1);
192 dz01 = _mm_sub_pd(iz0,jz1);
193 dx02 = _mm_sub_pd(ix0,jx2);
194 dy02 = _mm_sub_pd(iy0,jy2);
195 dz02 = _mm_sub_pd(iz0,jz2);
196 dx10 = _mm_sub_pd(ix1,jx0);
197 dy10 = _mm_sub_pd(iy1,jy0);
198 dz10 = _mm_sub_pd(iz1,jz0);
199 dx11 = _mm_sub_pd(ix1,jx1);
200 dy11 = _mm_sub_pd(iy1,jy1);
201 dz11 = _mm_sub_pd(iz1,jz1);
202 dx12 = _mm_sub_pd(ix1,jx2);
203 dy12 = _mm_sub_pd(iy1,jy2);
204 dz12 = _mm_sub_pd(iz1,jz2);
205 dx20 = _mm_sub_pd(ix2,jx0);
206 dy20 = _mm_sub_pd(iy2,jy0);
207 dz20 = _mm_sub_pd(iz2,jz0);
208 dx21 = _mm_sub_pd(ix2,jx1);
209 dy21 = _mm_sub_pd(iy2,jy1);
210 dz21 = _mm_sub_pd(iz2,jz1);
211 dx22 = _mm_sub_pd(ix2,jx2);
212 dy22 = _mm_sub_pd(iy2,jy2);
213 dz22 = _mm_sub_pd(iz2,jz2);
215 /* Calculate squared distance and things based on it */
216 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
217 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
218 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
219 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
220 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
221 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
222 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
223 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
224 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
226 rinv00 = gmx_mm_invsqrt_pd(rsq00);
227 rinv01 = gmx_mm_invsqrt_pd(rsq01);
228 rinv02 = gmx_mm_invsqrt_pd(rsq02);
229 rinv10 = gmx_mm_invsqrt_pd(rsq10);
230 rinv11 = gmx_mm_invsqrt_pd(rsq11);
231 rinv12 = gmx_mm_invsqrt_pd(rsq12);
232 rinv20 = gmx_mm_invsqrt_pd(rsq20);
233 rinv21 = gmx_mm_invsqrt_pd(rsq21);
234 rinv22 = gmx_mm_invsqrt_pd(rsq22);
236 fjx0 = _mm_setzero_pd();
237 fjy0 = _mm_setzero_pd();
238 fjz0 = _mm_setzero_pd();
239 fjx1 = _mm_setzero_pd();
240 fjy1 = _mm_setzero_pd();
241 fjz1 = _mm_setzero_pd();
242 fjx2 = _mm_setzero_pd();
243 fjy2 = _mm_setzero_pd();
244 fjz2 = _mm_setzero_pd();
246 /**************************
247 * CALCULATE INTERACTIONS *
248 **************************/
250 r00 = _mm_mul_pd(rsq00,rinv00);
252 /* Calculate table index by multiplying r with table scale and truncate to integer */
253 rt = _mm_mul_pd(r00,vftabscale);
254 vfitab = _mm_cvttpd_epi32(rt);
255 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
256 vfitab = _mm_slli_epi32(vfitab,2);
258 /* CUBIC SPLINE TABLE ELECTROSTATICS */
259 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
260 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
261 GMX_MM_TRANSPOSE2_PD(Y,F);
262 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
263 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
264 GMX_MM_TRANSPOSE2_PD(G,H);
265 Heps = _mm_mul_pd(vfeps,H);
266 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
267 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
268 velec = _mm_mul_pd(qq00,VV);
269 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
270 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
272 /* Update potential sum for this i atom from the interaction with this j atom. */
273 velecsum = _mm_add_pd(velecsum,velec);
277 /* Calculate temporary vectorial force */
278 tx = _mm_mul_pd(fscal,dx00);
279 ty = _mm_mul_pd(fscal,dy00);
280 tz = _mm_mul_pd(fscal,dz00);
282 /* Update vectorial force */
283 fix0 = _mm_add_pd(fix0,tx);
284 fiy0 = _mm_add_pd(fiy0,ty);
285 fiz0 = _mm_add_pd(fiz0,tz);
287 fjx0 = _mm_add_pd(fjx0,tx);
288 fjy0 = _mm_add_pd(fjy0,ty);
289 fjz0 = _mm_add_pd(fjz0,tz);
291 /**************************
292 * CALCULATE INTERACTIONS *
293 **************************/
295 r01 = _mm_mul_pd(rsq01,rinv01);
297 /* Calculate table index by multiplying r with table scale and truncate to integer */
298 rt = _mm_mul_pd(r01,vftabscale);
299 vfitab = _mm_cvttpd_epi32(rt);
300 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
301 vfitab = _mm_slli_epi32(vfitab,2);
303 /* CUBIC SPLINE TABLE ELECTROSTATICS */
304 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
305 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
306 GMX_MM_TRANSPOSE2_PD(Y,F);
307 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
308 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
309 GMX_MM_TRANSPOSE2_PD(G,H);
310 Heps = _mm_mul_pd(vfeps,H);
311 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
312 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
313 velec = _mm_mul_pd(qq01,VV);
314 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
315 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
317 /* Update potential sum for this i atom from the interaction with this j atom. */
318 velecsum = _mm_add_pd(velecsum,velec);
322 /* Calculate temporary vectorial force */
323 tx = _mm_mul_pd(fscal,dx01);
324 ty = _mm_mul_pd(fscal,dy01);
325 tz = _mm_mul_pd(fscal,dz01);
327 /* Update vectorial force */
328 fix0 = _mm_add_pd(fix0,tx);
329 fiy0 = _mm_add_pd(fiy0,ty);
330 fiz0 = _mm_add_pd(fiz0,tz);
332 fjx1 = _mm_add_pd(fjx1,tx);
333 fjy1 = _mm_add_pd(fjy1,ty);
334 fjz1 = _mm_add_pd(fjz1,tz);
336 /**************************
337 * CALCULATE INTERACTIONS *
338 **************************/
340 r02 = _mm_mul_pd(rsq02,rinv02);
342 /* Calculate table index by multiplying r with table scale and truncate to integer */
343 rt = _mm_mul_pd(r02,vftabscale);
344 vfitab = _mm_cvttpd_epi32(rt);
345 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
346 vfitab = _mm_slli_epi32(vfitab,2);
348 /* CUBIC SPLINE TABLE ELECTROSTATICS */
349 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
350 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
351 GMX_MM_TRANSPOSE2_PD(Y,F);
352 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
353 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
354 GMX_MM_TRANSPOSE2_PD(G,H);
355 Heps = _mm_mul_pd(vfeps,H);
356 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
357 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
358 velec = _mm_mul_pd(qq02,VV);
359 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
360 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
362 /* Update potential sum for this i atom from the interaction with this j atom. */
363 velecsum = _mm_add_pd(velecsum,velec);
367 /* Calculate temporary vectorial force */
368 tx = _mm_mul_pd(fscal,dx02);
369 ty = _mm_mul_pd(fscal,dy02);
370 tz = _mm_mul_pd(fscal,dz02);
372 /* Update vectorial force */
373 fix0 = _mm_add_pd(fix0,tx);
374 fiy0 = _mm_add_pd(fiy0,ty);
375 fiz0 = _mm_add_pd(fiz0,tz);
377 fjx2 = _mm_add_pd(fjx2,tx);
378 fjy2 = _mm_add_pd(fjy2,ty);
379 fjz2 = _mm_add_pd(fjz2,tz);
381 /**************************
382 * CALCULATE INTERACTIONS *
383 **************************/
385 r10 = _mm_mul_pd(rsq10,rinv10);
387 /* Calculate table index by multiplying r with table scale and truncate to integer */
388 rt = _mm_mul_pd(r10,vftabscale);
389 vfitab = _mm_cvttpd_epi32(rt);
390 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
391 vfitab = _mm_slli_epi32(vfitab,2);
393 /* CUBIC SPLINE TABLE ELECTROSTATICS */
394 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
395 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
396 GMX_MM_TRANSPOSE2_PD(Y,F);
397 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
398 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
399 GMX_MM_TRANSPOSE2_PD(G,H);
400 Heps = _mm_mul_pd(vfeps,H);
401 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
402 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
403 velec = _mm_mul_pd(qq10,VV);
404 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
405 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
407 /* Update potential sum for this i atom from the interaction with this j atom. */
408 velecsum = _mm_add_pd(velecsum,velec);
412 /* Calculate temporary vectorial force */
413 tx = _mm_mul_pd(fscal,dx10);
414 ty = _mm_mul_pd(fscal,dy10);
415 tz = _mm_mul_pd(fscal,dz10);
417 /* Update vectorial force */
418 fix1 = _mm_add_pd(fix1,tx);
419 fiy1 = _mm_add_pd(fiy1,ty);
420 fiz1 = _mm_add_pd(fiz1,tz);
422 fjx0 = _mm_add_pd(fjx0,tx);
423 fjy0 = _mm_add_pd(fjy0,ty);
424 fjz0 = _mm_add_pd(fjz0,tz);
426 /**************************
427 * CALCULATE INTERACTIONS *
428 **************************/
430 r11 = _mm_mul_pd(rsq11,rinv11);
432 /* Calculate table index by multiplying r with table scale and truncate to integer */
433 rt = _mm_mul_pd(r11,vftabscale);
434 vfitab = _mm_cvttpd_epi32(rt);
435 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
436 vfitab = _mm_slli_epi32(vfitab,2);
438 /* CUBIC SPLINE TABLE ELECTROSTATICS */
439 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
440 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
441 GMX_MM_TRANSPOSE2_PD(Y,F);
442 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
443 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
444 GMX_MM_TRANSPOSE2_PD(G,H);
445 Heps = _mm_mul_pd(vfeps,H);
446 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
447 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
448 velec = _mm_mul_pd(qq11,VV);
449 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
450 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
452 /* Update potential sum for this i atom from the interaction with this j atom. */
453 velecsum = _mm_add_pd(velecsum,velec);
457 /* Calculate temporary vectorial force */
458 tx = _mm_mul_pd(fscal,dx11);
459 ty = _mm_mul_pd(fscal,dy11);
460 tz = _mm_mul_pd(fscal,dz11);
462 /* Update vectorial force */
463 fix1 = _mm_add_pd(fix1,tx);
464 fiy1 = _mm_add_pd(fiy1,ty);
465 fiz1 = _mm_add_pd(fiz1,tz);
467 fjx1 = _mm_add_pd(fjx1,tx);
468 fjy1 = _mm_add_pd(fjy1,ty);
469 fjz1 = _mm_add_pd(fjz1,tz);
471 /**************************
472 * CALCULATE INTERACTIONS *
473 **************************/
475 r12 = _mm_mul_pd(rsq12,rinv12);
477 /* Calculate table index by multiplying r with table scale and truncate to integer */
478 rt = _mm_mul_pd(r12,vftabscale);
479 vfitab = _mm_cvttpd_epi32(rt);
480 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
481 vfitab = _mm_slli_epi32(vfitab,2);
483 /* CUBIC SPLINE TABLE ELECTROSTATICS */
484 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
485 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
486 GMX_MM_TRANSPOSE2_PD(Y,F);
487 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
488 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
489 GMX_MM_TRANSPOSE2_PD(G,H);
490 Heps = _mm_mul_pd(vfeps,H);
491 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
492 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
493 velec = _mm_mul_pd(qq12,VV);
494 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
495 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
497 /* Update potential sum for this i atom from the interaction with this j atom. */
498 velecsum = _mm_add_pd(velecsum,velec);
502 /* Calculate temporary vectorial force */
503 tx = _mm_mul_pd(fscal,dx12);
504 ty = _mm_mul_pd(fscal,dy12);
505 tz = _mm_mul_pd(fscal,dz12);
507 /* Update vectorial force */
508 fix1 = _mm_add_pd(fix1,tx);
509 fiy1 = _mm_add_pd(fiy1,ty);
510 fiz1 = _mm_add_pd(fiz1,tz);
512 fjx2 = _mm_add_pd(fjx2,tx);
513 fjy2 = _mm_add_pd(fjy2,ty);
514 fjz2 = _mm_add_pd(fjz2,tz);
516 /**************************
517 * CALCULATE INTERACTIONS *
518 **************************/
520 r20 = _mm_mul_pd(rsq20,rinv20);
522 /* Calculate table index by multiplying r with table scale and truncate to integer */
523 rt = _mm_mul_pd(r20,vftabscale);
524 vfitab = _mm_cvttpd_epi32(rt);
525 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
526 vfitab = _mm_slli_epi32(vfitab,2);
528 /* CUBIC SPLINE TABLE ELECTROSTATICS */
529 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
530 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
531 GMX_MM_TRANSPOSE2_PD(Y,F);
532 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
533 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
534 GMX_MM_TRANSPOSE2_PD(G,H);
535 Heps = _mm_mul_pd(vfeps,H);
536 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
537 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
538 velec = _mm_mul_pd(qq20,VV);
539 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
540 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
542 /* Update potential sum for this i atom from the interaction with this j atom. */
543 velecsum = _mm_add_pd(velecsum,velec);
547 /* Calculate temporary vectorial force */
548 tx = _mm_mul_pd(fscal,dx20);
549 ty = _mm_mul_pd(fscal,dy20);
550 tz = _mm_mul_pd(fscal,dz20);
552 /* Update vectorial force */
553 fix2 = _mm_add_pd(fix2,tx);
554 fiy2 = _mm_add_pd(fiy2,ty);
555 fiz2 = _mm_add_pd(fiz2,tz);
557 fjx0 = _mm_add_pd(fjx0,tx);
558 fjy0 = _mm_add_pd(fjy0,ty);
559 fjz0 = _mm_add_pd(fjz0,tz);
561 /**************************
562 * CALCULATE INTERACTIONS *
563 **************************/
565 r21 = _mm_mul_pd(rsq21,rinv21);
567 /* Calculate table index by multiplying r with table scale and truncate to integer */
568 rt = _mm_mul_pd(r21,vftabscale);
569 vfitab = _mm_cvttpd_epi32(rt);
570 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
571 vfitab = _mm_slli_epi32(vfitab,2);
573 /* CUBIC SPLINE TABLE ELECTROSTATICS */
574 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
575 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
576 GMX_MM_TRANSPOSE2_PD(Y,F);
577 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
578 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
579 GMX_MM_TRANSPOSE2_PD(G,H);
580 Heps = _mm_mul_pd(vfeps,H);
581 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
582 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
583 velec = _mm_mul_pd(qq21,VV);
584 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
585 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
587 /* Update potential sum for this i atom from the interaction with this j atom. */
588 velecsum = _mm_add_pd(velecsum,velec);
592 /* Calculate temporary vectorial force */
593 tx = _mm_mul_pd(fscal,dx21);
594 ty = _mm_mul_pd(fscal,dy21);
595 tz = _mm_mul_pd(fscal,dz21);
597 /* Update vectorial force */
598 fix2 = _mm_add_pd(fix2,tx);
599 fiy2 = _mm_add_pd(fiy2,ty);
600 fiz2 = _mm_add_pd(fiz2,tz);
602 fjx1 = _mm_add_pd(fjx1,tx);
603 fjy1 = _mm_add_pd(fjy1,ty);
604 fjz1 = _mm_add_pd(fjz1,tz);
606 /**************************
607 * CALCULATE INTERACTIONS *
608 **************************/
610 r22 = _mm_mul_pd(rsq22,rinv22);
612 /* Calculate table index by multiplying r with table scale and truncate to integer */
613 rt = _mm_mul_pd(r22,vftabscale);
614 vfitab = _mm_cvttpd_epi32(rt);
615 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
616 vfitab = _mm_slli_epi32(vfitab,2);
618 /* CUBIC SPLINE TABLE ELECTROSTATICS */
619 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
620 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
621 GMX_MM_TRANSPOSE2_PD(Y,F);
622 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
623 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
624 GMX_MM_TRANSPOSE2_PD(G,H);
625 Heps = _mm_mul_pd(vfeps,H);
626 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
627 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
628 velec = _mm_mul_pd(qq22,VV);
629 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
630 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
632 /* Update potential sum for this i atom from the interaction with this j atom. */
633 velecsum = _mm_add_pd(velecsum,velec);
637 /* Calculate temporary vectorial force */
638 tx = _mm_mul_pd(fscal,dx22);
639 ty = _mm_mul_pd(fscal,dy22);
640 tz = _mm_mul_pd(fscal,dz22);
642 /* Update vectorial force */
643 fix2 = _mm_add_pd(fix2,tx);
644 fiy2 = _mm_add_pd(fiy2,ty);
645 fiz2 = _mm_add_pd(fiz2,tz);
647 fjx2 = _mm_add_pd(fjx2,tx);
648 fjy2 = _mm_add_pd(fjy2,ty);
649 fjz2 = _mm_add_pd(fjz2,tz);
651 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
653 /* Inner loop uses 387 flops */
660 j_coord_offsetA = DIM*jnrA;
662 /* load j atom coordinates */
663 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
664 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
666 /* Calculate displacement vector */
667 dx00 = _mm_sub_pd(ix0,jx0);
668 dy00 = _mm_sub_pd(iy0,jy0);
669 dz00 = _mm_sub_pd(iz0,jz0);
670 dx01 = _mm_sub_pd(ix0,jx1);
671 dy01 = _mm_sub_pd(iy0,jy1);
672 dz01 = _mm_sub_pd(iz0,jz1);
673 dx02 = _mm_sub_pd(ix0,jx2);
674 dy02 = _mm_sub_pd(iy0,jy2);
675 dz02 = _mm_sub_pd(iz0,jz2);
676 dx10 = _mm_sub_pd(ix1,jx0);
677 dy10 = _mm_sub_pd(iy1,jy0);
678 dz10 = _mm_sub_pd(iz1,jz0);
679 dx11 = _mm_sub_pd(ix1,jx1);
680 dy11 = _mm_sub_pd(iy1,jy1);
681 dz11 = _mm_sub_pd(iz1,jz1);
682 dx12 = _mm_sub_pd(ix1,jx2);
683 dy12 = _mm_sub_pd(iy1,jy2);
684 dz12 = _mm_sub_pd(iz1,jz2);
685 dx20 = _mm_sub_pd(ix2,jx0);
686 dy20 = _mm_sub_pd(iy2,jy0);
687 dz20 = _mm_sub_pd(iz2,jz0);
688 dx21 = _mm_sub_pd(ix2,jx1);
689 dy21 = _mm_sub_pd(iy2,jy1);
690 dz21 = _mm_sub_pd(iz2,jz1);
691 dx22 = _mm_sub_pd(ix2,jx2);
692 dy22 = _mm_sub_pd(iy2,jy2);
693 dz22 = _mm_sub_pd(iz2,jz2);
695 /* Calculate squared distance and things based on it */
696 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
697 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
698 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
699 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
700 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
701 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
702 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
703 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
704 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
706 rinv00 = gmx_mm_invsqrt_pd(rsq00);
707 rinv01 = gmx_mm_invsqrt_pd(rsq01);
708 rinv02 = gmx_mm_invsqrt_pd(rsq02);
709 rinv10 = gmx_mm_invsqrt_pd(rsq10);
710 rinv11 = gmx_mm_invsqrt_pd(rsq11);
711 rinv12 = gmx_mm_invsqrt_pd(rsq12);
712 rinv20 = gmx_mm_invsqrt_pd(rsq20);
713 rinv21 = gmx_mm_invsqrt_pd(rsq21);
714 rinv22 = gmx_mm_invsqrt_pd(rsq22);
716 fjx0 = _mm_setzero_pd();
717 fjy0 = _mm_setzero_pd();
718 fjz0 = _mm_setzero_pd();
719 fjx1 = _mm_setzero_pd();
720 fjy1 = _mm_setzero_pd();
721 fjz1 = _mm_setzero_pd();
722 fjx2 = _mm_setzero_pd();
723 fjy2 = _mm_setzero_pd();
724 fjz2 = _mm_setzero_pd();
726 /**************************
727 * CALCULATE INTERACTIONS *
728 **************************/
730 r00 = _mm_mul_pd(rsq00,rinv00);
732 /* Calculate table index by multiplying r with table scale and truncate to integer */
733 rt = _mm_mul_pd(r00,vftabscale);
734 vfitab = _mm_cvttpd_epi32(rt);
735 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
736 vfitab = _mm_slli_epi32(vfitab,2);
738 /* CUBIC SPLINE TABLE ELECTROSTATICS */
739 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
740 F = _mm_setzero_pd();
741 GMX_MM_TRANSPOSE2_PD(Y,F);
742 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
743 H = _mm_setzero_pd();
744 GMX_MM_TRANSPOSE2_PD(G,H);
745 Heps = _mm_mul_pd(vfeps,H);
746 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
747 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
748 velec = _mm_mul_pd(qq00,VV);
749 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
750 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
752 /* Update potential sum for this i atom from the interaction with this j atom. */
753 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
754 velecsum = _mm_add_pd(velecsum,velec);
758 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
760 /* Calculate temporary vectorial force */
761 tx = _mm_mul_pd(fscal,dx00);
762 ty = _mm_mul_pd(fscal,dy00);
763 tz = _mm_mul_pd(fscal,dz00);
765 /* Update vectorial force */
766 fix0 = _mm_add_pd(fix0,tx);
767 fiy0 = _mm_add_pd(fiy0,ty);
768 fiz0 = _mm_add_pd(fiz0,tz);
770 fjx0 = _mm_add_pd(fjx0,tx);
771 fjy0 = _mm_add_pd(fjy0,ty);
772 fjz0 = _mm_add_pd(fjz0,tz);
774 /**************************
775 * CALCULATE INTERACTIONS *
776 **************************/
778 r01 = _mm_mul_pd(rsq01,rinv01);
780 /* Calculate table index by multiplying r with table scale and truncate to integer */
781 rt = _mm_mul_pd(r01,vftabscale);
782 vfitab = _mm_cvttpd_epi32(rt);
783 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
784 vfitab = _mm_slli_epi32(vfitab,2);
786 /* CUBIC SPLINE TABLE ELECTROSTATICS */
787 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
788 F = _mm_setzero_pd();
789 GMX_MM_TRANSPOSE2_PD(Y,F);
790 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
791 H = _mm_setzero_pd();
792 GMX_MM_TRANSPOSE2_PD(G,H);
793 Heps = _mm_mul_pd(vfeps,H);
794 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
795 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
796 velec = _mm_mul_pd(qq01,VV);
797 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
798 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
800 /* Update potential sum for this i atom from the interaction with this j atom. */
801 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
802 velecsum = _mm_add_pd(velecsum,velec);
806 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
808 /* Calculate temporary vectorial force */
809 tx = _mm_mul_pd(fscal,dx01);
810 ty = _mm_mul_pd(fscal,dy01);
811 tz = _mm_mul_pd(fscal,dz01);
813 /* Update vectorial force */
814 fix0 = _mm_add_pd(fix0,tx);
815 fiy0 = _mm_add_pd(fiy0,ty);
816 fiz0 = _mm_add_pd(fiz0,tz);
818 fjx1 = _mm_add_pd(fjx1,tx);
819 fjy1 = _mm_add_pd(fjy1,ty);
820 fjz1 = _mm_add_pd(fjz1,tz);
822 /**************************
823 * CALCULATE INTERACTIONS *
824 **************************/
826 r02 = _mm_mul_pd(rsq02,rinv02);
828 /* Calculate table index by multiplying r with table scale and truncate to integer */
829 rt = _mm_mul_pd(r02,vftabscale);
830 vfitab = _mm_cvttpd_epi32(rt);
831 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
832 vfitab = _mm_slli_epi32(vfitab,2);
834 /* CUBIC SPLINE TABLE ELECTROSTATICS */
835 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
836 F = _mm_setzero_pd();
837 GMX_MM_TRANSPOSE2_PD(Y,F);
838 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
839 H = _mm_setzero_pd();
840 GMX_MM_TRANSPOSE2_PD(G,H);
841 Heps = _mm_mul_pd(vfeps,H);
842 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
843 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
844 velec = _mm_mul_pd(qq02,VV);
845 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
846 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
848 /* Update potential sum for this i atom from the interaction with this j atom. */
849 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
850 velecsum = _mm_add_pd(velecsum,velec);
854 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
856 /* Calculate temporary vectorial force */
857 tx = _mm_mul_pd(fscal,dx02);
858 ty = _mm_mul_pd(fscal,dy02);
859 tz = _mm_mul_pd(fscal,dz02);
861 /* Update vectorial force */
862 fix0 = _mm_add_pd(fix0,tx);
863 fiy0 = _mm_add_pd(fiy0,ty);
864 fiz0 = _mm_add_pd(fiz0,tz);
866 fjx2 = _mm_add_pd(fjx2,tx);
867 fjy2 = _mm_add_pd(fjy2,ty);
868 fjz2 = _mm_add_pd(fjz2,tz);
870 /**************************
871 * CALCULATE INTERACTIONS *
872 **************************/
874 r10 = _mm_mul_pd(rsq10,rinv10);
876 /* Calculate table index by multiplying r with table scale and truncate to integer */
877 rt = _mm_mul_pd(r10,vftabscale);
878 vfitab = _mm_cvttpd_epi32(rt);
879 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
880 vfitab = _mm_slli_epi32(vfitab,2);
882 /* CUBIC SPLINE TABLE ELECTROSTATICS */
883 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
884 F = _mm_setzero_pd();
885 GMX_MM_TRANSPOSE2_PD(Y,F);
886 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
887 H = _mm_setzero_pd();
888 GMX_MM_TRANSPOSE2_PD(G,H);
889 Heps = _mm_mul_pd(vfeps,H);
890 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
891 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
892 velec = _mm_mul_pd(qq10,VV);
893 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
894 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
896 /* Update potential sum for this i atom from the interaction with this j atom. */
897 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
898 velecsum = _mm_add_pd(velecsum,velec);
902 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
904 /* Calculate temporary vectorial force */
905 tx = _mm_mul_pd(fscal,dx10);
906 ty = _mm_mul_pd(fscal,dy10);
907 tz = _mm_mul_pd(fscal,dz10);
909 /* Update vectorial force */
910 fix1 = _mm_add_pd(fix1,tx);
911 fiy1 = _mm_add_pd(fiy1,ty);
912 fiz1 = _mm_add_pd(fiz1,tz);
914 fjx0 = _mm_add_pd(fjx0,tx);
915 fjy0 = _mm_add_pd(fjy0,ty);
916 fjz0 = _mm_add_pd(fjz0,tz);
918 /**************************
919 * CALCULATE INTERACTIONS *
920 **************************/
922 r11 = _mm_mul_pd(rsq11,rinv11);
924 /* Calculate table index by multiplying r with table scale and truncate to integer */
925 rt = _mm_mul_pd(r11,vftabscale);
926 vfitab = _mm_cvttpd_epi32(rt);
927 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
928 vfitab = _mm_slli_epi32(vfitab,2);
930 /* CUBIC SPLINE TABLE ELECTROSTATICS */
931 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
932 F = _mm_setzero_pd();
933 GMX_MM_TRANSPOSE2_PD(Y,F);
934 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
935 H = _mm_setzero_pd();
936 GMX_MM_TRANSPOSE2_PD(G,H);
937 Heps = _mm_mul_pd(vfeps,H);
938 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
939 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
940 velec = _mm_mul_pd(qq11,VV);
941 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
942 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
944 /* Update potential sum for this i atom from the interaction with this j atom. */
945 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
946 velecsum = _mm_add_pd(velecsum,velec);
950 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
952 /* Calculate temporary vectorial force */
953 tx = _mm_mul_pd(fscal,dx11);
954 ty = _mm_mul_pd(fscal,dy11);
955 tz = _mm_mul_pd(fscal,dz11);
957 /* Update vectorial force */
958 fix1 = _mm_add_pd(fix1,tx);
959 fiy1 = _mm_add_pd(fiy1,ty);
960 fiz1 = _mm_add_pd(fiz1,tz);
962 fjx1 = _mm_add_pd(fjx1,tx);
963 fjy1 = _mm_add_pd(fjy1,ty);
964 fjz1 = _mm_add_pd(fjz1,tz);
966 /**************************
967 * CALCULATE INTERACTIONS *
968 **************************/
970 r12 = _mm_mul_pd(rsq12,rinv12);
972 /* Calculate table index by multiplying r with table scale and truncate to integer */
973 rt = _mm_mul_pd(r12,vftabscale);
974 vfitab = _mm_cvttpd_epi32(rt);
975 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
976 vfitab = _mm_slli_epi32(vfitab,2);
978 /* CUBIC SPLINE TABLE ELECTROSTATICS */
979 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
980 F = _mm_setzero_pd();
981 GMX_MM_TRANSPOSE2_PD(Y,F);
982 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
983 H = _mm_setzero_pd();
984 GMX_MM_TRANSPOSE2_PD(G,H);
985 Heps = _mm_mul_pd(vfeps,H);
986 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
987 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
988 velec = _mm_mul_pd(qq12,VV);
989 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
990 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
992 /* Update potential sum for this i atom from the interaction with this j atom. */
993 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
994 velecsum = _mm_add_pd(velecsum,velec);
998 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1000 /* Calculate temporary vectorial force */
1001 tx = _mm_mul_pd(fscal,dx12);
1002 ty = _mm_mul_pd(fscal,dy12);
1003 tz = _mm_mul_pd(fscal,dz12);
1005 /* Update vectorial force */
1006 fix1 = _mm_add_pd(fix1,tx);
1007 fiy1 = _mm_add_pd(fiy1,ty);
1008 fiz1 = _mm_add_pd(fiz1,tz);
1010 fjx2 = _mm_add_pd(fjx2,tx);
1011 fjy2 = _mm_add_pd(fjy2,ty);
1012 fjz2 = _mm_add_pd(fjz2,tz);
1014 /**************************
1015 * CALCULATE INTERACTIONS *
1016 **************************/
1018 r20 = _mm_mul_pd(rsq20,rinv20);
1020 /* Calculate table index by multiplying r with table scale and truncate to integer */
1021 rt = _mm_mul_pd(r20,vftabscale);
1022 vfitab = _mm_cvttpd_epi32(rt);
1023 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1024 vfitab = _mm_slli_epi32(vfitab,2);
1026 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1027 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1028 F = _mm_setzero_pd();
1029 GMX_MM_TRANSPOSE2_PD(Y,F);
1030 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1031 H = _mm_setzero_pd();
1032 GMX_MM_TRANSPOSE2_PD(G,H);
1033 Heps = _mm_mul_pd(vfeps,H);
1034 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1035 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1036 velec = _mm_mul_pd(qq20,VV);
1037 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1038 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1040 /* Update potential sum for this i atom from the interaction with this j atom. */
1041 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1042 velecsum = _mm_add_pd(velecsum,velec);
1046 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1048 /* Calculate temporary vectorial force */
1049 tx = _mm_mul_pd(fscal,dx20);
1050 ty = _mm_mul_pd(fscal,dy20);
1051 tz = _mm_mul_pd(fscal,dz20);
1053 /* Update vectorial force */
1054 fix2 = _mm_add_pd(fix2,tx);
1055 fiy2 = _mm_add_pd(fiy2,ty);
1056 fiz2 = _mm_add_pd(fiz2,tz);
1058 fjx0 = _mm_add_pd(fjx0,tx);
1059 fjy0 = _mm_add_pd(fjy0,ty);
1060 fjz0 = _mm_add_pd(fjz0,tz);
1062 /**************************
1063 * CALCULATE INTERACTIONS *
1064 **************************/
1066 r21 = _mm_mul_pd(rsq21,rinv21);
1068 /* Calculate table index by multiplying r with table scale and truncate to integer */
1069 rt = _mm_mul_pd(r21,vftabscale);
1070 vfitab = _mm_cvttpd_epi32(rt);
1071 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1072 vfitab = _mm_slli_epi32(vfitab,2);
1074 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1075 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1076 F = _mm_setzero_pd();
1077 GMX_MM_TRANSPOSE2_PD(Y,F);
1078 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1079 H = _mm_setzero_pd();
1080 GMX_MM_TRANSPOSE2_PD(G,H);
1081 Heps = _mm_mul_pd(vfeps,H);
1082 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1083 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1084 velec = _mm_mul_pd(qq21,VV);
1085 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1086 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1088 /* Update potential sum for this i atom from the interaction with this j atom. */
1089 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1090 velecsum = _mm_add_pd(velecsum,velec);
1094 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1096 /* Calculate temporary vectorial force */
1097 tx = _mm_mul_pd(fscal,dx21);
1098 ty = _mm_mul_pd(fscal,dy21);
1099 tz = _mm_mul_pd(fscal,dz21);
1101 /* Update vectorial force */
1102 fix2 = _mm_add_pd(fix2,tx);
1103 fiy2 = _mm_add_pd(fiy2,ty);
1104 fiz2 = _mm_add_pd(fiz2,tz);
1106 fjx1 = _mm_add_pd(fjx1,tx);
1107 fjy1 = _mm_add_pd(fjy1,ty);
1108 fjz1 = _mm_add_pd(fjz1,tz);
1110 /**************************
1111 * CALCULATE INTERACTIONS *
1112 **************************/
1114 r22 = _mm_mul_pd(rsq22,rinv22);
1116 /* Calculate table index by multiplying r with table scale and truncate to integer */
1117 rt = _mm_mul_pd(r22,vftabscale);
1118 vfitab = _mm_cvttpd_epi32(rt);
1119 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1120 vfitab = _mm_slli_epi32(vfitab,2);
1122 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1123 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1124 F = _mm_setzero_pd();
1125 GMX_MM_TRANSPOSE2_PD(Y,F);
1126 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1127 H = _mm_setzero_pd();
1128 GMX_MM_TRANSPOSE2_PD(G,H);
1129 Heps = _mm_mul_pd(vfeps,H);
1130 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1131 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1132 velec = _mm_mul_pd(qq22,VV);
1133 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1134 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1136 /* Update potential sum for this i atom from the interaction with this j atom. */
1137 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1138 velecsum = _mm_add_pd(velecsum,velec);
1142 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1144 /* Calculate temporary vectorial force */
1145 tx = _mm_mul_pd(fscal,dx22);
1146 ty = _mm_mul_pd(fscal,dy22);
1147 tz = _mm_mul_pd(fscal,dz22);
1149 /* Update vectorial force */
1150 fix2 = _mm_add_pd(fix2,tx);
1151 fiy2 = _mm_add_pd(fiy2,ty);
1152 fiz2 = _mm_add_pd(fiz2,tz);
1154 fjx2 = _mm_add_pd(fjx2,tx);
1155 fjy2 = _mm_add_pd(fjy2,ty);
1156 fjz2 = _mm_add_pd(fjz2,tz);
1158 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1160 /* Inner loop uses 387 flops */
1163 /* End of innermost loop */
1165 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1166 f+i_coord_offset,fshift+i_shift_offset);
1169 /* Update potential energies */
1170 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1172 /* Increment number of inner iterations */
1173 inneriter += j_index_end - j_index_start;
1175 /* Outer loop uses 19 flops */
1178 /* Increment number of outer iterations */
1181 /* Update outer/inner flops */
1183 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*387);
1186 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_double
1187 * Electrostatics interaction: CubicSplineTable
1188 * VdW interaction: None
1189 * Geometry: Water3-Water3
1190 * Calculate force/pot: Force
1193 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_double
1194 (t_nblist * gmx_restrict nlist,
1195 rvec * gmx_restrict xx,
1196 rvec * gmx_restrict ff,
1197 t_forcerec * gmx_restrict fr,
1198 t_mdatoms * gmx_restrict mdatoms,
1199 nb_kernel_data_t * gmx_restrict kernel_data,
1200 t_nrnb * gmx_restrict nrnb)
1202 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1203 * just 0 for non-waters.
1204 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1205 * jnr indices corresponding to data put in the four positions in the SIMD register.
1207 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1208 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1210 int j_coord_offsetA,j_coord_offsetB;
1211 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1212 real rcutoff_scalar;
1213 real *shiftvec,*fshift,*x,*f;
1214 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1216 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1218 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1220 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1221 int vdwjidx0A,vdwjidx0B;
1222 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1223 int vdwjidx1A,vdwjidx1B;
1224 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1225 int vdwjidx2A,vdwjidx2B;
1226 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1227 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1228 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1229 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1230 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1231 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1232 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1233 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1234 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1235 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1236 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1239 __m128i ifour = _mm_set1_epi32(4);
1240 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1242 __m128d dummy_mask,cutoff_mask;
1243 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1244 __m128d one = _mm_set1_pd(1.0);
1245 __m128d two = _mm_set1_pd(2.0);
1251 jindex = nlist->jindex;
1253 shiftidx = nlist->shift;
1255 shiftvec = fr->shift_vec[0];
1256 fshift = fr->fshift[0];
1257 facel = _mm_set1_pd(fr->epsfac);
1258 charge = mdatoms->chargeA;
1260 vftab = kernel_data->table_elec->data;
1261 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
1263 /* Setup water-specific parameters */
1264 inr = nlist->iinr[0];
1265 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1266 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1267 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1269 jq0 = _mm_set1_pd(charge[inr+0]);
1270 jq1 = _mm_set1_pd(charge[inr+1]);
1271 jq2 = _mm_set1_pd(charge[inr+2]);
1272 qq00 = _mm_mul_pd(iq0,jq0);
1273 qq01 = _mm_mul_pd(iq0,jq1);
1274 qq02 = _mm_mul_pd(iq0,jq2);
1275 qq10 = _mm_mul_pd(iq1,jq0);
1276 qq11 = _mm_mul_pd(iq1,jq1);
1277 qq12 = _mm_mul_pd(iq1,jq2);
1278 qq20 = _mm_mul_pd(iq2,jq0);
1279 qq21 = _mm_mul_pd(iq2,jq1);
1280 qq22 = _mm_mul_pd(iq2,jq2);
1282 /* Avoid stupid compiler warnings */
1284 j_coord_offsetA = 0;
1285 j_coord_offsetB = 0;
1290 /* Start outer loop over neighborlists */
1291 for(iidx=0; iidx<nri; iidx++)
1293 /* Load shift vector for this list */
1294 i_shift_offset = DIM*shiftidx[iidx];
1296 /* Load limits for loop over neighbors */
1297 j_index_start = jindex[iidx];
1298 j_index_end = jindex[iidx+1];
1300 /* Get outer coordinate index */
1302 i_coord_offset = DIM*inr;
1304 /* Load i particle coords and add shift vector */
1305 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1306 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1308 fix0 = _mm_setzero_pd();
1309 fiy0 = _mm_setzero_pd();
1310 fiz0 = _mm_setzero_pd();
1311 fix1 = _mm_setzero_pd();
1312 fiy1 = _mm_setzero_pd();
1313 fiz1 = _mm_setzero_pd();
1314 fix2 = _mm_setzero_pd();
1315 fiy2 = _mm_setzero_pd();
1316 fiz2 = _mm_setzero_pd();
1318 /* Start inner kernel loop */
1319 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1322 /* Get j neighbor index, and coordinate index */
1324 jnrB = jjnr[jidx+1];
1325 j_coord_offsetA = DIM*jnrA;
1326 j_coord_offsetB = DIM*jnrB;
1328 /* load j atom coordinates */
1329 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1330 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1332 /* Calculate displacement vector */
1333 dx00 = _mm_sub_pd(ix0,jx0);
1334 dy00 = _mm_sub_pd(iy0,jy0);
1335 dz00 = _mm_sub_pd(iz0,jz0);
1336 dx01 = _mm_sub_pd(ix0,jx1);
1337 dy01 = _mm_sub_pd(iy0,jy1);
1338 dz01 = _mm_sub_pd(iz0,jz1);
1339 dx02 = _mm_sub_pd(ix0,jx2);
1340 dy02 = _mm_sub_pd(iy0,jy2);
1341 dz02 = _mm_sub_pd(iz0,jz2);
1342 dx10 = _mm_sub_pd(ix1,jx0);
1343 dy10 = _mm_sub_pd(iy1,jy0);
1344 dz10 = _mm_sub_pd(iz1,jz0);
1345 dx11 = _mm_sub_pd(ix1,jx1);
1346 dy11 = _mm_sub_pd(iy1,jy1);
1347 dz11 = _mm_sub_pd(iz1,jz1);
1348 dx12 = _mm_sub_pd(ix1,jx2);
1349 dy12 = _mm_sub_pd(iy1,jy2);
1350 dz12 = _mm_sub_pd(iz1,jz2);
1351 dx20 = _mm_sub_pd(ix2,jx0);
1352 dy20 = _mm_sub_pd(iy2,jy0);
1353 dz20 = _mm_sub_pd(iz2,jz0);
1354 dx21 = _mm_sub_pd(ix2,jx1);
1355 dy21 = _mm_sub_pd(iy2,jy1);
1356 dz21 = _mm_sub_pd(iz2,jz1);
1357 dx22 = _mm_sub_pd(ix2,jx2);
1358 dy22 = _mm_sub_pd(iy2,jy2);
1359 dz22 = _mm_sub_pd(iz2,jz2);
1361 /* Calculate squared distance and things based on it */
1362 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1363 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1364 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1365 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1366 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1367 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1368 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1369 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1370 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1372 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1373 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1374 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1375 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1376 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1377 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1378 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1379 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1380 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1382 fjx0 = _mm_setzero_pd();
1383 fjy0 = _mm_setzero_pd();
1384 fjz0 = _mm_setzero_pd();
1385 fjx1 = _mm_setzero_pd();
1386 fjy1 = _mm_setzero_pd();
1387 fjz1 = _mm_setzero_pd();
1388 fjx2 = _mm_setzero_pd();
1389 fjy2 = _mm_setzero_pd();
1390 fjz2 = _mm_setzero_pd();
1392 /**************************
1393 * CALCULATE INTERACTIONS *
1394 **************************/
1396 r00 = _mm_mul_pd(rsq00,rinv00);
1398 /* Calculate table index by multiplying r with table scale and truncate to integer */
1399 rt = _mm_mul_pd(r00,vftabscale);
1400 vfitab = _mm_cvttpd_epi32(rt);
1401 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1402 vfitab = _mm_slli_epi32(vfitab,2);
1404 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1405 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1406 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1407 GMX_MM_TRANSPOSE2_PD(Y,F);
1408 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1409 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1410 GMX_MM_TRANSPOSE2_PD(G,H);
1411 Heps = _mm_mul_pd(vfeps,H);
1412 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1413 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1414 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1418 /* Calculate temporary vectorial force */
1419 tx = _mm_mul_pd(fscal,dx00);
1420 ty = _mm_mul_pd(fscal,dy00);
1421 tz = _mm_mul_pd(fscal,dz00);
1423 /* Update vectorial force */
1424 fix0 = _mm_add_pd(fix0,tx);
1425 fiy0 = _mm_add_pd(fiy0,ty);
1426 fiz0 = _mm_add_pd(fiz0,tz);
1428 fjx0 = _mm_add_pd(fjx0,tx);
1429 fjy0 = _mm_add_pd(fjy0,ty);
1430 fjz0 = _mm_add_pd(fjz0,tz);
1432 /**************************
1433 * CALCULATE INTERACTIONS *
1434 **************************/
1436 r01 = _mm_mul_pd(rsq01,rinv01);
1438 /* Calculate table index by multiplying r with table scale and truncate to integer */
1439 rt = _mm_mul_pd(r01,vftabscale);
1440 vfitab = _mm_cvttpd_epi32(rt);
1441 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1442 vfitab = _mm_slli_epi32(vfitab,2);
1444 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1445 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1446 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1447 GMX_MM_TRANSPOSE2_PD(Y,F);
1448 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1449 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1450 GMX_MM_TRANSPOSE2_PD(G,H);
1451 Heps = _mm_mul_pd(vfeps,H);
1452 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1453 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1454 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1458 /* Calculate temporary vectorial force */
1459 tx = _mm_mul_pd(fscal,dx01);
1460 ty = _mm_mul_pd(fscal,dy01);
1461 tz = _mm_mul_pd(fscal,dz01);
1463 /* Update vectorial force */
1464 fix0 = _mm_add_pd(fix0,tx);
1465 fiy0 = _mm_add_pd(fiy0,ty);
1466 fiz0 = _mm_add_pd(fiz0,tz);
1468 fjx1 = _mm_add_pd(fjx1,tx);
1469 fjy1 = _mm_add_pd(fjy1,ty);
1470 fjz1 = _mm_add_pd(fjz1,tz);
1472 /**************************
1473 * CALCULATE INTERACTIONS *
1474 **************************/
1476 r02 = _mm_mul_pd(rsq02,rinv02);
1478 /* Calculate table index by multiplying r with table scale and truncate to integer */
1479 rt = _mm_mul_pd(r02,vftabscale);
1480 vfitab = _mm_cvttpd_epi32(rt);
1481 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1482 vfitab = _mm_slli_epi32(vfitab,2);
1484 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1485 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1486 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1487 GMX_MM_TRANSPOSE2_PD(Y,F);
1488 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1489 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1490 GMX_MM_TRANSPOSE2_PD(G,H);
1491 Heps = _mm_mul_pd(vfeps,H);
1492 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1493 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1494 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1498 /* Calculate temporary vectorial force */
1499 tx = _mm_mul_pd(fscal,dx02);
1500 ty = _mm_mul_pd(fscal,dy02);
1501 tz = _mm_mul_pd(fscal,dz02);
1503 /* Update vectorial force */
1504 fix0 = _mm_add_pd(fix0,tx);
1505 fiy0 = _mm_add_pd(fiy0,ty);
1506 fiz0 = _mm_add_pd(fiz0,tz);
1508 fjx2 = _mm_add_pd(fjx2,tx);
1509 fjy2 = _mm_add_pd(fjy2,ty);
1510 fjz2 = _mm_add_pd(fjz2,tz);
1512 /**************************
1513 * CALCULATE INTERACTIONS *
1514 **************************/
1516 r10 = _mm_mul_pd(rsq10,rinv10);
1518 /* Calculate table index by multiplying r with table scale and truncate to integer */
1519 rt = _mm_mul_pd(r10,vftabscale);
1520 vfitab = _mm_cvttpd_epi32(rt);
1521 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1522 vfitab = _mm_slli_epi32(vfitab,2);
1524 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1525 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1526 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1527 GMX_MM_TRANSPOSE2_PD(Y,F);
1528 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1529 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1530 GMX_MM_TRANSPOSE2_PD(G,H);
1531 Heps = _mm_mul_pd(vfeps,H);
1532 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1533 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1534 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1538 /* Calculate temporary vectorial force */
1539 tx = _mm_mul_pd(fscal,dx10);
1540 ty = _mm_mul_pd(fscal,dy10);
1541 tz = _mm_mul_pd(fscal,dz10);
1543 /* Update vectorial force */
1544 fix1 = _mm_add_pd(fix1,tx);
1545 fiy1 = _mm_add_pd(fiy1,ty);
1546 fiz1 = _mm_add_pd(fiz1,tz);
1548 fjx0 = _mm_add_pd(fjx0,tx);
1549 fjy0 = _mm_add_pd(fjy0,ty);
1550 fjz0 = _mm_add_pd(fjz0,tz);
1552 /**************************
1553 * CALCULATE INTERACTIONS *
1554 **************************/
1556 r11 = _mm_mul_pd(rsq11,rinv11);
1558 /* Calculate table index by multiplying r with table scale and truncate to integer */
1559 rt = _mm_mul_pd(r11,vftabscale);
1560 vfitab = _mm_cvttpd_epi32(rt);
1561 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1562 vfitab = _mm_slli_epi32(vfitab,2);
1564 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1565 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1566 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1567 GMX_MM_TRANSPOSE2_PD(Y,F);
1568 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1569 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1570 GMX_MM_TRANSPOSE2_PD(G,H);
1571 Heps = _mm_mul_pd(vfeps,H);
1572 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1573 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1574 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1578 /* Calculate temporary vectorial force */
1579 tx = _mm_mul_pd(fscal,dx11);
1580 ty = _mm_mul_pd(fscal,dy11);
1581 tz = _mm_mul_pd(fscal,dz11);
1583 /* Update vectorial force */
1584 fix1 = _mm_add_pd(fix1,tx);
1585 fiy1 = _mm_add_pd(fiy1,ty);
1586 fiz1 = _mm_add_pd(fiz1,tz);
1588 fjx1 = _mm_add_pd(fjx1,tx);
1589 fjy1 = _mm_add_pd(fjy1,ty);
1590 fjz1 = _mm_add_pd(fjz1,tz);
1592 /**************************
1593 * CALCULATE INTERACTIONS *
1594 **************************/
1596 r12 = _mm_mul_pd(rsq12,rinv12);
1598 /* Calculate table index by multiplying r with table scale and truncate to integer */
1599 rt = _mm_mul_pd(r12,vftabscale);
1600 vfitab = _mm_cvttpd_epi32(rt);
1601 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1602 vfitab = _mm_slli_epi32(vfitab,2);
1604 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1605 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1606 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1607 GMX_MM_TRANSPOSE2_PD(Y,F);
1608 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1609 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1610 GMX_MM_TRANSPOSE2_PD(G,H);
1611 Heps = _mm_mul_pd(vfeps,H);
1612 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1613 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1614 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1618 /* Calculate temporary vectorial force */
1619 tx = _mm_mul_pd(fscal,dx12);
1620 ty = _mm_mul_pd(fscal,dy12);
1621 tz = _mm_mul_pd(fscal,dz12);
1623 /* Update vectorial force */
1624 fix1 = _mm_add_pd(fix1,tx);
1625 fiy1 = _mm_add_pd(fiy1,ty);
1626 fiz1 = _mm_add_pd(fiz1,tz);
1628 fjx2 = _mm_add_pd(fjx2,tx);
1629 fjy2 = _mm_add_pd(fjy2,ty);
1630 fjz2 = _mm_add_pd(fjz2,tz);
1632 /**************************
1633 * CALCULATE INTERACTIONS *
1634 **************************/
1636 r20 = _mm_mul_pd(rsq20,rinv20);
1638 /* Calculate table index by multiplying r with table scale and truncate to integer */
1639 rt = _mm_mul_pd(r20,vftabscale);
1640 vfitab = _mm_cvttpd_epi32(rt);
1641 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1642 vfitab = _mm_slli_epi32(vfitab,2);
1644 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1645 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1646 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1647 GMX_MM_TRANSPOSE2_PD(Y,F);
1648 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1649 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1650 GMX_MM_TRANSPOSE2_PD(G,H);
1651 Heps = _mm_mul_pd(vfeps,H);
1652 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1653 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1654 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1658 /* Calculate temporary vectorial force */
1659 tx = _mm_mul_pd(fscal,dx20);
1660 ty = _mm_mul_pd(fscal,dy20);
1661 tz = _mm_mul_pd(fscal,dz20);
1663 /* Update vectorial force */
1664 fix2 = _mm_add_pd(fix2,tx);
1665 fiy2 = _mm_add_pd(fiy2,ty);
1666 fiz2 = _mm_add_pd(fiz2,tz);
1668 fjx0 = _mm_add_pd(fjx0,tx);
1669 fjy0 = _mm_add_pd(fjy0,ty);
1670 fjz0 = _mm_add_pd(fjz0,tz);
1672 /**************************
1673 * CALCULATE INTERACTIONS *
1674 **************************/
1676 r21 = _mm_mul_pd(rsq21,rinv21);
1678 /* Calculate table index by multiplying r with table scale and truncate to integer */
1679 rt = _mm_mul_pd(r21,vftabscale);
1680 vfitab = _mm_cvttpd_epi32(rt);
1681 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1682 vfitab = _mm_slli_epi32(vfitab,2);
1684 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1685 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1686 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1687 GMX_MM_TRANSPOSE2_PD(Y,F);
1688 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1689 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1690 GMX_MM_TRANSPOSE2_PD(G,H);
1691 Heps = _mm_mul_pd(vfeps,H);
1692 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1693 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1694 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1698 /* Calculate temporary vectorial force */
1699 tx = _mm_mul_pd(fscal,dx21);
1700 ty = _mm_mul_pd(fscal,dy21);
1701 tz = _mm_mul_pd(fscal,dz21);
1703 /* Update vectorial force */
1704 fix2 = _mm_add_pd(fix2,tx);
1705 fiy2 = _mm_add_pd(fiy2,ty);
1706 fiz2 = _mm_add_pd(fiz2,tz);
1708 fjx1 = _mm_add_pd(fjx1,tx);
1709 fjy1 = _mm_add_pd(fjy1,ty);
1710 fjz1 = _mm_add_pd(fjz1,tz);
1712 /**************************
1713 * CALCULATE INTERACTIONS *
1714 **************************/
1716 r22 = _mm_mul_pd(rsq22,rinv22);
1718 /* Calculate table index by multiplying r with table scale and truncate to integer */
1719 rt = _mm_mul_pd(r22,vftabscale);
1720 vfitab = _mm_cvttpd_epi32(rt);
1721 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1722 vfitab = _mm_slli_epi32(vfitab,2);
1724 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1725 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1726 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1727 GMX_MM_TRANSPOSE2_PD(Y,F);
1728 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1729 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1730 GMX_MM_TRANSPOSE2_PD(G,H);
1731 Heps = _mm_mul_pd(vfeps,H);
1732 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1733 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1734 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1738 /* Calculate temporary vectorial force */
1739 tx = _mm_mul_pd(fscal,dx22);
1740 ty = _mm_mul_pd(fscal,dy22);
1741 tz = _mm_mul_pd(fscal,dz22);
1743 /* Update vectorial force */
1744 fix2 = _mm_add_pd(fix2,tx);
1745 fiy2 = _mm_add_pd(fiy2,ty);
1746 fiz2 = _mm_add_pd(fiz2,tz);
1748 fjx2 = _mm_add_pd(fjx2,tx);
1749 fjy2 = _mm_add_pd(fjy2,ty);
1750 fjz2 = _mm_add_pd(fjz2,tz);
1752 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1754 /* Inner loop uses 351 flops */
1757 if(jidx<j_index_end)
1761 j_coord_offsetA = DIM*jnrA;
1763 /* load j atom coordinates */
1764 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1765 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1767 /* Calculate displacement vector */
1768 dx00 = _mm_sub_pd(ix0,jx0);
1769 dy00 = _mm_sub_pd(iy0,jy0);
1770 dz00 = _mm_sub_pd(iz0,jz0);
1771 dx01 = _mm_sub_pd(ix0,jx1);
1772 dy01 = _mm_sub_pd(iy0,jy1);
1773 dz01 = _mm_sub_pd(iz0,jz1);
1774 dx02 = _mm_sub_pd(ix0,jx2);
1775 dy02 = _mm_sub_pd(iy0,jy2);
1776 dz02 = _mm_sub_pd(iz0,jz2);
1777 dx10 = _mm_sub_pd(ix1,jx0);
1778 dy10 = _mm_sub_pd(iy1,jy0);
1779 dz10 = _mm_sub_pd(iz1,jz0);
1780 dx11 = _mm_sub_pd(ix1,jx1);
1781 dy11 = _mm_sub_pd(iy1,jy1);
1782 dz11 = _mm_sub_pd(iz1,jz1);
1783 dx12 = _mm_sub_pd(ix1,jx2);
1784 dy12 = _mm_sub_pd(iy1,jy2);
1785 dz12 = _mm_sub_pd(iz1,jz2);
1786 dx20 = _mm_sub_pd(ix2,jx0);
1787 dy20 = _mm_sub_pd(iy2,jy0);
1788 dz20 = _mm_sub_pd(iz2,jz0);
1789 dx21 = _mm_sub_pd(ix2,jx1);
1790 dy21 = _mm_sub_pd(iy2,jy1);
1791 dz21 = _mm_sub_pd(iz2,jz1);
1792 dx22 = _mm_sub_pd(ix2,jx2);
1793 dy22 = _mm_sub_pd(iy2,jy2);
1794 dz22 = _mm_sub_pd(iz2,jz2);
1796 /* Calculate squared distance and things based on it */
1797 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1798 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1799 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1800 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1801 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1802 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1803 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1804 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1805 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1807 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1808 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1809 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1810 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1811 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1812 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1813 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1814 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1815 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1817 fjx0 = _mm_setzero_pd();
1818 fjy0 = _mm_setzero_pd();
1819 fjz0 = _mm_setzero_pd();
1820 fjx1 = _mm_setzero_pd();
1821 fjy1 = _mm_setzero_pd();
1822 fjz1 = _mm_setzero_pd();
1823 fjx2 = _mm_setzero_pd();
1824 fjy2 = _mm_setzero_pd();
1825 fjz2 = _mm_setzero_pd();
1827 /**************************
1828 * CALCULATE INTERACTIONS *
1829 **************************/
1831 r00 = _mm_mul_pd(rsq00,rinv00);
1833 /* Calculate table index by multiplying r with table scale and truncate to integer */
1834 rt = _mm_mul_pd(r00,vftabscale);
1835 vfitab = _mm_cvttpd_epi32(rt);
1836 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1837 vfitab = _mm_slli_epi32(vfitab,2);
1839 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1840 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1841 F = _mm_setzero_pd();
1842 GMX_MM_TRANSPOSE2_PD(Y,F);
1843 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1844 H = _mm_setzero_pd();
1845 GMX_MM_TRANSPOSE2_PD(G,H);
1846 Heps = _mm_mul_pd(vfeps,H);
1847 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1848 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1849 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1853 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1855 /* Calculate temporary vectorial force */
1856 tx = _mm_mul_pd(fscal,dx00);
1857 ty = _mm_mul_pd(fscal,dy00);
1858 tz = _mm_mul_pd(fscal,dz00);
1860 /* Update vectorial force */
1861 fix0 = _mm_add_pd(fix0,tx);
1862 fiy0 = _mm_add_pd(fiy0,ty);
1863 fiz0 = _mm_add_pd(fiz0,tz);
1865 fjx0 = _mm_add_pd(fjx0,tx);
1866 fjy0 = _mm_add_pd(fjy0,ty);
1867 fjz0 = _mm_add_pd(fjz0,tz);
1869 /**************************
1870 * CALCULATE INTERACTIONS *
1871 **************************/
1873 r01 = _mm_mul_pd(rsq01,rinv01);
1875 /* Calculate table index by multiplying r with table scale and truncate to integer */
1876 rt = _mm_mul_pd(r01,vftabscale);
1877 vfitab = _mm_cvttpd_epi32(rt);
1878 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1879 vfitab = _mm_slli_epi32(vfitab,2);
1881 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1882 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1883 F = _mm_setzero_pd();
1884 GMX_MM_TRANSPOSE2_PD(Y,F);
1885 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1886 H = _mm_setzero_pd();
1887 GMX_MM_TRANSPOSE2_PD(G,H);
1888 Heps = _mm_mul_pd(vfeps,H);
1889 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1890 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1891 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1895 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1897 /* Calculate temporary vectorial force */
1898 tx = _mm_mul_pd(fscal,dx01);
1899 ty = _mm_mul_pd(fscal,dy01);
1900 tz = _mm_mul_pd(fscal,dz01);
1902 /* Update vectorial force */
1903 fix0 = _mm_add_pd(fix0,tx);
1904 fiy0 = _mm_add_pd(fiy0,ty);
1905 fiz0 = _mm_add_pd(fiz0,tz);
1907 fjx1 = _mm_add_pd(fjx1,tx);
1908 fjy1 = _mm_add_pd(fjy1,ty);
1909 fjz1 = _mm_add_pd(fjz1,tz);
1911 /**************************
1912 * CALCULATE INTERACTIONS *
1913 **************************/
1915 r02 = _mm_mul_pd(rsq02,rinv02);
1917 /* Calculate table index by multiplying r with table scale and truncate to integer */
1918 rt = _mm_mul_pd(r02,vftabscale);
1919 vfitab = _mm_cvttpd_epi32(rt);
1920 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1921 vfitab = _mm_slli_epi32(vfitab,2);
1923 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1924 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1925 F = _mm_setzero_pd();
1926 GMX_MM_TRANSPOSE2_PD(Y,F);
1927 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1928 H = _mm_setzero_pd();
1929 GMX_MM_TRANSPOSE2_PD(G,H);
1930 Heps = _mm_mul_pd(vfeps,H);
1931 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1932 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1933 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1937 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1939 /* Calculate temporary vectorial force */
1940 tx = _mm_mul_pd(fscal,dx02);
1941 ty = _mm_mul_pd(fscal,dy02);
1942 tz = _mm_mul_pd(fscal,dz02);
1944 /* Update vectorial force */
1945 fix0 = _mm_add_pd(fix0,tx);
1946 fiy0 = _mm_add_pd(fiy0,ty);
1947 fiz0 = _mm_add_pd(fiz0,tz);
1949 fjx2 = _mm_add_pd(fjx2,tx);
1950 fjy2 = _mm_add_pd(fjy2,ty);
1951 fjz2 = _mm_add_pd(fjz2,tz);
1953 /**************************
1954 * CALCULATE INTERACTIONS *
1955 **************************/
1957 r10 = _mm_mul_pd(rsq10,rinv10);
1959 /* Calculate table index by multiplying r with table scale and truncate to integer */
1960 rt = _mm_mul_pd(r10,vftabscale);
1961 vfitab = _mm_cvttpd_epi32(rt);
1962 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1963 vfitab = _mm_slli_epi32(vfitab,2);
1965 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1966 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1967 F = _mm_setzero_pd();
1968 GMX_MM_TRANSPOSE2_PD(Y,F);
1969 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1970 H = _mm_setzero_pd();
1971 GMX_MM_TRANSPOSE2_PD(G,H);
1972 Heps = _mm_mul_pd(vfeps,H);
1973 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1974 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1975 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1979 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1981 /* Calculate temporary vectorial force */
1982 tx = _mm_mul_pd(fscal,dx10);
1983 ty = _mm_mul_pd(fscal,dy10);
1984 tz = _mm_mul_pd(fscal,dz10);
1986 /* Update vectorial force */
1987 fix1 = _mm_add_pd(fix1,tx);
1988 fiy1 = _mm_add_pd(fiy1,ty);
1989 fiz1 = _mm_add_pd(fiz1,tz);
1991 fjx0 = _mm_add_pd(fjx0,tx);
1992 fjy0 = _mm_add_pd(fjy0,ty);
1993 fjz0 = _mm_add_pd(fjz0,tz);
1995 /**************************
1996 * CALCULATE INTERACTIONS *
1997 **************************/
1999 r11 = _mm_mul_pd(rsq11,rinv11);
2001 /* Calculate table index by multiplying r with table scale and truncate to integer */
2002 rt = _mm_mul_pd(r11,vftabscale);
2003 vfitab = _mm_cvttpd_epi32(rt);
2004 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2005 vfitab = _mm_slli_epi32(vfitab,2);
2007 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2008 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2009 F = _mm_setzero_pd();
2010 GMX_MM_TRANSPOSE2_PD(Y,F);
2011 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2012 H = _mm_setzero_pd();
2013 GMX_MM_TRANSPOSE2_PD(G,H);
2014 Heps = _mm_mul_pd(vfeps,H);
2015 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2016 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2017 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2021 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2023 /* Calculate temporary vectorial force */
2024 tx = _mm_mul_pd(fscal,dx11);
2025 ty = _mm_mul_pd(fscal,dy11);
2026 tz = _mm_mul_pd(fscal,dz11);
2028 /* Update vectorial force */
2029 fix1 = _mm_add_pd(fix1,tx);
2030 fiy1 = _mm_add_pd(fiy1,ty);
2031 fiz1 = _mm_add_pd(fiz1,tz);
2033 fjx1 = _mm_add_pd(fjx1,tx);
2034 fjy1 = _mm_add_pd(fjy1,ty);
2035 fjz1 = _mm_add_pd(fjz1,tz);
2037 /**************************
2038 * CALCULATE INTERACTIONS *
2039 **************************/
2041 r12 = _mm_mul_pd(rsq12,rinv12);
2043 /* Calculate table index by multiplying r with table scale and truncate to integer */
2044 rt = _mm_mul_pd(r12,vftabscale);
2045 vfitab = _mm_cvttpd_epi32(rt);
2046 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2047 vfitab = _mm_slli_epi32(vfitab,2);
2049 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2050 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2051 F = _mm_setzero_pd();
2052 GMX_MM_TRANSPOSE2_PD(Y,F);
2053 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2054 H = _mm_setzero_pd();
2055 GMX_MM_TRANSPOSE2_PD(G,H);
2056 Heps = _mm_mul_pd(vfeps,H);
2057 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2058 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2059 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2063 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2065 /* Calculate temporary vectorial force */
2066 tx = _mm_mul_pd(fscal,dx12);
2067 ty = _mm_mul_pd(fscal,dy12);
2068 tz = _mm_mul_pd(fscal,dz12);
2070 /* Update vectorial force */
2071 fix1 = _mm_add_pd(fix1,tx);
2072 fiy1 = _mm_add_pd(fiy1,ty);
2073 fiz1 = _mm_add_pd(fiz1,tz);
2075 fjx2 = _mm_add_pd(fjx2,tx);
2076 fjy2 = _mm_add_pd(fjy2,ty);
2077 fjz2 = _mm_add_pd(fjz2,tz);
2079 /**************************
2080 * CALCULATE INTERACTIONS *
2081 **************************/
2083 r20 = _mm_mul_pd(rsq20,rinv20);
2085 /* Calculate table index by multiplying r with table scale and truncate to integer */
2086 rt = _mm_mul_pd(r20,vftabscale);
2087 vfitab = _mm_cvttpd_epi32(rt);
2088 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2089 vfitab = _mm_slli_epi32(vfitab,2);
2091 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2092 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2093 F = _mm_setzero_pd();
2094 GMX_MM_TRANSPOSE2_PD(Y,F);
2095 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2096 H = _mm_setzero_pd();
2097 GMX_MM_TRANSPOSE2_PD(G,H);
2098 Heps = _mm_mul_pd(vfeps,H);
2099 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2100 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2101 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
2105 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2107 /* Calculate temporary vectorial force */
2108 tx = _mm_mul_pd(fscal,dx20);
2109 ty = _mm_mul_pd(fscal,dy20);
2110 tz = _mm_mul_pd(fscal,dz20);
2112 /* Update vectorial force */
2113 fix2 = _mm_add_pd(fix2,tx);
2114 fiy2 = _mm_add_pd(fiy2,ty);
2115 fiz2 = _mm_add_pd(fiz2,tz);
2117 fjx0 = _mm_add_pd(fjx0,tx);
2118 fjy0 = _mm_add_pd(fjy0,ty);
2119 fjz0 = _mm_add_pd(fjz0,tz);
2121 /**************************
2122 * CALCULATE INTERACTIONS *
2123 **************************/
2125 r21 = _mm_mul_pd(rsq21,rinv21);
2127 /* Calculate table index by multiplying r with table scale and truncate to integer */
2128 rt = _mm_mul_pd(r21,vftabscale);
2129 vfitab = _mm_cvttpd_epi32(rt);
2130 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2131 vfitab = _mm_slli_epi32(vfitab,2);
2133 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2134 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2135 F = _mm_setzero_pd();
2136 GMX_MM_TRANSPOSE2_PD(Y,F);
2137 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2138 H = _mm_setzero_pd();
2139 GMX_MM_TRANSPOSE2_PD(G,H);
2140 Heps = _mm_mul_pd(vfeps,H);
2141 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2142 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2143 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2147 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2149 /* Calculate temporary vectorial force */
2150 tx = _mm_mul_pd(fscal,dx21);
2151 ty = _mm_mul_pd(fscal,dy21);
2152 tz = _mm_mul_pd(fscal,dz21);
2154 /* Update vectorial force */
2155 fix2 = _mm_add_pd(fix2,tx);
2156 fiy2 = _mm_add_pd(fiy2,ty);
2157 fiz2 = _mm_add_pd(fiz2,tz);
2159 fjx1 = _mm_add_pd(fjx1,tx);
2160 fjy1 = _mm_add_pd(fjy1,ty);
2161 fjz1 = _mm_add_pd(fjz1,tz);
2163 /**************************
2164 * CALCULATE INTERACTIONS *
2165 **************************/
2167 r22 = _mm_mul_pd(rsq22,rinv22);
2169 /* Calculate table index by multiplying r with table scale and truncate to integer */
2170 rt = _mm_mul_pd(r22,vftabscale);
2171 vfitab = _mm_cvttpd_epi32(rt);
2172 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2173 vfitab = _mm_slli_epi32(vfitab,2);
2175 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2176 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2177 F = _mm_setzero_pd();
2178 GMX_MM_TRANSPOSE2_PD(Y,F);
2179 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2180 H = _mm_setzero_pd();
2181 GMX_MM_TRANSPOSE2_PD(G,H);
2182 Heps = _mm_mul_pd(vfeps,H);
2183 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2184 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2185 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2189 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2191 /* Calculate temporary vectorial force */
2192 tx = _mm_mul_pd(fscal,dx22);
2193 ty = _mm_mul_pd(fscal,dy22);
2194 tz = _mm_mul_pd(fscal,dz22);
2196 /* Update vectorial force */
2197 fix2 = _mm_add_pd(fix2,tx);
2198 fiy2 = _mm_add_pd(fiy2,ty);
2199 fiz2 = _mm_add_pd(fiz2,tz);
2201 fjx2 = _mm_add_pd(fjx2,tx);
2202 fjy2 = _mm_add_pd(fjy2,ty);
2203 fjz2 = _mm_add_pd(fjz2,tz);
2205 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2207 /* Inner loop uses 351 flops */
2210 /* End of innermost loop */
2212 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2213 f+i_coord_offset,fshift+i_shift_offset);
2215 /* Increment number of inner iterations */
2216 inneriter += j_index_end - j_index_start;
2218 /* Outer loop uses 18 flops */
2221 /* Increment number of outer iterations */
2224 /* Update outer/inner flops */
2226 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*351);