2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse2_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_double
51 * Electrostatics interaction: CubicSplineTable
52 * VdW interaction: None
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse2_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int j_coord_offsetA,j_coord_offsetB;
75 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
77 real *shiftvec,*fshift,*x,*f;
78 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
82 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
84 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
85 int vdwjidx1A,vdwjidx1B;
86 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
87 int vdwjidx2A,vdwjidx2B;
88 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
89 int vdwjidx3A,vdwjidx3B;
90 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
91 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
92 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
93 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
94 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
95 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
96 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
97 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
98 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
99 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
100 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
103 __m128i ifour = _mm_set1_epi32(4);
104 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
106 __m128d dummy_mask,cutoff_mask;
107 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
108 __m128d one = _mm_set1_pd(1.0);
109 __m128d two = _mm_set1_pd(2.0);
115 jindex = nlist->jindex;
117 shiftidx = nlist->shift;
119 shiftvec = fr->shift_vec[0];
120 fshift = fr->fshift[0];
121 facel = _mm_set1_pd(fr->ic->epsfac);
122 charge = mdatoms->chargeA;
124 vftab = kernel_data->table_elec->data;
125 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
127 /* Setup water-specific parameters */
128 inr = nlist->iinr[0];
129 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
130 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
131 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
133 jq1 = _mm_set1_pd(charge[inr+1]);
134 jq2 = _mm_set1_pd(charge[inr+2]);
135 jq3 = _mm_set1_pd(charge[inr+3]);
136 qq11 = _mm_mul_pd(iq1,jq1);
137 qq12 = _mm_mul_pd(iq1,jq2);
138 qq13 = _mm_mul_pd(iq1,jq3);
139 qq21 = _mm_mul_pd(iq2,jq1);
140 qq22 = _mm_mul_pd(iq2,jq2);
141 qq23 = _mm_mul_pd(iq2,jq3);
142 qq31 = _mm_mul_pd(iq3,jq1);
143 qq32 = _mm_mul_pd(iq3,jq2);
144 qq33 = _mm_mul_pd(iq3,jq3);
146 /* Avoid stupid compiler warnings */
154 /* Start outer loop over neighborlists */
155 for(iidx=0; iidx<nri; iidx++)
157 /* Load shift vector for this list */
158 i_shift_offset = DIM*shiftidx[iidx];
160 /* Load limits for loop over neighbors */
161 j_index_start = jindex[iidx];
162 j_index_end = jindex[iidx+1];
164 /* Get outer coordinate index */
166 i_coord_offset = DIM*inr;
168 /* Load i particle coords and add shift vector */
169 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
170 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
172 fix1 = _mm_setzero_pd();
173 fiy1 = _mm_setzero_pd();
174 fiz1 = _mm_setzero_pd();
175 fix2 = _mm_setzero_pd();
176 fiy2 = _mm_setzero_pd();
177 fiz2 = _mm_setzero_pd();
178 fix3 = _mm_setzero_pd();
179 fiy3 = _mm_setzero_pd();
180 fiz3 = _mm_setzero_pd();
182 /* Reset potential sums */
183 velecsum = _mm_setzero_pd();
185 /* Start inner kernel loop */
186 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
189 /* Get j neighbor index, and coordinate index */
192 j_coord_offsetA = DIM*jnrA;
193 j_coord_offsetB = DIM*jnrB;
195 /* load j atom coordinates */
196 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
197 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
199 /* Calculate displacement vector */
200 dx11 = _mm_sub_pd(ix1,jx1);
201 dy11 = _mm_sub_pd(iy1,jy1);
202 dz11 = _mm_sub_pd(iz1,jz1);
203 dx12 = _mm_sub_pd(ix1,jx2);
204 dy12 = _mm_sub_pd(iy1,jy2);
205 dz12 = _mm_sub_pd(iz1,jz2);
206 dx13 = _mm_sub_pd(ix1,jx3);
207 dy13 = _mm_sub_pd(iy1,jy3);
208 dz13 = _mm_sub_pd(iz1,jz3);
209 dx21 = _mm_sub_pd(ix2,jx1);
210 dy21 = _mm_sub_pd(iy2,jy1);
211 dz21 = _mm_sub_pd(iz2,jz1);
212 dx22 = _mm_sub_pd(ix2,jx2);
213 dy22 = _mm_sub_pd(iy2,jy2);
214 dz22 = _mm_sub_pd(iz2,jz2);
215 dx23 = _mm_sub_pd(ix2,jx3);
216 dy23 = _mm_sub_pd(iy2,jy3);
217 dz23 = _mm_sub_pd(iz2,jz3);
218 dx31 = _mm_sub_pd(ix3,jx1);
219 dy31 = _mm_sub_pd(iy3,jy1);
220 dz31 = _mm_sub_pd(iz3,jz1);
221 dx32 = _mm_sub_pd(ix3,jx2);
222 dy32 = _mm_sub_pd(iy3,jy2);
223 dz32 = _mm_sub_pd(iz3,jz2);
224 dx33 = _mm_sub_pd(ix3,jx3);
225 dy33 = _mm_sub_pd(iy3,jy3);
226 dz33 = _mm_sub_pd(iz3,jz3);
228 /* Calculate squared distance and things based on it */
229 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
230 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
231 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
232 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
233 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
234 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
235 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
236 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
237 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
239 rinv11 = sse2_invsqrt_d(rsq11);
240 rinv12 = sse2_invsqrt_d(rsq12);
241 rinv13 = sse2_invsqrt_d(rsq13);
242 rinv21 = sse2_invsqrt_d(rsq21);
243 rinv22 = sse2_invsqrt_d(rsq22);
244 rinv23 = sse2_invsqrt_d(rsq23);
245 rinv31 = sse2_invsqrt_d(rsq31);
246 rinv32 = sse2_invsqrt_d(rsq32);
247 rinv33 = sse2_invsqrt_d(rsq33);
249 fjx1 = _mm_setzero_pd();
250 fjy1 = _mm_setzero_pd();
251 fjz1 = _mm_setzero_pd();
252 fjx2 = _mm_setzero_pd();
253 fjy2 = _mm_setzero_pd();
254 fjz2 = _mm_setzero_pd();
255 fjx3 = _mm_setzero_pd();
256 fjy3 = _mm_setzero_pd();
257 fjz3 = _mm_setzero_pd();
259 /**************************
260 * CALCULATE INTERACTIONS *
261 **************************/
263 r11 = _mm_mul_pd(rsq11,rinv11);
265 /* Calculate table index by multiplying r with table scale and truncate to integer */
266 rt = _mm_mul_pd(r11,vftabscale);
267 vfitab = _mm_cvttpd_epi32(rt);
268 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
269 vfitab = _mm_slli_epi32(vfitab,2);
271 /* CUBIC SPLINE TABLE ELECTROSTATICS */
272 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
273 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
274 GMX_MM_TRANSPOSE2_PD(Y,F);
275 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
276 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
277 GMX_MM_TRANSPOSE2_PD(G,H);
278 Heps = _mm_mul_pd(vfeps,H);
279 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
280 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
281 velec = _mm_mul_pd(qq11,VV);
282 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
283 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
285 /* Update potential sum for this i atom from the interaction with this j atom. */
286 velecsum = _mm_add_pd(velecsum,velec);
290 /* Calculate temporary vectorial force */
291 tx = _mm_mul_pd(fscal,dx11);
292 ty = _mm_mul_pd(fscal,dy11);
293 tz = _mm_mul_pd(fscal,dz11);
295 /* Update vectorial force */
296 fix1 = _mm_add_pd(fix1,tx);
297 fiy1 = _mm_add_pd(fiy1,ty);
298 fiz1 = _mm_add_pd(fiz1,tz);
300 fjx1 = _mm_add_pd(fjx1,tx);
301 fjy1 = _mm_add_pd(fjy1,ty);
302 fjz1 = _mm_add_pd(fjz1,tz);
304 /**************************
305 * CALCULATE INTERACTIONS *
306 **************************/
308 r12 = _mm_mul_pd(rsq12,rinv12);
310 /* Calculate table index by multiplying r with table scale and truncate to integer */
311 rt = _mm_mul_pd(r12,vftabscale);
312 vfitab = _mm_cvttpd_epi32(rt);
313 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
314 vfitab = _mm_slli_epi32(vfitab,2);
316 /* CUBIC SPLINE TABLE ELECTROSTATICS */
317 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
318 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
319 GMX_MM_TRANSPOSE2_PD(Y,F);
320 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
321 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
322 GMX_MM_TRANSPOSE2_PD(G,H);
323 Heps = _mm_mul_pd(vfeps,H);
324 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
325 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
326 velec = _mm_mul_pd(qq12,VV);
327 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
328 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
330 /* Update potential sum for this i atom from the interaction with this j atom. */
331 velecsum = _mm_add_pd(velecsum,velec);
335 /* Calculate temporary vectorial force */
336 tx = _mm_mul_pd(fscal,dx12);
337 ty = _mm_mul_pd(fscal,dy12);
338 tz = _mm_mul_pd(fscal,dz12);
340 /* Update vectorial force */
341 fix1 = _mm_add_pd(fix1,tx);
342 fiy1 = _mm_add_pd(fiy1,ty);
343 fiz1 = _mm_add_pd(fiz1,tz);
345 fjx2 = _mm_add_pd(fjx2,tx);
346 fjy2 = _mm_add_pd(fjy2,ty);
347 fjz2 = _mm_add_pd(fjz2,tz);
349 /**************************
350 * CALCULATE INTERACTIONS *
351 **************************/
353 r13 = _mm_mul_pd(rsq13,rinv13);
355 /* Calculate table index by multiplying r with table scale and truncate to integer */
356 rt = _mm_mul_pd(r13,vftabscale);
357 vfitab = _mm_cvttpd_epi32(rt);
358 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
359 vfitab = _mm_slli_epi32(vfitab,2);
361 /* CUBIC SPLINE TABLE ELECTROSTATICS */
362 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
363 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
364 GMX_MM_TRANSPOSE2_PD(Y,F);
365 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
366 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
367 GMX_MM_TRANSPOSE2_PD(G,H);
368 Heps = _mm_mul_pd(vfeps,H);
369 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
370 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
371 velec = _mm_mul_pd(qq13,VV);
372 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
373 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
375 /* Update potential sum for this i atom from the interaction with this j atom. */
376 velecsum = _mm_add_pd(velecsum,velec);
380 /* Calculate temporary vectorial force */
381 tx = _mm_mul_pd(fscal,dx13);
382 ty = _mm_mul_pd(fscal,dy13);
383 tz = _mm_mul_pd(fscal,dz13);
385 /* Update vectorial force */
386 fix1 = _mm_add_pd(fix1,tx);
387 fiy1 = _mm_add_pd(fiy1,ty);
388 fiz1 = _mm_add_pd(fiz1,tz);
390 fjx3 = _mm_add_pd(fjx3,tx);
391 fjy3 = _mm_add_pd(fjy3,ty);
392 fjz3 = _mm_add_pd(fjz3,tz);
394 /**************************
395 * CALCULATE INTERACTIONS *
396 **************************/
398 r21 = _mm_mul_pd(rsq21,rinv21);
400 /* Calculate table index by multiplying r with table scale and truncate to integer */
401 rt = _mm_mul_pd(r21,vftabscale);
402 vfitab = _mm_cvttpd_epi32(rt);
403 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
404 vfitab = _mm_slli_epi32(vfitab,2);
406 /* CUBIC SPLINE TABLE ELECTROSTATICS */
407 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
408 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
409 GMX_MM_TRANSPOSE2_PD(Y,F);
410 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
411 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
412 GMX_MM_TRANSPOSE2_PD(G,H);
413 Heps = _mm_mul_pd(vfeps,H);
414 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
415 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
416 velec = _mm_mul_pd(qq21,VV);
417 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
418 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
420 /* Update potential sum for this i atom from the interaction with this j atom. */
421 velecsum = _mm_add_pd(velecsum,velec);
425 /* Calculate temporary vectorial force */
426 tx = _mm_mul_pd(fscal,dx21);
427 ty = _mm_mul_pd(fscal,dy21);
428 tz = _mm_mul_pd(fscal,dz21);
430 /* Update vectorial force */
431 fix2 = _mm_add_pd(fix2,tx);
432 fiy2 = _mm_add_pd(fiy2,ty);
433 fiz2 = _mm_add_pd(fiz2,tz);
435 fjx1 = _mm_add_pd(fjx1,tx);
436 fjy1 = _mm_add_pd(fjy1,ty);
437 fjz1 = _mm_add_pd(fjz1,tz);
439 /**************************
440 * CALCULATE INTERACTIONS *
441 **************************/
443 r22 = _mm_mul_pd(rsq22,rinv22);
445 /* Calculate table index by multiplying r with table scale and truncate to integer */
446 rt = _mm_mul_pd(r22,vftabscale);
447 vfitab = _mm_cvttpd_epi32(rt);
448 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
449 vfitab = _mm_slli_epi32(vfitab,2);
451 /* CUBIC SPLINE TABLE ELECTROSTATICS */
452 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
453 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
454 GMX_MM_TRANSPOSE2_PD(Y,F);
455 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
456 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
457 GMX_MM_TRANSPOSE2_PD(G,H);
458 Heps = _mm_mul_pd(vfeps,H);
459 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
460 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
461 velec = _mm_mul_pd(qq22,VV);
462 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
463 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
465 /* Update potential sum for this i atom from the interaction with this j atom. */
466 velecsum = _mm_add_pd(velecsum,velec);
470 /* Calculate temporary vectorial force */
471 tx = _mm_mul_pd(fscal,dx22);
472 ty = _mm_mul_pd(fscal,dy22);
473 tz = _mm_mul_pd(fscal,dz22);
475 /* Update vectorial force */
476 fix2 = _mm_add_pd(fix2,tx);
477 fiy2 = _mm_add_pd(fiy2,ty);
478 fiz2 = _mm_add_pd(fiz2,tz);
480 fjx2 = _mm_add_pd(fjx2,tx);
481 fjy2 = _mm_add_pd(fjy2,ty);
482 fjz2 = _mm_add_pd(fjz2,tz);
484 /**************************
485 * CALCULATE INTERACTIONS *
486 **************************/
488 r23 = _mm_mul_pd(rsq23,rinv23);
490 /* Calculate table index by multiplying r with table scale and truncate to integer */
491 rt = _mm_mul_pd(r23,vftabscale);
492 vfitab = _mm_cvttpd_epi32(rt);
493 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
494 vfitab = _mm_slli_epi32(vfitab,2);
496 /* CUBIC SPLINE TABLE ELECTROSTATICS */
497 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
498 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
499 GMX_MM_TRANSPOSE2_PD(Y,F);
500 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
501 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
502 GMX_MM_TRANSPOSE2_PD(G,H);
503 Heps = _mm_mul_pd(vfeps,H);
504 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
505 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
506 velec = _mm_mul_pd(qq23,VV);
507 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
508 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
510 /* Update potential sum for this i atom from the interaction with this j atom. */
511 velecsum = _mm_add_pd(velecsum,velec);
515 /* Calculate temporary vectorial force */
516 tx = _mm_mul_pd(fscal,dx23);
517 ty = _mm_mul_pd(fscal,dy23);
518 tz = _mm_mul_pd(fscal,dz23);
520 /* Update vectorial force */
521 fix2 = _mm_add_pd(fix2,tx);
522 fiy2 = _mm_add_pd(fiy2,ty);
523 fiz2 = _mm_add_pd(fiz2,tz);
525 fjx3 = _mm_add_pd(fjx3,tx);
526 fjy3 = _mm_add_pd(fjy3,ty);
527 fjz3 = _mm_add_pd(fjz3,tz);
529 /**************************
530 * CALCULATE INTERACTIONS *
531 **************************/
533 r31 = _mm_mul_pd(rsq31,rinv31);
535 /* Calculate table index by multiplying r with table scale and truncate to integer */
536 rt = _mm_mul_pd(r31,vftabscale);
537 vfitab = _mm_cvttpd_epi32(rt);
538 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
539 vfitab = _mm_slli_epi32(vfitab,2);
541 /* CUBIC SPLINE TABLE ELECTROSTATICS */
542 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
543 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
544 GMX_MM_TRANSPOSE2_PD(Y,F);
545 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
546 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
547 GMX_MM_TRANSPOSE2_PD(G,H);
548 Heps = _mm_mul_pd(vfeps,H);
549 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
550 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
551 velec = _mm_mul_pd(qq31,VV);
552 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
553 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
555 /* Update potential sum for this i atom from the interaction with this j atom. */
556 velecsum = _mm_add_pd(velecsum,velec);
560 /* Calculate temporary vectorial force */
561 tx = _mm_mul_pd(fscal,dx31);
562 ty = _mm_mul_pd(fscal,dy31);
563 tz = _mm_mul_pd(fscal,dz31);
565 /* Update vectorial force */
566 fix3 = _mm_add_pd(fix3,tx);
567 fiy3 = _mm_add_pd(fiy3,ty);
568 fiz3 = _mm_add_pd(fiz3,tz);
570 fjx1 = _mm_add_pd(fjx1,tx);
571 fjy1 = _mm_add_pd(fjy1,ty);
572 fjz1 = _mm_add_pd(fjz1,tz);
574 /**************************
575 * CALCULATE INTERACTIONS *
576 **************************/
578 r32 = _mm_mul_pd(rsq32,rinv32);
580 /* Calculate table index by multiplying r with table scale and truncate to integer */
581 rt = _mm_mul_pd(r32,vftabscale);
582 vfitab = _mm_cvttpd_epi32(rt);
583 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
584 vfitab = _mm_slli_epi32(vfitab,2);
586 /* CUBIC SPLINE TABLE ELECTROSTATICS */
587 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
588 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
589 GMX_MM_TRANSPOSE2_PD(Y,F);
590 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
591 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
592 GMX_MM_TRANSPOSE2_PD(G,H);
593 Heps = _mm_mul_pd(vfeps,H);
594 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
595 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
596 velec = _mm_mul_pd(qq32,VV);
597 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
598 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
600 /* Update potential sum for this i atom from the interaction with this j atom. */
601 velecsum = _mm_add_pd(velecsum,velec);
605 /* Calculate temporary vectorial force */
606 tx = _mm_mul_pd(fscal,dx32);
607 ty = _mm_mul_pd(fscal,dy32);
608 tz = _mm_mul_pd(fscal,dz32);
610 /* Update vectorial force */
611 fix3 = _mm_add_pd(fix3,tx);
612 fiy3 = _mm_add_pd(fiy3,ty);
613 fiz3 = _mm_add_pd(fiz3,tz);
615 fjx2 = _mm_add_pd(fjx2,tx);
616 fjy2 = _mm_add_pd(fjy2,ty);
617 fjz2 = _mm_add_pd(fjz2,tz);
619 /**************************
620 * CALCULATE INTERACTIONS *
621 **************************/
623 r33 = _mm_mul_pd(rsq33,rinv33);
625 /* Calculate table index by multiplying r with table scale and truncate to integer */
626 rt = _mm_mul_pd(r33,vftabscale);
627 vfitab = _mm_cvttpd_epi32(rt);
628 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
629 vfitab = _mm_slli_epi32(vfitab,2);
631 /* CUBIC SPLINE TABLE ELECTROSTATICS */
632 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
633 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
634 GMX_MM_TRANSPOSE2_PD(Y,F);
635 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
636 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
637 GMX_MM_TRANSPOSE2_PD(G,H);
638 Heps = _mm_mul_pd(vfeps,H);
639 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
640 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
641 velec = _mm_mul_pd(qq33,VV);
642 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
643 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
645 /* Update potential sum for this i atom from the interaction with this j atom. */
646 velecsum = _mm_add_pd(velecsum,velec);
650 /* Calculate temporary vectorial force */
651 tx = _mm_mul_pd(fscal,dx33);
652 ty = _mm_mul_pd(fscal,dy33);
653 tz = _mm_mul_pd(fscal,dz33);
655 /* Update vectorial force */
656 fix3 = _mm_add_pd(fix3,tx);
657 fiy3 = _mm_add_pd(fiy3,ty);
658 fiz3 = _mm_add_pd(fiz3,tz);
660 fjx3 = _mm_add_pd(fjx3,tx);
661 fjy3 = _mm_add_pd(fjy3,ty);
662 fjz3 = _mm_add_pd(fjz3,tz);
664 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
666 /* Inner loop uses 387 flops */
673 j_coord_offsetA = DIM*jnrA;
675 /* load j atom coordinates */
676 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA+DIM,
677 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
679 /* Calculate displacement vector */
680 dx11 = _mm_sub_pd(ix1,jx1);
681 dy11 = _mm_sub_pd(iy1,jy1);
682 dz11 = _mm_sub_pd(iz1,jz1);
683 dx12 = _mm_sub_pd(ix1,jx2);
684 dy12 = _mm_sub_pd(iy1,jy2);
685 dz12 = _mm_sub_pd(iz1,jz2);
686 dx13 = _mm_sub_pd(ix1,jx3);
687 dy13 = _mm_sub_pd(iy1,jy3);
688 dz13 = _mm_sub_pd(iz1,jz3);
689 dx21 = _mm_sub_pd(ix2,jx1);
690 dy21 = _mm_sub_pd(iy2,jy1);
691 dz21 = _mm_sub_pd(iz2,jz1);
692 dx22 = _mm_sub_pd(ix2,jx2);
693 dy22 = _mm_sub_pd(iy2,jy2);
694 dz22 = _mm_sub_pd(iz2,jz2);
695 dx23 = _mm_sub_pd(ix2,jx3);
696 dy23 = _mm_sub_pd(iy2,jy3);
697 dz23 = _mm_sub_pd(iz2,jz3);
698 dx31 = _mm_sub_pd(ix3,jx1);
699 dy31 = _mm_sub_pd(iy3,jy1);
700 dz31 = _mm_sub_pd(iz3,jz1);
701 dx32 = _mm_sub_pd(ix3,jx2);
702 dy32 = _mm_sub_pd(iy3,jy2);
703 dz32 = _mm_sub_pd(iz3,jz2);
704 dx33 = _mm_sub_pd(ix3,jx3);
705 dy33 = _mm_sub_pd(iy3,jy3);
706 dz33 = _mm_sub_pd(iz3,jz3);
708 /* Calculate squared distance and things based on it */
709 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
710 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
711 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
712 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
713 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
714 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
715 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
716 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
717 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
719 rinv11 = sse2_invsqrt_d(rsq11);
720 rinv12 = sse2_invsqrt_d(rsq12);
721 rinv13 = sse2_invsqrt_d(rsq13);
722 rinv21 = sse2_invsqrt_d(rsq21);
723 rinv22 = sse2_invsqrt_d(rsq22);
724 rinv23 = sse2_invsqrt_d(rsq23);
725 rinv31 = sse2_invsqrt_d(rsq31);
726 rinv32 = sse2_invsqrt_d(rsq32);
727 rinv33 = sse2_invsqrt_d(rsq33);
729 fjx1 = _mm_setzero_pd();
730 fjy1 = _mm_setzero_pd();
731 fjz1 = _mm_setzero_pd();
732 fjx2 = _mm_setzero_pd();
733 fjy2 = _mm_setzero_pd();
734 fjz2 = _mm_setzero_pd();
735 fjx3 = _mm_setzero_pd();
736 fjy3 = _mm_setzero_pd();
737 fjz3 = _mm_setzero_pd();
739 /**************************
740 * CALCULATE INTERACTIONS *
741 **************************/
743 r11 = _mm_mul_pd(rsq11,rinv11);
745 /* Calculate table index by multiplying r with table scale and truncate to integer */
746 rt = _mm_mul_pd(r11,vftabscale);
747 vfitab = _mm_cvttpd_epi32(rt);
748 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
749 vfitab = _mm_slli_epi32(vfitab,2);
751 /* CUBIC SPLINE TABLE ELECTROSTATICS */
752 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
753 F = _mm_setzero_pd();
754 GMX_MM_TRANSPOSE2_PD(Y,F);
755 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
756 H = _mm_setzero_pd();
757 GMX_MM_TRANSPOSE2_PD(G,H);
758 Heps = _mm_mul_pd(vfeps,H);
759 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
760 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
761 velec = _mm_mul_pd(qq11,VV);
762 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
763 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
765 /* Update potential sum for this i atom from the interaction with this j atom. */
766 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
767 velecsum = _mm_add_pd(velecsum,velec);
771 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
773 /* Calculate temporary vectorial force */
774 tx = _mm_mul_pd(fscal,dx11);
775 ty = _mm_mul_pd(fscal,dy11);
776 tz = _mm_mul_pd(fscal,dz11);
778 /* Update vectorial force */
779 fix1 = _mm_add_pd(fix1,tx);
780 fiy1 = _mm_add_pd(fiy1,ty);
781 fiz1 = _mm_add_pd(fiz1,tz);
783 fjx1 = _mm_add_pd(fjx1,tx);
784 fjy1 = _mm_add_pd(fjy1,ty);
785 fjz1 = _mm_add_pd(fjz1,tz);
787 /**************************
788 * CALCULATE INTERACTIONS *
789 **************************/
791 r12 = _mm_mul_pd(rsq12,rinv12);
793 /* Calculate table index by multiplying r with table scale and truncate to integer */
794 rt = _mm_mul_pd(r12,vftabscale);
795 vfitab = _mm_cvttpd_epi32(rt);
796 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
797 vfitab = _mm_slli_epi32(vfitab,2);
799 /* CUBIC SPLINE TABLE ELECTROSTATICS */
800 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
801 F = _mm_setzero_pd();
802 GMX_MM_TRANSPOSE2_PD(Y,F);
803 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
804 H = _mm_setzero_pd();
805 GMX_MM_TRANSPOSE2_PD(G,H);
806 Heps = _mm_mul_pd(vfeps,H);
807 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
808 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
809 velec = _mm_mul_pd(qq12,VV);
810 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
811 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
813 /* Update potential sum for this i atom from the interaction with this j atom. */
814 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
815 velecsum = _mm_add_pd(velecsum,velec);
819 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
821 /* Calculate temporary vectorial force */
822 tx = _mm_mul_pd(fscal,dx12);
823 ty = _mm_mul_pd(fscal,dy12);
824 tz = _mm_mul_pd(fscal,dz12);
826 /* Update vectorial force */
827 fix1 = _mm_add_pd(fix1,tx);
828 fiy1 = _mm_add_pd(fiy1,ty);
829 fiz1 = _mm_add_pd(fiz1,tz);
831 fjx2 = _mm_add_pd(fjx2,tx);
832 fjy2 = _mm_add_pd(fjy2,ty);
833 fjz2 = _mm_add_pd(fjz2,tz);
835 /**************************
836 * CALCULATE INTERACTIONS *
837 **************************/
839 r13 = _mm_mul_pd(rsq13,rinv13);
841 /* Calculate table index by multiplying r with table scale and truncate to integer */
842 rt = _mm_mul_pd(r13,vftabscale);
843 vfitab = _mm_cvttpd_epi32(rt);
844 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
845 vfitab = _mm_slli_epi32(vfitab,2);
847 /* CUBIC SPLINE TABLE ELECTROSTATICS */
848 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
849 F = _mm_setzero_pd();
850 GMX_MM_TRANSPOSE2_PD(Y,F);
851 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
852 H = _mm_setzero_pd();
853 GMX_MM_TRANSPOSE2_PD(G,H);
854 Heps = _mm_mul_pd(vfeps,H);
855 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
856 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
857 velec = _mm_mul_pd(qq13,VV);
858 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
859 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
861 /* Update potential sum for this i atom from the interaction with this j atom. */
862 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
863 velecsum = _mm_add_pd(velecsum,velec);
867 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
869 /* Calculate temporary vectorial force */
870 tx = _mm_mul_pd(fscal,dx13);
871 ty = _mm_mul_pd(fscal,dy13);
872 tz = _mm_mul_pd(fscal,dz13);
874 /* Update vectorial force */
875 fix1 = _mm_add_pd(fix1,tx);
876 fiy1 = _mm_add_pd(fiy1,ty);
877 fiz1 = _mm_add_pd(fiz1,tz);
879 fjx3 = _mm_add_pd(fjx3,tx);
880 fjy3 = _mm_add_pd(fjy3,ty);
881 fjz3 = _mm_add_pd(fjz3,tz);
883 /**************************
884 * CALCULATE INTERACTIONS *
885 **************************/
887 r21 = _mm_mul_pd(rsq21,rinv21);
889 /* Calculate table index by multiplying r with table scale and truncate to integer */
890 rt = _mm_mul_pd(r21,vftabscale);
891 vfitab = _mm_cvttpd_epi32(rt);
892 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
893 vfitab = _mm_slli_epi32(vfitab,2);
895 /* CUBIC SPLINE TABLE ELECTROSTATICS */
896 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
897 F = _mm_setzero_pd();
898 GMX_MM_TRANSPOSE2_PD(Y,F);
899 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
900 H = _mm_setzero_pd();
901 GMX_MM_TRANSPOSE2_PD(G,H);
902 Heps = _mm_mul_pd(vfeps,H);
903 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
904 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
905 velec = _mm_mul_pd(qq21,VV);
906 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
907 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
909 /* Update potential sum for this i atom from the interaction with this j atom. */
910 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
911 velecsum = _mm_add_pd(velecsum,velec);
915 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
917 /* Calculate temporary vectorial force */
918 tx = _mm_mul_pd(fscal,dx21);
919 ty = _mm_mul_pd(fscal,dy21);
920 tz = _mm_mul_pd(fscal,dz21);
922 /* Update vectorial force */
923 fix2 = _mm_add_pd(fix2,tx);
924 fiy2 = _mm_add_pd(fiy2,ty);
925 fiz2 = _mm_add_pd(fiz2,tz);
927 fjx1 = _mm_add_pd(fjx1,tx);
928 fjy1 = _mm_add_pd(fjy1,ty);
929 fjz1 = _mm_add_pd(fjz1,tz);
931 /**************************
932 * CALCULATE INTERACTIONS *
933 **************************/
935 r22 = _mm_mul_pd(rsq22,rinv22);
937 /* Calculate table index by multiplying r with table scale and truncate to integer */
938 rt = _mm_mul_pd(r22,vftabscale);
939 vfitab = _mm_cvttpd_epi32(rt);
940 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
941 vfitab = _mm_slli_epi32(vfitab,2);
943 /* CUBIC SPLINE TABLE ELECTROSTATICS */
944 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
945 F = _mm_setzero_pd();
946 GMX_MM_TRANSPOSE2_PD(Y,F);
947 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
948 H = _mm_setzero_pd();
949 GMX_MM_TRANSPOSE2_PD(G,H);
950 Heps = _mm_mul_pd(vfeps,H);
951 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
952 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
953 velec = _mm_mul_pd(qq22,VV);
954 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
955 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
957 /* Update potential sum for this i atom from the interaction with this j atom. */
958 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
959 velecsum = _mm_add_pd(velecsum,velec);
963 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
965 /* Calculate temporary vectorial force */
966 tx = _mm_mul_pd(fscal,dx22);
967 ty = _mm_mul_pd(fscal,dy22);
968 tz = _mm_mul_pd(fscal,dz22);
970 /* Update vectorial force */
971 fix2 = _mm_add_pd(fix2,tx);
972 fiy2 = _mm_add_pd(fiy2,ty);
973 fiz2 = _mm_add_pd(fiz2,tz);
975 fjx2 = _mm_add_pd(fjx2,tx);
976 fjy2 = _mm_add_pd(fjy2,ty);
977 fjz2 = _mm_add_pd(fjz2,tz);
979 /**************************
980 * CALCULATE INTERACTIONS *
981 **************************/
983 r23 = _mm_mul_pd(rsq23,rinv23);
985 /* Calculate table index by multiplying r with table scale and truncate to integer */
986 rt = _mm_mul_pd(r23,vftabscale);
987 vfitab = _mm_cvttpd_epi32(rt);
988 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
989 vfitab = _mm_slli_epi32(vfitab,2);
991 /* CUBIC SPLINE TABLE ELECTROSTATICS */
992 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
993 F = _mm_setzero_pd();
994 GMX_MM_TRANSPOSE2_PD(Y,F);
995 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
996 H = _mm_setzero_pd();
997 GMX_MM_TRANSPOSE2_PD(G,H);
998 Heps = _mm_mul_pd(vfeps,H);
999 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1000 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1001 velec = _mm_mul_pd(qq23,VV);
1002 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1003 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
1005 /* Update potential sum for this i atom from the interaction with this j atom. */
1006 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1007 velecsum = _mm_add_pd(velecsum,velec);
1011 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1013 /* Calculate temporary vectorial force */
1014 tx = _mm_mul_pd(fscal,dx23);
1015 ty = _mm_mul_pd(fscal,dy23);
1016 tz = _mm_mul_pd(fscal,dz23);
1018 /* Update vectorial force */
1019 fix2 = _mm_add_pd(fix2,tx);
1020 fiy2 = _mm_add_pd(fiy2,ty);
1021 fiz2 = _mm_add_pd(fiz2,tz);
1023 fjx3 = _mm_add_pd(fjx3,tx);
1024 fjy3 = _mm_add_pd(fjy3,ty);
1025 fjz3 = _mm_add_pd(fjz3,tz);
1027 /**************************
1028 * CALCULATE INTERACTIONS *
1029 **************************/
1031 r31 = _mm_mul_pd(rsq31,rinv31);
1033 /* Calculate table index by multiplying r with table scale and truncate to integer */
1034 rt = _mm_mul_pd(r31,vftabscale);
1035 vfitab = _mm_cvttpd_epi32(rt);
1036 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1037 vfitab = _mm_slli_epi32(vfitab,2);
1039 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1040 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1041 F = _mm_setzero_pd();
1042 GMX_MM_TRANSPOSE2_PD(Y,F);
1043 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1044 H = _mm_setzero_pd();
1045 GMX_MM_TRANSPOSE2_PD(G,H);
1046 Heps = _mm_mul_pd(vfeps,H);
1047 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1048 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1049 velec = _mm_mul_pd(qq31,VV);
1050 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1051 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
1053 /* Update potential sum for this i atom from the interaction with this j atom. */
1054 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1055 velecsum = _mm_add_pd(velecsum,velec);
1059 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1061 /* Calculate temporary vectorial force */
1062 tx = _mm_mul_pd(fscal,dx31);
1063 ty = _mm_mul_pd(fscal,dy31);
1064 tz = _mm_mul_pd(fscal,dz31);
1066 /* Update vectorial force */
1067 fix3 = _mm_add_pd(fix3,tx);
1068 fiy3 = _mm_add_pd(fiy3,ty);
1069 fiz3 = _mm_add_pd(fiz3,tz);
1071 fjx1 = _mm_add_pd(fjx1,tx);
1072 fjy1 = _mm_add_pd(fjy1,ty);
1073 fjz1 = _mm_add_pd(fjz1,tz);
1075 /**************************
1076 * CALCULATE INTERACTIONS *
1077 **************************/
1079 r32 = _mm_mul_pd(rsq32,rinv32);
1081 /* Calculate table index by multiplying r with table scale and truncate to integer */
1082 rt = _mm_mul_pd(r32,vftabscale);
1083 vfitab = _mm_cvttpd_epi32(rt);
1084 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1085 vfitab = _mm_slli_epi32(vfitab,2);
1087 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1088 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1089 F = _mm_setzero_pd();
1090 GMX_MM_TRANSPOSE2_PD(Y,F);
1091 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1092 H = _mm_setzero_pd();
1093 GMX_MM_TRANSPOSE2_PD(G,H);
1094 Heps = _mm_mul_pd(vfeps,H);
1095 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1096 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1097 velec = _mm_mul_pd(qq32,VV);
1098 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1099 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
1101 /* Update potential sum for this i atom from the interaction with this j atom. */
1102 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1103 velecsum = _mm_add_pd(velecsum,velec);
1107 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1109 /* Calculate temporary vectorial force */
1110 tx = _mm_mul_pd(fscal,dx32);
1111 ty = _mm_mul_pd(fscal,dy32);
1112 tz = _mm_mul_pd(fscal,dz32);
1114 /* Update vectorial force */
1115 fix3 = _mm_add_pd(fix3,tx);
1116 fiy3 = _mm_add_pd(fiy3,ty);
1117 fiz3 = _mm_add_pd(fiz3,tz);
1119 fjx2 = _mm_add_pd(fjx2,tx);
1120 fjy2 = _mm_add_pd(fjy2,ty);
1121 fjz2 = _mm_add_pd(fjz2,tz);
1123 /**************************
1124 * CALCULATE INTERACTIONS *
1125 **************************/
1127 r33 = _mm_mul_pd(rsq33,rinv33);
1129 /* Calculate table index by multiplying r with table scale and truncate to integer */
1130 rt = _mm_mul_pd(r33,vftabscale);
1131 vfitab = _mm_cvttpd_epi32(rt);
1132 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1133 vfitab = _mm_slli_epi32(vfitab,2);
1135 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1136 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1137 F = _mm_setzero_pd();
1138 GMX_MM_TRANSPOSE2_PD(Y,F);
1139 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1140 H = _mm_setzero_pd();
1141 GMX_MM_TRANSPOSE2_PD(G,H);
1142 Heps = _mm_mul_pd(vfeps,H);
1143 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1144 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1145 velec = _mm_mul_pd(qq33,VV);
1146 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1147 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
1149 /* Update potential sum for this i atom from the interaction with this j atom. */
1150 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1151 velecsum = _mm_add_pd(velecsum,velec);
1155 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1157 /* Calculate temporary vectorial force */
1158 tx = _mm_mul_pd(fscal,dx33);
1159 ty = _mm_mul_pd(fscal,dy33);
1160 tz = _mm_mul_pd(fscal,dz33);
1162 /* Update vectorial force */
1163 fix3 = _mm_add_pd(fix3,tx);
1164 fiy3 = _mm_add_pd(fiy3,ty);
1165 fiz3 = _mm_add_pd(fiz3,tz);
1167 fjx3 = _mm_add_pd(fjx3,tx);
1168 fjy3 = _mm_add_pd(fjy3,ty);
1169 fjz3 = _mm_add_pd(fjz3,tz);
1171 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1173 /* Inner loop uses 387 flops */
1176 /* End of innermost loop */
1178 gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1179 f+i_coord_offset+DIM,fshift+i_shift_offset);
1182 /* Update potential energies */
1183 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1185 /* Increment number of inner iterations */
1186 inneriter += j_index_end - j_index_start;
1188 /* Outer loop uses 19 flops */
1191 /* Increment number of outer iterations */
1194 /* Update outer/inner flops */
1196 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*387);
1199 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_double
1200 * Electrostatics interaction: CubicSplineTable
1201 * VdW interaction: None
1202 * Geometry: Water4-Water4
1203 * Calculate force/pot: Force
1206 nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse2_double
1207 (t_nblist * gmx_restrict nlist,
1208 rvec * gmx_restrict xx,
1209 rvec * gmx_restrict ff,
1210 struct t_forcerec * gmx_restrict fr,
1211 t_mdatoms * gmx_restrict mdatoms,
1212 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1213 t_nrnb * gmx_restrict nrnb)
1215 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1216 * just 0 for non-waters.
1217 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1218 * jnr indices corresponding to data put in the four positions in the SIMD register.
1220 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1221 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1223 int j_coord_offsetA,j_coord_offsetB;
1224 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1225 real rcutoff_scalar;
1226 real *shiftvec,*fshift,*x,*f;
1227 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1229 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1231 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1233 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1234 int vdwjidx1A,vdwjidx1B;
1235 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1236 int vdwjidx2A,vdwjidx2B;
1237 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1238 int vdwjidx3A,vdwjidx3B;
1239 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1240 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1241 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1242 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1243 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1244 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1245 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1246 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1247 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1248 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1249 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1252 __m128i ifour = _mm_set1_epi32(4);
1253 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1255 __m128d dummy_mask,cutoff_mask;
1256 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1257 __m128d one = _mm_set1_pd(1.0);
1258 __m128d two = _mm_set1_pd(2.0);
1264 jindex = nlist->jindex;
1266 shiftidx = nlist->shift;
1268 shiftvec = fr->shift_vec[0];
1269 fshift = fr->fshift[0];
1270 facel = _mm_set1_pd(fr->ic->epsfac);
1271 charge = mdatoms->chargeA;
1273 vftab = kernel_data->table_elec->data;
1274 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
1276 /* Setup water-specific parameters */
1277 inr = nlist->iinr[0];
1278 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1279 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1280 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1282 jq1 = _mm_set1_pd(charge[inr+1]);
1283 jq2 = _mm_set1_pd(charge[inr+2]);
1284 jq3 = _mm_set1_pd(charge[inr+3]);
1285 qq11 = _mm_mul_pd(iq1,jq1);
1286 qq12 = _mm_mul_pd(iq1,jq2);
1287 qq13 = _mm_mul_pd(iq1,jq3);
1288 qq21 = _mm_mul_pd(iq2,jq1);
1289 qq22 = _mm_mul_pd(iq2,jq2);
1290 qq23 = _mm_mul_pd(iq2,jq3);
1291 qq31 = _mm_mul_pd(iq3,jq1);
1292 qq32 = _mm_mul_pd(iq3,jq2);
1293 qq33 = _mm_mul_pd(iq3,jq3);
1295 /* Avoid stupid compiler warnings */
1297 j_coord_offsetA = 0;
1298 j_coord_offsetB = 0;
1303 /* Start outer loop over neighborlists */
1304 for(iidx=0; iidx<nri; iidx++)
1306 /* Load shift vector for this list */
1307 i_shift_offset = DIM*shiftidx[iidx];
1309 /* Load limits for loop over neighbors */
1310 j_index_start = jindex[iidx];
1311 j_index_end = jindex[iidx+1];
1313 /* Get outer coordinate index */
1315 i_coord_offset = DIM*inr;
1317 /* Load i particle coords and add shift vector */
1318 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1319 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1321 fix1 = _mm_setzero_pd();
1322 fiy1 = _mm_setzero_pd();
1323 fiz1 = _mm_setzero_pd();
1324 fix2 = _mm_setzero_pd();
1325 fiy2 = _mm_setzero_pd();
1326 fiz2 = _mm_setzero_pd();
1327 fix3 = _mm_setzero_pd();
1328 fiy3 = _mm_setzero_pd();
1329 fiz3 = _mm_setzero_pd();
1331 /* Start inner kernel loop */
1332 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1335 /* Get j neighbor index, and coordinate index */
1337 jnrB = jjnr[jidx+1];
1338 j_coord_offsetA = DIM*jnrA;
1339 j_coord_offsetB = DIM*jnrB;
1341 /* load j atom coordinates */
1342 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1343 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1345 /* Calculate displacement vector */
1346 dx11 = _mm_sub_pd(ix1,jx1);
1347 dy11 = _mm_sub_pd(iy1,jy1);
1348 dz11 = _mm_sub_pd(iz1,jz1);
1349 dx12 = _mm_sub_pd(ix1,jx2);
1350 dy12 = _mm_sub_pd(iy1,jy2);
1351 dz12 = _mm_sub_pd(iz1,jz2);
1352 dx13 = _mm_sub_pd(ix1,jx3);
1353 dy13 = _mm_sub_pd(iy1,jy3);
1354 dz13 = _mm_sub_pd(iz1,jz3);
1355 dx21 = _mm_sub_pd(ix2,jx1);
1356 dy21 = _mm_sub_pd(iy2,jy1);
1357 dz21 = _mm_sub_pd(iz2,jz1);
1358 dx22 = _mm_sub_pd(ix2,jx2);
1359 dy22 = _mm_sub_pd(iy2,jy2);
1360 dz22 = _mm_sub_pd(iz2,jz2);
1361 dx23 = _mm_sub_pd(ix2,jx3);
1362 dy23 = _mm_sub_pd(iy2,jy3);
1363 dz23 = _mm_sub_pd(iz2,jz3);
1364 dx31 = _mm_sub_pd(ix3,jx1);
1365 dy31 = _mm_sub_pd(iy3,jy1);
1366 dz31 = _mm_sub_pd(iz3,jz1);
1367 dx32 = _mm_sub_pd(ix3,jx2);
1368 dy32 = _mm_sub_pd(iy3,jy2);
1369 dz32 = _mm_sub_pd(iz3,jz2);
1370 dx33 = _mm_sub_pd(ix3,jx3);
1371 dy33 = _mm_sub_pd(iy3,jy3);
1372 dz33 = _mm_sub_pd(iz3,jz3);
1374 /* Calculate squared distance and things based on it */
1375 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1376 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1377 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1378 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1379 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1380 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1381 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1382 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1383 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1385 rinv11 = sse2_invsqrt_d(rsq11);
1386 rinv12 = sse2_invsqrt_d(rsq12);
1387 rinv13 = sse2_invsqrt_d(rsq13);
1388 rinv21 = sse2_invsqrt_d(rsq21);
1389 rinv22 = sse2_invsqrt_d(rsq22);
1390 rinv23 = sse2_invsqrt_d(rsq23);
1391 rinv31 = sse2_invsqrt_d(rsq31);
1392 rinv32 = sse2_invsqrt_d(rsq32);
1393 rinv33 = sse2_invsqrt_d(rsq33);
1395 fjx1 = _mm_setzero_pd();
1396 fjy1 = _mm_setzero_pd();
1397 fjz1 = _mm_setzero_pd();
1398 fjx2 = _mm_setzero_pd();
1399 fjy2 = _mm_setzero_pd();
1400 fjz2 = _mm_setzero_pd();
1401 fjx3 = _mm_setzero_pd();
1402 fjy3 = _mm_setzero_pd();
1403 fjz3 = _mm_setzero_pd();
1405 /**************************
1406 * CALCULATE INTERACTIONS *
1407 **************************/
1409 r11 = _mm_mul_pd(rsq11,rinv11);
1411 /* Calculate table index by multiplying r with table scale and truncate to integer */
1412 rt = _mm_mul_pd(r11,vftabscale);
1413 vfitab = _mm_cvttpd_epi32(rt);
1414 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1415 vfitab = _mm_slli_epi32(vfitab,2);
1417 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1418 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1419 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1420 GMX_MM_TRANSPOSE2_PD(Y,F);
1421 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1422 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1423 GMX_MM_TRANSPOSE2_PD(G,H);
1424 Heps = _mm_mul_pd(vfeps,H);
1425 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1426 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1427 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1431 /* Calculate temporary vectorial force */
1432 tx = _mm_mul_pd(fscal,dx11);
1433 ty = _mm_mul_pd(fscal,dy11);
1434 tz = _mm_mul_pd(fscal,dz11);
1436 /* Update vectorial force */
1437 fix1 = _mm_add_pd(fix1,tx);
1438 fiy1 = _mm_add_pd(fiy1,ty);
1439 fiz1 = _mm_add_pd(fiz1,tz);
1441 fjx1 = _mm_add_pd(fjx1,tx);
1442 fjy1 = _mm_add_pd(fjy1,ty);
1443 fjz1 = _mm_add_pd(fjz1,tz);
1445 /**************************
1446 * CALCULATE INTERACTIONS *
1447 **************************/
1449 r12 = _mm_mul_pd(rsq12,rinv12);
1451 /* Calculate table index by multiplying r with table scale and truncate to integer */
1452 rt = _mm_mul_pd(r12,vftabscale);
1453 vfitab = _mm_cvttpd_epi32(rt);
1454 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1455 vfitab = _mm_slli_epi32(vfitab,2);
1457 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1458 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1459 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1460 GMX_MM_TRANSPOSE2_PD(Y,F);
1461 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1462 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1463 GMX_MM_TRANSPOSE2_PD(G,H);
1464 Heps = _mm_mul_pd(vfeps,H);
1465 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1466 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1467 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1471 /* Calculate temporary vectorial force */
1472 tx = _mm_mul_pd(fscal,dx12);
1473 ty = _mm_mul_pd(fscal,dy12);
1474 tz = _mm_mul_pd(fscal,dz12);
1476 /* Update vectorial force */
1477 fix1 = _mm_add_pd(fix1,tx);
1478 fiy1 = _mm_add_pd(fiy1,ty);
1479 fiz1 = _mm_add_pd(fiz1,tz);
1481 fjx2 = _mm_add_pd(fjx2,tx);
1482 fjy2 = _mm_add_pd(fjy2,ty);
1483 fjz2 = _mm_add_pd(fjz2,tz);
1485 /**************************
1486 * CALCULATE INTERACTIONS *
1487 **************************/
1489 r13 = _mm_mul_pd(rsq13,rinv13);
1491 /* Calculate table index by multiplying r with table scale and truncate to integer */
1492 rt = _mm_mul_pd(r13,vftabscale);
1493 vfitab = _mm_cvttpd_epi32(rt);
1494 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1495 vfitab = _mm_slli_epi32(vfitab,2);
1497 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1498 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1499 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1500 GMX_MM_TRANSPOSE2_PD(Y,F);
1501 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1502 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1503 GMX_MM_TRANSPOSE2_PD(G,H);
1504 Heps = _mm_mul_pd(vfeps,H);
1505 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1506 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1507 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
1511 /* Calculate temporary vectorial force */
1512 tx = _mm_mul_pd(fscal,dx13);
1513 ty = _mm_mul_pd(fscal,dy13);
1514 tz = _mm_mul_pd(fscal,dz13);
1516 /* Update vectorial force */
1517 fix1 = _mm_add_pd(fix1,tx);
1518 fiy1 = _mm_add_pd(fiy1,ty);
1519 fiz1 = _mm_add_pd(fiz1,tz);
1521 fjx3 = _mm_add_pd(fjx3,tx);
1522 fjy3 = _mm_add_pd(fjy3,ty);
1523 fjz3 = _mm_add_pd(fjz3,tz);
1525 /**************************
1526 * CALCULATE INTERACTIONS *
1527 **************************/
1529 r21 = _mm_mul_pd(rsq21,rinv21);
1531 /* Calculate table index by multiplying r with table scale and truncate to integer */
1532 rt = _mm_mul_pd(r21,vftabscale);
1533 vfitab = _mm_cvttpd_epi32(rt);
1534 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1535 vfitab = _mm_slli_epi32(vfitab,2);
1537 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1538 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1539 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1540 GMX_MM_TRANSPOSE2_PD(Y,F);
1541 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1542 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1543 GMX_MM_TRANSPOSE2_PD(G,H);
1544 Heps = _mm_mul_pd(vfeps,H);
1545 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1546 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1547 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1551 /* Calculate temporary vectorial force */
1552 tx = _mm_mul_pd(fscal,dx21);
1553 ty = _mm_mul_pd(fscal,dy21);
1554 tz = _mm_mul_pd(fscal,dz21);
1556 /* Update vectorial force */
1557 fix2 = _mm_add_pd(fix2,tx);
1558 fiy2 = _mm_add_pd(fiy2,ty);
1559 fiz2 = _mm_add_pd(fiz2,tz);
1561 fjx1 = _mm_add_pd(fjx1,tx);
1562 fjy1 = _mm_add_pd(fjy1,ty);
1563 fjz1 = _mm_add_pd(fjz1,tz);
1565 /**************************
1566 * CALCULATE INTERACTIONS *
1567 **************************/
1569 r22 = _mm_mul_pd(rsq22,rinv22);
1571 /* Calculate table index by multiplying r with table scale and truncate to integer */
1572 rt = _mm_mul_pd(r22,vftabscale);
1573 vfitab = _mm_cvttpd_epi32(rt);
1574 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1575 vfitab = _mm_slli_epi32(vfitab,2);
1577 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1578 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1579 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1580 GMX_MM_TRANSPOSE2_PD(Y,F);
1581 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1582 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1583 GMX_MM_TRANSPOSE2_PD(G,H);
1584 Heps = _mm_mul_pd(vfeps,H);
1585 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1586 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1587 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1591 /* Calculate temporary vectorial force */
1592 tx = _mm_mul_pd(fscal,dx22);
1593 ty = _mm_mul_pd(fscal,dy22);
1594 tz = _mm_mul_pd(fscal,dz22);
1596 /* Update vectorial force */
1597 fix2 = _mm_add_pd(fix2,tx);
1598 fiy2 = _mm_add_pd(fiy2,ty);
1599 fiz2 = _mm_add_pd(fiz2,tz);
1601 fjx2 = _mm_add_pd(fjx2,tx);
1602 fjy2 = _mm_add_pd(fjy2,ty);
1603 fjz2 = _mm_add_pd(fjz2,tz);
1605 /**************************
1606 * CALCULATE INTERACTIONS *
1607 **************************/
1609 r23 = _mm_mul_pd(rsq23,rinv23);
1611 /* Calculate table index by multiplying r with table scale and truncate to integer */
1612 rt = _mm_mul_pd(r23,vftabscale);
1613 vfitab = _mm_cvttpd_epi32(rt);
1614 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1615 vfitab = _mm_slli_epi32(vfitab,2);
1617 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1618 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1619 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1620 GMX_MM_TRANSPOSE2_PD(Y,F);
1621 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1622 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1623 GMX_MM_TRANSPOSE2_PD(G,H);
1624 Heps = _mm_mul_pd(vfeps,H);
1625 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1626 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1627 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
1631 /* Calculate temporary vectorial force */
1632 tx = _mm_mul_pd(fscal,dx23);
1633 ty = _mm_mul_pd(fscal,dy23);
1634 tz = _mm_mul_pd(fscal,dz23);
1636 /* Update vectorial force */
1637 fix2 = _mm_add_pd(fix2,tx);
1638 fiy2 = _mm_add_pd(fiy2,ty);
1639 fiz2 = _mm_add_pd(fiz2,tz);
1641 fjx3 = _mm_add_pd(fjx3,tx);
1642 fjy3 = _mm_add_pd(fjy3,ty);
1643 fjz3 = _mm_add_pd(fjz3,tz);
1645 /**************************
1646 * CALCULATE INTERACTIONS *
1647 **************************/
1649 r31 = _mm_mul_pd(rsq31,rinv31);
1651 /* Calculate table index by multiplying r with table scale and truncate to integer */
1652 rt = _mm_mul_pd(r31,vftabscale);
1653 vfitab = _mm_cvttpd_epi32(rt);
1654 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1655 vfitab = _mm_slli_epi32(vfitab,2);
1657 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1658 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1659 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1660 GMX_MM_TRANSPOSE2_PD(Y,F);
1661 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1662 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1663 GMX_MM_TRANSPOSE2_PD(G,H);
1664 Heps = _mm_mul_pd(vfeps,H);
1665 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1666 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1667 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
1671 /* Calculate temporary vectorial force */
1672 tx = _mm_mul_pd(fscal,dx31);
1673 ty = _mm_mul_pd(fscal,dy31);
1674 tz = _mm_mul_pd(fscal,dz31);
1676 /* Update vectorial force */
1677 fix3 = _mm_add_pd(fix3,tx);
1678 fiy3 = _mm_add_pd(fiy3,ty);
1679 fiz3 = _mm_add_pd(fiz3,tz);
1681 fjx1 = _mm_add_pd(fjx1,tx);
1682 fjy1 = _mm_add_pd(fjy1,ty);
1683 fjz1 = _mm_add_pd(fjz1,tz);
1685 /**************************
1686 * CALCULATE INTERACTIONS *
1687 **************************/
1689 r32 = _mm_mul_pd(rsq32,rinv32);
1691 /* Calculate table index by multiplying r with table scale and truncate to integer */
1692 rt = _mm_mul_pd(r32,vftabscale);
1693 vfitab = _mm_cvttpd_epi32(rt);
1694 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1695 vfitab = _mm_slli_epi32(vfitab,2);
1697 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1698 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1699 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1700 GMX_MM_TRANSPOSE2_PD(Y,F);
1701 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1702 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1703 GMX_MM_TRANSPOSE2_PD(G,H);
1704 Heps = _mm_mul_pd(vfeps,H);
1705 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1706 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1707 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
1711 /* Calculate temporary vectorial force */
1712 tx = _mm_mul_pd(fscal,dx32);
1713 ty = _mm_mul_pd(fscal,dy32);
1714 tz = _mm_mul_pd(fscal,dz32);
1716 /* Update vectorial force */
1717 fix3 = _mm_add_pd(fix3,tx);
1718 fiy3 = _mm_add_pd(fiy3,ty);
1719 fiz3 = _mm_add_pd(fiz3,tz);
1721 fjx2 = _mm_add_pd(fjx2,tx);
1722 fjy2 = _mm_add_pd(fjy2,ty);
1723 fjz2 = _mm_add_pd(fjz2,tz);
1725 /**************************
1726 * CALCULATE INTERACTIONS *
1727 **************************/
1729 r33 = _mm_mul_pd(rsq33,rinv33);
1731 /* Calculate table index by multiplying r with table scale and truncate to integer */
1732 rt = _mm_mul_pd(r33,vftabscale);
1733 vfitab = _mm_cvttpd_epi32(rt);
1734 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1735 vfitab = _mm_slli_epi32(vfitab,2);
1737 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1738 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1739 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1740 GMX_MM_TRANSPOSE2_PD(Y,F);
1741 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1742 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1743 GMX_MM_TRANSPOSE2_PD(G,H);
1744 Heps = _mm_mul_pd(vfeps,H);
1745 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1746 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1747 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
1751 /* Calculate temporary vectorial force */
1752 tx = _mm_mul_pd(fscal,dx33);
1753 ty = _mm_mul_pd(fscal,dy33);
1754 tz = _mm_mul_pd(fscal,dz33);
1756 /* Update vectorial force */
1757 fix3 = _mm_add_pd(fix3,tx);
1758 fiy3 = _mm_add_pd(fiy3,ty);
1759 fiz3 = _mm_add_pd(fiz3,tz);
1761 fjx3 = _mm_add_pd(fjx3,tx);
1762 fjy3 = _mm_add_pd(fjy3,ty);
1763 fjz3 = _mm_add_pd(fjz3,tz);
1765 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1767 /* Inner loop uses 351 flops */
1770 if(jidx<j_index_end)
1774 j_coord_offsetA = DIM*jnrA;
1776 /* load j atom coordinates */
1777 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA+DIM,
1778 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1780 /* Calculate displacement vector */
1781 dx11 = _mm_sub_pd(ix1,jx1);
1782 dy11 = _mm_sub_pd(iy1,jy1);
1783 dz11 = _mm_sub_pd(iz1,jz1);
1784 dx12 = _mm_sub_pd(ix1,jx2);
1785 dy12 = _mm_sub_pd(iy1,jy2);
1786 dz12 = _mm_sub_pd(iz1,jz2);
1787 dx13 = _mm_sub_pd(ix1,jx3);
1788 dy13 = _mm_sub_pd(iy1,jy3);
1789 dz13 = _mm_sub_pd(iz1,jz3);
1790 dx21 = _mm_sub_pd(ix2,jx1);
1791 dy21 = _mm_sub_pd(iy2,jy1);
1792 dz21 = _mm_sub_pd(iz2,jz1);
1793 dx22 = _mm_sub_pd(ix2,jx2);
1794 dy22 = _mm_sub_pd(iy2,jy2);
1795 dz22 = _mm_sub_pd(iz2,jz2);
1796 dx23 = _mm_sub_pd(ix2,jx3);
1797 dy23 = _mm_sub_pd(iy2,jy3);
1798 dz23 = _mm_sub_pd(iz2,jz3);
1799 dx31 = _mm_sub_pd(ix3,jx1);
1800 dy31 = _mm_sub_pd(iy3,jy1);
1801 dz31 = _mm_sub_pd(iz3,jz1);
1802 dx32 = _mm_sub_pd(ix3,jx2);
1803 dy32 = _mm_sub_pd(iy3,jy2);
1804 dz32 = _mm_sub_pd(iz3,jz2);
1805 dx33 = _mm_sub_pd(ix3,jx3);
1806 dy33 = _mm_sub_pd(iy3,jy3);
1807 dz33 = _mm_sub_pd(iz3,jz3);
1809 /* Calculate squared distance and things based on it */
1810 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1811 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1812 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1813 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1814 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1815 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1816 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1817 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1818 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1820 rinv11 = sse2_invsqrt_d(rsq11);
1821 rinv12 = sse2_invsqrt_d(rsq12);
1822 rinv13 = sse2_invsqrt_d(rsq13);
1823 rinv21 = sse2_invsqrt_d(rsq21);
1824 rinv22 = sse2_invsqrt_d(rsq22);
1825 rinv23 = sse2_invsqrt_d(rsq23);
1826 rinv31 = sse2_invsqrt_d(rsq31);
1827 rinv32 = sse2_invsqrt_d(rsq32);
1828 rinv33 = sse2_invsqrt_d(rsq33);
1830 fjx1 = _mm_setzero_pd();
1831 fjy1 = _mm_setzero_pd();
1832 fjz1 = _mm_setzero_pd();
1833 fjx2 = _mm_setzero_pd();
1834 fjy2 = _mm_setzero_pd();
1835 fjz2 = _mm_setzero_pd();
1836 fjx3 = _mm_setzero_pd();
1837 fjy3 = _mm_setzero_pd();
1838 fjz3 = _mm_setzero_pd();
1840 /**************************
1841 * CALCULATE INTERACTIONS *
1842 **************************/
1844 r11 = _mm_mul_pd(rsq11,rinv11);
1846 /* Calculate table index by multiplying r with table scale and truncate to integer */
1847 rt = _mm_mul_pd(r11,vftabscale);
1848 vfitab = _mm_cvttpd_epi32(rt);
1849 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1850 vfitab = _mm_slli_epi32(vfitab,2);
1852 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1853 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1854 F = _mm_setzero_pd();
1855 GMX_MM_TRANSPOSE2_PD(Y,F);
1856 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1857 H = _mm_setzero_pd();
1858 GMX_MM_TRANSPOSE2_PD(G,H);
1859 Heps = _mm_mul_pd(vfeps,H);
1860 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1861 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1862 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1866 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1868 /* Calculate temporary vectorial force */
1869 tx = _mm_mul_pd(fscal,dx11);
1870 ty = _mm_mul_pd(fscal,dy11);
1871 tz = _mm_mul_pd(fscal,dz11);
1873 /* Update vectorial force */
1874 fix1 = _mm_add_pd(fix1,tx);
1875 fiy1 = _mm_add_pd(fiy1,ty);
1876 fiz1 = _mm_add_pd(fiz1,tz);
1878 fjx1 = _mm_add_pd(fjx1,tx);
1879 fjy1 = _mm_add_pd(fjy1,ty);
1880 fjz1 = _mm_add_pd(fjz1,tz);
1882 /**************************
1883 * CALCULATE INTERACTIONS *
1884 **************************/
1886 r12 = _mm_mul_pd(rsq12,rinv12);
1888 /* Calculate table index by multiplying r with table scale and truncate to integer */
1889 rt = _mm_mul_pd(r12,vftabscale);
1890 vfitab = _mm_cvttpd_epi32(rt);
1891 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1892 vfitab = _mm_slli_epi32(vfitab,2);
1894 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1895 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1896 F = _mm_setzero_pd();
1897 GMX_MM_TRANSPOSE2_PD(Y,F);
1898 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1899 H = _mm_setzero_pd();
1900 GMX_MM_TRANSPOSE2_PD(G,H);
1901 Heps = _mm_mul_pd(vfeps,H);
1902 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1903 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1904 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1908 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1910 /* Calculate temporary vectorial force */
1911 tx = _mm_mul_pd(fscal,dx12);
1912 ty = _mm_mul_pd(fscal,dy12);
1913 tz = _mm_mul_pd(fscal,dz12);
1915 /* Update vectorial force */
1916 fix1 = _mm_add_pd(fix1,tx);
1917 fiy1 = _mm_add_pd(fiy1,ty);
1918 fiz1 = _mm_add_pd(fiz1,tz);
1920 fjx2 = _mm_add_pd(fjx2,tx);
1921 fjy2 = _mm_add_pd(fjy2,ty);
1922 fjz2 = _mm_add_pd(fjz2,tz);
1924 /**************************
1925 * CALCULATE INTERACTIONS *
1926 **************************/
1928 r13 = _mm_mul_pd(rsq13,rinv13);
1930 /* Calculate table index by multiplying r with table scale and truncate to integer */
1931 rt = _mm_mul_pd(r13,vftabscale);
1932 vfitab = _mm_cvttpd_epi32(rt);
1933 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1934 vfitab = _mm_slli_epi32(vfitab,2);
1936 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1937 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1938 F = _mm_setzero_pd();
1939 GMX_MM_TRANSPOSE2_PD(Y,F);
1940 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1941 H = _mm_setzero_pd();
1942 GMX_MM_TRANSPOSE2_PD(G,H);
1943 Heps = _mm_mul_pd(vfeps,H);
1944 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1945 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1946 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
1950 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1952 /* Calculate temporary vectorial force */
1953 tx = _mm_mul_pd(fscal,dx13);
1954 ty = _mm_mul_pd(fscal,dy13);
1955 tz = _mm_mul_pd(fscal,dz13);
1957 /* Update vectorial force */
1958 fix1 = _mm_add_pd(fix1,tx);
1959 fiy1 = _mm_add_pd(fiy1,ty);
1960 fiz1 = _mm_add_pd(fiz1,tz);
1962 fjx3 = _mm_add_pd(fjx3,tx);
1963 fjy3 = _mm_add_pd(fjy3,ty);
1964 fjz3 = _mm_add_pd(fjz3,tz);
1966 /**************************
1967 * CALCULATE INTERACTIONS *
1968 **************************/
1970 r21 = _mm_mul_pd(rsq21,rinv21);
1972 /* Calculate table index by multiplying r with table scale and truncate to integer */
1973 rt = _mm_mul_pd(r21,vftabscale);
1974 vfitab = _mm_cvttpd_epi32(rt);
1975 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1976 vfitab = _mm_slli_epi32(vfitab,2);
1978 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1979 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1980 F = _mm_setzero_pd();
1981 GMX_MM_TRANSPOSE2_PD(Y,F);
1982 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1983 H = _mm_setzero_pd();
1984 GMX_MM_TRANSPOSE2_PD(G,H);
1985 Heps = _mm_mul_pd(vfeps,H);
1986 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1987 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1988 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1992 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1994 /* Calculate temporary vectorial force */
1995 tx = _mm_mul_pd(fscal,dx21);
1996 ty = _mm_mul_pd(fscal,dy21);
1997 tz = _mm_mul_pd(fscal,dz21);
1999 /* Update vectorial force */
2000 fix2 = _mm_add_pd(fix2,tx);
2001 fiy2 = _mm_add_pd(fiy2,ty);
2002 fiz2 = _mm_add_pd(fiz2,tz);
2004 fjx1 = _mm_add_pd(fjx1,tx);
2005 fjy1 = _mm_add_pd(fjy1,ty);
2006 fjz1 = _mm_add_pd(fjz1,tz);
2008 /**************************
2009 * CALCULATE INTERACTIONS *
2010 **************************/
2012 r22 = _mm_mul_pd(rsq22,rinv22);
2014 /* Calculate table index by multiplying r with table scale and truncate to integer */
2015 rt = _mm_mul_pd(r22,vftabscale);
2016 vfitab = _mm_cvttpd_epi32(rt);
2017 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2018 vfitab = _mm_slli_epi32(vfitab,2);
2020 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2021 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2022 F = _mm_setzero_pd();
2023 GMX_MM_TRANSPOSE2_PD(Y,F);
2024 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2025 H = _mm_setzero_pd();
2026 GMX_MM_TRANSPOSE2_PD(G,H);
2027 Heps = _mm_mul_pd(vfeps,H);
2028 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2029 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2030 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2034 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2036 /* Calculate temporary vectorial force */
2037 tx = _mm_mul_pd(fscal,dx22);
2038 ty = _mm_mul_pd(fscal,dy22);
2039 tz = _mm_mul_pd(fscal,dz22);
2041 /* Update vectorial force */
2042 fix2 = _mm_add_pd(fix2,tx);
2043 fiy2 = _mm_add_pd(fiy2,ty);
2044 fiz2 = _mm_add_pd(fiz2,tz);
2046 fjx2 = _mm_add_pd(fjx2,tx);
2047 fjy2 = _mm_add_pd(fjy2,ty);
2048 fjz2 = _mm_add_pd(fjz2,tz);
2050 /**************************
2051 * CALCULATE INTERACTIONS *
2052 **************************/
2054 r23 = _mm_mul_pd(rsq23,rinv23);
2056 /* Calculate table index by multiplying r with table scale and truncate to integer */
2057 rt = _mm_mul_pd(r23,vftabscale);
2058 vfitab = _mm_cvttpd_epi32(rt);
2059 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2060 vfitab = _mm_slli_epi32(vfitab,2);
2062 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2063 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2064 F = _mm_setzero_pd();
2065 GMX_MM_TRANSPOSE2_PD(Y,F);
2066 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2067 H = _mm_setzero_pd();
2068 GMX_MM_TRANSPOSE2_PD(G,H);
2069 Heps = _mm_mul_pd(vfeps,H);
2070 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2071 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2072 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
2076 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2078 /* Calculate temporary vectorial force */
2079 tx = _mm_mul_pd(fscal,dx23);
2080 ty = _mm_mul_pd(fscal,dy23);
2081 tz = _mm_mul_pd(fscal,dz23);
2083 /* Update vectorial force */
2084 fix2 = _mm_add_pd(fix2,tx);
2085 fiy2 = _mm_add_pd(fiy2,ty);
2086 fiz2 = _mm_add_pd(fiz2,tz);
2088 fjx3 = _mm_add_pd(fjx3,tx);
2089 fjy3 = _mm_add_pd(fjy3,ty);
2090 fjz3 = _mm_add_pd(fjz3,tz);
2092 /**************************
2093 * CALCULATE INTERACTIONS *
2094 **************************/
2096 r31 = _mm_mul_pd(rsq31,rinv31);
2098 /* Calculate table index by multiplying r with table scale and truncate to integer */
2099 rt = _mm_mul_pd(r31,vftabscale);
2100 vfitab = _mm_cvttpd_epi32(rt);
2101 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2102 vfitab = _mm_slli_epi32(vfitab,2);
2104 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2105 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2106 F = _mm_setzero_pd();
2107 GMX_MM_TRANSPOSE2_PD(Y,F);
2108 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2109 H = _mm_setzero_pd();
2110 GMX_MM_TRANSPOSE2_PD(G,H);
2111 Heps = _mm_mul_pd(vfeps,H);
2112 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2113 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2114 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
2118 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2120 /* Calculate temporary vectorial force */
2121 tx = _mm_mul_pd(fscal,dx31);
2122 ty = _mm_mul_pd(fscal,dy31);
2123 tz = _mm_mul_pd(fscal,dz31);
2125 /* Update vectorial force */
2126 fix3 = _mm_add_pd(fix3,tx);
2127 fiy3 = _mm_add_pd(fiy3,ty);
2128 fiz3 = _mm_add_pd(fiz3,tz);
2130 fjx1 = _mm_add_pd(fjx1,tx);
2131 fjy1 = _mm_add_pd(fjy1,ty);
2132 fjz1 = _mm_add_pd(fjz1,tz);
2134 /**************************
2135 * CALCULATE INTERACTIONS *
2136 **************************/
2138 r32 = _mm_mul_pd(rsq32,rinv32);
2140 /* Calculate table index by multiplying r with table scale and truncate to integer */
2141 rt = _mm_mul_pd(r32,vftabscale);
2142 vfitab = _mm_cvttpd_epi32(rt);
2143 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2144 vfitab = _mm_slli_epi32(vfitab,2);
2146 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2147 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2148 F = _mm_setzero_pd();
2149 GMX_MM_TRANSPOSE2_PD(Y,F);
2150 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2151 H = _mm_setzero_pd();
2152 GMX_MM_TRANSPOSE2_PD(G,H);
2153 Heps = _mm_mul_pd(vfeps,H);
2154 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2155 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2156 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
2160 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2162 /* Calculate temporary vectorial force */
2163 tx = _mm_mul_pd(fscal,dx32);
2164 ty = _mm_mul_pd(fscal,dy32);
2165 tz = _mm_mul_pd(fscal,dz32);
2167 /* Update vectorial force */
2168 fix3 = _mm_add_pd(fix3,tx);
2169 fiy3 = _mm_add_pd(fiy3,ty);
2170 fiz3 = _mm_add_pd(fiz3,tz);
2172 fjx2 = _mm_add_pd(fjx2,tx);
2173 fjy2 = _mm_add_pd(fjy2,ty);
2174 fjz2 = _mm_add_pd(fjz2,tz);
2176 /**************************
2177 * CALCULATE INTERACTIONS *
2178 **************************/
2180 r33 = _mm_mul_pd(rsq33,rinv33);
2182 /* Calculate table index by multiplying r with table scale and truncate to integer */
2183 rt = _mm_mul_pd(r33,vftabscale);
2184 vfitab = _mm_cvttpd_epi32(rt);
2185 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2186 vfitab = _mm_slli_epi32(vfitab,2);
2188 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2189 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2190 F = _mm_setzero_pd();
2191 GMX_MM_TRANSPOSE2_PD(Y,F);
2192 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2193 H = _mm_setzero_pd();
2194 GMX_MM_TRANSPOSE2_PD(G,H);
2195 Heps = _mm_mul_pd(vfeps,H);
2196 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2197 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2198 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
2202 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2204 /* Calculate temporary vectorial force */
2205 tx = _mm_mul_pd(fscal,dx33);
2206 ty = _mm_mul_pd(fscal,dy33);
2207 tz = _mm_mul_pd(fscal,dz33);
2209 /* Update vectorial force */
2210 fix3 = _mm_add_pd(fix3,tx);
2211 fiy3 = _mm_add_pd(fiy3,ty);
2212 fiz3 = _mm_add_pd(fiz3,tz);
2214 fjx3 = _mm_add_pd(fjx3,tx);
2215 fjy3 = _mm_add_pd(fjy3,ty);
2216 fjz3 = _mm_add_pd(fjz3,tz);
2218 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2220 /* Inner loop uses 351 flops */
2223 /* End of innermost loop */
2225 gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2226 f+i_coord_offset+DIM,fshift+i_shift_offset);
2228 /* Increment number of inner iterations */
2229 inneriter += j_index_end - j_index_start;
2231 /* Outer loop uses 18 flops */
2234 /* Increment number of outer iterations */
2237 /* Update outer/inner flops */
2239 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*351);