2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse2_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_double
51 * Electrostatics interaction: CubicSplineTable
52 * VdW interaction: None
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int j_coord_offsetA,j_coord_offsetB;
75 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
77 real *shiftvec,*fshift,*x,*f;
78 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
82 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
84 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
85 int vdwjidx0A,vdwjidx0B;
86 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
87 int vdwjidx1A,vdwjidx1B;
88 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
89 int vdwjidx2A,vdwjidx2B;
90 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
91 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
92 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
93 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
94 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
95 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
96 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
97 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
98 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
99 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
100 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
103 __m128i ifour = _mm_set1_epi32(4);
104 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
106 __m128d dummy_mask,cutoff_mask;
107 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
108 __m128d one = _mm_set1_pd(1.0);
109 __m128d two = _mm_set1_pd(2.0);
115 jindex = nlist->jindex;
117 shiftidx = nlist->shift;
119 shiftvec = fr->shift_vec[0];
120 fshift = fr->fshift[0];
121 facel = _mm_set1_pd(fr->ic->epsfac);
122 charge = mdatoms->chargeA;
124 vftab = kernel_data->table_elec->data;
125 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
127 /* Setup water-specific parameters */
128 inr = nlist->iinr[0];
129 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
130 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
131 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
133 jq0 = _mm_set1_pd(charge[inr+0]);
134 jq1 = _mm_set1_pd(charge[inr+1]);
135 jq2 = _mm_set1_pd(charge[inr+2]);
136 qq00 = _mm_mul_pd(iq0,jq0);
137 qq01 = _mm_mul_pd(iq0,jq1);
138 qq02 = _mm_mul_pd(iq0,jq2);
139 qq10 = _mm_mul_pd(iq1,jq0);
140 qq11 = _mm_mul_pd(iq1,jq1);
141 qq12 = _mm_mul_pd(iq1,jq2);
142 qq20 = _mm_mul_pd(iq2,jq0);
143 qq21 = _mm_mul_pd(iq2,jq1);
144 qq22 = _mm_mul_pd(iq2,jq2);
146 /* Avoid stupid compiler warnings */
154 /* Start outer loop over neighborlists */
155 for(iidx=0; iidx<nri; iidx++)
157 /* Load shift vector for this list */
158 i_shift_offset = DIM*shiftidx[iidx];
160 /* Load limits for loop over neighbors */
161 j_index_start = jindex[iidx];
162 j_index_end = jindex[iidx+1];
164 /* Get outer coordinate index */
166 i_coord_offset = DIM*inr;
168 /* Load i particle coords and add shift vector */
169 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
170 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
172 fix0 = _mm_setzero_pd();
173 fiy0 = _mm_setzero_pd();
174 fiz0 = _mm_setzero_pd();
175 fix1 = _mm_setzero_pd();
176 fiy1 = _mm_setzero_pd();
177 fiz1 = _mm_setzero_pd();
178 fix2 = _mm_setzero_pd();
179 fiy2 = _mm_setzero_pd();
180 fiz2 = _mm_setzero_pd();
182 /* Reset potential sums */
183 velecsum = _mm_setzero_pd();
185 /* Start inner kernel loop */
186 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
189 /* Get j neighbor index, and coordinate index */
192 j_coord_offsetA = DIM*jnrA;
193 j_coord_offsetB = DIM*jnrB;
195 /* load j atom coordinates */
196 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
197 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
199 /* Calculate displacement vector */
200 dx00 = _mm_sub_pd(ix0,jx0);
201 dy00 = _mm_sub_pd(iy0,jy0);
202 dz00 = _mm_sub_pd(iz0,jz0);
203 dx01 = _mm_sub_pd(ix0,jx1);
204 dy01 = _mm_sub_pd(iy0,jy1);
205 dz01 = _mm_sub_pd(iz0,jz1);
206 dx02 = _mm_sub_pd(ix0,jx2);
207 dy02 = _mm_sub_pd(iy0,jy2);
208 dz02 = _mm_sub_pd(iz0,jz2);
209 dx10 = _mm_sub_pd(ix1,jx0);
210 dy10 = _mm_sub_pd(iy1,jy0);
211 dz10 = _mm_sub_pd(iz1,jz0);
212 dx11 = _mm_sub_pd(ix1,jx1);
213 dy11 = _mm_sub_pd(iy1,jy1);
214 dz11 = _mm_sub_pd(iz1,jz1);
215 dx12 = _mm_sub_pd(ix1,jx2);
216 dy12 = _mm_sub_pd(iy1,jy2);
217 dz12 = _mm_sub_pd(iz1,jz2);
218 dx20 = _mm_sub_pd(ix2,jx0);
219 dy20 = _mm_sub_pd(iy2,jy0);
220 dz20 = _mm_sub_pd(iz2,jz0);
221 dx21 = _mm_sub_pd(ix2,jx1);
222 dy21 = _mm_sub_pd(iy2,jy1);
223 dz21 = _mm_sub_pd(iz2,jz1);
224 dx22 = _mm_sub_pd(ix2,jx2);
225 dy22 = _mm_sub_pd(iy2,jy2);
226 dz22 = _mm_sub_pd(iz2,jz2);
228 /* Calculate squared distance and things based on it */
229 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
230 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
231 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
232 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
233 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
234 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
235 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
236 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
237 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
239 rinv00 = sse2_invsqrt_d(rsq00);
240 rinv01 = sse2_invsqrt_d(rsq01);
241 rinv02 = sse2_invsqrt_d(rsq02);
242 rinv10 = sse2_invsqrt_d(rsq10);
243 rinv11 = sse2_invsqrt_d(rsq11);
244 rinv12 = sse2_invsqrt_d(rsq12);
245 rinv20 = sse2_invsqrt_d(rsq20);
246 rinv21 = sse2_invsqrt_d(rsq21);
247 rinv22 = sse2_invsqrt_d(rsq22);
249 fjx0 = _mm_setzero_pd();
250 fjy0 = _mm_setzero_pd();
251 fjz0 = _mm_setzero_pd();
252 fjx1 = _mm_setzero_pd();
253 fjy1 = _mm_setzero_pd();
254 fjz1 = _mm_setzero_pd();
255 fjx2 = _mm_setzero_pd();
256 fjy2 = _mm_setzero_pd();
257 fjz2 = _mm_setzero_pd();
259 /**************************
260 * CALCULATE INTERACTIONS *
261 **************************/
263 r00 = _mm_mul_pd(rsq00,rinv00);
265 /* Calculate table index by multiplying r with table scale and truncate to integer */
266 rt = _mm_mul_pd(r00,vftabscale);
267 vfitab = _mm_cvttpd_epi32(rt);
268 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
269 vfitab = _mm_slli_epi32(vfitab,2);
271 /* CUBIC SPLINE TABLE ELECTROSTATICS */
272 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
273 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
274 GMX_MM_TRANSPOSE2_PD(Y,F);
275 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
276 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
277 GMX_MM_TRANSPOSE2_PD(G,H);
278 Heps = _mm_mul_pd(vfeps,H);
279 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
280 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
281 velec = _mm_mul_pd(qq00,VV);
282 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
283 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
285 /* Update potential sum for this i atom from the interaction with this j atom. */
286 velecsum = _mm_add_pd(velecsum,velec);
290 /* Calculate temporary vectorial force */
291 tx = _mm_mul_pd(fscal,dx00);
292 ty = _mm_mul_pd(fscal,dy00);
293 tz = _mm_mul_pd(fscal,dz00);
295 /* Update vectorial force */
296 fix0 = _mm_add_pd(fix0,tx);
297 fiy0 = _mm_add_pd(fiy0,ty);
298 fiz0 = _mm_add_pd(fiz0,tz);
300 fjx0 = _mm_add_pd(fjx0,tx);
301 fjy0 = _mm_add_pd(fjy0,ty);
302 fjz0 = _mm_add_pd(fjz0,tz);
304 /**************************
305 * CALCULATE INTERACTIONS *
306 **************************/
308 r01 = _mm_mul_pd(rsq01,rinv01);
310 /* Calculate table index by multiplying r with table scale and truncate to integer */
311 rt = _mm_mul_pd(r01,vftabscale);
312 vfitab = _mm_cvttpd_epi32(rt);
313 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
314 vfitab = _mm_slli_epi32(vfitab,2);
316 /* CUBIC SPLINE TABLE ELECTROSTATICS */
317 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
318 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
319 GMX_MM_TRANSPOSE2_PD(Y,F);
320 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
321 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
322 GMX_MM_TRANSPOSE2_PD(G,H);
323 Heps = _mm_mul_pd(vfeps,H);
324 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
325 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
326 velec = _mm_mul_pd(qq01,VV);
327 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
328 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
330 /* Update potential sum for this i atom from the interaction with this j atom. */
331 velecsum = _mm_add_pd(velecsum,velec);
335 /* Calculate temporary vectorial force */
336 tx = _mm_mul_pd(fscal,dx01);
337 ty = _mm_mul_pd(fscal,dy01);
338 tz = _mm_mul_pd(fscal,dz01);
340 /* Update vectorial force */
341 fix0 = _mm_add_pd(fix0,tx);
342 fiy0 = _mm_add_pd(fiy0,ty);
343 fiz0 = _mm_add_pd(fiz0,tz);
345 fjx1 = _mm_add_pd(fjx1,tx);
346 fjy1 = _mm_add_pd(fjy1,ty);
347 fjz1 = _mm_add_pd(fjz1,tz);
349 /**************************
350 * CALCULATE INTERACTIONS *
351 **************************/
353 r02 = _mm_mul_pd(rsq02,rinv02);
355 /* Calculate table index by multiplying r with table scale and truncate to integer */
356 rt = _mm_mul_pd(r02,vftabscale);
357 vfitab = _mm_cvttpd_epi32(rt);
358 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
359 vfitab = _mm_slli_epi32(vfitab,2);
361 /* CUBIC SPLINE TABLE ELECTROSTATICS */
362 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
363 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
364 GMX_MM_TRANSPOSE2_PD(Y,F);
365 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
366 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
367 GMX_MM_TRANSPOSE2_PD(G,H);
368 Heps = _mm_mul_pd(vfeps,H);
369 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
370 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
371 velec = _mm_mul_pd(qq02,VV);
372 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
373 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
375 /* Update potential sum for this i atom from the interaction with this j atom. */
376 velecsum = _mm_add_pd(velecsum,velec);
380 /* Calculate temporary vectorial force */
381 tx = _mm_mul_pd(fscal,dx02);
382 ty = _mm_mul_pd(fscal,dy02);
383 tz = _mm_mul_pd(fscal,dz02);
385 /* Update vectorial force */
386 fix0 = _mm_add_pd(fix0,tx);
387 fiy0 = _mm_add_pd(fiy0,ty);
388 fiz0 = _mm_add_pd(fiz0,tz);
390 fjx2 = _mm_add_pd(fjx2,tx);
391 fjy2 = _mm_add_pd(fjy2,ty);
392 fjz2 = _mm_add_pd(fjz2,tz);
394 /**************************
395 * CALCULATE INTERACTIONS *
396 **************************/
398 r10 = _mm_mul_pd(rsq10,rinv10);
400 /* Calculate table index by multiplying r with table scale and truncate to integer */
401 rt = _mm_mul_pd(r10,vftabscale);
402 vfitab = _mm_cvttpd_epi32(rt);
403 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
404 vfitab = _mm_slli_epi32(vfitab,2);
406 /* CUBIC SPLINE TABLE ELECTROSTATICS */
407 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
408 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
409 GMX_MM_TRANSPOSE2_PD(Y,F);
410 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
411 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
412 GMX_MM_TRANSPOSE2_PD(G,H);
413 Heps = _mm_mul_pd(vfeps,H);
414 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
415 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
416 velec = _mm_mul_pd(qq10,VV);
417 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
418 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
420 /* Update potential sum for this i atom from the interaction with this j atom. */
421 velecsum = _mm_add_pd(velecsum,velec);
425 /* Calculate temporary vectorial force */
426 tx = _mm_mul_pd(fscal,dx10);
427 ty = _mm_mul_pd(fscal,dy10);
428 tz = _mm_mul_pd(fscal,dz10);
430 /* Update vectorial force */
431 fix1 = _mm_add_pd(fix1,tx);
432 fiy1 = _mm_add_pd(fiy1,ty);
433 fiz1 = _mm_add_pd(fiz1,tz);
435 fjx0 = _mm_add_pd(fjx0,tx);
436 fjy0 = _mm_add_pd(fjy0,ty);
437 fjz0 = _mm_add_pd(fjz0,tz);
439 /**************************
440 * CALCULATE INTERACTIONS *
441 **************************/
443 r11 = _mm_mul_pd(rsq11,rinv11);
445 /* Calculate table index by multiplying r with table scale and truncate to integer */
446 rt = _mm_mul_pd(r11,vftabscale);
447 vfitab = _mm_cvttpd_epi32(rt);
448 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
449 vfitab = _mm_slli_epi32(vfitab,2);
451 /* CUBIC SPLINE TABLE ELECTROSTATICS */
452 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
453 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
454 GMX_MM_TRANSPOSE2_PD(Y,F);
455 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
456 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
457 GMX_MM_TRANSPOSE2_PD(G,H);
458 Heps = _mm_mul_pd(vfeps,H);
459 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
460 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
461 velec = _mm_mul_pd(qq11,VV);
462 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
463 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
465 /* Update potential sum for this i atom from the interaction with this j atom. */
466 velecsum = _mm_add_pd(velecsum,velec);
470 /* Calculate temporary vectorial force */
471 tx = _mm_mul_pd(fscal,dx11);
472 ty = _mm_mul_pd(fscal,dy11);
473 tz = _mm_mul_pd(fscal,dz11);
475 /* Update vectorial force */
476 fix1 = _mm_add_pd(fix1,tx);
477 fiy1 = _mm_add_pd(fiy1,ty);
478 fiz1 = _mm_add_pd(fiz1,tz);
480 fjx1 = _mm_add_pd(fjx1,tx);
481 fjy1 = _mm_add_pd(fjy1,ty);
482 fjz1 = _mm_add_pd(fjz1,tz);
484 /**************************
485 * CALCULATE INTERACTIONS *
486 **************************/
488 r12 = _mm_mul_pd(rsq12,rinv12);
490 /* Calculate table index by multiplying r with table scale and truncate to integer */
491 rt = _mm_mul_pd(r12,vftabscale);
492 vfitab = _mm_cvttpd_epi32(rt);
493 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
494 vfitab = _mm_slli_epi32(vfitab,2);
496 /* CUBIC SPLINE TABLE ELECTROSTATICS */
497 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
498 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
499 GMX_MM_TRANSPOSE2_PD(Y,F);
500 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
501 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
502 GMX_MM_TRANSPOSE2_PD(G,H);
503 Heps = _mm_mul_pd(vfeps,H);
504 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
505 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
506 velec = _mm_mul_pd(qq12,VV);
507 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
508 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
510 /* Update potential sum for this i atom from the interaction with this j atom. */
511 velecsum = _mm_add_pd(velecsum,velec);
515 /* Calculate temporary vectorial force */
516 tx = _mm_mul_pd(fscal,dx12);
517 ty = _mm_mul_pd(fscal,dy12);
518 tz = _mm_mul_pd(fscal,dz12);
520 /* Update vectorial force */
521 fix1 = _mm_add_pd(fix1,tx);
522 fiy1 = _mm_add_pd(fiy1,ty);
523 fiz1 = _mm_add_pd(fiz1,tz);
525 fjx2 = _mm_add_pd(fjx2,tx);
526 fjy2 = _mm_add_pd(fjy2,ty);
527 fjz2 = _mm_add_pd(fjz2,tz);
529 /**************************
530 * CALCULATE INTERACTIONS *
531 **************************/
533 r20 = _mm_mul_pd(rsq20,rinv20);
535 /* Calculate table index by multiplying r with table scale and truncate to integer */
536 rt = _mm_mul_pd(r20,vftabscale);
537 vfitab = _mm_cvttpd_epi32(rt);
538 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
539 vfitab = _mm_slli_epi32(vfitab,2);
541 /* CUBIC SPLINE TABLE ELECTROSTATICS */
542 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
543 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
544 GMX_MM_TRANSPOSE2_PD(Y,F);
545 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
546 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
547 GMX_MM_TRANSPOSE2_PD(G,H);
548 Heps = _mm_mul_pd(vfeps,H);
549 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
550 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
551 velec = _mm_mul_pd(qq20,VV);
552 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
553 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
555 /* Update potential sum for this i atom from the interaction with this j atom. */
556 velecsum = _mm_add_pd(velecsum,velec);
560 /* Calculate temporary vectorial force */
561 tx = _mm_mul_pd(fscal,dx20);
562 ty = _mm_mul_pd(fscal,dy20);
563 tz = _mm_mul_pd(fscal,dz20);
565 /* Update vectorial force */
566 fix2 = _mm_add_pd(fix2,tx);
567 fiy2 = _mm_add_pd(fiy2,ty);
568 fiz2 = _mm_add_pd(fiz2,tz);
570 fjx0 = _mm_add_pd(fjx0,tx);
571 fjy0 = _mm_add_pd(fjy0,ty);
572 fjz0 = _mm_add_pd(fjz0,tz);
574 /**************************
575 * CALCULATE INTERACTIONS *
576 **************************/
578 r21 = _mm_mul_pd(rsq21,rinv21);
580 /* Calculate table index by multiplying r with table scale and truncate to integer */
581 rt = _mm_mul_pd(r21,vftabscale);
582 vfitab = _mm_cvttpd_epi32(rt);
583 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
584 vfitab = _mm_slli_epi32(vfitab,2);
586 /* CUBIC SPLINE TABLE ELECTROSTATICS */
587 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
588 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
589 GMX_MM_TRANSPOSE2_PD(Y,F);
590 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
591 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
592 GMX_MM_TRANSPOSE2_PD(G,H);
593 Heps = _mm_mul_pd(vfeps,H);
594 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
595 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
596 velec = _mm_mul_pd(qq21,VV);
597 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
598 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
600 /* Update potential sum for this i atom from the interaction with this j atom. */
601 velecsum = _mm_add_pd(velecsum,velec);
605 /* Calculate temporary vectorial force */
606 tx = _mm_mul_pd(fscal,dx21);
607 ty = _mm_mul_pd(fscal,dy21);
608 tz = _mm_mul_pd(fscal,dz21);
610 /* Update vectorial force */
611 fix2 = _mm_add_pd(fix2,tx);
612 fiy2 = _mm_add_pd(fiy2,ty);
613 fiz2 = _mm_add_pd(fiz2,tz);
615 fjx1 = _mm_add_pd(fjx1,tx);
616 fjy1 = _mm_add_pd(fjy1,ty);
617 fjz1 = _mm_add_pd(fjz1,tz);
619 /**************************
620 * CALCULATE INTERACTIONS *
621 **************************/
623 r22 = _mm_mul_pd(rsq22,rinv22);
625 /* Calculate table index by multiplying r with table scale and truncate to integer */
626 rt = _mm_mul_pd(r22,vftabscale);
627 vfitab = _mm_cvttpd_epi32(rt);
628 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
629 vfitab = _mm_slli_epi32(vfitab,2);
631 /* CUBIC SPLINE TABLE ELECTROSTATICS */
632 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
633 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
634 GMX_MM_TRANSPOSE2_PD(Y,F);
635 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
636 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
637 GMX_MM_TRANSPOSE2_PD(G,H);
638 Heps = _mm_mul_pd(vfeps,H);
639 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
640 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
641 velec = _mm_mul_pd(qq22,VV);
642 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
643 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
645 /* Update potential sum for this i atom from the interaction with this j atom. */
646 velecsum = _mm_add_pd(velecsum,velec);
650 /* Calculate temporary vectorial force */
651 tx = _mm_mul_pd(fscal,dx22);
652 ty = _mm_mul_pd(fscal,dy22);
653 tz = _mm_mul_pd(fscal,dz22);
655 /* Update vectorial force */
656 fix2 = _mm_add_pd(fix2,tx);
657 fiy2 = _mm_add_pd(fiy2,ty);
658 fiz2 = _mm_add_pd(fiz2,tz);
660 fjx2 = _mm_add_pd(fjx2,tx);
661 fjy2 = _mm_add_pd(fjy2,ty);
662 fjz2 = _mm_add_pd(fjz2,tz);
664 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
666 /* Inner loop uses 387 flops */
673 j_coord_offsetA = DIM*jnrA;
675 /* load j atom coordinates */
676 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
677 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
679 /* Calculate displacement vector */
680 dx00 = _mm_sub_pd(ix0,jx0);
681 dy00 = _mm_sub_pd(iy0,jy0);
682 dz00 = _mm_sub_pd(iz0,jz0);
683 dx01 = _mm_sub_pd(ix0,jx1);
684 dy01 = _mm_sub_pd(iy0,jy1);
685 dz01 = _mm_sub_pd(iz0,jz1);
686 dx02 = _mm_sub_pd(ix0,jx2);
687 dy02 = _mm_sub_pd(iy0,jy2);
688 dz02 = _mm_sub_pd(iz0,jz2);
689 dx10 = _mm_sub_pd(ix1,jx0);
690 dy10 = _mm_sub_pd(iy1,jy0);
691 dz10 = _mm_sub_pd(iz1,jz0);
692 dx11 = _mm_sub_pd(ix1,jx1);
693 dy11 = _mm_sub_pd(iy1,jy1);
694 dz11 = _mm_sub_pd(iz1,jz1);
695 dx12 = _mm_sub_pd(ix1,jx2);
696 dy12 = _mm_sub_pd(iy1,jy2);
697 dz12 = _mm_sub_pd(iz1,jz2);
698 dx20 = _mm_sub_pd(ix2,jx0);
699 dy20 = _mm_sub_pd(iy2,jy0);
700 dz20 = _mm_sub_pd(iz2,jz0);
701 dx21 = _mm_sub_pd(ix2,jx1);
702 dy21 = _mm_sub_pd(iy2,jy1);
703 dz21 = _mm_sub_pd(iz2,jz1);
704 dx22 = _mm_sub_pd(ix2,jx2);
705 dy22 = _mm_sub_pd(iy2,jy2);
706 dz22 = _mm_sub_pd(iz2,jz2);
708 /* Calculate squared distance and things based on it */
709 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
710 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
711 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
712 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
713 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
714 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
715 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
716 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
717 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
719 rinv00 = sse2_invsqrt_d(rsq00);
720 rinv01 = sse2_invsqrt_d(rsq01);
721 rinv02 = sse2_invsqrt_d(rsq02);
722 rinv10 = sse2_invsqrt_d(rsq10);
723 rinv11 = sse2_invsqrt_d(rsq11);
724 rinv12 = sse2_invsqrt_d(rsq12);
725 rinv20 = sse2_invsqrt_d(rsq20);
726 rinv21 = sse2_invsqrt_d(rsq21);
727 rinv22 = sse2_invsqrt_d(rsq22);
729 fjx0 = _mm_setzero_pd();
730 fjy0 = _mm_setzero_pd();
731 fjz0 = _mm_setzero_pd();
732 fjx1 = _mm_setzero_pd();
733 fjy1 = _mm_setzero_pd();
734 fjz1 = _mm_setzero_pd();
735 fjx2 = _mm_setzero_pd();
736 fjy2 = _mm_setzero_pd();
737 fjz2 = _mm_setzero_pd();
739 /**************************
740 * CALCULATE INTERACTIONS *
741 **************************/
743 r00 = _mm_mul_pd(rsq00,rinv00);
745 /* Calculate table index by multiplying r with table scale and truncate to integer */
746 rt = _mm_mul_pd(r00,vftabscale);
747 vfitab = _mm_cvttpd_epi32(rt);
748 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
749 vfitab = _mm_slli_epi32(vfitab,2);
751 /* CUBIC SPLINE TABLE ELECTROSTATICS */
752 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
753 F = _mm_setzero_pd();
754 GMX_MM_TRANSPOSE2_PD(Y,F);
755 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
756 H = _mm_setzero_pd();
757 GMX_MM_TRANSPOSE2_PD(G,H);
758 Heps = _mm_mul_pd(vfeps,H);
759 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
760 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
761 velec = _mm_mul_pd(qq00,VV);
762 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
763 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
765 /* Update potential sum for this i atom from the interaction with this j atom. */
766 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
767 velecsum = _mm_add_pd(velecsum,velec);
771 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
773 /* Calculate temporary vectorial force */
774 tx = _mm_mul_pd(fscal,dx00);
775 ty = _mm_mul_pd(fscal,dy00);
776 tz = _mm_mul_pd(fscal,dz00);
778 /* Update vectorial force */
779 fix0 = _mm_add_pd(fix0,tx);
780 fiy0 = _mm_add_pd(fiy0,ty);
781 fiz0 = _mm_add_pd(fiz0,tz);
783 fjx0 = _mm_add_pd(fjx0,tx);
784 fjy0 = _mm_add_pd(fjy0,ty);
785 fjz0 = _mm_add_pd(fjz0,tz);
787 /**************************
788 * CALCULATE INTERACTIONS *
789 **************************/
791 r01 = _mm_mul_pd(rsq01,rinv01);
793 /* Calculate table index by multiplying r with table scale and truncate to integer */
794 rt = _mm_mul_pd(r01,vftabscale);
795 vfitab = _mm_cvttpd_epi32(rt);
796 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
797 vfitab = _mm_slli_epi32(vfitab,2);
799 /* CUBIC SPLINE TABLE ELECTROSTATICS */
800 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
801 F = _mm_setzero_pd();
802 GMX_MM_TRANSPOSE2_PD(Y,F);
803 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
804 H = _mm_setzero_pd();
805 GMX_MM_TRANSPOSE2_PD(G,H);
806 Heps = _mm_mul_pd(vfeps,H);
807 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
808 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
809 velec = _mm_mul_pd(qq01,VV);
810 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
811 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
813 /* Update potential sum for this i atom from the interaction with this j atom. */
814 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
815 velecsum = _mm_add_pd(velecsum,velec);
819 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
821 /* Calculate temporary vectorial force */
822 tx = _mm_mul_pd(fscal,dx01);
823 ty = _mm_mul_pd(fscal,dy01);
824 tz = _mm_mul_pd(fscal,dz01);
826 /* Update vectorial force */
827 fix0 = _mm_add_pd(fix0,tx);
828 fiy0 = _mm_add_pd(fiy0,ty);
829 fiz0 = _mm_add_pd(fiz0,tz);
831 fjx1 = _mm_add_pd(fjx1,tx);
832 fjy1 = _mm_add_pd(fjy1,ty);
833 fjz1 = _mm_add_pd(fjz1,tz);
835 /**************************
836 * CALCULATE INTERACTIONS *
837 **************************/
839 r02 = _mm_mul_pd(rsq02,rinv02);
841 /* Calculate table index by multiplying r with table scale and truncate to integer */
842 rt = _mm_mul_pd(r02,vftabscale);
843 vfitab = _mm_cvttpd_epi32(rt);
844 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
845 vfitab = _mm_slli_epi32(vfitab,2);
847 /* CUBIC SPLINE TABLE ELECTROSTATICS */
848 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
849 F = _mm_setzero_pd();
850 GMX_MM_TRANSPOSE2_PD(Y,F);
851 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
852 H = _mm_setzero_pd();
853 GMX_MM_TRANSPOSE2_PD(G,H);
854 Heps = _mm_mul_pd(vfeps,H);
855 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
856 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
857 velec = _mm_mul_pd(qq02,VV);
858 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
859 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
861 /* Update potential sum for this i atom from the interaction with this j atom. */
862 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
863 velecsum = _mm_add_pd(velecsum,velec);
867 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
869 /* Calculate temporary vectorial force */
870 tx = _mm_mul_pd(fscal,dx02);
871 ty = _mm_mul_pd(fscal,dy02);
872 tz = _mm_mul_pd(fscal,dz02);
874 /* Update vectorial force */
875 fix0 = _mm_add_pd(fix0,tx);
876 fiy0 = _mm_add_pd(fiy0,ty);
877 fiz0 = _mm_add_pd(fiz0,tz);
879 fjx2 = _mm_add_pd(fjx2,tx);
880 fjy2 = _mm_add_pd(fjy2,ty);
881 fjz2 = _mm_add_pd(fjz2,tz);
883 /**************************
884 * CALCULATE INTERACTIONS *
885 **************************/
887 r10 = _mm_mul_pd(rsq10,rinv10);
889 /* Calculate table index by multiplying r with table scale and truncate to integer */
890 rt = _mm_mul_pd(r10,vftabscale);
891 vfitab = _mm_cvttpd_epi32(rt);
892 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
893 vfitab = _mm_slli_epi32(vfitab,2);
895 /* CUBIC SPLINE TABLE ELECTROSTATICS */
896 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
897 F = _mm_setzero_pd();
898 GMX_MM_TRANSPOSE2_PD(Y,F);
899 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
900 H = _mm_setzero_pd();
901 GMX_MM_TRANSPOSE2_PD(G,H);
902 Heps = _mm_mul_pd(vfeps,H);
903 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
904 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
905 velec = _mm_mul_pd(qq10,VV);
906 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
907 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
909 /* Update potential sum for this i atom from the interaction with this j atom. */
910 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
911 velecsum = _mm_add_pd(velecsum,velec);
915 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
917 /* Calculate temporary vectorial force */
918 tx = _mm_mul_pd(fscal,dx10);
919 ty = _mm_mul_pd(fscal,dy10);
920 tz = _mm_mul_pd(fscal,dz10);
922 /* Update vectorial force */
923 fix1 = _mm_add_pd(fix1,tx);
924 fiy1 = _mm_add_pd(fiy1,ty);
925 fiz1 = _mm_add_pd(fiz1,tz);
927 fjx0 = _mm_add_pd(fjx0,tx);
928 fjy0 = _mm_add_pd(fjy0,ty);
929 fjz0 = _mm_add_pd(fjz0,tz);
931 /**************************
932 * CALCULATE INTERACTIONS *
933 **************************/
935 r11 = _mm_mul_pd(rsq11,rinv11);
937 /* Calculate table index by multiplying r with table scale and truncate to integer */
938 rt = _mm_mul_pd(r11,vftabscale);
939 vfitab = _mm_cvttpd_epi32(rt);
940 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
941 vfitab = _mm_slli_epi32(vfitab,2);
943 /* CUBIC SPLINE TABLE ELECTROSTATICS */
944 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
945 F = _mm_setzero_pd();
946 GMX_MM_TRANSPOSE2_PD(Y,F);
947 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
948 H = _mm_setzero_pd();
949 GMX_MM_TRANSPOSE2_PD(G,H);
950 Heps = _mm_mul_pd(vfeps,H);
951 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
952 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
953 velec = _mm_mul_pd(qq11,VV);
954 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
955 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
957 /* Update potential sum for this i atom from the interaction with this j atom. */
958 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
959 velecsum = _mm_add_pd(velecsum,velec);
963 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
965 /* Calculate temporary vectorial force */
966 tx = _mm_mul_pd(fscal,dx11);
967 ty = _mm_mul_pd(fscal,dy11);
968 tz = _mm_mul_pd(fscal,dz11);
970 /* Update vectorial force */
971 fix1 = _mm_add_pd(fix1,tx);
972 fiy1 = _mm_add_pd(fiy1,ty);
973 fiz1 = _mm_add_pd(fiz1,tz);
975 fjx1 = _mm_add_pd(fjx1,tx);
976 fjy1 = _mm_add_pd(fjy1,ty);
977 fjz1 = _mm_add_pd(fjz1,tz);
979 /**************************
980 * CALCULATE INTERACTIONS *
981 **************************/
983 r12 = _mm_mul_pd(rsq12,rinv12);
985 /* Calculate table index by multiplying r with table scale and truncate to integer */
986 rt = _mm_mul_pd(r12,vftabscale);
987 vfitab = _mm_cvttpd_epi32(rt);
988 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
989 vfitab = _mm_slli_epi32(vfitab,2);
991 /* CUBIC SPLINE TABLE ELECTROSTATICS */
992 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
993 F = _mm_setzero_pd();
994 GMX_MM_TRANSPOSE2_PD(Y,F);
995 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
996 H = _mm_setzero_pd();
997 GMX_MM_TRANSPOSE2_PD(G,H);
998 Heps = _mm_mul_pd(vfeps,H);
999 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1000 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1001 velec = _mm_mul_pd(qq12,VV);
1002 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1003 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1005 /* Update potential sum for this i atom from the interaction with this j atom. */
1006 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1007 velecsum = _mm_add_pd(velecsum,velec);
1011 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1013 /* Calculate temporary vectorial force */
1014 tx = _mm_mul_pd(fscal,dx12);
1015 ty = _mm_mul_pd(fscal,dy12);
1016 tz = _mm_mul_pd(fscal,dz12);
1018 /* Update vectorial force */
1019 fix1 = _mm_add_pd(fix1,tx);
1020 fiy1 = _mm_add_pd(fiy1,ty);
1021 fiz1 = _mm_add_pd(fiz1,tz);
1023 fjx2 = _mm_add_pd(fjx2,tx);
1024 fjy2 = _mm_add_pd(fjy2,ty);
1025 fjz2 = _mm_add_pd(fjz2,tz);
1027 /**************************
1028 * CALCULATE INTERACTIONS *
1029 **************************/
1031 r20 = _mm_mul_pd(rsq20,rinv20);
1033 /* Calculate table index by multiplying r with table scale and truncate to integer */
1034 rt = _mm_mul_pd(r20,vftabscale);
1035 vfitab = _mm_cvttpd_epi32(rt);
1036 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1037 vfitab = _mm_slli_epi32(vfitab,2);
1039 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1040 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1041 F = _mm_setzero_pd();
1042 GMX_MM_TRANSPOSE2_PD(Y,F);
1043 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1044 H = _mm_setzero_pd();
1045 GMX_MM_TRANSPOSE2_PD(G,H);
1046 Heps = _mm_mul_pd(vfeps,H);
1047 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1048 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1049 velec = _mm_mul_pd(qq20,VV);
1050 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1051 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1053 /* Update potential sum for this i atom from the interaction with this j atom. */
1054 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1055 velecsum = _mm_add_pd(velecsum,velec);
1059 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1061 /* Calculate temporary vectorial force */
1062 tx = _mm_mul_pd(fscal,dx20);
1063 ty = _mm_mul_pd(fscal,dy20);
1064 tz = _mm_mul_pd(fscal,dz20);
1066 /* Update vectorial force */
1067 fix2 = _mm_add_pd(fix2,tx);
1068 fiy2 = _mm_add_pd(fiy2,ty);
1069 fiz2 = _mm_add_pd(fiz2,tz);
1071 fjx0 = _mm_add_pd(fjx0,tx);
1072 fjy0 = _mm_add_pd(fjy0,ty);
1073 fjz0 = _mm_add_pd(fjz0,tz);
1075 /**************************
1076 * CALCULATE INTERACTIONS *
1077 **************************/
1079 r21 = _mm_mul_pd(rsq21,rinv21);
1081 /* Calculate table index by multiplying r with table scale and truncate to integer */
1082 rt = _mm_mul_pd(r21,vftabscale);
1083 vfitab = _mm_cvttpd_epi32(rt);
1084 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1085 vfitab = _mm_slli_epi32(vfitab,2);
1087 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1088 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1089 F = _mm_setzero_pd();
1090 GMX_MM_TRANSPOSE2_PD(Y,F);
1091 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1092 H = _mm_setzero_pd();
1093 GMX_MM_TRANSPOSE2_PD(G,H);
1094 Heps = _mm_mul_pd(vfeps,H);
1095 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1096 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1097 velec = _mm_mul_pd(qq21,VV);
1098 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1099 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1101 /* Update potential sum for this i atom from the interaction with this j atom. */
1102 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1103 velecsum = _mm_add_pd(velecsum,velec);
1107 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1109 /* Calculate temporary vectorial force */
1110 tx = _mm_mul_pd(fscal,dx21);
1111 ty = _mm_mul_pd(fscal,dy21);
1112 tz = _mm_mul_pd(fscal,dz21);
1114 /* Update vectorial force */
1115 fix2 = _mm_add_pd(fix2,tx);
1116 fiy2 = _mm_add_pd(fiy2,ty);
1117 fiz2 = _mm_add_pd(fiz2,tz);
1119 fjx1 = _mm_add_pd(fjx1,tx);
1120 fjy1 = _mm_add_pd(fjy1,ty);
1121 fjz1 = _mm_add_pd(fjz1,tz);
1123 /**************************
1124 * CALCULATE INTERACTIONS *
1125 **************************/
1127 r22 = _mm_mul_pd(rsq22,rinv22);
1129 /* Calculate table index by multiplying r with table scale and truncate to integer */
1130 rt = _mm_mul_pd(r22,vftabscale);
1131 vfitab = _mm_cvttpd_epi32(rt);
1132 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1133 vfitab = _mm_slli_epi32(vfitab,2);
1135 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1136 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1137 F = _mm_setzero_pd();
1138 GMX_MM_TRANSPOSE2_PD(Y,F);
1139 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1140 H = _mm_setzero_pd();
1141 GMX_MM_TRANSPOSE2_PD(G,H);
1142 Heps = _mm_mul_pd(vfeps,H);
1143 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1144 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1145 velec = _mm_mul_pd(qq22,VV);
1146 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1147 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1149 /* Update potential sum for this i atom from the interaction with this j atom. */
1150 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1151 velecsum = _mm_add_pd(velecsum,velec);
1155 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1157 /* Calculate temporary vectorial force */
1158 tx = _mm_mul_pd(fscal,dx22);
1159 ty = _mm_mul_pd(fscal,dy22);
1160 tz = _mm_mul_pd(fscal,dz22);
1162 /* Update vectorial force */
1163 fix2 = _mm_add_pd(fix2,tx);
1164 fiy2 = _mm_add_pd(fiy2,ty);
1165 fiz2 = _mm_add_pd(fiz2,tz);
1167 fjx2 = _mm_add_pd(fjx2,tx);
1168 fjy2 = _mm_add_pd(fjy2,ty);
1169 fjz2 = _mm_add_pd(fjz2,tz);
1171 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1173 /* Inner loop uses 387 flops */
1176 /* End of innermost loop */
1178 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1179 f+i_coord_offset,fshift+i_shift_offset);
1182 /* Update potential energies */
1183 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1185 /* Increment number of inner iterations */
1186 inneriter += j_index_end - j_index_start;
1188 /* Outer loop uses 19 flops */
1191 /* Increment number of outer iterations */
1194 /* Update outer/inner flops */
1196 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*387);
1199 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_double
1200 * Electrostatics interaction: CubicSplineTable
1201 * VdW interaction: None
1202 * Geometry: Water3-Water3
1203 * Calculate force/pot: Force
1206 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_double
1207 (t_nblist * gmx_restrict nlist,
1208 rvec * gmx_restrict xx,
1209 rvec * gmx_restrict ff,
1210 struct t_forcerec * gmx_restrict fr,
1211 t_mdatoms * gmx_restrict mdatoms,
1212 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1213 t_nrnb * gmx_restrict nrnb)
1215 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1216 * just 0 for non-waters.
1217 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1218 * jnr indices corresponding to data put in the four positions in the SIMD register.
1220 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1221 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1223 int j_coord_offsetA,j_coord_offsetB;
1224 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1225 real rcutoff_scalar;
1226 real *shiftvec,*fshift,*x,*f;
1227 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1229 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1231 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1233 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1234 int vdwjidx0A,vdwjidx0B;
1235 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1236 int vdwjidx1A,vdwjidx1B;
1237 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1238 int vdwjidx2A,vdwjidx2B;
1239 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1240 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1241 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1242 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1243 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1244 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1245 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1246 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1247 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1248 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1249 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1252 __m128i ifour = _mm_set1_epi32(4);
1253 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1255 __m128d dummy_mask,cutoff_mask;
1256 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1257 __m128d one = _mm_set1_pd(1.0);
1258 __m128d two = _mm_set1_pd(2.0);
1264 jindex = nlist->jindex;
1266 shiftidx = nlist->shift;
1268 shiftvec = fr->shift_vec[0];
1269 fshift = fr->fshift[0];
1270 facel = _mm_set1_pd(fr->ic->epsfac);
1271 charge = mdatoms->chargeA;
1273 vftab = kernel_data->table_elec->data;
1274 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
1276 /* Setup water-specific parameters */
1277 inr = nlist->iinr[0];
1278 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1279 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1280 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1282 jq0 = _mm_set1_pd(charge[inr+0]);
1283 jq1 = _mm_set1_pd(charge[inr+1]);
1284 jq2 = _mm_set1_pd(charge[inr+2]);
1285 qq00 = _mm_mul_pd(iq0,jq0);
1286 qq01 = _mm_mul_pd(iq0,jq1);
1287 qq02 = _mm_mul_pd(iq0,jq2);
1288 qq10 = _mm_mul_pd(iq1,jq0);
1289 qq11 = _mm_mul_pd(iq1,jq1);
1290 qq12 = _mm_mul_pd(iq1,jq2);
1291 qq20 = _mm_mul_pd(iq2,jq0);
1292 qq21 = _mm_mul_pd(iq2,jq1);
1293 qq22 = _mm_mul_pd(iq2,jq2);
1295 /* Avoid stupid compiler warnings */
1297 j_coord_offsetA = 0;
1298 j_coord_offsetB = 0;
1303 /* Start outer loop over neighborlists */
1304 for(iidx=0; iidx<nri; iidx++)
1306 /* Load shift vector for this list */
1307 i_shift_offset = DIM*shiftidx[iidx];
1309 /* Load limits for loop over neighbors */
1310 j_index_start = jindex[iidx];
1311 j_index_end = jindex[iidx+1];
1313 /* Get outer coordinate index */
1315 i_coord_offset = DIM*inr;
1317 /* Load i particle coords and add shift vector */
1318 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1319 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1321 fix0 = _mm_setzero_pd();
1322 fiy0 = _mm_setzero_pd();
1323 fiz0 = _mm_setzero_pd();
1324 fix1 = _mm_setzero_pd();
1325 fiy1 = _mm_setzero_pd();
1326 fiz1 = _mm_setzero_pd();
1327 fix2 = _mm_setzero_pd();
1328 fiy2 = _mm_setzero_pd();
1329 fiz2 = _mm_setzero_pd();
1331 /* Start inner kernel loop */
1332 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1335 /* Get j neighbor index, and coordinate index */
1337 jnrB = jjnr[jidx+1];
1338 j_coord_offsetA = DIM*jnrA;
1339 j_coord_offsetB = DIM*jnrB;
1341 /* load j atom coordinates */
1342 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1343 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1345 /* Calculate displacement vector */
1346 dx00 = _mm_sub_pd(ix0,jx0);
1347 dy00 = _mm_sub_pd(iy0,jy0);
1348 dz00 = _mm_sub_pd(iz0,jz0);
1349 dx01 = _mm_sub_pd(ix0,jx1);
1350 dy01 = _mm_sub_pd(iy0,jy1);
1351 dz01 = _mm_sub_pd(iz0,jz1);
1352 dx02 = _mm_sub_pd(ix0,jx2);
1353 dy02 = _mm_sub_pd(iy0,jy2);
1354 dz02 = _mm_sub_pd(iz0,jz2);
1355 dx10 = _mm_sub_pd(ix1,jx0);
1356 dy10 = _mm_sub_pd(iy1,jy0);
1357 dz10 = _mm_sub_pd(iz1,jz0);
1358 dx11 = _mm_sub_pd(ix1,jx1);
1359 dy11 = _mm_sub_pd(iy1,jy1);
1360 dz11 = _mm_sub_pd(iz1,jz1);
1361 dx12 = _mm_sub_pd(ix1,jx2);
1362 dy12 = _mm_sub_pd(iy1,jy2);
1363 dz12 = _mm_sub_pd(iz1,jz2);
1364 dx20 = _mm_sub_pd(ix2,jx0);
1365 dy20 = _mm_sub_pd(iy2,jy0);
1366 dz20 = _mm_sub_pd(iz2,jz0);
1367 dx21 = _mm_sub_pd(ix2,jx1);
1368 dy21 = _mm_sub_pd(iy2,jy1);
1369 dz21 = _mm_sub_pd(iz2,jz1);
1370 dx22 = _mm_sub_pd(ix2,jx2);
1371 dy22 = _mm_sub_pd(iy2,jy2);
1372 dz22 = _mm_sub_pd(iz2,jz2);
1374 /* Calculate squared distance and things based on it */
1375 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1376 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1377 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1378 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1379 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1380 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1381 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1382 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1383 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1385 rinv00 = sse2_invsqrt_d(rsq00);
1386 rinv01 = sse2_invsqrt_d(rsq01);
1387 rinv02 = sse2_invsqrt_d(rsq02);
1388 rinv10 = sse2_invsqrt_d(rsq10);
1389 rinv11 = sse2_invsqrt_d(rsq11);
1390 rinv12 = sse2_invsqrt_d(rsq12);
1391 rinv20 = sse2_invsqrt_d(rsq20);
1392 rinv21 = sse2_invsqrt_d(rsq21);
1393 rinv22 = sse2_invsqrt_d(rsq22);
1395 fjx0 = _mm_setzero_pd();
1396 fjy0 = _mm_setzero_pd();
1397 fjz0 = _mm_setzero_pd();
1398 fjx1 = _mm_setzero_pd();
1399 fjy1 = _mm_setzero_pd();
1400 fjz1 = _mm_setzero_pd();
1401 fjx2 = _mm_setzero_pd();
1402 fjy2 = _mm_setzero_pd();
1403 fjz2 = _mm_setzero_pd();
1405 /**************************
1406 * CALCULATE INTERACTIONS *
1407 **************************/
1409 r00 = _mm_mul_pd(rsq00,rinv00);
1411 /* Calculate table index by multiplying r with table scale and truncate to integer */
1412 rt = _mm_mul_pd(r00,vftabscale);
1413 vfitab = _mm_cvttpd_epi32(rt);
1414 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1415 vfitab = _mm_slli_epi32(vfitab,2);
1417 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1418 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1419 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1420 GMX_MM_TRANSPOSE2_PD(Y,F);
1421 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1422 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1423 GMX_MM_TRANSPOSE2_PD(G,H);
1424 Heps = _mm_mul_pd(vfeps,H);
1425 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1426 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1427 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1431 /* Calculate temporary vectorial force */
1432 tx = _mm_mul_pd(fscal,dx00);
1433 ty = _mm_mul_pd(fscal,dy00);
1434 tz = _mm_mul_pd(fscal,dz00);
1436 /* Update vectorial force */
1437 fix0 = _mm_add_pd(fix0,tx);
1438 fiy0 = _mm_add_pd(fiy0,ty);
1439 fiz0 = _mm_add_pd(fiz0,tz);
1441 fjx0 = _mm_add_pd(fjx0,tx);
1442 fjy0 = _mm_add_pd(fjy0,ty);
1443 fjz0 = _mm_add_pd(fjz0,tz);
1445 /**************************
1446 * CALCULATE INTERACTIONS *
1447 **************************/
1449 r01 = _mm_mul_pd(rsq01,rinv01);
1451 /* Calculate table index by multiplying r with table scale and truncate to integer */
1452 rt = _mm_mul_pd(r01,vftabscale);
1453 vfitab = _mm_cvttpd_epi32(rt);
1454 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1455 vfitab = _mm_slli_epi32(vfitab,2);
1457 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1458 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1459 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1460 GMX_MM_TRANSPOSE2_PD(Y,F);
1461 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1462 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1463 GMX_MM_TRANSPOSE2_PD(G,H);
1464 Heps = _mm_mul_pd(vfeps,H);
1465 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1466 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1467 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1471 /* Calculate temporary vectorial force */
1472 tx = _mm_mul_pd(fscal,dx01);
1473 ty = _mm_mul_pd(fscal,dy01);
1474 tz = _mm_mul_pd(fscal,dz01);
1476 /* Update vectorial force */
1477 fix0 = _mm_add_pd(fix0,tx);
1478 fiy0 = _mm_add_pd(fiy0,ty);
1479 fiz0 = _mm_add_pd(fiz0,tz);
1481 fjx1 = _mm_add_pd(fjx1,tx);
1482 fjy1 = _mm_add_pd(fjy1,ty);
1483 fjz1 = _mm_add_pd(fjz1,tz);
1485 /**************************
1486 * CALCULATE INTERACTIONS *
1487 **************************/
1489 r02 = _mm_mul_pd(rsq02,rinv02);
1491 /* Calculate table index by multiplying r with table scale and truncate to integer */
1492 rt = _mm_mul_pd(r02,vftabscale);
1493 vfitab = _mm_cvttpd_epi32(rt);
1494 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1495 vfitab = _mm_slli_epi32(vfitab,2);
1497 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1498 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1499 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1500 GMX_MM_TRANSPOSE2_PD(Y,F);
1501 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1502 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1503 GMX_MM_TRANSPOSE2_PD(G,H);
1504 Heps = _mm_mul_pd(vfeps,H);
1505 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1506 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1507 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1511 /* Calculate temporary vectorial force */
1512 tx = _mm_mul_pd(fscal,dx02);
1513 ty = _mm_mul_pd(fscal,dy02);
1514 tz = _mm_mul_pd(fscal,dz02);
1516 /* Update vectorial force */
1517 fix0 = _mm_add_pd(fix0,tx);
1518 fiy0 = _mm_add_pd(fiy0,ty);
1519 fiz0 = _mm_add_pd(fiz0,tz);
1521 fjx2 = _mm_add_pd(fjx2,tx);
1522 fjy2 = _mm_add_pd(fjy2,ty);
1523 fjz2 = _mm_add_pd(fjz2,tz);
1525 /**************************
1526 * CALCULATE INTERACTIONS *
1527 **************************/
1529 r10 = _mm_mul_pd(rsq10,rinv10);
1531 /* Calculate table index by multiplying r with table scale and truncate to integer */
1532 rt = _mm_mul_pd(r10,vftabscale);
1533 vfitab = _mm_cvttpd_epi32(rt);
1534 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1535 vfitab = _mm_slli_epi32(vfitab,2);
1537 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1538 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1539 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1540 GMX_MM_TRANSPOSE2_PD(Y,F);
1541 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1542 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1543 GMX_MM_TRANSPOSE2_PD(G,H);
1544 Heps = _mm_mul_pd(vfeps,H);
1545 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1546 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1547 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1551 /* Calculate temporary vectorial force */
1552 tx = _mm_mul_pd(fscal,dx10);
1553 ty = _mm_mul_pd(fscal,dy10);
1554 tz = _mm_mul_pd(fscal,dz10);
1556 /* Update vectorial force */
1557 fix1 = _mm_add_pd(fix1,tx);
1558 fiy1 = _mm_add_pd(fiy1,ty);
1559 fiz1 = _mm_add_pd(fiz1,tz);
1561 fjx0 = _mm_add_pd(fjx0,tx);
1562 fjy0 = _mm_add_pd(fjy0,ty);
1563 fjz0 = _mm_add_pd(fjz0,tz);
1565 /**************************
1566 * CALCULATE INTERACTIONS *
1567 **************************/
1569 r11 = _mm_mul_pd(rsq11,rinv11);
1571 /* Calculate table index by multiplying r with table scale and truncate to integer */
1572 rt = _mm_mul_pd(r11,vftabscale);
1573 vfitab = _mm_cvttpd_epi32(rt);
1574 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1575 vfitab = _mm_slli_epi32(vfitab,2);
1577 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1578 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1579 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1580 GMX_MM_TRANSPOSE2_PD(Y,F);
1581 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1582 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1583 GMX_MM_TRANSPOSE2_PD(G,H);
1584 Heps = _mm_mul_pd(vfeps,H);
1585 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1586 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1587 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1591 /* Calculate temporary vectorial force */
1592 tx = _mm_mul_pd(fscal,dx11);
1593 ty = _mm_mul_pd(fscal,dy11);
1594 tz = _mm_mul_pd(fscal,dz11);
1596 /* Update vectorial force */
1597 fix1 = _mm_add_pd(fix1,tx);
1598 fiy1 = _mm_add_pd(fiy1,ty);
1599 fiz1 = _mm_add_pd(fiz1,tz);
1601 fjx1 = _mm_add_pd(fjx1,tx);
1602 fjy1 = _mm_add_pd(fjy1,ty);
1603 fjz1 = _mm_add_pd(fjz1,tz);
1605 /**************************
1606 * CALCULATE INTERACTIONS *
1607 **************************/
1609 r12 = _mm_mul_pd(rsq12,rinv12);
1611 /* Calculate table index by multiplying r with table scale and truncate to integer */
1612 rt = _mm_mul_pd(r12,vftabscale);
1613 vfitab = _mm_cvttpd_epi32(rt);
1614 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1615 vfitab = _mm_slli_epi32(vfitab,2);
1617 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1618 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1619 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1620 GMX_MM_TRANSPOSE2_PD(Y,F);
1621 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1622 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1623 GMX_MM_TRANSPOSE2_PD(G,H);
1624 Heps = _mm_mul_pd(vfeps,H);
1625 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1626 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1627 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1631 /* Calculate temporary vectorial force */
1632 tx = _mm_mul_pd(fscal,dx12);
1633 ty = _mm_mul_pd(fscal,dy12);
1634 tz = _mm_mul_pd(fscal,dz12);
1636 /* Update vectorial force */
1637 fix1 = _mm_add_pd(fix1,tx);
1638 fiy1 = _mm_add_pd(fiy1,ty);
1639 fiz1 = _mm_add_pd(fiz1,tz);
1641 fjx2 = _mm_add_pd(fjx2,tx);
1642 fjy2 = _mm_add_pd(fjy2,ty);
1643 fjz2 = _mm_add_pd(fjz2,tz);
1645 /**************************
1646 * CALCULATE INTERACTIONS *
1647 **************************/
1649 r20 = _mm_mul_pd(rsq20,rinv20);
1651 /* Calculate table index by multiplying r with table scale and truncate to integer */
1652 rt = _mm_mul_pd(r20,vftabscale);
1653 vfitab = _mm_cvttpd_epi32(rt);
1654 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1655 vfitab = _mm_slli_epi32(vfitab,2);
1657 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1658 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1659 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1660 GMX_MM_TRANSPOSE2_PD(Y,F);
1661 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1662 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1663 GMX_MM_TRANSPOSE2_PD(G,H);
1664 Heps = _mm_mul_pd(vfeps,H);
1665 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1666 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1667 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1671 /* Calculate temporary vectorial force */
1672 tx = _mm_mul_pd(fscal,dx20);
1673 ty = _mm_mul_pd(fscal,dy20);
1674 tz = _mm_mul_pd(fscal,dz20);
1676 /* Update vectorial force */
1677 fix2 = _mm_add_pd(fix2,tx);
1678 fiy2 = _mm_add_pd(fiy2,ty);
1679 fiz2 = _mm_add_pd(fiz2,tz);
1681 fjx0 = _mm_add_pd(fjx0,tx);
1682 fjy0 = _mm_add_pd(fjy0,ty);
1683 fjz0 = _mm_add_pd(fjz0,tz);
1685 /**************************
1686 * CALCULATE INTERACTIONS *
1687 **************************/
1689 r21 = _mm_mul_pd(rsq21,rinv21);
1691 /* Calculate table index by multiplying r with table scale and truncate to integer */
1692 rt = _mm_mul_pd(r21,vftabscale);
1693 vfitab = _mm_cvttpd_epi32(rt);
1694 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1695 vfitab = _mm_slli_epi32(vfitab,2);
1697 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1698 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1699 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1700 GMX_MM_TRANSPOSE2_PD(Y,F);
1701 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1702 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1703 GMX_MM_TRANSPOSE2_PD(G,H);
1704 Heps = _mm_mul_pd(vfeps,H);
1705 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1706 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1707 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1711 /* Calculate temporary vectorial force */
1712 tx = _mm_mul_pd(fscal,dx21);
1713 ty = _mm_mul_pd(fscal,dy21);
1714 tz = _mm_mul_pd(fscal,dz21);
1716 /* Update vectorial force */
1717 fix2 = _mm_add_pd(fix2,tx);
1718 fiy2 = _mm_add_pd(fiy2,ty);
1719 fiz2 = _mm_add_pd(fiz2,tz);
1721 fjx1 = _mm_add_pd(fjx1,tx);
1722 fjy1 = _mm_add_pd(fjy1,ty);
1723 fjz1 = _mm_add_pd(fjz1,tz);
1725 /**************************
1726 * CALCULATE INTERACTIONS *
1727 **************************/
1729 r22 = _mm_mul_pd(rsq22,rinv22);
1731 /* Calculate table index by multiplying r with table scale and truncate to integer */
1732 rt = _mm_mul_pd(r22,vftabscale);
1733 vfitab = _mm_cvttpd_epi32(rt);
1734 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1735 vfitab = _mm_slli_epi32(vfitab,2);
1737 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1738 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1739 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1740 GMX_MM_TRANSPOSE2_PD(Y,F);
1741 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1742 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1743 GMX_MM_TRANSPOSE2_PD(G,H);
1744 Heps = _mm_mul_pd(vfeps,H);
1745 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1746 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1747 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1751 /* Calculate temporary vectorial force */
1752 tx = _mm_mul_pd(fscal,dx22);
1753 ty = _mm_mul_pd(fscal,dy22);
1754 tz = _mm_mul_pd(fscal,dz22);
1756 /* Update vectorial force */
1757 fix2 = _mm_add_pd(fix2,tx);
1758 fiy2 = _mm_add_pd(fiy2,ty);
1759 fiz2 = _mm_add_pd(fiz2,tz);
1761 fjx2 = _mm_add_pd(fjx2,tx);
1762 fjy2 = _mm_add_pd(fjy2,ty);
1763 fjz2 = _mm_add_pd(fjz2,tz);
1765 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1767 /* Inner loop uses 351 flops */
1770 if(jidx<j_index_end)
1774 j_coord_offsetA = DIM*jnrA;
1776 /* load j atom coordinates */
1777 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1778 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1780 /* Calculate displacement vector */
1781 dx00 = _mm_sub_pd(ix0,jx0);
1782 dy00 = _mm_sub_pd(iy0,jy0);
1783 dz00 = _mm_sub_pd(iz0,jz0);
1784 dx01 = _mm_sub_pd(ix0,jx1);
1785 dy01 = _mm_sub_pd(iy0,jy1);
1786 dz01 = _mm_sub_pd(iz0,jz1);
1787 dx02 = _mm_sub_pd(ix0,jx2);
1788 dy02 = _mm_sub_pd(iy0,jy2);
1789 dz02 = _mm_sub_pd(iz0,jz2);
1790 dx10 = _mm_sub_pd(ix1,jx0);
1791 dy10 = _mm_sub_pd(iy1,jy0);
1792 dz10 = _mm_sub_pd(iz1,jz0);
1793 dx11 = _mm_sub_pd(ix1,jx1);
1794 dy11 = _mm_sub_pd(iy1,jy1);
1795 dz11 = _mm_sub_pd(iz1,jz1);
1796 dx12 = _mm_sub_pd(ix1,jx2);
1797 dy12 = _mm_sub_pd(iy1,jy2);
1798 dz12 = _mm_sub_pd(iz1,jz2);
1799 dx20 = _mm_sub_pd(ix2,jx0);
1800 dy20 = _mm_sub_pd(iy2,jy0);
1801 dz20 = _mm_sub_pd(iz2,jz0);
1802 dx21 = _mm_sub_pd(ix2,jx1);
1803 dy21 = _mm_sub_pd(iy2,jy1);
1804 dz21 = _mm_sub_pd(iz2,jz1);
1805 dx22 = _mm_sub_pd(ix2,jx2);
1806 dy22 = _mm_sub_pd(iy2,jy2);
1807 dz22 = _mm_sub_pd(iz2,jz2);
1809 /* Calculate squared distance and things based on it */
1810 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1811 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1812 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1813 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1814 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1815 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1816 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1817 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1818 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1820 rinv00 = sse2_invsqrt_d(rsq00);
1821 rinv01 = sse2_invsqrt_d(rsq01);
1822 rinv02 = sse2_invsqrt_d(rsq02);
1823 rinv10 = sse2_invsqrt_d(rsq10);
1824 rinv11 = sse2_invsqrt_d(rsq11);
1825 rinv12 = sse2_invsqrt_d(rsq12);
1826 rinv20 = sse2_invsqrt_d(rsq20);
1827 rinv21 = sse2_invsqrt_d(rsq21);
1828 rinv22 = sse2_invsqrt_d(rsq22);
1830 fjx0 = _mm_setzero_pd();
1831 fjy0 = _mm_setzero_pd();
1832 fjz0 = _mm_setzero_pd();
1833 fjx1 = _mm_setzero_pd();
1834 fjy1 = _mm_setzero_pd();
1835 fjz1 = _mm_setzero_pd();
1836 fjx2 = _mm_setzero_pd();
1837 fjy2 = _mm_setzero_pd();
1838 fjz2 = _mm_setzero_pd();
1840 /**************************
1841 * CALCULATE INTERACTIONS *
1842 **************************/
1844 r00 = _mm_mul_pd(rsq00,rinv00);
1846 /* Calculate table index by multiplying r with table scale and truncate to integer */
1847 rt = _mm_mul_pd(r00,vftabscale);
1848 vfitab = _mm_cvttpd_epi32(rt);
1849 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1850 vfitab = _mm_slli_epi32(vfitab,2);
1852 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1853 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1854 F = _mm_setzero_pd();
1855 GMX_MM_TRANSPOSE2_PD(Y,F);
1856 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1857 H = _mm_setzero_pd();
1858 GMX_MM_TRANSPOSE2_PD(G,H);
1859 Heps = _mm_mul_pd(vfeps,H);
1860 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1861 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1862 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1866 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1868 /* Calculate temporary vectorial force */
1869 tx = _mm_mul_pd(fscal,dx00);
1870 ty = _mm_mul_pd(fscal,dy00);
1871 tz = _mm_mul_pd(fscal,dz00);
1873 /* Update vectorial force */
1874 fix0 = _mm_add_pd(fix0,tx);
1875 fiy0 = _mm_add_pd(fiy0,ty);
1876 fiz0 = _mm_add_pd(fiz0,tz);
1878 fjx0 = _mm_add_pd(fjx0,tx);
1879 fjy0 = _mm_add_pd(fjy0,ty);
1880 fjz0 = _mm_add_pd(fjz0,tz);
1882 /**************************
1883 * CALCULATE INTERACTIONS *
1884 **************************/
1886 r01 = _mm_mul_pd(rsq01,rinv01);
1888 /* Calculate table index by multiplying r with table scale and truncate to integer */
1889 rt = _mm_mul_pd(r01,vftabscale);
1890 vfitab = _mm_cvttpd_epi32(rt);
1891 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1892 vfitab = _mm_slli_epi32(vfitab,2);
1894 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1895 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1896 F = _mm_setzero_pd();
1897 GMX_MM_TRANSPOSE2_PD(Y,F);
1898 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1899 H = _mm_setzero_pd();
1900 GMX_MM_TRANSPOSE2_PD(G,H);
1901 Heps = _mm_mul_pd(vfeps,H);
1902 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1903 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1904 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1908 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1910 /* Calculate temporary vectorial force */
1911 tx = _mm_mul_pd(fscal,dx01);
1912 ty = _mm_mul_pd(fscal,dy01);
1913 tz = _mm_mul_pd(fscal,dz01);
1915 /* Update vectorial force */
1916 fix0 = _mm_add_pd(fix0,tx);
1917 fiy0 = _mm_add_pd(fiy0,ty);
1918 fiz0 = _mm_add_pd(fiz0,tz);
1920 fjx1 = _mm_add_pd(fjx1,tx);
1921 fjy1 = _mm_add_pd(fjy1,ty);
1922 fjz1 = _mm_add_pd(fjz1,tz);
1924 /**************************
1925 * CALCULATE INTERACTIONS *
1926 **************************/
1928 r02 = _mm_mul_pd(rsq02,rinv02);
1930 /* Calculate table index by multiplying r with table scale and truncate to integer */
1931 rt = _mm_mul_pd(r02,vftabscale);
1932 vfitab = _mm_cvttpd_epi32(rt);
1933 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1934 vfitab = _mm_slli_epi32(vfitab,2);
1936 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1937 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1938 F = _mm_setzero_pd();
1939 GMX_MM_TRANSPOSE2_PD(Y,F);
1940 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1941 H = _mm_setzero_pd();
1942 GMX_MM_TRANSPOSE2_PD(G,H);
1943 Heps = _mm_mul_pd(vfeps,H);
1944 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1945 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1946 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1950 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1952 /* Calculate temporary vectorial force */
1953 tx = _mm_mul_pd(fscal,dx02);
1954 ty = _mm_mul_pd(fscal,dy02);
1955 tz = _mm_mul_pd(fscal,dz02);
1957 /* Update vectorial force */
1958 fix0 = _mm_add_pd(fix0,tx);
1959 fiy0 = _mm_add_pd(fiy0,ty);
1960 fiz0 = _mm_add_pd(fiz0,tz);
1962 fjx2 = _mm_add_pd(fjx2,tx);
1963 fjy2 = _mm_add_pd(fjy2,ty);
1964 fjz2 = _mm_add_pd(fjz2,tz);
1966 /**************************
1967 * CALCULATE INTERACTIONS *
1968 **************************/
1970 r10 = _mm_mul_pd(rsq10,rinv10);
1972 /* Calculate table index by multiplying r with table scale and truncate to integer */
1973 rt = _mm_mul_pd(r10,vftabscale);
1974 vfitab = _mm_cvttpd_epi32(rt);
1975 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1976 vfitab = _mm_slli_epi32(vfitab,2);
1978 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1979 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1980 F = _mm_setzero_pd();
1981 GMX_MM_TRANSPOSE2_PD(Y,F);
1982 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1983 H = _mm_setzero_pd();
1984 GMX_MM_TRANSPOSE2_PD(G,H);
1985 Heps = _mm_mul_pd(vfeps,H);
1986 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1987 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1988 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1992 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1994 /* Calculate temporary vectorial force */
1995 tx = _mm_mul_pd(fscal,dx10);
1996 ty = _mm_mul_pd(fscal,dy10);
1997 tz = _mm_mul_pd(fscal,dz10);
1999 /* Update vectorial force */
2000 fix1 = _mm_add_pd(fix1,tx);
2001 fiy1 = _mm_add_pd(fiy1,ty);
2002 fiz1 = _mm_add_pd(fiz1,tz);
2004 fjx0 = _mm_add_pd(fjx0,tx);
2005 fjy0 = _mm_add_pd(fjy0,ty);
2006 fjz0 = _mm_add_pd(fjz0,tz);
2008 /**************************
2009 * CALCULATE INTERACTIONS *
2010 **************************/
2012 r11 = _mm_mul_pd(rsq11,rinv11);
2014 /* Calculate table index by multiplying r with table scale and truncate to integer */
2015 rt = _mm_mul_pd(r11,vftabscale);
2016 vfitab = _mm_cvttpd_epi32(rt);
2017 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2018 vfitab = _mm_slli_epi32(vfitab,2);
2020 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2021 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2022 F = _mm_setzero_pd();
2023 GMX_MM_TRANSPOSE2_PD(Y,F);
2024 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2025 H = _mm_setzero_pd();
2026 GMX_MM_TRANSPOSE2_PD(G,H);
2027 Heps = _mm_mul_pd(vfeps,H);
2028 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2029 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2030 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2034 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2036 /* Calculate temporary vectorial force */
2037 tx = _mm_mul_pd(fscal,dx11);
2038 ty = _mm_mul_pd(fscal,dy11);
2039 tz = _mm_mul_pd(fscal,dz11);
2041 /* Update vectorial force */
2042 fix1 = _mm_add_pd(fix1,tx);
2043 fiy1 = _mm_add_pd(fiy1,ty);
2044 fiz1 = _mm_add_pd(fiz1,tz);
2046 fjx1 = _mm_add_pd(fjx1,tx);
2047 fjy1 = _mm_add_pd(fjy1,ty);
2048 fjz1 = _mm_add_pd(fjz1,tz);
2050 /**************************
2051 * CALCULATE INTERACTIONS *
2052 **************************/
2054 r12 = _mm_mul_pd(rsq12,rinv12);
2056 /* Calculate table index by multiplying r with table scale and truncate to integer */
2057 rt = _mm_mul_pd(r12,vftabscale);
2058 vfitab = _mm_cvttpd_epi32(rt);
2059 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2060 vfitab = _mm_slli_epi32(vfitab,2);
2062 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2063 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2064 F = _mm_setzero_pd();
2065 GMX_MM_TRANSPOSE2_PD(Y,F);
2066 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2067 H = _mm_setzero_pd();
2068 GMX_MM_TRANSPOSE2_PD(G,H);
2069 Heps = _mm_mul_pd(vfeps,H);
2070 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2071 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2072 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2076 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2078 /* Calculate temporary vectorial force */
2079 tx = _mm_mul_pd(fscal,dx12);
2080 ty = _mm_mul_pd(fscal,dy12);
2081 tz = _mm_mul_pd(fscal,dz12);
2083 /* Update vectorial force */
2084 fix1 = _mm_add_pd(fix1,tx);
2085 fiy1 = _mm_add_pd(fiy1,ty);
2086 fiz1 = _mm_add_pd(fiz1,tz);
2088 fjx2 = _mm_add_pd(fjx2,tx);
2089 fjy2 = _mm_add_pd(fjy2,ty);
2090 fjz2 = _mm_add_pd(fjz2,tz);
2092 /**************************
2093 * CALCULATE INTERACTIONS *
2094 **************************/
2096 r20 = _mm_mul_pd(rsq20,rinv20);
2098 /* Calculate table index by multiplying r with table scale and truncate to integer */
2099 rt = _mm_mul_pd(r20,vftabscale);
2100 vfitab = _mm_cvttpd_epi32(rt);
2101 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2102 vfitab = _mm_slli_epi32(vfitab,2);
2104 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2105 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2106 F = _mm_setzero_pd();
2107 GMX_MM_TRANSPOSE2_PD(Y,F);
2108 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2109 H = _mm_setzero_pd();
2110 GMX_MM_TRANSPOSE2_PD(G,H);
2111 Heps = _mm_mul_pd(vfeps,H);
2112 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2113 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2114 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
2118 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2120 /* Calculate temporary vectorial force */
2121 tx = _mm_mul_pd(fscal,dx20);
2122 ty = _mm_mul_pd(fscal,dy20);
2123 tz = _mm_mul_pd(fscal,dz20);
2125 /* Update vectorial force */
2126 fix2 = _mm_add_pd(fix2,tx);
2127 fiy2 = _mm_add_pd(fiy2,ty);
2128 fiz2 = _mm_add_pd(fiz2,tz);
2130 fjx0 = _mm_add_pd(fjx0,tx);
2131 fjy0 = _mm_add_pd(fjy0,ty);
2132 fjz0 = _mm_add_pd(fjz0,tz);
2134 /**************************
2135 * CALCULATE INTERACTIONS *
2136 **************************/
2138 r21 = _mm_mul_pd(rsq21,rinv21);
2140 /* Calculate table index by multiplying r with table scale and truncate to integer */
2141 rt = _mm_mul_pd(r21,vftabscale);
2142 vfitab = _mm_cvttpd_epi32(rt);
2143 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2144 vfitab = _mm_slli_epi32(vfitab,2);
2146 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2147 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2148 F = _mm_setzero_pd();
2149 GMX_MM_TRANSPOSE2_PD(Y,F);
2150 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2151 H = _mm_setzero_pd();
2152 GMX_MM_TRANSPOSE2_PD(G,H);
2153 Heps = _mm_mul_pd(vfeps,H);
2154 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2155 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2156 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2160 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2162 /* Calculate temporary vectorial force */
2163 tx = _mm_mul_pd(fscal,dx21);
2164 ty = _mm_mul_pd(fscal,dy21);
2165 tz = _mm_mul_pd(fscal,dz21);
2167 /* Update vectorial force */
2168 fix2 = _mm_add_pd(fix2,tx);
2169 fiy2 = _mm_add_pd(fiy2,ty);
2170 fiz2 = _mm_add_pd(fiz2,tz);
2172 fjx1 = _mm_add_pd(fjx1,tx);
2173 fjy1 = _mm_add_pd(fjy1,ty);
2174 fjz1 = _mm_add_pd(fjz1,tz);
2176 /**************************
2177 * CALCULATE INTERACTIONS *
2178 **************************/
2180 r22 = _mm_mul_pd(rsq22,rinv22);
2182 /* Calculate table index by multiplying r with table scale and truncate to integer */
2183 rt = _mm_mul_pd(r22,vftabscale);
2184 vfitab = _mm_cvttpd_epi32(rt);
2185 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2186 vfitab = _mm_slli_epi32(vfitab,2);
2188 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2189 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2190 F = _mm_setzero_pd();
2191 GMX_MM_TRANSPOSE2_PD(Y,F);
2192 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2193 H = _mm_setzero_pd();
2194 GMX_MM_TRANSPOSE2_PD(G,H);
2195 Heps = _mm_mul_pd(vfeps,H);
2196 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2197 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2198 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2202 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2204 /* Calculate temporary vectorial force */
2205 tx = _mm_mul_pd(fscal,dx22);
2206 ty = _mm_mul_pd(fscal,dy22);
2207 tz = _mm_mul_pd(fscal,dz22);
2209 /* Update vectorial force */
2210 fix2 = _mm_add_pd(fix2,tx);
2211 fiy2 = _mm_add_pd(fiy2,ty);
2212 fiz2 = _mm_add_pd(fiz2,tz);
2214 fjx2 = _mm_add_pd(fjx2,tx);
2215 fjy2 = _mm_add_pd(fjy2,ty);
2216 fjz2 = _mm_add_pd(fjz2,tz);
2218 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2220 /* Inner loop uses 351 flops */
2223 /* End of innermost loop */
2225 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2226 f+i_coord_offset,fshift+i_shift_offset);
2228 /* Increment number of inner iterations */
2229 inneriter += j_index_end - j_index_start;
2231 /* Outer loop uses 18 flops */
2234 /* Increment number of outer iterations */
2237 /* Update outer/inner flops */
2239 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*351);