2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
49 #include "gromacs/simd/math_x86_sse4_1_double.h"
50 #include "kernelutil_x86_sse4_1_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse4_1_double
54 * Electrostatics interaction: CubicSplineTable
55 * VdW interaction: None
56 * Geometry: Water4-Water4
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_sse4_1_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
77 int j_coord_offsetA,j_coord_offsetB;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
88 int vdwjidx1A,vdwjidx1B;
89 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
90 int vdwjidx2A,vdwjidx2B;
91 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
92 int vdwjidx3A,vdwjidx3B;
93 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
94 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
95 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
96 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
97 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
98 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
99 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
100 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
101 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
102 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
103 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
106 __m128i ifour = _mm_set1_epi32(4);
107 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
109 __m128d dummy_mask,cutoff_mask;
110 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
111 __m128d one = _mm_set1_pd(1.0);
112 __m128d two = _mm_set1_pd(2.0);
118 jindex = nlist->jindex;
120 shiftidx = nlist->shift;
122 shiftvec = fr->shift_vec[0];
123 fshift = fr->fshift[0];
124 facel = _mm_set1_pd(fr->epsfac);
125 charge = mdatoms->chargeA;
127 vftab = kernel_data->table_elec->data;
128 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
130 /* Setup water-specific parameters */
131 inr = nlist->iinr[0];
132 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
133 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
134 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
136 jq1 = _mm_set1_pd(charge[inr+1]);
137 jq2 = _mm_set1_pd(charge[inr+2]);
138 jq3 = _mm_set1_pd(charge[inr+3]);
139 qq11 = _mm_mul_pd(iq1,jq1);
140 qq12 = _mm_mul_pd(iq1,jq2);
141 qq13 = _mm_mul_pd(iq1,jq3);
142 qq21 = _mm_mul_pd(iq2,jq1);
143 qq22 = _mm_mul_pd(iq2,jq2);
144 qq23 = _mm_mul_pd(iq2,jq3);
145 qq31 = _mm_mul_pd(iq3,jq1);
146 qq32 = _mm_mul_pd(iq3,jq2);
147 qq33 = _mm_mul_pd(iq3,jq3);
149 /* Avoid stupid compiler warnings */
157 /* Start outer loop over neighborlists */
158 for(iidx=0; iidx<nri; iidx++)
160 /* Load shift vector for this list */
161 i_shift_offset = DIM*shiftidx[iidx];
163 /* Load limits for loop over neighbors */
164 j_index_start = jindex[iidx];
165 j_index_end = jindex[iidx+1];
167 /* Get outer coordinate index */
169 i_coord_offset = DIM*inr;
171 /* Load i particle coords and add shift vector */
172 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
173 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
175 fix1 = _mm_setzero_pd();
176 fiy1 = _mm_setzero_pd();
177 fiz1 = _mm_setzero_pd();
178 fix2 = _mm_setzero_pd();
179 fiy2 = _mm_setzero_pd();
180 fiz2 = _mm_setzero_pd();
181 fix3 = _mm_setzero_pd();
182 fiy3 = _mm_setzero_pd();
183 fiz3 = _mm_setzero_pd();
185 /* Reset potential sums */
186 velecsum = _mm_setzero_pd();
188 /* Start inner kernel loop */
189 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
192 /* Get j neighbor index, and coordinate index */
195 j_coord_offsetA = DIM*jnrA;
196 j_coord_offsetB = DIM*jnrB;
198 /* load j atom coordinates */
199 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
200 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
202 /* Calculate displacement vector */
203 dx11 = _mm_sub_pd(ix1,jx1);
204 dy11 = _mm_sub_pd(iy1,jy1);
205 dz11 = _mm_sub_pd(iz1,jz1);
206 dx12 = _mm_sub_pd(ix1,jx2);
207 dy12 = _mm_sub_pd(iy1,jy2);
208 dz12 = _mm_sub_pd(iz1,jz2);
209 dx13 = _mm_sub_pd(ix1,jx3);
210 dy13 = _mm_sub_pd(iy1,jy3);
211 dz13 = _mm_sub_pd(iz1,jz3);
212 dx21 = _mm_sub_pd(ix2,jx1);
213 dy21 = _mm_sub_pd(iy2,jy1);
214 dz21 = _mm_sub_pd(iz2,jz1);
215 dx22 = _mm_sub_pd(ix2,jx2);
216 dy22 = _mm_sub_pd(iy2,jy2);
217 dz22 = _mm_sub_pd(iz2,jz2);
218 dx23 = _mm_sub_pd(ix2,jx3);
219 dy23 = _mm_sub_pd(iy2,jy3);
220 dz23 = _mm_sub_pd(iz2,jz3);
221 dx31 = _mm_sub_pd(ix3,jx1);
222 dy31 = _mm_sub_pd(iy3,jy1);
223 dz31 = _mm_sub_pd(iz3,jz1);
224 dx32 = _mm_sub_pd(ix3,jx2);
225 dy32 = _mm_sub_pd(iy3,jy2);
226 dz32 = _mm_sub_pd(iz3,jz2);
227 dx33 = _mm_sub_pd(ix3,jx3);
228 dy33 = _mm_sub_pd(iy3,jy3);
229 dz33 = _mm_sub_pd(iz3,jz3);
231 /* Calculate squared distance and things based on it */
232 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
233 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
234 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
235 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
236 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
237 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
238 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
239 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
240 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
242 rinv11 = gmx_mm_invsqrt_pd(rsq11);
243 rinv12 = gmx_mm_invsqrt_pd(rsq12);
244 rinv13 = gmx_mm_invsqrt_pd(rsq13);
245 rinv21 = gmx_mm_invsqrt_pd(rsq21);
246 rinv22 = gmx_mm_invsqrt_pd(rsq22);
247 rinv23 = gmx_mm_invsqrt_pd(rsq23);
248 rinv31 = gmx_mm_invsqrt_pd(rsq31);
249 rinv32 = gmx_mm_invsqrt_pd(rsq32);
250 rinv33 = gmx_mm_invsqrt_pd(rsq33);
252 fjx1 = _mm_setzero_pd();
253 fjy1 = _mm_setzero_pd();
254 fjz1 = _mm_setzero_pd();
255 fjx2 = _mm_setzero_pd();
256 fjy2 = _mm_setzero_pd();
257 fjz2 = _mm_setzero_pd();
258 fjx3 = _mm_setzero_pd();
259 fjy3 = _mm_setzero_pd();
260 fjz3 = _mm_setzero_pd();
262 /**************************
263 * CALCULATE INTERACTIONS *
264 **************************/
266 r11 = _mm_mul_pd(rsq11,rinv11);
268 /* Calculate table index by multiplying r with table scale and truncate to integer */
269 rt = _mm_mul_pd(r11,vftabscale);
270 vfitab = _mm_cvttpd_epi32(rt);
271 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
272 vfitab = _mm_slli_epi32(vfitab,2);
274 /* CUBIC SPLINE TABLE ELECTROSTATICS */
275 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
276 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
277 GMX_MM_TRANSPOSE2_PD(Y,F);
278 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
279 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
280 GMX_MM_TRANSPOSE2_PD(G,H);
281 Heps = _mm_mul_pd(vfeps,H);
282 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
283 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
284 velec = _mm_mul_pd(qq11,VV);
285 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
286 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
288 /* Update potential sum for this i atom from the interaction with this j atom. */
289 velecsum = _mm_add_pd(velecsum,velec);
293 /* Calculate temporary vectorial force */
294 tx = _mm_mul_pd(fscal,dx11);
295 ty = _mm_mul_pd(fscal,dy11);
296 tz = _mm_mul_pd(fscal,dz11);
298 /* Update vectorial force */
299 fix1 = _mm_add_pd(fix1,tx);
300 fiy1 = _mm_add_pd(fiy1,ty);
301 fiz1 = _mm_add_pd(fiz1,tz);
303 fjx1 = _mm_add_pd(fjx1,tx);
304 fjy1 = _mm_add_pd(fjy1,ty);
305 fjz1 = _mm_add_pd(fjz1,tz);
307 /**************************
308 * CALCULATE INTERACTIONS *
309 **************************/
311 r12 = _mm_mul_pd(rsq12,rinv12);
313 /* Calculate table index by multiplying r with table scale and truncate to integer */
314 rt = _mm_mul_pd(r12,vftabscale);
315 vfitab = _mm_cvttpd_epi32(rt);
316 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
317 vfitab = _mm_slli_epi32(vfitab,2);
319 /* CUBIC SPLINE TABLE ELECTROSTATICS */
320 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
321 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
322 GMX_MM_TRANSPOSE2_PD(Y,F);
323 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
324 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
325 GMX_MM_TRANSPOSE2_PD(G,H);
326 Heps = _mm_mul_pd(vfeps,H);
327 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
328 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
329 velec = _mm_mul_pd(qq12,VV);
330 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
331 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
333 /* Update potential sum for this i atom from the interaction with this j atom. */
334 velecsum = _mm_add_pd(velecsum,velec);
338 /* Calculate temporary vectorial force */
339 tx = _mm_mul_pd(fscal,dx12);
340 ty = _mm_mul_pd(fscal,dy12);
341 tz = _mm_mul_pd(fscal,dz12);
343 /* Update vectorial force */
344 fix1 = _mm_add_pd(fix1,tx);
345 fiy1 = _mm_add_pd(fiy1,ty);
346 fiz1 = _mm_add_pd(fiz1,tz);
348 fjx2 = _mm_add_pd(fjx2,tx);
349 fjy2 = _mm_add_pd(fjy2,ty);
350 fjz2 = _mm_add_pd(fjz2,tz);
352 /**************************
353 * CALCULATE INTERACTIONS *
354 **************************/
356 r13 = _mm_mul_pd(rsq13,rinv13);
358 /* Calculate table index by multiplying r with table scale and truncate to integer */
359 rt = _mm_mul_pd(r13,vftabscale);
360 vfitab = _mm_cvttpd_epi32(rt);
361 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
362 vfitab = _mm_slli_epi32(vfitab,2);
364 /* CUBIC SPLINE TABLE ELECTROSTATICS */
365 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
366 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
367 GMX_MM_TRANSPOSE2_PD(Y,F);
368 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
369 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
370 GMX_MM_TRANSPOSE2_PD(G,H);
371 Heps = _mm_mul_pd(vfeps,H);
372 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
373 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
374 velec = _mm_mul_pd(qq13,VV);
375 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
376 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
378 /* Update potential sum for this i atom from the interaction with this j atom. */
379 velecsum = _mm_add_pd(velecsum,velec);
383 /* Calculate temporary vectorial force */
384 tx = _mm_mul_pd(fscal,dx13);
385 ty = _mm_mul_pd(fscal,dy13);
386 tz = _mm_mul_pd(fscal,dz13);
388 /* Update vectorial force */
389 fix1 = _mm_add_pd(fix1,tx);
390 fiy1 = _mm_add_pd(fiy1,ty);
391 fiz1 = _mm_add_pd(fiz1,tz);
393 fjx3 = _mm_add_pd(fjx3,tx);
394 fjy3 = _mm_add_pd(fjy3,ty);
395 fjz3 = _mm_add_pd(fjz3,tz);
397 /**************************
398 * CALCULATE INTERACTIONS *
399 **************************/
401 r21 = _mm_mul_pd(rsq21,rinv21);
403 /* Calculate table index by multiplying r with table scale and truncate to integer */
404 rt = _mm_mul_pd(r21,vftabscale);
405 vfitab = _mm_cvttpd_epi32(rt);
406 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
407 vfitab = _mm_slli_epi32(vfitab,2);
409 /* CUBIC SPLINE TABLE ELECTROSTATICS */
410 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
411 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
412 GMX_MM_TRANSPOSE2_PD(Y,F);
413 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
414 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
415 GMX_MM_TRANSPOSE2_PD(G,H);
416 Heps = _mm_mul_pd(vfeps,H);
417 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
418 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
419 velec = _mm_mul_pd(qq21,VV);
420 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
421 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
423 /* Update potential sum for this i atom from the interaction with this j atom. */
424 velecsum = _mm_add_pd(velecsum,velec);
428 /* Calculate temporary vectorial force */
429 tx = _mm_mul_pd(fscal,dx21);
430 ty = _mm_mul_pd(fscal,dy21);
431 tz = _mm_mul_pd(fscal,dz21);
433 /* Update vectorial force */
434 fix2 = _mm_add_pd(fix2,tx);
435 fiy2 = _mm_add_pd(fiy2,ty);
436 fiz2 = _mm_add_pd(fiz2,tz);
438 fjx1 = _mm_add_pd(fjx1,tx);
439 fjy1 = _mm_add_pd(fjy1,ty);
440 fjz1 = _mm_add_pd(fjz1,tz);
442 /**************************
443 * CALCULATE INTERACTIONS *
444 **************************/
446 r22 = _mm_mul_pd(rsq22,rinv22);
448 /* Calculate table index by multiplying r with table scale and truncate to integer */
449 rt = _mm_mul_pd(r22,vftabscale);
450 vfitab = _mm_cvttpd_epi32(rt);
451 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
452 vfitab = _mm_slli_epi32(vfitab,2);
454 /* CUBIC SPLINE TABLE ELECTROSTATICS */
455 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
456 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
457 GMX_MM_TRANSPOSE2_PD(Y,F);
458 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
459 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
460 GMX_MM_TRANSPOSE2_PD(G,H);
461 Heps = _mm_mul_pd(vfeps,H);
462 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
463 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
464 velec = _mm_mul_pd(qq22,VV);
465 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
466 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
468 /* Update potential sum for this i atom from the interaction with this j atom. */
469 velecsum = _mm_add_pd(velecsum,velec);
473 /* Calculate temporary vectorial force */
474 tx = _mm_mul_pd(fscal,dx22);
475 ty = _mm_mul_pd(fscal,dy22);
476 tz = _mm_mul_pd(fscal,dz22);
478 /* Update vectorial force */
479 fix2 = _mm_add_pd(fix2,tx);
480 fiy2 = _mm_add_pd(fiy2,ty);
481 fiz2 = _mm_add_pd(fiz2,tz);
483 fjx2 = _mm_add_pd(fjx2,tx);
484 fjy2 = _mm_add_pd(fjy2,ty);
485 fjz2 = _mm_add_pd(fjz2,tz);
487 /**************************
488 * CALCULATE INTERACTIONS *
489 **************************/
491 r23 = _mm_mul_pd(rsq23,rinv23);
493 /* Calculate table index by multiplying r with table scale and truncate to integer */
494 rt = _mm_mul_pd(r23,vftabscale);
495 vfitab = _mm_cvttpd_epi32(rt);
496 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
497 vfitab = _mm_slli_epi32(vfitab,2);
499 /* CUBIC SPLINE TABLE ELECTROSTATICS */
500 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
501 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
502 GMX_MM_TRANSPOSE2_PD(Y,F);
503 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
504 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
505 GMX_MM_TRANSPOSE2_PD(G,H);
506 Heps = _mm_mul_pd(vfeps,H);
507 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
508 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
509 velec = _mm_mul_pd(qq23,VV);
510 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
511 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
513 /* Update potential sum for this i atom from the interaction with this j atom. */
514 velecsum = _mm_add_pd(velecsum,velec);
518 /* Calculate temporary vectorial force */
519 tx = _mm_mul_pd(fscal,dx23);
520 ty = _mm_mul_pd(fscal,dy23);
521 tz = _mm_mul_pd(fscal,dz23);
523 /* Update vectorial force */
524 fix2 = _mm_add_pd(fix2,tx);
525 fiy2 = _mm_add_pd(fiy2,ty);
526 fiz2 = _mm_add_pd(fiz2,tz);
528 fjx3 = _mm_add_pd(fjx3,tx);
529 fjy3 = _mm_add_pd(fjy3,ty);
530 fjz3 = _mm_add_pd(fjz3,tz);
532 /**************************
533 * CALCULATE INTERACTIONS *
534 **************************/
536 r31 = _mm_mul_pd(rsq31,rinv31);
538 /* Calculate table index by multiplying r with table scale and truncate to integer */
539 rt = _mm_mul_pd(r31,vftabscale);
540 vfitab = _mm_cvttpd_epi32(rt);
541 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
542 vfitab = _mm_slli_epi32(vfitab,2);
544 /* CUBIC SPLINE TABLE ELECTROSTATICS */
545 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
546 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
547 GMX_MM_TRANSPOSE2_PD(Y,F);
548 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
549 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
550 GMX_MM_TRANSPOSE2_PD(G,H);
551 Heps = _mm_mul_pd(vfeps,H);
552 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
553 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
554 velec = _mm_mul_pd(qq31,VV);
555 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
556 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
558 /* Update potential sum for this i atom from the interaction with this j atom. */
559 velecsum = _mm_add_pd(velecsum,velec);
563 /* Calculate temporary vectorial force */
564 tx = _mm_mul_pd(fscal,dx31);
565 ty = _mm_mul_pd(fscal,dy31);
566 tz = _mm_mul_pd(fscal,dz31);
568 /* Update vectorial force */
569 fix3 = _mm_add_pd(fix3,tx);
570 fiy3 = _mm_add_pd(fiy3,ty);
571 fiz3 = _mm_add_pd(fiz3,tz);
573 fjx1 = _mm_add_pd(fjx1,tx);
574 fjy1 = _mm_add_pd(fjy1,ty);
575 fjz1 = _mm_add_pd(fjz1,tz);
577 /**************************
578 * CALCULATE INTERACTIONS *
579 **************************/
581 r32 = _mm_mul_pd(rsq32,rinv32);
583 /* Calculate table index by multiplying r with table scale and truncate to integer */
584 rt = _mm_mul_pd(r32,vftabscale);
585 vfitab = _mm_cvttpd_epi32(rt);
586 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
587 vfitab = _mm_slli_epi32(vfitab,2);
589 /* CUBIC SPLINE TABLE ELECTROSTATICS */
590 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
591 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
592 GMX_MM_TRANSPOSE2_PD(Y,F);
593 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
594 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
595 GMX_MM_TRANSPOSE2_PD(G,H);
596 Heps = _mm_mul_pd(vfeps,H);
597 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
598 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
599 velec = _mm_mul_pd(qq32,VV);
600 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
601 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
603 /* Update potential sum for this i atom from the interaction with this j atom. */
604 velecsum = _mm_add_pd(velecsum,velec);
608 /* Calculate temporary vectorial force */
609 tx = _mm_mul_pd(fscal,dx32);
610 ty = _mm_mul_pd(fscal,dy32);
611 tz = _mm_mul_pd(fscal,dz32);
613 /* Update vectorial force */
614 fix3 = _mm_add_pd(fix3,tx);
615 fiy3 = _mm_add_pd(fiy3,ty);
616 fiz3 = _mm_add_pd(fiz3,tz);
618 fjx2 = _mm_add_pd(fjx2,tx);
619 fjy2 = _mm_add_pd(fjy2,ty);
620 fjz2 = _mm_add_pd(fjz2,tz);
622 /**************************
623 * CALCULATE INTERACTIONS *
624 **************************/
626 r33 = _mm_mul_pd(rsq33,rinv33);
628 /* Calculate table index by multiplying r with table scale and truncate to integer */
629 rt = _mm_mul_pd(r33,vftabscale);
630 vfitab = _mm_cvttpd_epi32(rt);
631 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
632 vfitab = _mm_slli_epi32(vfitab,2);
634 /* CUBIC SPLINE TABLE ELECTROSTATICS */
635 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
636 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
637 GMX_MM_TRANSPOSE2_PD(Y,F);
638 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
639 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
640 GMX_MM_TRANSPOSE2_PD(G,H);
641 Heps = _mm_mul_pd(vfeps,H);
642 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
643 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
644 velec = _mm_mul_pd(qq33,VV);
645 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
646 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
648 /* Update potential sum for this i atom from the interaction with this j atom. */
649 velecsum = _mm_add_pd(velecsum,velec);
653 /* Calculate temporary vectorial force */
654 tx = _mm_mul_pd(fscal,dx33);
655 ty = _mm_mul_pd(fscal,dy33);
656 tz = _mm_mul_pd(fscal,dz33);
658 /* Update vectorial force */
659 fix3 = _mm_add_pd(fix3,tx);
660 fiy3 = _mm_add_pd(fiy3,ty);
661 fiz3 = _mm_add_pd(fiz3,tz);
663 fjx3 = _mm_add_pd(fjx3,tx);
664 fjy3 = _mm_add_pd(fjy3,ty);
665 fjz3 = _mm_add_pd(fjz3,tz);
667 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
669 /* Inner loop uses 387 flops */
676 j_coord_offsetA = DIM*jnrA;
678 /* load j atom coordinates */
679 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA+DIM,
680 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
682 /* Calculate displacement vector */
683 dx11 = _mm_sub_pd(ix1,jx1);
684 dy11 = _mm_sub_pd(iy1,jy1);
685 dz11 = _mm_sub_pd(iz1,jz1);
686 dx12 = _mm_sub_pd(ix1,jx2);
687 dy12 = _mm_sub_pd(iy1,jy2);
688 dz12 = _mm_sub_pd(iz1,jz2);
689 dx13 = _mm_sub_pd(ix1,jx3);
690 dy13 = _mm_sub_pd(iy1,jy3);
691 dz13 = _mm_sub_pd(iz1,jz3);
692 dx21 = _mm_sub_pd(ix2,jx1);
693 dy21 = _mm_sub_pd(iy2,jy1);
694 dz21 = _mm_sub_pd(iz2,jz1);
695 dx22 = _mm_sub_pd(ix2,jx2);
696 dy22 = _mm_sub_pd(iy2,jy2);
697 dz22 = _mm_sub_pd(iz2,jz2);
698 dx23 = _mm_sub_pd(ix2,jx3);
699 dy23 = _mm_sub_pd(iy2,jy3);
700 dz23 = _mm_sub_pd(iz2,jz3);
701 dx31 = _mm_sub_pd(ix3,jx1);
702 dy31 = _mm_sub_pd(iy3,jy1);
703 dz31 = _mm_sub_pd(iz3,jz1);
704 dx32 = _mm_sub_pd(ix3,jx2);
705 dy32 = _mm_sub_pd(iy3,jy2);
706 dz32 = _mm_sub_pd(iz3,jz2);
707 dx33 = _mm_sub_pd(ix3,jx3);
708 dy33 = _mm_sub_pd(iy3,jy3);
709 dz33 = _mm_sub_pd(iz3,jz3);
711 /* Calculate squared distance and things based on it */
712 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
713 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
714 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
715 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
716 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
717 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
718 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
719 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
720 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
722 rinv11 = gmx_mm_invsqrt_pd(rsq11);
723 rinv12 = gmx_mm_invsqrt_pd(rsq12);
724 rinv13 = gmx_mm_invsqrt_pd(rsq13);
725 rinv21 = gmx_mm_invsqrt_pd(rsq21);
726 rinv22 = gmx_mm_invsqrt_pd(rsq22);
727 rinv23 = gmx_mm_invsqrt_pd(rsq23);
728 rinv31 = gmx_mm_invsqrt_pd(rsq31);
729 rinv32 = gmx_mm_invsqrt_pd(rsq32);
730 rinv33 = gmx_mm_invsqrt_pd(rsq33);
732 fjx1 = _mm_setzero_pd();
733 fjy1 = _mm_setzero_pd();
734 fjz1 = _mm_setzero_pd();
735 fjx2 = _mm_setzero_pd();
736 fjy2 = _mm_setzero_pd();
737 fjz2 = _mm_setzero_pd();
738 fjx3 = _mm_setzero_pd();
739 fjy3 = _mm_setzero_pd();
740 fjz3 = _mm_setzero_pd();
742 /**************************
743 * CALCULATE INTERACTIONS *
744 **************************/
746 r11 = _mm_mul_pd(rsq11,rinv11);
748 /* Calculate table index by multiplying r with table scale and truncate to integer */
749 rt = _mm_mul_pd(r11,vftabscale);
750 vfitab = _mm_cvttpd_epi32(rt);
751 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
752 vfitab = _mm_slli_epi32(vfitab,2);
754 /* CUBIC SPLINE TABLE ELECTROSTATICS */
755 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
756 F = _mm_setzero_pd();
757 GMX_MM_TRANSPOSE2_PD(Y,F);
758 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
759 H = _mm_setzero_pd();
760 GMX_MM_TRANSPOSE2_PD(G,H);
761 Heps = _mm_mul_pd(vfeps,H);
762 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
763 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
764 velec = _mm_mul_pd(qq11,VV);
765 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
766 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
768 /* Update potential sum for this i atom from the interaction with this j atom. */
769 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
770 velecsum = _mm_add_pd(velecsum,velec);
774 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
776 /* Calculate temporary vectorial force */
777 tx = _mm_mul_pd(fscal,dx11);
778 ty = _mm_mul_pd(fscal,dy11);
779 tz = _mm_mul_pd(fscal,dz11);
781 /* Update vectorial force */
782 fix1 = _mm_add_pd(fix1,tx);
783 fiy1 = _mm_add_pd(fiy1,ty);
784 fiz1 = _mm_add_pd(fiz1,tz);
786 fjx1 = _mm_add_pd(fjx1,tx);
787 fjy1 = _mm_add_pd(fjy1,ty);
788 fjz1 = _mm_add_pd(fjz1,tz);
790 /**************************
791 * CALCULATE INTERACTIONS *
792 **************************/
794 r12 = _mm_mul_pd(rsq12,rinv12);
796 /* Calculate table index by multiplying r with table scale and truncate to integer */
797 rt = _mm_mul_pd(r12,vftabscale);
798 vfitab = _mm_cvttpd_epi32(rt);
799 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
800 vfitab = _mm_slli_epi32(vfitab,2);
802 /* CUBIC SPLINE TABLE ELECTROSTATICS */
803 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
804 F = _mm_setzero_pd();
805 GMX_MM_TRANSPOSE2_PD(Y,F);
806 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
807 H = _mm_setzero_pd();
808 GMX_MM_TRANSPOSE2_PD(G,H);
809 Heps = _mm_mul_pd(vfeps,H);
810 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
811 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
812 velec = _mm_mul_pd(qq12,VV);
813 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
814 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
816 /* Update potential sum for this i atom from the interaction with this j atom. */
817 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
818 velecsum = _mm_add_pd(velecsum,velec);
822 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
824 /* Calculate temporary vectorial force */
825 tx = _mm_mul_pd(fscal,dx12);
826 ty = _mm_mul_pd(fscal,dy12);
827 tz = _mm_mul_pd(fscal,dz12);
829 /* Update vectorial force */
830 fix1 = _mm_add_pd(fix1,tx);
831 fiy1 = _mm_add_pd(fiy1,ty);
832 fiz1 = _mm_add_pd(fiz1,tz);
834 fjx2 = _mm_add_pd(fjx2,tx);
835 fjy2 = _mm_add_pd(fjy2,ty);
836 fjz2 = _mm_add_pd(fjz2,tz);
838 /**************************
839 * CALCULATE INTERACTIONS *
840 **************************/
842 r13 = _mm_mul_pd(rsq13,rinv13);
844 /* Calculate table index by multiplying r with table scale and truncate to integer */
845 rt = _mm_mul_pd(r13,vftabscale);
846 vfitab = _mm_cvttpd_epi32(rt);
847 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
848 vfitab = _mm_slli_epi32(vfitab,2);
850 /* CUBIC SPLINE TABLE ELECTROSTATICS */
851 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
852 F = _mm_setzero_pd();
853 GMX_MM_TRANSPOSE2_PD(Y,F);
854 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
855 H = _mm_setzero_pd();
856 GMX_MM_TRANSPOSE2_PD(G,H);
857 Heps = _mm_mul_pd(vfeps,H);
858 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
859 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
860 velec = _mm_mul_pd(qq13,VV);
861 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
862 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
864 /* Update potential sum for this i atom from the interaction with this j atom. */
865 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
866 velecsum = _mm_add_pd(velecsum,velec);
870 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
872 /* Calculate temporary vectorial force */
873 tx = _mm_mul_pd(fscal,dx13);
874 ty = _mm_mul_pd(fscal,dy13);
875 tz = _mm_mul_pd(fscal,dz13);
877 /* Update vectorial force */
878 fix1 = _mm_add_pd(fix1,tx);
879 fiy1 = _mm_add_pd(fiy1,ty);
880 fiz1 = _mm_add_pd(fiz1,tz);
882 fjx3 = _mm_add_pd(fjx3,tx);
883 fjy3 = _mm_add_pd(fjy3,ty);
884 fjz3 = _mm_add_pd(fjz3,tz);
886 /**************************
887 * CALCULATE INTERACTIONS *
888 **************************/
890 r21 = _mm_mul_pd(rsq21,rinv21);
892 /* Calculate table index by multiplying r with table scale and truncate to integer */
893 rt = _mm_mul_pd(r21,vftabscale);
894 vfitab = _mm_cvttpd_epi32(rt);
895 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
896 vfitab = _mm_slli_epi32(vfitab,2);
898 /* CUBIC SPLINE TABLE ELECTROSTATICS */
899 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
900 F = _mm_setzero_pd();
901 GMX_MM_TRANSPOSE2_PD(Y,F);
902 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
903 H = _mm_setzero_pd();
904 GMX_MM_TRANSPOSE2_PD(G,H);
905 Heps = _mm_mul_pd(vfeps,H);
906 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
907 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
908 velec = _mm_mul_pd(qq21,VV);
909 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
910 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
912 /* Update potential sum for this i atom from the interaction with this j atom. */
913 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
914 velecsum = _mm_add_pd(velecsum,velec);
918 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
920 /* Calculate temporary vectorial force */
921 tx = _mm_mul_pd(fscal,dx21);
922 ty = _mm_mul_pd(fscal,dy21);
923 tz = _mm_mul_pd(fscal,dz21);
925 /* Update vectorial force */
926 fix2 = _mm_add_pd(fix2,tx);
927 fiy2 = _mm_add_pd(fiy2,ty);
928 fiz2 = _mm_add_pd(fiz2,tz);
930 fjx1 = _mm_add_pd(fjx1,tx);
931 fjy1 = _mm_add_pd(fjy1,ty);
932 fjz1 = _mm_add_pd(fjz1,tz);
934 /**************************
935 * CALCULATE INTERACTIONS *
936 **************************/
938 r22 = _mm_mul_pd(rsq22,rinv22);
940 /* Calculate table index by multiplying r with table scale and truncate to integer */
941 rt = _mm_mul_pd(r22,vftabscale);
942 vfitab = _mm_cvttpd_epi32(rt);
943 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
944 vfitab = _mm_slli_epi32(vfitab,2);
946 /* CUBIC SPLINE TABLE ELECTROSTATICS */
947 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
948 F = _mm_setzero_pd();
949 GMX_MM_TRANSPOSE2_PD(Y,F);
950 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
951 H = _mm_setzero_pd();
952 GMX_MM_TRANSPOSE2_PD(G,H);
953 Heps = _mm_mul_pd(vfeps,H);
954 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
955 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
956 velec = _mm_mul_pd(qq22,VV);
957 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
958 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
960 /* Update potential sum for this i atom from the interaction with this j atom. */
961 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
962 velecsum = _mm_add_pd(velecsum,velec);
966 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
968 /* Calculate temporary vectorial force */
969 tx = _mm_mul_pd(fscal,dx22);
970 ty = _mm_mul_pd(fscal,dy22);
971 tz = _mm_mul_pd(fscal,dz22);
973 /* Update vectorial force */
974 fix2 = _mm_add_pd(fix2,tx);
975 fiy2 = _mm_add_pd(fiy2,ty);
976 fiz2 = _mm_add_pd(fiz2,tz);
978 fjx2 = _mm_add_pd(fjx2,tx);
979 fjy2 = _mm_add_pd(fjy2,ty);
980 fjz2 = _mm_add_pd(fjz2,tz);
982 /**************************
983 * CALCULATE INTERACTIONS *
984 **************************/
986 r23 = _mm_mul_pd(rsq23,rinv23);
988 /* Calculate table index by multiplying r with table scale and truncate to integer */
989 rt = _mm_mul_pd(r23,vftabscale);
990 vfitab = _mm_cvttpd_epi32(rt);
991 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
992 vfitab = _mm_slli_epi32(vfitab,2);
994 /* CUBIC SPLINE TABLE ELECTROSTATICS */
995 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
996 F = _mm_setzero_pd();
997 GMX_MM_TRANSPOSE2_PD(Y,F);
998 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
999 H = _mm_setzero_pd();
1000 GMX_MM_TRANSPOSE2_PD(G,H);
1001 Heps = _mm_mul_pd(vfeps,H);
1002 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1003 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1004 velec = _mm_mul_pd(qq23,VV);
1005 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1006 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
1008 /* Update potential sum for this i atom from the interaction with this j atom. */
1009 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1010 velecsum = _mm_add_pd(velecsum,velec);
1014 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1016 /* Calculate temporary vectorial force */
1017 tx = _mm_mul_pd(fscal,dx23);
1018 ty = _mm_mul_pd(fscal,dy23);
1019 tz = _mm_mul_pd(fscal,dz23);
1021 /* Update vectorial force */
1022 fix2 = _mm_add_pd(fix2,tx);
1023 fiy2 = _mm_add_pd(fiy2,ty);
1024 fiz2 = _mm_add_pd(fiz2,tz);
1026 fjx3 = _mm_add_pd(fjx3,tx);
1027 fjy3 = _mm_add_pd(fjy3,ty);
1028 fjz3 = _mm_add_pd(fjz3,tz);
1030 /**************************
1031 * CALCULATE INTERACTIONS *
1032 **************************/
1034 r31 = _mm_mul_pd(rsq31,rinv31);
1036 /* Calculate table index by multiplying r with table scale and truncate to integer */
1037 rt = _mm_mul_pd(r31,vftabscale);
1038 vfitab = _mm_cvttpd_epi32(rt);
1039 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1040 vfitab = _mm_slli_epi32(vfitab,2);
1042 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1043 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1044 F = _mm_setzero_pd();
1045 GMX_MM_TRANSPOSE2_PD(Y,F);
1046 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1047 H = _mm_setzero_pd();
1048 GMX_MM_TRANSPOSE2_PD(G,H);
1049 Heps = _mm_mul_pd(vfeps,H);
1050 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1051 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1052 velec = _mm_mul_pd(qq31,VV);
1053 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1054 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
1056 /* Update potential sum for this i atom from the interaction with this j atom. */
1057 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1058 velecsum = _mm_add_pd(velecsum,velec);
1062 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1064 /* Calculate temporary vectorial force */
1065 tx = _mm_mul_pd(fscal,dx31);
1066 ty = _mm_mul_pd(fscal,dy31);
1067 tz = _mm_mul_pd(fscal,dz31);
1069 /* Update vectorial force */
1070 fix3 = _mm_add_pd(fix3,tx);
1071 fiy3 = _mm_add_pd(fiy3,ty);
1072 fiz3 = _mm_add_pd(fiz3,tz);
1074 fjx1 = _mm_add_pd(fjx1,tx);
1075 fjy1 = _mm_add_pd(fjy1,ty);
1076 fjz1 = _mm_add_pd(fjz1,tz);
1078 /**************************
1079 * CALCULATE INTERACTIONS *
1080 **************************/
1082 r32 = _mm_mul_pd(rsq32,rinv32);
1084 /* Calculate table index by multiplying r with table scale and truncate to integer */
1085 rt = _mm_mul_pd(r32,vftabscale);
1086 vfitab = _mm_cvttpd_epi32(rt);
1087 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1088 vfitab = _mm_slli_epi32(vfitab,2);
1090 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1091 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1092 F = _mm_setzero_pd();
1093 GMX_MM_TRANSPOSE2_PD(Y,F);
1094 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1095 H = _mm_setzero_pd();
1096 GMX_MM_TRANSPOSE2_PD(G,H);
1097 Heps = _mm_mul_pd(vfeps,H);
1098 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1099 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1100 velec = _mm_mul_pd(qq32,VV);
1101 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1102 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
1104 /* Update potential sum for this i atom from the interaction with this j atom. */
1105 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1106 velecsum = _mm_add_pd(velecsum,velec);
1110 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1112 /* Calculate temporary vectorial force */
1113 tx = _mm_mul_pd(fscal,dx32);
1114 ty = _mm_mul_pd(fscal,dy32);
1115 tz = _mm_mul_pd(fscal,dz32);
1117 /* Update vectorial force */
1118 fix3 = _mm_add_pd(fix3,tx);
1119 fiy3 = _mm_add_pd(fiy3,ty);
1120 fiz3 = _mm_add_pd(fiz3,tz);
1122 fjx2 = _mm_add_pd(fjx2,tx);
1123 fjy2 = _mm_add_pd(fjy2,ty);
1124 fjz2 = _mm_add_pd(fjz2,tz);
1126 /**************************
1127 * CALCULATE INTERACTIONS *
1128 **************************/
1130 r33 = _mm_mul_pd(rsq33,rinv33);
1132 /* Calculate table index by multiplying r with table scale and truncate to integer */
1133 rt = _mm_mul_pd(r33,vftabscale);
1134 vfitab = _mm_cvttpd_epi32(rt);
1135 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1136 vfitab = _mm_slli_epi32(vfitab,2);
1138 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1139 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1140 F = _mm_setzero_pd();
1141 GMX_MM_TRANSPOSE2_PD(Y,F);
1142 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1143 H = _mm_setzero_pd();
1144 GMX_MM_TRANSPOSE2_PD(G,H);
1145 Heps = _mm_mul_pd(vfeps,H);
1146 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1147 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1148 velec = _mm_mul_pd(qq33,VV);
1149 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1150 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
1152 /* Update potential sum for this i atom from the interaction with this j atom. */
1153 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1154 velecsum = _mm_add_pd(velecsum,velec);
1158 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1160 /* Calculate temporary vectorial force */
1161 tx = _mm_mul_pd(fscal,dx33);
1162 ty = _mm_mul_pd(fscal,dy33);
1163 tz = _mm_mul_pd(fscal,dz33);
1165 /* Update vectorial force */
1166 fix3 = _mm_add_pd(fix3,tx);
1167 fiy3 = _mm_add_pd(fiy3,ty);
1168 fiz3 = _mm_add_pd(fiz3,tz);
1170 fjx3 = _mm_add_pd(fjx3,tx);
1171 fjy3 = _mm_add_pd(fjy3,ty);
1172 fjz3 = _mm_add_pd(fjz3,tz);
1174 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1176 /* Inner loop uses 387 flops */
1179 /* End of innermost loop */
1181 gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1182 f+i_coord_offset+DIM,fshift+i_shift_offset);
1185 /* Update potential energies */
1186 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1188 /* Increment number of inner iterations */
1189 inneriter += j_index_end - j_index_start;
1191 /* Outer loop uses 19 flops */
1194 /* Increment number of outer iterations */
1197 /* Update outer/inner flops */
1199 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*387);
1202 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse4_1_double
1203 * Electrostatics interaction: CubicSplineTable
1204 * VdW interaction: None
1205 * Geometry: Water4-Water4
1206 * Calculate force/pot: Force
1209 nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_sse4_1_double
1210 (t_nblist * gmx_restrict nlist,
1211 rvec * gmx_restrict xx,
1212 rvec * gmx_restrict ff,
1213 t_forcerec * gmx_restrict fr,
1214 t_mdatoms * gmx_restrict mdatoms,
1215 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1216 t_nrnb * gmx_restrict nrnb)
1218 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1219 * just 0 for non-waters.
1220 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1221 * jnr indices corresponding to data put in the four positions in the SIMD register.
1223 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1224 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1226 int j_coord_offsetA,j_coord_offsetB;
1227 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1228 real rcutoff_scalar;
1229 real *shiftvec,*fshift,*x,*f;
1230 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1232 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1234 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1236 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1237 int vdwjidx1A,vdwjidx1B;
1238 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1239 int vdwjidx2A,vdwjidx2B;
1240 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1241 int vdwjidx3A,vdwjidx3B;
1242 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1243 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1244 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1245 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1246 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1247 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1248 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1249 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1250 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1251 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1252 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1255 __m128i ifour = _mm_set1_epi32(4);
1256 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1258 __m128d dummy_mask,cutoff_mask;
1259 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1260 __m128d one = _mm_set1_pd(1.0);
1261 __m128d two = _mm_set1_pd(2.0);
1267 jindex = nlist->jindex;
1269 shiftidx = nlist->shift;
1271 shiftvec = fr->shift_vec[0];
1272 fshift = fr->fshift[0];
1273 facel = _mm_set1_pd(fr->epsfac);
1274 charge = mdatoms->chargeA;
1276 vftab = kernel_data->table_elec->data;
1277 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
1279 /* Setup water-specific parameters */
1280 inr = nlist->iinr[0];
1281 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1282 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1283 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1285 jq1 = _mm_set1_pd(charge[inr+1]);
1286 jq2 = _mm_set1_pd(charge[inr+2]);
1287 jq3 = _mm_set1_pd(charge[inr+3]);
1288 qq11 = _mm_mul_pd(iq1,jq1);
1289 qq12 = _mm_mul_pd(iq1,jq2);
1290 qq13 = _mm_mul_pd(iq1,jq3);
1291 qq21 = _mm_mul_pd(iq2,jq1);
1292 qq22 = _mm_mul_pd(iq2,jq2);
1293 qq23 = _mm_mul_pd(iq2,jq3);
1294 qq31 = _mm_mul_pd(iq3,jq1);
1295 qq32 = _mm_mul_pd(iq3,jq2);
1296 qq33 = _mm_mul_pd(iq3,jq3);
1298 /* Avoid stupid compiler warnings */
1300 j_coord_offsetA = 0;
1301 j_coord_offsetB = 0;
1306 /* Start outer loop over neighborlists */
1307 for(iidx=0; iidx<nri; iidx++)
1309 /* Load shift vector for this list */
1310 i_shift_offset = DIM*shiftidx[iidx];
1312 /* Load limits for loop over neighbors */
1313 j_index_start = jindex[iidx];
1314 j_index_end = jindex[iidx+1];
1316 /* Get outer coordinate index */
1318 i_coord_offset = DIM*inr;
1320 /* Load i particle coords and add shift vector */
1321 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1322 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1324 fix1 = _mm_setzero_pd();
1325 fiy1 = _mm_setzero_pd();
1326 fiz1 = _mm_setzero_pd();
1327 fix2 = _mm_setzero_pd();
1328 fiy2 = _mm_setzero_pd();
1329 fiz2 = _mm_setzero_pd();
1330 fix3 = _mm_setzero_pd();
1331 fiy3 = _mm_setzero_pd();
1332 fiz3 = _mm_setzero_pd();
1334 /* Start inner kernel loop */
1335 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1338 /* Get j neighbor index, and coordinate index */
1340 jnrB = jjnr[jidx+1];
1341 j_coord_offsetA = DIM*jnrA;
1342 j_coord_offsetB = DIM*jnrB;
1344 /* load j atom coordinates */
1345 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1346 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1348 /* Calculate displacement vector */
1349 dx11 = _mm_sub_pd(ix1,jx1);
1350 dy11 = _mm_sub_pd(iy1,jy1);
1351 dz11 = _mm_sub_pd(iz1,jz1);
1352 dx12 = _mm_sub_pd(ix1,jx2);
1353 dy12 = _mm_sub_pd(iy1,jy2);
1354 dz12 = _mm_sub_pd(iz1,jz2);
1355 dx13 = _mm_sub_pd(ix1,jx3);
1356 dy13 = _mm_sub_pd(iy1,jy3);
1357 dz13 = _mm_sub_pd(iz1,jz3);
1358 dx21 = _mm_sub_pd(ix2,jx1);
1359 dy21 = _mm_sub_pd(iy2,jy1);
1360 dz21 = _mm_sub_pd(iz2,jz1);
1361 dx22 = _mm_sub_pd(ix2,jx2);
1362 dy22 = _mm_sub_pd(iy2,jy2);
1363 dz22 = _mm_sub_pd(iz2,jz2);
1364 dx23 = _mm_sub_pd(ix2,jx3);
1365 dy23 = _mm_sub_pd(iy2,jy3);
1366 dz23 = _mm_sub_pd(iz2,jz3);
1367 dx31 = _mm_sub_pd(ix3,jx1);
1368 dy31 = _mm_sub_pd(iy3,jy1);
1369 dz31 = _mm_sub_pd(iz3,jz1);
1370 dx32 = _mm_sub_pd(ix3,jx2);
1371 dy32 = _mm_sub_pd(iy3,jy2);
1372 dz32 = _mm_sub_pd(iz3,jz2);
1373 dx33 = _mm_sub_pd(ix3,jx3);
1374 dy33 = _mm_sub_pd(iy3,jy3);
1375 dz33 = _mm_sub_pd(iz3,jz3);
1377 /* Calculate squared distance and things based on it */
1378 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1379 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1380 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1381 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1382 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1383 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1384 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1385 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1386 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1388 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1389 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1390 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1391 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1392 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1393 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1394 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1395 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1396 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1398 fjx1 = _mm_setzero_pd();
1399 fjy1 = _mm_setzero_pd();
1400 fjz1 = _mm_setzero_pd();
1401 fjx2 = _mm_setzero_pd();
1402 fjy2 = _mm_setzero_pd();
1403 fjz2 = _mm_setzero_pd();
1404 fjx3 = _mm_setzero_pd();
1405 fjy3 = _mm_setzero_pd();
1406 fjz3 = _mm_setzero_pd();
1408 /**************************
1409 * CALCULATE INTERACTIONS *
1410 **************************/
1412 r11 = _mm_mul_pd(rsq11,rinv11);
1414 /* Calculate table index by multiplying r with table scale and truncate to integer */
1415 rt = _mm_mul_pd(r11,vftabscale);
1416 vfitab = _mm_cvttpd_epi32(rt);
1417 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1418 vfitab = _mm_slli_epi32(vfitab,2);
1420 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1421 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1422 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1423 GMX_MM_TRANSPOSE2_PD(Y,F);
1424 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1425 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1426 GMX_MM_TRANSPOSE2_PD(G,H);
1427 Heps = _mm_mul_pd(vfeps,H);
1428 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1429 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1430 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1434 /* Calculate temporary vectorial force */
1435 tx = _mm_mul_pd(fscal,dx11);
1436 ty = _mm_mul_pd(fscal,dy11);
1437 tz = _mm_mul_pd(fscal,dz11);
1439 /* Update vectorial force */
1440 fix1 = _mm_add_pd(fix1,tx);
1441 fiy1 = _mm_add_pd(fiy1,ty);
1442 fiz1 = _mm_add_pd(fiz1,tz);
1444 fjx1 = _mm_add_pd(fjx1,tx);
1445 fjy1 = _mm_add_pd(fjy1,ty);
1446 fjz1 = _mm_add_pd(fjz1,tz);
1448 /**************************
1449 * CALCULATE INTERACTIONS *
1450 **************************/
1452 r12 = _mm_mul_pd(rsq12,rinv12);
1454 /* Calculate table index by multiplying r with table scale and truncate to integer */
1455 rt = _mm_mul_pd(r12,vftabscale);
1456 vfitab = _mm_cvttpd_epi32(rt);
1457 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1458 vfitab = _mm_slli_epi32(vfitab,2);
1460 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1461 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1462 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1463 GMX_MM_TRANSPOSE2_PD(Y,F);
1464 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1465 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1466 GMX_MM_TRANSPOSE2_PD(G,H);
1467 Heps = _mm_mul_pd(vfeps,H);
1468 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1469 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1470 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1474 /* Calculate temporary vectorial force */
1475 tx = _mm_mul_pd(fscal,dx12);
1476 ty = _mm_mul_pd(fscal,dy12);
1477 tz = _mm_mul_pd(fscal,dz12);
1479 /* Update vectorial force */
1480 fix1 = _mm_add_pd(fix1,tx);
1481 fiy1 = _mm_add_pd(fiy1,ty);
1482 fiz1 = _mm_add_pd(fiz1,tz);
1484 fjx2 = _mm_add_pd(fjx2,tx);
1485 fjy2 = _mm_add_pd(fjy2,ty);
1486 fjz2 = _mm_add_pd(fjz2,tz);
1488 /**************************
1489 * CALCULATE INTERACTIONS *
1490 **************************/
1492 r13 = _mm_mul_pd(rsq13,rinv13);
1494 /* Calculate table index by multiplying r with table scale and truncate to integer */
1495 rt = _mm_mul_pd(r13,vftabscale);
1496 vfitab = _mm_cvttpd_epi32(rt);
1497 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1498 vfitab = _mm_slli_epi32(vfitab,2);
1500 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1501 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1502 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1503 GMX_MM_TRANSPOSE2_PD(Y,F);
1504 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1505 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1506 GMX_MM_TRANSPOSE2_PD(G,H);
1507 Heps = _mm_mul_pd(vfeps,H);
1508 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1509 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1510 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
1514 /* Calculate temporary vectorial force */
1515 tx = _mm_mul_pd(fscal,dx13);
1516 ty = _mm_mul_pd(fscal,dy13);
1517 tz = _mm_mul_pd(fscal,dz13);
1519 /* Update vectorial force */
1520 fix1 = _mm_add_pd(fix1,tx);
1521 fiy1 = _mm_add_pd(fiy1,ty);
1522 fiz1 = _mm_add_pd(fiz1,tz);
1524 fjx3 = _mm_add_pd(fjx3,tx);
1525 fjy3 = _mm_add_pd(fjy3,ty);
1526 fjz3 = _mm_add_pd(fjz3,tz);
1528 /**************************
1529 * CALCULATE INTERACTIONS *
1530 **************************/
1532 r21 = _mm_mul_pd(rsq21,rinv21);
1534 /* Calculate table index by multiplying r with table scale and truncate to integer */
1535 rt = _mm_mul_pd(r21,vftabscale);
1536 vfitab = _mm_cvttpd_epi32(rt);
1537 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1538 vfitab = _mm_slli_epi32(vfitab,2);
1540 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1541 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1542 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1543 GMX_MM_TRANSPOSE2_PD(Y,F);
1544 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1545 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1546 GMX_MM_TRANSPOSE2_PD(G,H);
1547 Heps = _mm_mul_pd(vfeps,H);
1548 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1549 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1550 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1554 /* Calculate temporary vectorial force */
1555 tx = _mm_mul_pd(fscal,dx21);
1556 ty = _mm_mul_pd(fscal,dy21);
1557 tz = _mm_mul_pd(fscal,dz21);
1559 /* Update vectorial force */
1560 fix2 = _mm_add_pd(fix2,tx);
1561 fiy2 = _mm_add_pd(fiy2,ty);
1562 fiz2 = _mm_add_pd(fiz2,tz);
1564 fjx1 = _mm_add_pd(fjx1,tx);
1565 fjy1 = _mm_add_pd(fjy1,ty);
1566 fjz1 = _mm_add_pd(fjz1,tz);
1568 /**************************
1569 * CALCULATE INTERACTIONS *
1570 **************************/
1572 r22 = _mm_mul_pd(rsq22,rinv22);
1574 /* Calculate table index by multiplying r with table scale and truncate to integer */
1575 rt = _mm_mul_pd(r22,vftabscale);
1576 vfitab = _mm_cvttpd_epi32(rt);
1577 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1578 vfitab = _mm_slli_epi32(vfitab,2);
1580 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1581 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1582 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1583 GMX_MM_TRANSPOSE2_PD(Y,F);
1584 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1585 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1586 GMX_MM_TRANSPOSE2_PD(G,H);
1587 Heps = _mm_mul_pd(vfeps,H);
1588 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1589 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1590 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1594 /* Calculate temporary vectorial force */
1595 tx = _mm_mul_pd(fscal,dx22);
1596 ty = _mm_mul_pd(fscal,dy22);
1597 tz = _mm_mul_pd(fscal,dz22);
1599 /* Update vectorial force */
1600 fix2 = _mm_add_pd(fix2,tx);
1601 fiy2 = _mm_add_pd(fiy2,ty);
1602 fiz2 = _mm_add_pd(fiz2,tz);
1604 fjx2 = _mm_add_pd(fjx2,tx);
1605 fjy2 = _mm_add_pd(fjy2,ty);
1606 fjz2 = _mm_add_pd(fjz2,tz);
1608 /**************************
1609 * CALCULATE INTERACTIONS *
1610 **************************/
1612 r23 = _mm_mul_pd(rsq23,rinv23);
1614 /* Calculate table index by multiplying r with table scale and truncate to integer */
1615 rt = _mm_mul_pd(r23,vftabscale);
1616 vfitab = _mm_cvttpd_epi32(rt);
1617 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1618 vfitab = _mm_slli_epi32(vfitab,2);
1620 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1621 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1622 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1623 GMX_MM_TRANSPOSE2_PD(Y,F);
1624 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1625 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1626 GMX_MM_TRANSPOSE2_PD(G,H);
1627 Heps = _mm_mul_pd(vfeps,H);
1628 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1629 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1630 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
1634 /* Calculate temporary vectorial force */
1635 tx = _mm_mul_pd(fscal,dx23);
1636 ty = _mm_mul_pd(fscal,dy23);
1637 tz = _mm_mul_pd(fscal,dz23);
1639 /* Update vectorial force */
1640 fix2 = _mm_add_pd(fix2,tx);
1641 fiy2 = _mm_add_pd(fiy2,ty);
1642 fiz2 = _mm_add_pd(fiz2,tz);
1644 fjx3 = _mm_add_pd(fjx3,tx);
1645 fjy3 = _mm_add_pd(fjy3,ty);
1646 fjz3 = _mm_add_pd(fjz3,tz);
1648 /**************************
1649 * CALCULATE INTERACTIONS *
1650 **************************/
1652 r31 = _mm_mul_pd(rsq31,rinv31);
1654 /* Calculate table index by multiplying r with table scale and truncate to integer */
1655 rt = _mm_mul_pd(r31,vftabscale);
1656 vfitab = _mm_cvttpd_epi32(rt);
1657 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1658 vfitab = _mm_slli_epi32(vfitab,2);
1660 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1661 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1662 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1663 GMX_MM_TRANSPOSE2_PD(Y,F);
1664 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1665 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1666 GMX_MM_TRANSPOSE2_PD(G,H);
1667 Heps = _mm_mul_pd(vfeps,H);
1668 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1669 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1670 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
1674 /* Calculate temporary vectorial force */
1675 tx = _mm_mul_pd(fscal,dx31);
1676 ty = _mm_mul_pd(fscal,dy31);
1677 tz = _mm_mul_pd(fscal,dz31);
1679 /* Update vectorial force */
1680 fix3 = _mm_add_pd(fix3,tx);
1681 fiy3 = _mm_add_pd(fiy3,ty);
1682 fiz3 = _mm_add_pd(fiz3,tz);
1684 fjx1 = _mm_add_pd(fjx1,tx);
1685 fjy1 = _mm_add_pd(fjy1,ty);
1686 fjz1 = _mm_add_pd(fjz1,tz);
1688 /**************************
1689 * CALCULATE INTERACTIONS *
1690 **************************/
1692 r32 = _mm_mul_pd(rsq32,rinv32);
1694 /* Calculate table index by multiplying r with table scale and truncate to integer */
1695 rt = _mm_mul_pd(r32,vftabscale);
1696 vfitab = _mm_cvttpd_epi32(rt);
1697 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1698 vfitab = _mm_slli_epi32(vfitab,2);
1700 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1701 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1702 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1703 GMX_MM_TRANSPOSE2_PD(Y,F);
1704 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1705 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1706 GMX_MM_TRANSPOSE2_PD(G,H);
1707 Heps = _mm_mul_pd(vfeps,H);
1708 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1709 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1710 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
1714 /* Calculate temporary vectorial force */
1715 tx = _mm_mul_pd(fscal,dx32);
1716 ty = _mm_mul_pd(fscal,dy32);
1717 tz = _mm_mul_pd(fscal,dz32);
1719 /* Update vectorial force */
1720 fix3 = _mm_add_pd(fix3,tx);
1721 fiy3 = _mm_add_pd(fiy3,ty);
1722 fiz3 = _mm_add_pd(fiz3,tz);
1724 fjx2 = _mm_add_pd(fjx2,tx);
1725 fjy2 = _mm_add_pd(fjy2,ty);
1726 fjz2 = _mm_add_pd(fjz2,tz);
1728 /**************************
1729 * CALCULATE INTERACTIONS *
1730 **************************/
1732 r33 = _mm_mul_pd(rsq33,rinv33);
1734 /* Calculate table index by multiplying r with table scale and truncate to integer */
1735 rt = _mm_mul_pd(r33,vftabscale);
1736 vfitab = _mm_cvttpd_epi32(rt);
1737 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1738 vfitab = _mm_slli_epi32(vfitab,2);
1740 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1741 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1742 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1743 GMX_MM_TRANSPOSE2_PD(Y,F);
1744 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1745 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1746 GMX_MM_TRANSPOSE2_PD(G,H);
1747 Heps = _mm_mul_pd(vfeps,H);
1748 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1749 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1750 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
1754 /* Calculate temporary vectorial force */
1755 tx = _mm_mul_pd(fscal,dx33);
1756 ty = _mm_mul_pd(fscal,dy33);
1757 tz = _mm_mul_pd(fscal,dz33);
1759 /* Update vectorial force */
1760 fix3 = _mm_add_pd(fix3,tx);
1761 fiy3 = _mm_add_pd(fiy3,ty);
1762 fiz3 = _mm_add_pd(fiz3,tz);
1764 fjx3 = _mm_add_pd(fjx3,tx);
1765 fjy3 = _mm_add_pd(fjy3,ty);
1766 fjz3 = _mm_add_pd(fjz3,tz);
1768 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA+DIM,f+j_coord_offsetB+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1770 /* Inner loop uses 351 flops */
1773 if(jidx<j_index_end)
1777 j_coord_offsetA = DIM*jnrA;
1779 /* load j atom coordinates */
1780 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA+DIM,
1781 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1783 /* Calculate displacement vector */
1784 dx11 = _mm_sub_pd(ix1,jx1);
1785 dy11 = _mm_sub_pd(iy1,jy1);
1786 dz11 = _mm_sub_pd(iz1,jz1);
1787 dx12 = _mm_sub_pd(ix1,jx2);
1788 dy12 = _mm_sub_pd(iy1,jy2);
1789 dz12 = _mm_sub_pd(iz1,jz2);
1790 dx13 = _mm_sub_pd(ix1,jx3);
1791 dy13 = _mm_sub_pd(iy1,jy3);
1792 dz13 = _mm_sub_pd(iz1,jz3);
1793 dx21 = _mm_sub_pd(ix2,jx1);
1794 dy21 = _mm_sub_pd(iy2,jy1);
1795 dz21 = _mm_sub_pd(iz2,jz1);
1796 dx22 = _mm_sub_pd(ix2,jx2);
1797 dy22 = _mm_sub_pd(iy2,jy2);
1798 dz22 = _mm_sub_pd(iz2,jz2);
1799 dx23 = _mm_sub_pd(ix2,jx3);
1800 dy23 = _mm_sub_pd(iy2,jy3);
1801 dz23 = _mm_sub_pd(iz2,jz3);
1802 dx31 = _mm_sub_pd(ix3,jx1);
1803 dy31 = _mm_sub_pd(iy3,jy1);
1804 dz31 = _mm_sub_pd(iz3,jz1);
1805 dx32 = _mm_sub_pd(ix3,jx2);
1806 dy32 = _mm_sub_pd(iy3,jy2);
1807 dz32 = _mm_sub_pd(iz3,jz2);
1808 dx33 = _mm_sub_pd(ix3,jx3);
1809 dy33 = _mm_sub_pd(iy3,jy3);
1810 dz33 = _mm_sub_pd(iz3,jz3);
1812 /* Calculate squared distance and things based on it */
1813 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1814 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1815 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1816 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1817 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1818 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1819 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1820 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1821 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1823 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1824 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1825 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1826 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1827 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1828 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1829 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1830 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1831 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1833 fjx1 = _mm_setzero_pd();
1834 fjy1 = _mm_setzero_pd();
1835 fjz1 = _mm_setzero_pd();
1836 fjx2 = _mm_setzero_pd();
1837 fjy2 = _mm_setzero_pd();
1838 fjz2 = _mm_setzero_pd();
1839 fjx3 = _mm_setzero_pd();
1840 fjy3 = _mm_setzero_pd();
1841 fjz3 = _mm_setzero_pd();
1843 /**************************
1844 * CALCULATE INTERACTIONS *
1845 **************************/
1847 r11 = _mm_mul_pd(rsq11,rinv11);
1849 /* Calculate table index by multiplying r with table scale and truncate to integer */
1850 rt = _mm_mul_pd(r11,vftabscale);
1851 vfitab = _mm_cvttpd_epi32(rt);
1852 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1853 vfitab = _mm_slli_epi32(vfitab,2);
1855 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1856 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1857 F = _mm_setzero_pd();
1858 GMX_MM_TRANSPOSE2_PD(Y,F);
1859 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1860 H = _mm_setzero_pd();
1861 GMX_MM_TRANSPOSE2_PD(G,H);
1862 Heps = _mm_mul_pd(vfeps,H);
1863 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1864 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1865 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1869 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1871 /* Calculate temporary vectorial force */
1872 tx = _mm_mul_pd(fscal,dx11);
1873 ty = _mm_mul_pd(fscal,dy11);
1874 tz = _mm_mul_pd(fscal,dz11);
1876 /* Update vectorial force */
1877 fix1 = _mm_add_pd(fix1,tx);
1878 fiy1 = _mm_add_pd(fiy1,ty);
1879 fiz1 = _mm_add_pd(fiz1,tz);
1881 fjx1 = _mm_add_pd(fjx1,tx);
1882 fjy1 = _mm_add_pd(fjy1,ty);
1883 fjz1 = _mm_add_pd(fjz1,tz);
1885 /**************************
1886 * CALCULATE INTERACTIONS *
1887 **************************/
1889 r12 = _mm_mul_pd(rsq12,rinv12);
1891 /* Calculate table index by multiplying r with table scale and truncate to integer */
1892 rt = _mm_mul_pd(r12,vftabscale);
1893 vfitab = _mm_cvttpd_epi32(rt);
1894 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1895 vfitab = _mm_slli_epi32(vfitab,2);
1897 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1898 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1899 F = _mm_setzero_pd();
1900 GMX_MM_TRANSPOSE2_PD(Y,F);
1901 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1902 H = _mm_setzero_pd();
1903 GMX_MM_TRANSPOSE2_PD(G,H);
1904 Heps = _mm_mul_pd(vfeps,H);
1905 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1906 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1907 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1911 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1913 /* Calculate temporary vectorial force */
1914 tx = _mm_mul_pd(fscal,dx12);
1915 ty = _mm_mul_pd(fscal,dy12);
1916 tz = _mm_mul_pd(fscal,dz12);
1918 /* Update vectorial force */
1919 fix1 = _mm_add_pd(fix1,tx);
1920 fiy1 = _mm_add_pd(fiy1,ty);
1921 fiz1 = _mm_add_pd(fiz1,tz);
1923 fjx2 = _mm_add_pd(fjx2,tx);
1924 fjy2 = _mm_add_pd(fjy2,ty);
1925 fjz2 = _mm_add_pd(fjz2,tz);
1927 /**************************
1928 * CALCULATE INTERACTIONS *
1929 **************************/
1931 r13 = _mm_mul_pd(rsq13,rinv13);
1933 /* Calculate table index by multiplying r with table scale and truncate to integer */
1934 rt = _mm_mul_pd(r13,vftabscale);
1935 vfitab = _mm_cvttpd_epi32(rt);
1936 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1937 vfitab = _mm_slli_epi32(vfitab,2);
1939 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1940 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1941 F = _mm_setzero_pd();
1942 GMX_MM_TRANSPOSE2_PD(Y,F);
1943 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1944 H = _mm_setzero_pd();
1945 GMX_MM_TRANSPOSE2_PD(G,H);
1946 Heps = _mm_mul_pd(vfeps,H);
1947 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1948 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1949 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
1953 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1955 /* Calculate temporary vectorial force */
1956 tx = _mm_mul_pd(fscal,dx13);
1957 ty = _mm_mul_pd(fscal,dy13);
1958 tz = _mm_mul_pd(fscal,dz13);
1960 /* Update vectorial force */
1961 fix1 = _mm_add_pd(fix1,tx);
1962 fiy1 = _mm_add_pd(fiy1,ty);
1963 fiz1 = _mm_add_pd(fiz1,tz);
1965 fjx3 = _mm_add_pd(fjx3,tx);
1966 fjy3 = _mm_add_pd(fjy3,ty);
1967 fjz3 = _mm_add_pd(fjz3,tz);
1969 /**************************
1970 * CALCULATE INTERACTIONS *
1971 **************************/
1973 r21 = _mm_mul_pd(rsq21,rinv21);
1975 /* Calculate table index by multiplying r with table scale and truncate to integer */
1976 rt = _mm_mul_pd(r21,vftabscale);
1977 vfitab = _mm_cvttpd_epi32(rt);
1978 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1979 vfitab = _mm_slli_epi32(vfitab,2);
1981 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1982 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1983 F = _mm_setzero_pd();
1984 GMX_MM_TRANSPOSE2_PD(Y,F);
1985 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1986 H = _mm_setzero_pd();
1987 GMX_MM_TRANSPOSE2_PD(G,H);
1988 Heps = _mm_mul_pd(vfeps,H);
1989 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1990 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1991 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1995 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1997 /* Calculate temporary vectorial force */
1998 tx = _mm_mul_pd(fscal,dx21);
1999 ty = _mm_mul_pd(fscal,dy21);
2000 tz = _mm_mul_pd(fscal,dz21);
2002 /* Update vectorial force */
2003 fix2 = _mm_add_pd(fix2,tx);
2004 fiy2 = _mm_add_pd(fiy2,ty);
2005 fiz2 = _mm_add_pd(fiz2,tz);
2007 fjx1 = _mm_add_pd(fjx1,tx);
2008 fjy1 = _mm_add_pd(fjy1,ty);
2009 fjz1 = _mm_add_pd(fjz1,tz);
2011 /**************************
2012 * CALCULATE INTERACTIONS *
2013 **************************/
2015 r22 = _mm_mul_pd(rsq22,rinv22);
2017 /* Calculate table index by multiplying r with table scale and truncate to integer */
2018 rt = _mm_mul_pd(r22,vftabscale);
2019 vfitab = _mm_cvttpd_epi32(rt);
2020 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2021 vfitab = _mm_slli_epi32(vfitab,2);
2023 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2024 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2025 F = _mm_setzero_pd();
2026 GMX_MM_TRANSPOSE2_PD(Y,F);
2027 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2028 H = _mm_setzero_pd();
2029 GMX_MM_TRANSPOSE2_PD(G,H);
2030 Heps = _mm_mul_pd(vfeps,H);
2031 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2032 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2033 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2037 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2039 /* Calculate temporary vectorial force */
2040 tx = _mm_mul_pd(fscal,dx22);
2041 ty = _mm_mul_pd(fscal,dy22);
2042 tz = _mm_mul_pd(fscal,dz22);
2044 /* Update vectorial force */
2045 fix2 = _mm_add_pd(fix2,tx);
2046 fiy2 = _mm_add_pd(fiy2,ty);
2047 fiz2 = _mm_add_pd(fiz2,tz);
2049 fjx2 = _mm_add_pd(fjx2,tx);
2050 fjy2 = _mm_add_pd(fjy2,ty);
2051 fjz2 = _mm_add_pd(fjz2,tz);
2053 /**************************
2054 * CALCULATE INTERACTIONS *
2055 **************************/
2057 r23 = _mm_mul_pd(rsq23,rinv23);
2059 /* Calculate table index by multiplying r with table scale and truncate to integer */
2060 rt = _mm_mul_pd(r23,vftabscale);
2061 vfitab = _mm_cvttpd_epi32(rt);
2062 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2063 vfitab = _mm_slli_epi32(vfitab,2);
2065 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2066 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2067 F = _mm_setzero_pd();
2068 GMX_MM_TRANSPOSE2_PD(Y,F);
2069 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2070 H = _mm_setzero_pd();
2071 GMX_MM_TRANSPOSE2_PD(G,H);
2072 Heps = _mm_mul_pd(vfeps,H);
2073 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2074 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2075 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
2079 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2081 /* Calculate temporary vectorial force */
2082 tx = _mm_mul_pd(fscal,dx23);
2083 ty = _mm_mul_pd(fscal,dy23);
2084 tz = _mm_mul_pd(fscal,dz23);
2086 /* Update vectorial force */
2087 fix2 = _mm_add_pd(fix2,tx);
2088 fiy2 = _mm_add_pd(fiy2,ty);
2089 fiz2 = _mm_add_pd(fiz2,tz);
2091 fjx3 = _mm_add_pd(fjx3,tx);
2092 fjy3 = _mm_add_pd(fjy3,ty);
2093 fjz3 = _mm_add_pd(fjz3,tz);
2095 /**************************
2096 * CALCULATE INTERACTIONS *
2097 **************************/
2099 r31 = _mm_mul_pd(rsq31,rinv31);
2101 /* Calculate table index by multiplying r with table scale and truncate to integer */
2102 rt = _mm_mul_pd(r31,vftabscale);
2103 vfitab = _mm_cvttpd_epi32(rt);
2104 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2105 vfitab = _mm_slli_epi32(vfitab,2);
2107 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2108 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2109 F = _mm_setzero_pd();
2110 GMX_MM_TRANSPOSE2_PD(Y,F);
2111 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2112 H = _mm_setzero_pd();
2113 GMX_MM_TRANSPOSE2_PD(G,H);
2114 Heps = _mm_mul_pd(vfeps,H);
2115 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2116 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2117 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
2121 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2123 /* Calculate temporary vectorial force */
2124 tx = _mm_mul_pd(fscal,dx31);
2125 ty = _mm_mul_pd(fscal,dy31);
2126 tz = _mm_mul_pd(fscal,dz31);
2128 /* Update vectorial force */
2129 fix3 = _mm_add_pd(fix3,tx);
2130 fiy3 = _mm_add_pd(fiy3,ty);
2131 fiz3 = _mm_add_pd(fiz3,tz);
2133 fjx1 = _mm_add_pd(fjx1,tx);
2134 fjy1 = _mm_add_pd(fjy1,ty);
2135 fjz1 = _mm_add_pd(fjz1,tz);
2137 /**************************
2138 * CALCULATE INTERACTIONS *
2139 **************************/
2141 r32 = _mm_mul_pd(rsq32,rinv32);
2143 /* Calculate table index by multiplying r with table scale and truncate to integer */
2144 rt = _mm_mul_pd(r32,vftabscale);
2145 vfitab = _mm_cvttpd_epi32(rt);
2146 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2147 vfitab = _mm_slli_epi32(vfitab,2);
2149 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2150 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2151 F = _mm_setzero_pd();
2152 GMX_MM_TRANSPOSE2_PD(Y,F);
2153 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2154 H = _mm_setzero_pd();
2155 GMX_MM_TRANSPOSE2_PD(G,H);
2156 Heps = _mm_mul_pd(vfeps,H);
2157 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2158 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2159 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
2163 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2165 /* Calculate temporary vectorial force */
2166 tx = _mm_mul_pd(fscal,dx32);
2167 ty = _mm_mul_pd(fscal,dy32);
2168 tz = _mm_mul_pd(fscal,dz32);
2170 /* Update vectorial force */
2171 fix3 = _mm_add_pd(fix3,tx);
2172 fiy3 = _mm_add_pd(fiy3,ty);
2173 fiz3 = _mm_add_pd(fiz3,tz);
2175 fjx2 = _mm_add_pd(fjx2,tx);
2176 fjy2 = _mm_add_pd(fjy2,ty);
2177 fjz2 = _mm_add_pd(fjz2,tz);
2179 /**************************
2180 * CALCULATE INTERACTIONS *
2181 **************************/
2183 r33 = _mm_mul_pd(rsq33,rinv33);
2185 /* Calculate table index by multiplying r with table scale and truncate to integer */
2186 rt = _mm_mul_pd(r33,vftabscale);
2187 vfitab = _mm_cvttpd_epi32(rt);
2188 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2189 vfitab = _mm_slli_epi32(vfitab,2);
2191 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2192 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2193 F = _mm_setzero_pd();
2194 GMX_MM_TRANSPOSE2_PD(Y,F);
2195 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2196 H = _mm_setzero_pd();
2197 GMX_MM_TRANSPOSE2_PD(G,H);
2198 Heps = _mm_mul_pd(vfeps,H);
2199 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2200 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2201 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
2205 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2207 /* Calculate temporary vectorial force */
2208 tx = _mm_mul_pd(fscal,dx33);
2209 ty = _mm_mul_pd(fscal,dy33);
2210 tz = _mm_mul_pd(fscal,dz33);
2212 /* Update vectorial force */
2213 fix3 = _mm_add_pd(fix3,tx);
2214 fiy3 = _mm_add_pd(fiy3,ty);
2215 fiz3 = _mm_add_pd(fiz3,tz);
2217 fjx3 = _mm_add_pd(fjx3,tx);
2218 fjy3 = _mm_add_pd(fjy3,ty);
2219 fjz3 = _mm_add_pd(fjz3,tz);
2221 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA+DIM,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2223 /* Inner loop uses 351 flops */
2226 /* End of innermost loop */
2228 gmx_mm_update_iforce_3atom_swizzle_pd(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2229 f+i_coord_offset+DIM,fshift+i_shift_offset);
2231 /* Increment number of inner iterations */
2232 inneriter += j_index_end - j_index_start;
2234 /* Outer loop uses 18 flops */
2237 /* Increment number of outer iterations */
2240 /* Update outer/inner flops */
2242 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*351);