2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "gromacs/math/vec.h"
49 #include "gromacs/simd/math_x86_sse4_1_double.h"
50 #include "kernelutil_x86_sse4_1_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse4_1_double
54 * Electrostatics interaction: CubicSplineTable
55 * VdW interaction: None
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse4_1_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
77 int j_coord_offsetA,j_coord_offsetB;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 int vdwjidx0A,vdwjidx0B;
89 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 int vdwjidx1A,vdwjidx1B;
91 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B;
93 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
106 __m128i ifour = _mm_set1_epi32(4);
107 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
109 __m128d dummy_mask,cutoff_mask;
110 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
111 __m128d one = _mm_set1_pd(1.0);
112 __m128d two = _mm_set1_pd(2.0);
118 jindex = nlist->jindex;
120 shiftidx = nlist->shift;
122 shiftvec = fr->shift_vec[0];
123 fshift = fr->fshift[0];
124 facel = _mm_set1_pd(fr->epsfac);
125 charge = mdatoms->chargeA;
127 vftab = kernel_data->table_elec->data;
128 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
130 /* Setup water-specific parameters */
131 inr = nlist->iinr[0];
132 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
133 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
134 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
136 jq0 = _mm_set1_pd(charge[inr+0]);
137 jq1 = _mm_set1_pd(charge[inr+1]);
138 jq2 = _mm_set1_pd(charge[inr+2]);
139 qq00 = _mm_mul_pd(iq0,jq0);
140 qq01 = _mm_mul_pd(iq0,jq1);
141 qq02 = _mm_mul_pd(iq0,jq2);
142 qq10 = _mm_mul_pd(iq1,jq0);
143 qq11 = _mm_mul_pd(iq1,jq1);
144 qq12 = _mm_mul_pd(iq1,jq2);
145 qq20 = _mm_mul_pd(iq2,jq0);
146 qq21 = _mm_mul_pd(iq2,jq1);
147 qq22 = _mm_mul_pd(iq2,jq2);
149 /* Avoid stupid compiler warnings */
157 /* Start outer loop over neighborlists */
158 for(iidx=0; iidx<nri; iidx++)
160 /* Load shift vector for this list */
161 i_shift_offset = DIM*shiftidx[iidx];
163 /* Load limits for loop over neighbors */
164 j_index_start = jindex[iidx];
165 j_index_end = jindex[iidx+1];
167 /* Get outer coordinate index */
169 i_coord_offset = DIM*inr;
171 /* Load i particle coords and add shift vector */
172 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
173 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
175 fix0 = _mm_setzero_pd();
176 fiy0 = _mm_setzero_pd();
177 fiz0 = _mm_setzero_pd();
178 fix1 = _mm_setzero_pd();
179 fiy1 = _mm_setzero_pd();
180 fiz1 = _mm_setzero_pd();
181 fix2 = _mm_setzero_pd();
182 fiy2 = _mm_setzero_pd();
183 fiz2 = _mm_setzero_pd();
185 /* Reset potential sums */
186 velecsum = _mm_setzero_pd();
188 /* Start inner kernel loop */
189 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
192 /* Get j neighbor index, and coordinate index */
195 j_coord_offsetA = DIM*jnrA;
196 j_coord_offsetB = DIM*jnrB;
198 /* load j atom coordinates */
199 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
200 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
202 /* Calculate displacement vector */
203 dx00 = _mm_sub_pd(ix0,jx0);
204 dy00 = _mm_sub_pd(iy0,jy0);
205 dz00 = _mm_sub_pd(iz0,jz0);
206 dx01 = _mm_sub_pd(ix0,jx1);
207 dy01 = _mm_sub_pd(iy0,jy1);
208 dz01 = _mm_sub_pd(iz0,jz1);
209 dx02 = _mm_sub_pd(ix0,jx2);
210 dy02 = _mm_sub_pd(iy0,jy2);
211 dz02 = _mm_sub_pd(iz0,jz2);
212 dx10 = _mm_sub_pd(ix1,jx0);
213 dy10 = _mm_sub_pd(iy1,jy0);
214 dz10 = _mm_sub_pd(iz1,jz0);
215 dx11 = _mm_sub_pd(ix1,jx1);
216 dy11 = _mm_sub_pd(iy1,jy1);
217 dz11 = _mm_sub_pd(iz1,jz1);
218 dx12 = _mm_sub_pd(ix1,jx2);
219 dy12 = _mm_sub_pd(iy1,jy2);
220 dz12 = _mm_sub_pd(iz1,jz2);
221 dx20 = _mm_sub_pd(ix2,jx0);
222 dy20 = _mm_sub_pd(iy2,jy0);
223 dz20 = _mm_sub_pd(iz2,jz0);
224 dx21 = _mm_sub_pd(ix2,jx1);
225 dy21 = _mm_sub_pd(iy2,jy1);
226 dz21 = _mm_sub_pd(iz2,jz1);
227 dx22 = _mm_sub_pd(ix2,jx2);
228 dy22 = _mm_sub_pd(iy2,jy2);
229 dz22 = _mm_sub_pd(iz2,jz2);
231 /* Calculate squared distance and things based on it */
232 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
233 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
234 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
235 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
236 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
237 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
238 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
239 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
240 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
242 rinv00 = gmx_mm_invsqrt_pd(rsq00);
243 rinv01 = gmx_mm_invsqrt_pd(rsq01);
244 rinv02 = gmx_mm_invsqrt_pd(rsq02);
245 rinv10 = gmx_mm_invsqrt_pd(rsq10);
246 rinv11 = gmx_mm_invsqrt_pd(rsq11);
247 rinv12 = gmx_mm_invsqrt_pd(rsq12);
248 rinv20 = gmx_mm_invsqrt_pd(rsq20);
249 rinv21 = gmx_mm_invsqrt_pd(rsq21);
250 rinv22 = gmx_mm_invsqrt_pd(rsq22);
252 fjx0 = _mm_setzero_pd();
253 fjy0 = _mm_setzero_pd();
254 fjz0 = _mm_setzero_pd();
255 fjx1 = _mm_setzero_pd();
256 fjy1 = _mm_setzero_pd();
257 fjz1 = _mm_setzero_pd();
258 fjx2 = _mm_setzero_pd();
259 fjy2 = _mm_setzero_pd();
260 fjz2 = _mm_setzero_pd();
262 /**************************
263 * CALCULATE INTERACTIONS *
264 **************************/
266 r00 = _mm_mul_pd(rsq00,rinv00);
268 /* Calculate table index by multiplying r with table scale and truncate to integer */
269 rt = _mm_mul_pd(r00,vftabscale);
270 vfitab = _mm_cvttpd_epi32(rt);
271 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
272 vfitab = _mm_slli_epi32(vfitab,2);
274 /* CUBIC SPLINE TABLE ELECTROSTATICS */
275 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
276 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
277 GMX_MM_TRANSPOSE2_PD(Y,F);
278 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
279 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
280 GMX_MM_TRANSPOSE2_PD(G,H);
281 Heps = _mm_mul_pd(vfeps,H);
282 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
283 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
284 velec = _mm_mul_pd(qq00,VV);
285 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
286 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
288 /* Update potential sum for this i atom from the interaction with this j atom. */
289 velecsum = _mm_add_pd(velecsum,velec);
293 /* Calculate temporary vectorial force */
294 tx = _mm_mul_pd(fscal,dx00);
295 ty = _mm_mul_pd(fscal,dy00);
296 tz = _mm_mul_pd(fscal,dz00);
298 /* Update vectorial force */
299 fix0 = _mm_add_pd(fix0,tx);
300 fiy0 = _mm_add_pd(fiy0,ty);
301 fiz0 = _mm_add_pd(fiz0,tz);
303 fjx0 = _mm_add_pd(fjx0,tx);
304 fjy0 = _mm_add_pd(fjy0,ty);
305 fjz0 = _mm_add_pd(fjz0,tz);
307 /**************************
308 * CALCULATE INTERACTIONS *
309 **************************/
311 r01 = _mm_mul_pd(rsq01,rinv01);
313 /* Calculate table index by multiplying r with table scale and truncate to integer */
314 rt = _mm_mul_pd(r01,vftabscale);
315 vfitab = _mm_cvttpd_epi32(rt);
316 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
317 vfitab = _mm_slli_epi32(vfitab,2);
319 /* CUBIC SPLINE TABLE ELECTROSTATICS */
320 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
321 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
322 GMX_MM_TRANSPOSE2_PD(Y,F);
323 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
324 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
325 GMX_MM_TRANSPOSE2_PD(G,H);
326 Heps = _mm_mul_pd(vfeps,H);
327 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
328 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
329 velec = _mm_mul_pd(qq01,VV);
330 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
331 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
333 /* Update potential sum for this i atom from the interaction with this j atom. */
334 velecsum = _mm_add_pd(velecsum,velec);
338 /* Calculate temporary vectorial force */
339 tx = _mm_mul_pd(fscal,dx01);
340 ty = _mm_mul_pd(fscal,dy01);
341 tz = _mm_mul_pd(fscal,dz01);
343 /* Update vectorial force */
344 fix0 = _mm_add_pd(fix0,tx);
345 fiy0 = _mm_add_pd(fiy0,ty);
346 fiz0 = _mm_add_pd(fiz0,tz);
348 fjx1 = _mm_add_pd(fjx1,tx);
349 fjy1 = _mm_add_pd(fjy1,ty);
350 fjz1 = _mm_add_pd(fjz1,tz);
352 /**************************
353 * CALCULATE INTERACTIONS *
354 **************************/
356 r02 = _mm_mul_pd(rsq02,rinv02);
358 /* Calculate table index by multiplying r with table scale and truncate to integer */
359 rt = _mm_mul_pd(r02,vftabscale);
360 vfitab = _mm_cvttpd_epi32(rt);
361 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
362 vfitab = _mm_slli_epi32(vfitab,2);
364 /* CUBIC SPLINE TABLE ELECTROSTATICS */
365 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
366 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
367 GMX_MM_TRANSPOSE2_PD(Y,F);
368 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
369 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
370 GMX_MM_TRANSPOSE2_PD(G,H);
371 Heps = _mm_mul_pd(vfeps,H);
372 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
373 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
374 velec = _mm_mul_pd(qq02,VV);
375 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
376 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
378 /* Update potential sum for this i atom from the interaction with this j atom. */
379 velecsum = _mm_add_pd(velecsum,velec);
383 /* Calculate temporary vectorial force */
384 tx = _mm_mul_pd(fscal,dx02);
385 ty = _mm_mul_pd(fscal,dy02);
386 tz = _mm_mul_pd(fscal,dz02);
388 /* Update vectorial force */
389 fix0 = _mm_add_pd(fix0,tx);
390 fiy0 = _mm_add_pd(fiy0,ty);
391 fiz0 = _mm_add_pd(fiz0,tz);
393 fjx2 = _mm_add_pd(fjx2,tx);
394 fjy2 = _mm_add_pd(fjy2,ty);
395 fjz2 = _mm_add_pd(fjz2,tz);
397 /**************************
398 * CALCULATE INTERACTIONS *
399 **************************/
401 r10 = _mm_mul_pd(rsq10,rinv10);
403 /* Calculate table index by multiplying r with table scale and truncate to integer */
404 rt = _mm_mul_pd(r10,vftabscale);
405 vfitab = _mm_cvttpd_epi32(rt);
406 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
407 vfitab = _mm_slli_epi32(vfitab,2);
409 /* CUBIC SPLINE TABLE ELECTROSTATICS */
410 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
411 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
412 GMX_MM_TRANSPOSE2_PD(Y,F);
413 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
414 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
415 GMX_MM_TRANSPOSE2_PD(G,H);
416 Heps = _mm_mul_pd(vfeps,H);
417 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
418 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
419 velec = _mm_mul_pd(qq10,VV);
420 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
421 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
423 /* Update potential sum for this i atom from the interaction with this j atom. */
424 velecsum = _mm_add_pd(velecsum,velec);
428 /* Calculate temporary vectorial force */
429 tx = _mm_mul_pd(fscal,dx10);
430 ty = _mm_mul_pd(fscal,dy10);
431 tz = _mm_mul_pd(fscal,dz10);
433 /* Update vectorial force */
434 fix1 = _mm_add_pd(fix1,tx);
435 fiy1 = _mm_add_pd(fiy1,ty);
436 fiz1 = _mm_add_pd(fiz1,tz);
438 fjx0 = _mm_add_pd(fjx0,tx);
439 fjy0 = _mm_add_pd(fjy0,ty);
440 fjz0 = _mm_add_pd(fjz0,tz);
442 /**************************
443 * CALCULATE INTERACTIONS *
444 **************************/
446 r11 = _mm_mul_pd(rsq11,rinv11);
448 /* Calculate table index by multiplying r with table scale and truncate to integer */
449 rt = _mm_mul_pd(r11,vftabscale);
450 vfitab = _mm_cvttpd_epi32(rt);
451 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
452 vfitab = _mm_slli_epi32(vfitab,2);
454 /* CUBIC SPLINE TABLE ELECTROSTATICS */
455 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
456 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
457 GMX_MM_TRANSPOSE2_PD(Y,F);
458 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
459 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
460 GMX_MM_TRANSPOSE2_PD(G,H);
461 Heps = _mm_mul_pd(vfeps,H);
462 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
463 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
464 velec = _mm_mul_pd(qq11,VV);
465 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
466 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
468 /* Update potential sum for this i atom from the interaction with this j atom. */
469 velecsum = _mm_add_pd(velecsum,velec);
473 /* Calculate temporary vectorial force */
474 tx = _mm_mul_pd(fscal,dx11);
475 ty = _mm_mul_pd(fscal,dy11);
476 tz = _mm_mul_pd(fscal,dz11);
478 /* Update vectorial force */
479 fix1 = _mm_add_pd(fix1,tx);
480 fiy1 = _mm_add_pd(fiy1,ty);
481 fiz1 = _mm_add_pd(fiz1,tz);
483 fjx1 = _mm_add_pd(fjx1,tx);
484 fjy1 = _mm_add_pd(fjy1,ty);
485 fjz1 = _mm_add_pd(fjz1,tz);
487 /**************************
488 * CALCULATE INTERACTIONS *
489 **************************/
491 r12 = _mm_mul_pd(rsq12,rinv12);
493 /* Calculate table index by multiplying r with table scale and truncate to integer */
494 rt = _mm_mul_pd(r12,vftabscale);
495 vfitab = _mm_cvttpd_epi32(rt);
496 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
497 vfitab = _mm_slli_epi32(vfitab,2);
499 /* CUBIC SPLINE TABLE ELECTROSTATICS */
500 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
501 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
502 GMX_MM_TRANSPOSE2_PD(Y,F);
503 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
504 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
505 GMX_MM_TRANSPOSE2_PD(G,H);
506 Heps = _mm_mul_pd(vfeps,H);
507 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
508 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
509 velec = _mm_mul_pd(qq12,VV);
510 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
511 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
513 /* Update potential sum for this i atom from the interaction with this j atom. */
514 velecsum = _mm_add_pd(velecsum,velec);
518 /* Calculate temporary vectorial force */
519 tx = _mm_mul_pd(fscal,dx12);
520 ty = _mm_mul_pd(fscal,dy12);
521 tz = _mm_mul_pd(fscal,dz12);
523 /* Update vectorial force */
524 fix1 = _mm_add_pd(fix1,tx);
525 fiy1 = _mm_add_pd(fiy1,ty);
526 fiz1 = _mm_add_pd(fiz1,tz);
528 fjx2 = _mm_add_pd(fjx2,tx);
529 fjy2 = _mm_add_pd(fjy2,ty);
530 fjz2 = _mm_add_pd(fjz2,tz);
532 /**************************
533 * CALCULATE INTERACTIONS *
534 **************************/
536 r20 = _mm_mul_pd(rsq20,rinv20);
538 /* Calculate table index by multiplying r with table scale and truncate to integer */
539 rt = _mm_mul_pd(r20,vftabscale);
540 vfitab = _mm_cvttpd_epi32(rt);
541 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
542 vfitab = _mm_slli_epi32(vfitab,2);
544 /* CUBIC SPLINE TABLE ELECTROSTATICS */
545 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
546 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
547 GMX_MM_TRANSPOSE2_PD(Y,F);
548 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
549 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
550 GMX_MM_TRANSPOSE2_PD(G,H);
551 Heps = _mm_mul_pd(vfeps,H);
552 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
553 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
554 velec = _mm_mul_pd(qq20,VV);
555 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
556 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
558 /* Update potential sum for this i atom from the interaction with this j atom. */
559 velecsum = _mm_add_pd(velecsum,velec);
563 /* Calculate temporary vectorial force */
564 tx = _mm_mul_pd(fscal,dx20);
565 ty = _mm_mul_pd(fscal,dy20);
566 tz = _mm_mul_pd(fscal,dz20);
568 /* Update vectorial force */
569 fix2 = _mm_add_pd(fix2,tx);
570 fiy2 = _mm_add_pd(fiy2,ty);
571 fiz2 = _mm_add_pd(fiz2,tz);
573 fjx0 = _mm_add_pd(fjx0,tx);
574 fjy0 = _mm_add_pd(fjy0,ty);
575 fjz0 = _mm_add_pd(fjz0,tz);
577 /**************************
578 * CALCULATE INTERACTIONS *
579 **************************/
581 r21 = _mm_mul_pd(rsq21,rinv21);
583 /* Calculate table index by multiplying r with table scale and truncate to integer */
584 rt = _mm_mul_pd(r21,vftabscale);
585 vfitab = _mm_cvttpd_epi32(rt);
586 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
587 vfitab = _mm_slli_epi32(vfitab,2);
589 /* CUBIC SPLINE TABLE ELECTROSTATICS */
590 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
591 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
592 GMX_MM_TRANSPOSE2_PD(Y,F);
593 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
594 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
595 GMX_MM_TRANSPOSE2_PD(G,H);
596 Heps = _mm_mul_pd(vfeps,H);
597 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
598 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
599 velec = _mm_mul_pd(qq21,VV);
600 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
601 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
603 /* Update potential sum for this i atom from the interaction with this j atom. */
604 velecsum = _mm_add_pd(velecsum,velec);
608 /* Calculate temporary vectorial force */
609 tx = _mm_mul_pd(fscal,dx21);
610 ty = _mm_mul_pd(fscal,dy21);
611 tz = _mm_mul_pd(fscal,dz21);
613 /* Update vectorial force */
614 fix2 = _mm_add_pd(fix2,tx);
615 fiy2 = _mm_add_pd(fiy2,ty);
616 fiz2 = _mm_add_pd(fiz2,tz);
618 fjx1 = _mm_add_pd(fjx1,tx);
619 fjy1 = _mm_add_pd(fjy1,ty);
620 fjz1 = _mm_add_pd(fjz1,tz);
622 /**************************
623 * CALCULATE INTERACTIONS *
624 **************************/
626 r22 = _mm_mul_pd(rsq22,rinv22);
628 /* Calculate table index by multiplying r with table scale and truncate to integer */
629 rt = _mm_mul_pd(r22,vftabscale);
630 vfitab = _mm_cvttpd_epi32(rt);
631 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
632 vfitab = _mm_slli_epi32(vfitab,2);
634 /* CUBIC SPLINE TABLE ELECTROSTATICS */
635 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
636 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
637 GMX_MM_TRANSPOSE2_PD(Y,F);
638 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
639 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
640 GMX_MM_TRANSPOSE2_PD(G,H);
641 Heps = _mm_mul_pd(vfeps,H);
642 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
643 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
644 velec = _mm_mul_pd(qq22,VV);
645 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
646 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
648 /* Update potential sum for this i atom from the interaction with this j atom. */
649 velecsum = _mm_add_pd(velecsum,velec);
653 /* Calculate temporary vectorial force */
654 tx = _mm_mul_pd(fscal,dx22);
655 ty = _mm_mul_pd(fscal,dy22);
656 tz = _mm_mul_pd(fscal,dz22);
658 /* Update vectorial force */
659 fix2 = _mm_add_pd(fix2,tx);
660 fiy2 = _mm_add_pd(fiy2,ty);
661 fiz2 = _mm_add_pd(fiz2,tz);
663 fjx2 = _mm_add_pd(fjx2,tx);
664 fjy2 = _mm_add_pd(fjy2,ty);
665 fjz2 = _mm_add_pd(fjz2,tz);
667 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
669 /* Inner loop uses 387 flops */
676 j_coord_offsetA = DIM*jnrA;
678 /* load j atom coordinates */
679 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
680 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
682 /* Calculate displacement vector */
683 dx00 = _mm_sub_pd(ix0,jx0);
684 dy00 = _mm_sub_pd(iy0,jy0);
685 dz00 = _mm_sub_pd(iz0,jz0);
686 dx01 = _mm_sub_pd(ix0,jx1);
687 dy01 = _mm_sub_pd(iy0,jy1);
688 dz01 = _mm_sub_pd(iz0,jz1);
689 dx02 = _mm_sub_pd(ix0,jx2);
690 dy02 = _mm_sub_pd(iy0,jy2);
691 dz02 = _mm_sub_pd(iz0,jz2);
692 dx10 = _mm_sub_pd(ix1,jx0);
693 dy10 = _mm_sub_pd(iy1,jy0);
694 dz10 = _mm_sub_pd(iz1,jz0);
695 dx11 = _mm_sub_pd(ix1,jx1);
696 dy11 = _mm_sub_pd(iy1,jy1);
697 dz11 = _mm_sub_pd(iz1,jz1);
698 dx12 = _mm_sub_pd(ix1,jx2);
699 dy12 = _mm_sub_pd(iy1,jy2);
700 dz12 = _mm_sub_pd(iz1,jz2);
701 dx20 = _mm_sub_pd(ix2,jx0);
702 dy20 = _mm_sub_pd(iy2,jy0);
703 dz20 = _mm_sub_pd(iz2,jz0);
704 dx21 = _mm_sub_pd(ix2,jx1);
705 dy21 = _mm_sub_pd(iy2,jy1);
706 dz21 = _mm_sub_pd(iz2,jz1);
707 dx22 = _mm_sub_pd(ix2,jx2);
708 dy22 = _mm_sub_pd(iy2,jy2);
709 dz22 = _mm_sub_pd(iz2,jz2);
711 /* Calculate squared distance and things based on it */
712 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
713 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
714 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
715 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
716 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
717 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
718 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
719 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
720 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
722 rinv00 = gmx_mm_invsqrt_pd(rsq00);
723 rinv01 = gmx_mm_invsqrt_pd(rsq01);
724 rinv02 = gmx_mm_invsqrt_pd(rsq02);
725 rinv10 = gmx_mm_invsqrt_pd(rsq10);
726 rinv11 = gmx_mm_invsqrt_pd(rsq11);
727 rinv12 = gmx_mm_invsqrt_pd(rsq12);
728 rinv20 = gmx_mm_invsqrt_pd(rsq20);
729 rinv21 = gmx_mm_invsqrt_pd(rsq21);
730 rinv22 = gmx_mm_invsqrt_pd(rsq22);
732 fjx0 = _mm_setzero_pd();
733 fjy0 = _mm_setzero_pd();
734 fjz0 = _mm_setzero_pd();
735 fjx1 = _mm_setzero_pd();
736 fjy1 = _mm_setzero_pd();
737 fjz1 = _mm_setzero_pd();
738 fjx2 = _mm_setzero_pd();
739 fjy2 = _mm_setzero_pd();
740 fjz2 = _mm_setzero_pd();
742 /**************************
743 * CALCULATE INTERACTIONS *
744 **************************/
746 r00 = _mm_mul_pd(rsq00,rinv00);
748 /* Calculate table index by multiplying r with table scale and truncate to integer */
749 rt = _mm_mul_pd(r00,vftabscale);
750 vfitab = _mm_cvttpd_epi32(rt);
751 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
752 vfitab = _mm_slli_epi32(vfitab,2);
754 /* CUBIC SPLINE TABLE ELECTROSTATICS */
755 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
756 F = _mm_setzero_pd();
757 GMX_MM_TRANSPOSE2_PD(Y,F);
758 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
759 H = _mm_setzero_pd();
760 GMX_MM_TRANSPOSE2_PD(G,H);
761 Heps = _mm_mul_pd(vfeps,H);
762 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
763 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
764 velec = _mm_mul_pd(qq00,VV);
765 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
766 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
768 /* Update potential sum for this i atom from the interaction with this j atom. */
769 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
770 velecsum = _mm_add_pd(velecsum,velec);
774 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
776 /* Calculate temporary vectorial force */
777 tx = _mm_mul_pd(fscal,dx00);
778 ty = _mm_mul_pd(fscal,dy00);
779 tz = _mm_mul_pd(fscal,dz00);
781 /* Update vectorial force */
782 fix0 = _mm_add_pd(fix0,tx);
783 fiy0 = _mm_add_pd(fiy0,ty);
784 fiz0 = _mm_add_pd(fiz0,tz);
786 fjx0 = _mm_add_pd(fjx0,tx);
787 fjy0 = _mm_add_pd(fjy0,ty);
788 fjz0 = _mm_add_pd(fjz0,tz);
790 /**************************
791 * CALCULATE INTERACTIONS *
792 **************************/
794 r01 = _mm_mul_pd(rsq01,rinv01);
796 /* Calculate table index by multiplying r with table scale and truncate to integer */
797 rt = _mm_mul_pd(r01,vftabscale);
798 vfitab = _mm_cvttpd_epi32(rt);
799 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
800 vfitab = _mm_slli_epi32(vfitab,2);
802 /* CUBIC SPLINE TABLE ELECTROSTATICS */
803 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
804 F = _mm_setzero_pd();
805 GMX_MM_TRANSPOSE2_PD(Y,F);
806 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
807 H = _mm_setzero_pd();
808 GMX_MM_TRANSPOSE2_PD(G,H);
809 Heps = _mm_mul_pd(vfeps,H);
810 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
811 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
812 velec = _mm_mul_pd(qq01,VV);
813 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
814 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
816 /* Update potential sum for this i atom from the interaction with this j atom. */
817 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
818 velecsum = _mm_add_pd(velecsum,velec);
822 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
824 /* Calculate temporary vectorial force */
825 tx = _mm_mul_pd(fscal,dx01);
826 ty = _mm_mul_pd(fscal,dy01);
827 tz = _mm_mul_pd(fscal,dz01);
829 /* Update vectorial force */
830 fix0 = _mm_add_pd(fix0,tx);
831 fiy0 = _mm_add_pd(fiy0,ty);
832 fiz0 = _mm_add_pd(fiz0,tz);
834 fjx1 = _mm_add_pd(fjx1,tx);
835 fjy1 = _mm_add_pd(fjy1,ty);
836 fjz1 = _mm_add_pd(fjz1,tz);
838 /**************************
839 * CALCULATE INTERACTIONS *
840 **************************/
842 r02 = _mm_mul_pd(rsq02,rinv02);
844 /* Calculate table index by multiplying r with table scale and truncate to integer */
845 rt = _mm_mul_pd(r02,vftabscale);
846 vfitab = _mm_cvttpd_epi32(rt);
847 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
848 vfitab = _mm_slli_epi32(vfitab,2);
850 /* CUBIC SPLINE TABLE ELECTROSTATICS */
851 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
852 F = _mm_setzero_pd();
853 GMX_MM_TRANSPOSE2_PD(Y,F);
854 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
855 H = _mm_setzero_pd();
856 GMX_MM_TRANSPOSE2_PD(G,H);
857 Heps = _mm_mul_pd(vfeps,H);
858 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
859 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
860 velec = _mm_mul_pd(qq02,VV);
861 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
862 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
864 /* Update potential sum for this i atom from the interaction with this j atom. */
865 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
866 velecsum = _mm_add_pd(velecsum,velec);
870 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
872 /* Calculate temporary vectorial force */
873 tx = _mm_mul_pd(fscal,dx02);
874 ty = _mm_mul_pd(fscal,dy02);
875 tz = _mm_mul_pd(fscal,dz02);
877 /* Update vectorial force */
878 fix0 = _mm_add_pd(fix0,tx);
879 fiy0 = _mm_add_pd(fiy0,ty);
880 fiz0 = _mm_add_pd(fiz0,tz);
882 fjx2 = _mm_add_pd(fjx2,tx);
883 fjy2 = _mm_add_pd(fjy2,ty);
884 fjz2 = _mm_add_pd(fjz2,tz);
886 /**************************
887 * CALCULATE INTERACTIONS *
888 **************************/
890 r10 = _mm_mul_pd(rsq10,rinv10);
892 /* Calculate table index by multiplying r with table scale and truncate to integer */
893 rt = _mm_mul_pd(r10,vftabscale);
894 vfitab = _mm_cvttpd_epi32(rt);
895 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
896 vfitab = _mm_slli_epi32(vfitab,2);
898 /* CUBIC SPLINE TABLE ELECTROSTATICS */
899 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
900 F = _mm_setzero_pd();
901 GMX_MM_TRANSPOSE2_PD(Y,F);
902 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
903 H = _mm_setzero_pd();
904 GMX_MM_TRANSPOSE2_PD(G,H);
905 Heps = _mm_mul_pd(vfeps,H);
906 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
907 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
908 velec = _mm_mul_pd(qq10,VV);
909 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
910 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
912 /* Update potential sum for this i atom from the interaction with this j atom. */
913 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
914 velecsum = _mm_add_pd(velecsum,velec);
918 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
920 /* Calculate temporary vectorial force */
921 tx = _mm_mul_pd(fscal,dx10);
922 ty = _mm_mul_pd(fscal,dy10);
923 tz = _mm_mul_pd(fscal,dz10);
925 /* Update vectorial force */
926 fix1 = _mm_add_pd(fix1,tx);
927 fiy1 = _mm_add_pd(fiy1,ty);
928 fiz1 = _mm_add_pd(fiz1,tz);
930 fjx0 = _mm_add_pd(fjx0,tx);
931 fjy0 = _mm_add_pd(fjy0,ty);
932 fjz0 = _mm_add_pd(fjz0,tz);
934 /**************************
935 * CALCULATE INTERACTIONS *
936 **************************/
938 r11 = _mm_mul_pd(rsq11,rinv11);
940 /* Calculate table index by multiplying r with table scale and truncate to integer */
941 rt = _mm_mul_pd(r11,vftabscale);
942 vfitab = _mm_cvttpd_epi32(rt);
943 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
944 vfitab = _mm_slli_epi32(vfitab,2);
946 /* CUBIC SPLINE TABLE ELECTROSTATICS */
947 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
948 F = _mm_setzero_pd();
949 GMX_MM_TRANSPOSE2_PD(Y,F);
950 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
951 H = _mm_setzero_pd();
952 GMX_MM_TRANSPOSE2_PD(G,H);
953 Heps = _mm_mul_pd(vfeps,H);
954 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
955 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
956 velec = _mm_mul_pd(qq11,VV);
957 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
958 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
960 /* Update potential sum for this i atom from the interaction with this j atom. */
961 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
962 velecsum = _mm_add_pd(velecsum,velec);
966 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
968 /* Calculate temporary vectorial force */
969 tx = _mm_mul_pd(fscal,dx11);
970 ty = _mm_mul_pd(fscal,dy11);
971 tz = _mm_mul_pd(fscal,dz11);
973 /* Update vectorial force */
974 fix1 = _mm_add_pd(fix1,tx);
975 fiy1 = _mm_add_pd(fiy1,ty);
976 fiz1 = _mm_add_pd(fiz1,tz);
978 fjx1 = _mm_add_pd(fjx1,tx);
979 fjy1 = _mm_add_pd(fjy1,ty);
980 fjz1 = _mm_add_pd(fjz1,tz);
982 /**************************
983 * CALCULATE INTERACTIONS *
984 **************************/
986 r12 = _mm_mul_pd(rsq12,rinv12);
988 /* Calculate table index by multiplying r with table scale and truncate to integer */
989 rt = _mm_mul_pd(r12,vftabscale);
990 vfitab = _mm_cvttpd_epi32(rt);
991 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
992 vfitab = _mm_slli_epi32(vfitab,2);
994 /* CUBIC SPLINE TABLE ELECTROSTATICS */
995 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
996 F = _mm_setzero_pd();
997 GMX_MM_TRANSPOSE2_PD(Y,F);
998 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
999 H = _mm_setzero_pd();
1000 GMX_MM_TRANSPOSE2_PD(G,H);
1001 Heps = _mm_mul_pd(vfeps,H);
1002 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1003 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1004 velec = _mm_mul_pd(qq12,VV);
1005 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1006 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1008 /* Update potential sum for this i atom from the interaction with this j atom. */
1009 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1010 velecsum = _mm_add_pd(velecsum,velec);
1014 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1016 /* Calculate temporary vectorial force */
1017 tx = _mm_mul_pd(fscal,dx12);
1018 ty = _mm_mul_pd(fscal,dy12);
1019 tz = _mm_mul_pd(fscal,dz12);
1021 /* Update vectorial force */
1022 fix1 = _mm_add_pd(fix1,tx);
1023 fiy1 = _mm_add_pd(fiy1,ty);
1024 fiz1 = _mm_add_pd(fiz1,tz);
1026 fjx2 = _mm_add_pd(fjx2,tx);
1027 fjy2 = _mm_add_pd(fjy2,ty);
1028 fjz2 = _mm_add_pd(fjz2,tz);
1030 /**************************
1031 * CALCULATE INTERACTIONS *
1032 **************************/
1034 r20 = _mm_mul_pd(rsq20,rinv20);
1036 /* Calculate table index by multiplying r with table scale and truncate to integer */
1037 rt = _mm_mul_pd(r20,vftabscale);
1038 vfitab = _mm_cvttpd_epi32(rt);
1039 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1040 vfitab = _mm_slli_epi32(vfitab,2);
1042 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1043 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1044 F = _mm_setzero_pd();
1045 GMX_MM_TRANSPOSE2_PD(Y,F);
1046 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1047 H = _mm_setzero_pd();
1048 GMX_MM_TRANSPOSE2_PD(G,H);
1049 Heps = _mm_mul_pd(vfeps,H);
1050 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1051 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1052 velec = _mm_mul_pd(qq20,VV);
1053 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1054 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1056 /* Update potential sum for this i atom from the interaction with this j atom. */
1057 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1058 velecsum = _mm_add_pd(velecsum,velec);
1062 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1064 /* Calculate temporary vectorial force */
1065 tx = _mm_mul_pd(fscal,dx20);
1066 ty = _mm_mul_pd(fscal,dy20);
1067 tz = _mm_mul_pd(fscal,dz20);
1069 /* Update vectorial force */
1070 fix2 = _mm_add_pd(fix2,tx);
1071 fiy2 = _mm_add_pd(fiy2,ty);
1072 fiz2 = _mm_add_pd(fiz2,tz);
1074 fjx0 = _mm_add_pd(fjx0,tx);
1075 fjy0 = _mm_add_pd(fjy0,ty);
1076 fjz0 = _mm_add_pd(fjz0,tz);
1078 /**************************
1079 * CALCULATE INTERACTIONS *
1080 **************************/
1082 r21 = _mm_mul_pd(rsq21,rinv21);
1084 /* Calculate table index by multiplying r with table scale and truncate to integer */
1085 rt = _mm_mul_pd(r21,vftabscale);
1086 vfitab = _mm_cvttpd_epi32(rt);
1087 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1088 vfitab = _mm_slli_epi32(vfitab,2);
1090 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1091 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1092 F = _mm_setzero_pd();
1093 GMX_MM_TRANSPOSE2_PD(Y,F);
1094 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1095 H = _mm_setzero_pd();
1096 GMX_MM_TRANSPOSE2_PD(G,H);
1097 Heps = _mm_mul_pd(vfeps,H);
1098 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1099 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1100 velec = _mm_mul_pd(qq21,VV);
1101 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1102 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1104 /* Update potential sum for this i atom from the interaction with this j atom. */
1105 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1106 velecsum = _mm_add_pd(velecsum,velec);
1110 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1112 /* Calculate temporary vectorial force */
1113 tx = _mm_mul_pd(fscal,dx21);
1114 ty = _mm_mul_pd(fscal,dy21);
1115 tz = _mm_mul_pd(fscal,dz21);
1117 /* Update vectorial force */
1118 fix2 = _mm_add_pd(fix2,tx);
1119 fiy2 = _mm_add_pd(fiy2,ty);
1120 fiz2 = _mm_add_pd(fiz2,tz);
1122 fjx1 = _mm_add_pd(fjx1,tx);
1123 fjy1 = _mm_add_pd(fjy1,ty);
1124 fjz1 = _mm_add_pd(fjz1,tz);
1126 /**************************
1127 * CALCULATE INTERACTIONS *
1128 **************************/
1130 r22 = _mm_mul_pd(rsq22,rinv22);
1132 /* Calculate table index by multiplying r with table scale and truncate to integer */
1133 rt = _mm_mul_pd(r22,vftabscale);
1134 vfitab = _mm_cvttpd_epi32(rt);
1135 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1136 vfitab = _mm_slli_epi32(vfitab,2);
1138 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1139 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1140 F = _mm_setzero_pd();
1141 GMX_MM_TRANSPOSE2_PD(Y,F);
1142 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1143 H = _mm_setzero_pd();
1144 GMX_MM_TRANSPOSE2_PD(G,H);
1145 Heps = _mm_mul_pd(vfeps,H);
1146 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1147 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1148 velec = _mm_mul_pd(qq22,VV);
1149 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1150 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1152 /* Update potential sum for this i atom from the interaction with this j atom. */
1153 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1154 velecsum = _mm_add_pd(velecsum,velec);
1158 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1160 /* Calculate temporary vectorial force */
1161 tx = _mm_mul_pd(fscal,dx22);
1162 ty = _mm_mul_pd(fscal,dy22);
1163 tz = _mm_mul_pd(fscal,dz22);
1165 /* Update vectorial force */
1166 fix2 = _mm_add_pd(fix2,tx);
1167 fiy2 = _mm_add_pd(fiy2,ty);
1168 fiz2 = _mm_add_pd(fiz2,tz);
1170 fjx2 = _mm_add_pd(fjx2,tx);
1171 fjy2 = _mm_add_pd(fjy2,ty);
1172 fjz2 = _mm_add_pd(fjz2,tz);
1174 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1176 /* Inner loop uses 387 flops */
1179 /* End of innermost loop */
1181 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1182 f+i_coord_offset,fshift+i_shift_offset);
1185 /* Update potential energies */
1186 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1188 /* Increment number of inner iterations */
1189 inneriter += j_index_end - j_index_start;
1191 /* Outer loop uses 19 flops */
1194 /* Increment number of outer iterations */
1197 /* Update outer/inner flops */
1199 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*387);
1202 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse4_1_double
1203 * Electrostatics interaction: CubicSplineTable
1204 * VdW interaction: None
1205 * Geometry: Water3-Water3
1206 * Calculate force/pot: Force
1209 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse4_1_double
1210 (t_nblist * gmx_restrict nlist,
1211 rvec * gmx_restrict xx,
1212 rvec * gmx_restrict ff,
1213 t_forcerec * gmx_restrict fr,
1214 t_mdatoms * gmx_restrict mdatoms,
1215 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1216 t_nrnb * gmx_restrict nrnb)
1218 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1219 * just 0 for non-waters.
1220 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1221 * jnr indices corresponding to data put in the four positions in the SIMD register.
1223 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1224 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1226 int j_coord_offsetA,j_coord_offsetB;
1227 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1228 real rcutoff_scalar;
1229 real *shiftvec,*fshift,*x,*f;
1230 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1232 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1234 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1236 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1237 int vdwjidx0A,vdwjidx0B;
1238 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1239 int vdwjidx1A,vdwjidx1B;
1240 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1241 int vdwjidx2A,vdwjidx2B;
1242 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1243 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1244 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1245 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1246 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1247 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1248 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1249 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1250 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1251 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1252 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1255 __m128i ifour = _mm_set1_epi32(4);
1256 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1258 __m128d dummy_mask,cutoff_mask;
1259 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1260 __m128d one = _mm_set1_pd(1.0);
1261 __m128d two = _mm_set1_pd(2.0);
1267 jindex = nlist->jindex;
1269 shiftidx = nlist->shift;
1271 shiftvec = fr->shift_vec[0];
1272 fshift = fr->fshift[0];
1273 facel = _mm_set1_pd(fr->epsfac);
1274 charge = mdatoms->chargeA;
1276 vftab = kernel_data->table_elec->data;
1277 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
1279 /* Setup water-specific parameters */
1280 inr = nlist->iinr[0];
1281 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1282 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1283 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1285 jq0 = _mm_set1_pd(charge[inr+0]);
1286 jq1 = _mm_set1_pd(charge[inr+1]);
1287 jq2 = _mm_set1_pd(charge[inr+2]);
1288 qq00 = _mm_mul_pd(iq0,jq0);
1289 qq01 = _mm_mul_pd(iq0,jq1);
1290 qq02 = _mm_mul_pd(iq0,jq2);
1291 qq10 = _mm_mul_pd(iq1,jq0);
1292 qq11 = _mm_mul_pd(iq1,jq1);
1293 qq12 = _mm_mul_pd(iq1,jq2);
1294 qq20 = _mm_mul_pd(iq2,jq0);
1295 qq21 = _mm_mul_pd(iq2,jq1);
1296 qq22 = _mm_mul_pd(iq2,jq2);
1298 /* Avoid stupid compiler warnings */
1300 j_coord_offsetA = 0;
1301 j_coord_offsetB = 0;
1306 /* Start outer loop over neighborlists */
1307 for(iidx=0; iidx<nri; iidx++)
1309 /* Load shift vector for this list */
1310 i_shift_offset = DIM*shiftidx[iidx];
1312 /* Load limits for loop over neighbors */
1313 j_index_start = jindex[iidx];
1314 j_index_end = jindex[iidx+1];
1316 /* Get outer coordinate index */
1318 i_coord_offset = DIM*inr;
1320 /* Load i particle coords and add shift vector */
1321 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1322 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1324 fix0 = _mm_setzero_pd();
1325 fiy0 = _mm_setzero_pd();
1326 fiz0 = _mm_setzero_pd();
1327 fix1 = _mm_setzero_pd();
1328 fiy1 = _mm_setzero_pd();
1329 fiz1 = _mm_setzero_pd();
1330 fix2 = _mm_setzero_pd();
1331 fiy2 = _mm_setzero_pd();
1332 fiz2 = _mm_setzero_pd();
1334 /* Start inner kernel loop */
1335 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1338 /* Get j neighbor index, and coordinate index */
1340 jnrB = jjnr[jidx+1];
1341 j_coord_offsetA = DIM*jnrA;
1342 j_coord_offsetB = DIM*jnrB;
1344 /* load j atom coordinates */
1345 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1346 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1348 /* Calculate displacement vector */
1349 dx00 = _mm_sub_pd(ix0,jx0);
1350 dy00 = _mm_sub_pd(iy0,jy0);
1351 dz00 = _mm_sub_pd(iz0,jz0);
1352 dx01 = _mm_sub_pd(ix0,jx1);
1353 dy01 = _mm_sub_pd(iy0,jy1);
1354 dz01 = _mm_sub_pd(iz0,jz1);
1355 dx02 = _mm_sub_pd(ix0,jx2);
1356 dy02 = _mm_sub_pd(iy0,jy2);
1357 dz02 = _mm_sub_pd(iz0,jz2);
1358 dx10 = _mm_sub_pd(ix1,jx0);
1359 dy10 = _mm_sub_pd(iy1,jy0);
1360 dz10 = _mm_sub_pd(iz1,jz0);
1361 dx11 = _mm_sub_pd(ix1,jx1);
1362 dy11 = _mm_sub_pd(iy1,jy1);
1363 dz11 = _mm_sub_pd(iz1,jz1);
1364 dx12 = _mm_sub_pd(ix1,jx2);
1365 dy12 = _mm_sub_pd(iy1,jy2);
1366 dz12 = _mm_sub_pd(iz1,jz2);
1367 dx20 = _mm_sub_pd(ix2,jx0);
1368 dy20 = _mm_sub_pd(iy2,jy0);
1369 dz20 = _mm_sub_pd(iz2,jz0);
1370 dx21 = _mm_sub_pd(ix2,jx1);
1371 dy21 = _mm_sub_pd(iy2,jy1);
1372 dz21 = _mm_sub_pd(iz2,jz1);
1373 dx22 = _mm_sub_pd(ix2,jx2);
1374 dy22 = _mm_sub_pd(iy2,jy2);
1375 dz22 = _mm_sub_pd(iz2,jz2);
1377 /* Calculate squared distance and things based on it */
1378 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1379 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1380 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1381 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1382 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1383 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1384 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1385 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1386 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1388 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1389 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1390 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1391 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1392 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1393 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1394 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1395 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1396 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1398 fjx0 = _mm_setzero_pd();
1399 fjy0 = _mm_setzero_pd();
1400 fjz0 = _mm_setzero_pd();
1401 fjx1 = _mm_setzero_pd();
1402 fjy1 = _mm_setzero_pd();
1403 fjz1 = _mm_setzero_pd();
1404 fjx2 = _mm_setzero_pd();
1405 fjy2 = _mm_setzero_pd();
1406 fjz2 = _mm_setzero_pd();
1408 /**************************
1409 * CALCULATE INTERACTIONS *
1410 **************************/
1412 r00 = _mm_mul_pd(rsq00,rinv00);
1414 /* Calculate table index by multiplying r with table scale and truncate to integer */
1415 rt = _mm_mul_pd(r00,vftabscale);
1416 vfitab = _mm_cvttpd_epi32(rt);
1417 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1418 vfitab = _mm_slli_epi32(vfitab,2);
1420 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1421 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1422 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1423 GMX_MM_TRANSPOSE2_PD(Y,F);
1424 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1425 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1426 GMX_MM_TRANSPOSE2_PD(G,H);
1427 Heps = _mm_mul_pd(vfeps,H);
1428 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1429 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1430 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1434 /* Calculate temporary vectorial force */
1435 tx = _mm_mul_pd(fscal,dx00);
1436 ty = _mm_mul_pd(fscal,dy00);
1437 tz = _mm_mul_pd(fscal,dz00);
1439 /* Update vectorial force */
1440 fix0 = _mm_add_pd(fix0,tx);
1441 fiy0 = _mm_add_pd(fiy0,ty);
1442 fiz0 = _mm_add_pd(fiz0,tz);
1444 fjx0 = _mm_add_pd(fjx0,tx);
1445 fjy0 = _mm_add_pd(fjy0,ty);
1446 fjz0 = _mm_add_pd(fjz0,tz);
1448 /**************************
1449 * CALCULATE INTERACTIONS *
1450 **************************/
1452 r01 = _mm_mul_pd(rsq01,rinv01);
1454 /* Calculate table index by multiplying r with table scale and truncate to integer */
1455 rt = _mm_mul_pd(r01,vftabscale);
1456 vfitab = _mm_cvttpd_epi32(rt);
1457 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1458 vfitab = _mm_slli_epi32(vfitab,2);
1460 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1461 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1462 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1463 GMX_MM_TRANSPOSE2_PD(Y,F);
1464 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1465 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1466 GMX_MM_TRANSPOSE2_PD(G,H);
1467 Heps = _mm_mul_pd(vfeps,H);
1468 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1469 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1470 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1474 /* Calculate temporary vectorial force */
1475 tx = _mm_mul_pd(fscal,dx01);
1476 ty = _mm_mul_pd(fscal,dy01);
1477 tz = _mm_mul_pd(fscal,dz01);
1479 /* Update vectorial force */
1480 fix0 = _mm_add_pd(fix0,tx);
1481 fiy0 = _mm_add_pd(fiy0,ty);
1482 fiz0 = _mm_add_pd(fiz0,tz);
1484 fjx1 = _mm_add_pd(fjx1,tx);
1485 fjy1 = _mm_add_pd(fjy1,ty);
1486 fjz1 = _mm_add_pd(fjz1,tz);
1488 /**************************
1489 * CALCULATE INTERACTIONS *
1490 **************************/
1492 r02 = _mm_mul_pd(rsq02,rinv02);
1494 /* Calculate table index by multiplying r with table scale and truncate to integer */
1495 rt = _mm_mul_pd(r02,vftabscale);
1496 vfitab = _mm_cvttpd_epi32(rt);
1497 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1498 vfitab = _mm_slli_epi32(vfitab,2);
1500 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1501 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1502 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1503 GMX_MM_TRANSPOSE2_PD(Y,F);
1504 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1505 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1506 GMX_MM_TRANSPOSE2_PD(G,H);
1507 Heps = _mm_mul_pd(vfeps,H);
1508 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1509 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1510 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1514 /* Calculate temporary vectorial force */
1515 tx = _mm_mul_pd(fscal,dx02);
1516 ty = _mm_mul_pd(fscal,dy02);
1517 tz = _mm_mul_pd(fscal,dz02);
1519 /* Update vectorial force */
1520 fix0 = _mm_add_pd(fix0,tx);
1521 fiy0 = _mm_add_pd(fiy0,ty);
1522 fiz0 = _mm_add_pd(fiz0,tz);
1524 fjx2 = _mm_add_pd(fjx2,tx);
1525 fjy2 = _mm_add_pd(fjy2,ty);
1526 fjz2 = _mm_add_pd(fjz2,tz);
1528 /**************************
1529 * CALCULATE INTERACTIONS *
1530 **************************/
1532 r10 = _mm_mul_pd(rsq10,rinv10);
1534 /* Calculate table index by multiplying r with table scale and truncate to integer */
1535 rt = _mm_mul_pd(r10,vftabscale);
1536 vfitab = _mm_cvttpd_epi32(rt);
1537 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1538 vfitab = _mm_slli_epi32(vfitab,2);
1540 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1541 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1542 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1543 GMX_MM_TRANSPOSE2_PD(Y,F);
1544 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1545 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1546 GMX_MM_TRANSPOSE2_PD(G,H);
1547 Heps = _mm_mul_pd(vfeps,H);
1548 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1549 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1550 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1554 /* Calculate temporary vectorial force */
1555 tx = _mm_mul_pd(fscal,dx10);
1556 ty = _mm_mul_pd(fscal,dy10);
1557 tz = _mm_mul_pd(fscal,dz10);
1559 /* Update vectorial force */
1560 fix1 = _mm_add_pd(fix1,tx);
1561 fiy1 = _mm_add_pd(fiy1,ty);
1562 fiz1 = _mm_add_pd(fiz1,tz);
1564 fjx0 = _mm_add_pd(fjx0,tx);
1565 fjy0 = _mm_add_pd(fjy0,ty);
1566 fjz0 = _mm_add_pd(fjz0,tz);
1568 /**************************
1569 * CALCULATE INTERACTIONS *
1570 **************************/
1572 r11 = _mm_mul_pd(rsq11,rinv11);
1574 /* Calculate table index by multiplying r with table scale and truncate to integer */
1575 rt = _mm_mul_pd(r11,vftabscale);
1576 vfitab = _mm_cvttpd_epi32(rt);
1577 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1578 vfitab = _mm_slli_epi32(vfitab,2);
1580 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1581 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1582 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1583 GMX_MM_TRANSPOSE2_PD(Y,F);
1584 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1585 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1586 GMX_MM_TRANSPOSE2_PD(G,H);
1587 Heps = _mm_mul_pd(vfeps,H);
1588 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1589 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1590 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1594 /* Calculate temporary vectorial force */
1595 tx = _mm_mul_pd(fscal,dx11);
1596 ty = _mm_mul_pd(fscal,dy11);
1597 tz = _mm_mul_pd(fscal,dz11);
1599 /* Update vectorial force */
1600 fix1 = _mm_add_pd(fix1,tx);
1601 fiy1 = _mm_add_pd(fiy1,ty);
1602 fiz1 = _mm_add_pd(fiz1,tz);
1604 fjx1 = _mm_add_pd(fjx1,tx);
1605 fjy1 = _mm_add_pd(fjy1,ty);
1606 fjz1 = _mm_add_pd(fjz1,tz);
1608 /**************************
1609 * CALCULATE INTERACTIONS *
1610 **************************/
1612 r12 = _mm_mul_pd(rsq12,rinv12);
1614 /* Calculate table index by multiplying r with table scale and truncate to integer */
1615 rt = _mm_mul_pd(r12,vftabscale);
1616 vfitab = _mm_cvttpd_epi32(rt);
1617 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1618 vfitab = _mm_slli_epi32(vfitab,2);
1620 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1621 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1622 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1623 GMX_MM_TRANSPOSE2_PD(Y,F);
1624 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1625 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1626 GMX_MM_TRANSPOSE2_PD(G,H);
1627 Heps = _mm_mul_pd(vfeps,H);
1628 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1629 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1630 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1634 /* Calculate temporary vectorial force */
1635 tx = _mm_mul_pd(fscal,dx12);
1636 ty = _mm_mul_pd(fscal,dy12);
1637 tz = _mm_mul_pd(fscal,dz12);
1639 /* Update vectorial force */
1640 fix1 = _mm_add_pd(fix1,tx);
1641 fiy1 = _mm_add_pd(fiy1,ty);
1642 fiz1 = _mm_add_pd(fiz1,tz);
1644 fjx2 = _mm_add_pd(fjx2,tx);
1645 fjy2 = _mm_add_pd(fjy2,ty);
1646 fjz2 = _mm_add_pd(fjz2,tz);
1648 /**************************
1649 * CALCULATE INTERACTIONS *
1650 **************************/
1652 r20 = _mm_mul_pd(rsq20,rinv20);
1654 /* Calculate table index by multiplying r with table scale and truncate to integer */
1655 rt = _mm_mul_pd(r20,vftabscale);
1656 vfitab = _mm_cvttpd_epi32(rt);
1657 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1658 vfitab = _mm_slli_epi32(vfitab,2);
1660 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1661 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1662 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1663 GMX_MM_TRANSPOSE2_PD(Y,F);
1664 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1665 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1666 GMX_MM_TRANSPOSE2_PD(G,H);
1667 Heps = _mm_mul_pd(vfeps,H);
1668 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1669 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1670 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1674 /* Calculate temporary vectorial force */
1675 tx = _mm_mul_pd(fscal,dx20);
1676 ty = _mm_mul_pd(fscal,dy20);
1677 tz = _mm_mul_pd(fscal,dz20);
1679 /* Update vectorial force */
1680 fix2 = _mm_add_pd(fix2,tx);
1681 fiy2 = _mm_add_pd(fiy2,ty);
1682 fiz2 = _mm_add_pd(fiz2,tz);
1684 fjx0 = _mm_add_pd(fjx0,tx);
1685 fjy0 = _mm_add_pd(fjy0,ty);
1686 fjz0 = _mm_add_pd(fjz0,tz);
1688 /**************************
1689 * CALCULATE INTERACTIONS *
1690 **************************/
1692 r21 = _mm_mul_pd(rsq21,rinv21);
1694 /* Calculate table index by multiplying r with table scale and truncate to integer */
1695 rt = _mm_mul_pd(r21,vftabscale);
1696 vfitab = _mm_cvttpd_epi32(rt);
1697 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1698 vfitab = _mm_slli_epi32(vfitab,2);
1700 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1701 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1702 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1703 GMX_MM_TRANSPOSE2_PD(Y,F);
1704 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1705 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1706 GMX_MM_TRANSPOSE2_PD(G,H);
1707 Heps = _mm_mul_pd(vfeps,H);
1708 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1709 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1710 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1714 /* Calculate temporary vectorial force */
1715 tx = _mm_mul_pd(fscal,dx21);
1716 ty = _mm_mul_pd(fscal,dy21);
1717 tz = _mm_mul_pd(fscal,dz21);
1719 /* Update vectorial force */
1720 fix2 = _mm_add_pd(fix2,tx);
1721 fiy2 = _mm_add_pd(fiy2,ty);
1722 fiz2 = _mm_add_pd(fiz2,tz);
1724 fjx1 = _mm_add_pd(fjx1,tx);
1725 fjy1 = _mm_add_pd(fjy1,ty);
1726 fjz1 = _mm_add_pd(fjz1,tz);
1728 /**************************
1729 * CALCULATE INTERACTIONS *
1730 **************************/
1732 r22 = _mm_mul_pd(rsq22,rinv22);
1734 /* Calculate table index by multiplying r with table scale and truncate to integer */
1735 rt = _mm_mul_pd(r22,vftabscale);
1736 vfitab = _mm_cvttpd_epi32(rt);
1737 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1738 vfitab = _mm_slli_epi32(vfitab,2);
1740 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1741 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1742 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1743 GMX_MM_TRANSPOSE2_PD(Y,F);
1744 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1745 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1746 GMX_MM_TRANSPOSE2_PD(G,H);
1747 Heps = _mm_mul_pd(vfeps,H);
1748 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1749 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1750 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1754 /* Calculate temporary vectorial force */
1755 tx = _mm_mul_pd(fscal,dx22);
1756 ty = _mm_mul_pd(fscal,dy22);
1757 tz = _mm_mul_pd(fscal,dz22);
1759 /* Update vectorial force */
1760 fix2 = _mm_add_pd(fix2,tx);
1761 fiy2 = _mm_add_pd(fiy2,ty);
1762 fiz2 = _mm_add_pd(fiz2,tz);
1764 fjx2 = _mm_add_pd(fjx2,tx);
1765 fjy2 = _mm_add_pd(fjy2,ty);
1766 fjz2 = _mm_add_pd(fjz2,tz);
1768 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1770 /* Inner loop uses 351 flops */
1773 if(jidx<j_index_end)
1777 j_coord_offsetA = DIM*jnrA;
1779 /* load j atom coordinates */
1780 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1781 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1783 /* Calculate displacement vector */
1784 dx00 = _mm_sub_pd(ix0,jx0);
1785 dy00 = _mm_sub_pd(iy0,jy0);
1786 dz00 = _mm_sub_pd(iz0,jz0);
1787 dx01 = _mm_sub_pd(ix0,jx1);
1788 dy01 = _mm_sub_pd(iy0,jy1);
1789 dz01 = _mm_sub_pd(iz0,jz1);
1790 dx02 = _mm_sub_pd(ix0,jx2);
1791 dy02 = _mm_sub_pd(iy0,jy2);
1792 dz02 = _mm_sub_pd(iz0,jz2);
1793 dx10 = _mm_sub_pd(ix1,jx0);
1794 dy10 = _mm_sub_pd(iy1,jy0);
1795 dz10 = _mm_sub_pd(iz1,jz0);
1796 dx11 = _mm_sub_pd(ix1,jx1);
1797 dy11 = _mm_sub_pd(iy1,jy1);
1798 dz11 = _mm_sub_pd(iz1,jz1);
1799 dx12 = _mm_sub_pd(ix1,jx2);
1800 dy12 = _mm_sub_pd(iy1,jy2);
1801 dz12 = _mm_sub_pd(iz1,jz2);
1802 dx20 = _mm_sub_pd(ix2,jx0);
1803 dy20 = _mm_sub_pd(iy2,jy0);
1804 dz20 = _mm_sub_pd(iz2,jz0);
1805 dx21 = _mm_sub_pd(ix2,jx1);
1806 dy21 = _mm_sub_pd(iy2,jy1);
1807 dz21 = _mm_sub_pd(iz2,jz1);
1808 dx22 = _mm_sub_pd(ix2,jx2);
1809 dy22 = _mm_sub_pd(iy2,jy2);
1810 dz22 = _mm_sub_pd(iz2,jz2);
1812 /* Calculate squared distance and things based on it */
1813 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1814 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1815 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1816 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1817 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1818 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1819 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1820 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1821 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1823 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1824 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1825 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1826 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1827 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1828 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1829 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1830 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1831 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1833 fjx0 = _mm_setzero_pd();
1834 fjy0 = _mm_setzero_pd();
1835 fjz0 = _mm_setzero_pd();
1836 fjx1 = _mm_setzero_pd();
1837 fjy1 = _mm_setzero_pd();
1838 fjz1 = _mm_setzero_pd();
1839 fjx2 = _mm_setzero_pd();
1840 fjy2 = _mm_setzero_pd();
1841 fjz2 = _mm_setzero_pd();
1843 /**************************
1844 * CALCULATE INTERACTIONS *
1845 **************************/
1847 r00 = _mm_mul_pd(rsq00,rinv00);
1849 /* Calculate table index by multiplying r with table scale and truncate to integer */
1850 rt = _mm_mul_pd(r00,vftabscale);
1851 vfitab = _mm_cvttpd_epi32(rt);
1852 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1853 vfitab = _mm_slli_epi32(vfitab,2);
1855 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1856 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1857 F = _mm_setzero_pd();
1858 GMX_MM_TRANSPOSE2_PD(Y,F);
1859 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1860 H = _mm_setzero_pd();
1861 GMX_MM_TRANSPOSE2_PD(G,H);
1862 Heps = _mm_mul_pd(vfeps,H);
1863 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1864 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1865 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1869 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1871 /* Calculate temporary vectorial force */
1872 tx = _mm_mul_pd(fscal,dx00);
1873 ty = _mm_mul_pd(fscal,dy00);
1874 tz = _mm_mul_pd(fscal,dz00);
1876 /* Update vectorial force */
1877 fix0 = _mm_add_pd(fix0,tx);
1878 fiy0 = _mm_add_pd(fiy0,ty);
1879 fiz0 = _mm_add_pd(fiz0,tz);
1881 fjx0 = _mm_add_pd(fjx0,tx);
1882 fjy0 = _mm_add_pd(fjy0,ty);
1883 fjz0 = _mm_add_pd(fjz0,tz);
1885 /**************************
1886 * CALCULATE INTERACTIONS *
1887 **************************/
1889 r01 = _mm_mul_pd(rsq01,rinv01);
1891 /* Calculate table index by multiplying r with table scale and truncate to integer */
1892 rt = _mm_mul_pd(r01,vftabscale);
1893 vfitab = _mm_cvttpd_epi32(rt);
1894 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1895 vfitab = _mm_slli_epi32(vfitab,2);
1897 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1898 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1899 F = _mm_setzero_pd();
1900 GMX_MM_TRANSPOSE2_PD(Y,F);
1901 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1902 H = _mm_setzero_pd();
1903 GMX_MM_TRANSPOSE2_PD(G,H);
1904 Heps = _mm_mul_pd(vfeps,H);
1905 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1906 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1907 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1911 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1913 /* Calculate temporary vectorial force */
1914 tx = _mm_mul_pd(fscal,dx01);
1915 ty = _mm_mul_pd(fscal,dy01);
1916 tz = _mm_mul_pd(fscal,dz01);
1918 /* Update vectorial force */
1919 fix0 = _mm_add_pd(fix0,tx);
1920 fiy0 = _mm_add_pd(fiy0,ty);
1921 fiz0 = _mm_add_pd(fiz0,tz);
1923 fjx1 = _mm_add_pd(fjx1,tx);
1924 fjy1 = _mm_add_pd(fjy1,ty);
1925 fjz1 = _mm_add_pd(fjz1,tz);
1927 /**************************
1928 * CALCULATE INTERACTIONS *
1929 **************************/
1931 r02 = _mm_mul_pd(rsq02,rinv02);
1933 /* Calculate table index by multiplying r with table scale and truncate to integer */
1934 rt = _mm_mul_pd(r02,vftabscale);
1935 vfitab = _mm_cvttpd_epi32(rt);
1936 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1937 vfitab = _mm_slli_epi32(vfitab,2);
1939 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1940 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1941 F = _mm_setzero_pd();
1942 GMX_MM_TRANSPOSE2_PD(Y,F);
1943 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1944 H = _mm_setzero_pd();
1945 GMX_MM_TRANSPOSE2_PD(G,H);
1946 Heps = _mm_mul_pd(vfeps,H);
1947 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1948 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1949 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1953 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1955 /* Calculate temporary vectorial force */
1956 tx = _mm_mul_pd(fscal,dx02);
1957 ty = _mm_mul_pd(fscal,dy02);
1958 tz = _mm_mul_pd(fscal,dz02);
1960 /* Update vectorial force */
1961 fix0 = _mm_add_pd(fix0,tx);
1962 fiy0 = _mm_add_pd(fiy0,ty);
1963 fiz0 = _mm_add_pd(fiz0,tz);
1965 fjx2 = _mm_add_pd(fjx2,tx);
1966 fjy2 = _mm_add_pd(fjy2,ty);
1967 fjz2 = _mm_add_pd(fjz2,tz);
1969 /**************************
1970 * CALCULATE INTERACTIONS *
1971 **************************/
1973 r10 = _mm_mul_pd(rsq10,rinv10);
1975 /* Calculate table index by multiplying r with table scale and truncate to integer */
1976 rt = _mm_mul_pd(r10,vftabscale);
1977 vfitab = _mm_cvttpd_epi32(rt);
1978 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1979 vfitab = _mm_slli_epi32(vfitab,2);
1981 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1982 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1983 F = _mm_setzero_pd();
1984 GMX_MM_TRANSPOSE2_PD(Y,F);
1985 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1986 H = _mm_setzero_pd();
1987 GMX_MM_TRANSPOSE2_PD(G,H);
1988 Heps = _mm_mul_pd(vfeps,H);
1989 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1990 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1991 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1995 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1997 /* Calculate temporary vectorial force */
1998 tx = _mm_mul_pd(fscal,dx10);
1999 ty = _mm_mul_pd(fscal,dy10);
2000 tz = _mm_mul_pd(fscal,dz10);
2002 /* Update vectorial force */
2003 fix1 = _mm_add_pd(fix1,tx);
2004 fiy1 = _mm_add_pd(fiy1,ty);
2005 fiz1 = _mm_add_pd(fiz1,tz);
2007 fjx0 = _mm_add_pd(fjx0,tx);
2008 fjy0 = _mm_add_pd(fjy0,ty);
2009 fjz0 = _mm_add_pd(fjz0,tz);
2011 /**************************
2012 * CALCULATE INTERACTIONS *
2013 **************************/
2015 r11 = _mm_mul_pd(rsq11,rinv11);
2017 /* Calculate table index by multiplying r with table scale and truncate to integer */
2018 rt = _mm_mul_pd(r11,vftabscale);
2019 vfitab = _mm_cvttpd_epi32(rt);
2020 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2021 vfitab = _mm_slli_epi32(vfitab,2);
2023 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2024 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2025 F = _mm_setzero_pd();
2026 GMX_MM_TRANSPOSE2_PD(Y,F);
2027 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2028 H = _mm_setzero_pd();
2029 GMX_MM_TRANSPOSE2_PD(G,H);
2030 Heps = _mm_mul_pd(vfeps,H);
2031 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2032 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2033 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2037 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2039 /* Calculate temporary vectorial force */
2040 tx = _mm_mul_pd(fscal,dx11);
2041 ty = _mm_mul_pd(fscal,dy11);
2042 tz = _mm_mul_pd(fscal,dz11);
2044 /* Update vectorial force */
2045 fix1 = _mm_add_pd(fix1,tx);
2046 fiy1 = _mm_add_pd(fiy1,ty);
2047 fiz1 = _mm_add_pd(fiz1,tz);
2049 fjx1 = _mm_add_pd(fjx1,tx);
2050 fjy1 = _mm_add_pd(fjy1,ty);
2051 fjz1 = _mm_add_pd(fjz1,tz);
2053 /**************************
2054 * CALCULATE INTERACTIONS *
2055 **************************/
2057 r12 = _mm_mul_pd(rsq12,rinv12);
2059 /* Calculate table index by multiplying r with table scale and truncate to integer */
2060 rt = _mm_mul_pd(r12,vftabscale);
2061 vfitab = _mm_cvttpd_epi32(rt);
2062 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2063 vfitab = _mm_slli_epi32(vfitab,2);
2065 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2066 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2067 F = _mm_setzero_pd();
2068 GMX_MM_TRANSPOSE2_PD(Y,F);
2069 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2070 H = _mm_setzero_pd();
2071 GMX_MM_TRANSPOSE2_PD(G,H);
2072 Heps = _mm_mul_pd(vfeps,H);
2073 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2074 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2075 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2079 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2081 /* Calculate temporary vectorial force */
2082 tx = _mm_mul_pd(fscal,dx12);
2083 ty = _mm_mul_pd(fscal,dy12);
2084 tz = _mm_mul_pd(fscal,dz12);
2086 /* Update vectorial force */
2087 fix1 = _mm_add_pd(fix1,tx);
2088 fiy1 = _mm_add_pd(fiy1,ty);
2089 fiz1 = _mm_add_pd(fiz1,tz);
2091 fjx2 = _mm_add_pd(fjx2,tx);
2092 fjy2 = _mm_add_pd(fjy2,ty);
2093 fjz2 = _mm_add_pd(fjz2,tz);
2095 /**************************
2096 * CALCULATE INTERACTIONS *
2097 **************************/
2099 r20 = _mm_mul_pd(rsq20,rinv20);
2101 /* Calculate table index by multiplying r with table scale and truncate to integer */
2102 rt = _mm_mul_pd(r20,vftabscale);
2103 vfitab = _mm_cvttpd_epi32(rt);
2104 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2105 vfitab = _mm_slli_epi32(vfitab,2);
2107 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2108 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2109 F = _mm_setzero_pd();
2110 GMX_MM_TRANSPOSE2_PD(Y,F);
2111 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2112 H = _mm_setzero_pd();
2113 GMX_MM_TRANSPOSE2_PD(G,H);
2114 Heps = _mm_mul_pd(vfeps,H);
2115 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2116 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2117 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
2121 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2123 /* Calculate temporary vectorial force */
2124 tx = _mm_mul_pd(fscal,dx20);
2125 ty = _mm_mul_pd(fscal,dy20);
2126 tz = _mm_mul_pd(fscal,dz20);
2128 /* Update vectorial force */
2129 fix2 = _mm_add_pd(fix2,tx);
2130 fiy2 = _mm_add_pd(fiy2,ty);
2131 fiz2 = _mm_add_pd(fiz2,tz);
2133 fjx0 = _mm_add_pd(fjx0,tx);
2134 fjy0 = _mm_add_pd(fjy0,ty);
2135 fjz0 = _mm_add_pd(fjz0,tz);
2137 /**************************
2138 * CALCULATE INTERACTIONS *
2139 **************************/
2141 r21 = _mm_mul_pd(rsq21,rinv21);
2143 /* Calculate table index by multiplying r with table scale and truncate to integer */
2144 rt = _mm_mul_pd(r21,vftabscale);
2145 vfitab = _mm_cvttpd_epi32(rt);
2146 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2147 vfitab = _mm_slli_epi32(vfitab,2);
2149 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2150 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2151 F = _mm_setzero_pd();
2152 GMX_MM_TRANSPOSE2_PD(Y,F);
2153 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2154 H = _mm_setzero_pd();
2155 GMX_MM_TRANSPOSE2_PD(G,H);
2156 Heps = _mm_mul_pd(vfeps,H);
2157 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2158 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2159 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2163 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2165 /* Calculate temporary vectorial force */
2166 tx = _mm_mul_pd(fscal,dx21);
2167 ty = _mm_mul_pd(fscal,dy21);
2168 tz = _mm_mul_pd(fscal,dz21);
2170 /* Update vectorial force */
2171 fix2 = _mm_add_pd(fix2,tx);
2172 fiy2 = _mm_add_pd(fiy2,ty);
2173 fiz2 = _mm_add_pd(fiz2,tz);
2175 fjx1 = _mm_add_pd(fjx1,tx);
2176 fjy1 = _mm_add_pd(fjy1,ty);
2177 fjz1 = _mm_add_pd(fjz1,tz);
2179 /**************************
2180 * CALCULATE INTERACTIONS *
2181 **************************/
2183 r22 = _mm_mul_pd(rsq22,rinv22);
2185 /* Calculate table index by multiplying r with table scale and truncate to integer */
2186 rt = _mm_mul_pd(r22,vftabscale);
2187 vfitab = _mm_cvttpd_epi32(rt);
2188 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2189 vfitab = _mm_slli_epi32(vfitab,2);
2191 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2192 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2193 F = _mm_setzero_pd();
2194 GMX_MM_TRANSPOSE2_PD(Y,F);
2195 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2196 H = _mm_setzero_pd();
2197 GMX_MM_TRANSPOSE2_PD(G,H);
2198 Heps = _mm_mul_pd(vfeps,H);
2199 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2200 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2201 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2205 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2207 /* Calculate temporary vectorial force */
2208 tx = _mm_mul_pd(fscal,dx22);
2209 ty = _mm_mul_pd(fscal,dy22);
2210 tz = _mm_mul_pd(fscal,dz22);
2212 /* Update vectorial force */
2213 fix2 = _mm_add_pd(fix2,tx);
2214 fiy2 = _mm_add_pd(fiy2,ty);
2215 fiz2 = _mm_add_pd(fiz2,tz);
2217 fjx2 = _mm_add_pd(fjx2,tx);
2218 fjy2 = _mm_add_pd(fjy2,ty);
2219 fjz2 = _mm_add_pd(fjz2,tz);
2221 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2223 /* Inner loop uses 351 flops */
2226 /* End of innermost loop */
2228 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2229 f+i_coord_offset,fshift+i_shift_offset);
2231 /* Increment number of inner iterations */
2232 inneriter += j_index_end - j_index_start;
2234 /* Outer loop uses 18 flops */
2237 /* Increment number of outer iterations */
2240 /* Update outer/inner flops */
2242 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*351);