2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
49 #include "gromacs/simd/math_x86_sse4_1_double.h"
50 #include "kernelutil_x86_sse4_1_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse4_1_double
54 * Electrostatics interaction: CubicSplineTable
55 * VdW interaction: CubicSplineTable
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse4_1_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
77 int j_coord_offsetA,j_coord_offsetB;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 int vdwjidx0A,vdwjidx0B;
89 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 int vdwjidx1A,vdwjidx1B;
91 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B;
93 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
106 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
109 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
110 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
112 __m128i ifour = _mm_set1_epi32(4);
113 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
115 __m128d dummy_mask,cutoff_mask;
116 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
117 __m128d one = _mm_set1_pd(1.0);
118 __m128d two = _mm_set1_pd(2.0);
124 jindex = nlist->jindex;
126 shiftidx = nlist->shift;
128 shiftvec = fr->shift_vec[0];
129 fshift = fr->fshift[0];
130 facel = _mm_set1_pd(fr->epsfac);
131 charge = mdatoms->chargeA;
132 nvdwtype = fr->ntype;
134 vdwtype = mdatoms->typeA;
136 vftab = kernel_data->table_elec_vdw->data;
137 vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
139 /* Setup water-specific parameters */
140 inr = nlist->iinr[0];
141 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
142 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
143 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
144 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
146 jq0 = _mm_set1_pd(charge[inr+0]);
147 jq1 = _mm_set1_pd(charge[inr+1]);
148 jq2 = _mm_set1_pd(charge[inr+2]);
149 vdwjidx0A = 2*vdwtype[inr+0];
150 qq00 = _mm_mul_pd(iq0,jq0);
151 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
152 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
153 qq01 = _mm_mul_pd(iq0,jq1);
154 qq02 = _mm_mul_pd(iq0,jq2);
155 qq10 = _mm_mul_pd(iq1,jq0);
156 qq11 = _mm_mul_pd(iq1,jq1);
157 qq12 = _mm_mul_pd(iq1,jq2);
158 qq20 = _mm_mul_pd(iq2,jq0);
159 qq21 = _mm_mul_pd(iq2,jq1);
160 qq22 = _mm_mul_pd(iq2,jq2);
162 /* Avoid stupid compiler warnings */
170 /* Start outer loop over neighborlists */
171 for(iidx=0; iidx<nri; iidx++)
173 /* Load shift vector for this list */
174 i_shift_offset = DIM*shiftidx[iidx];
176 /* Load limits for loop over neighbors */
177 j_index_start = jindex[iidx];
178 j_index_end = jindex[iidx+1];
180 /* Get outer coordinate index */
182 i_coord_offset = DIM*inr;
184 /* Load i particle coords and add shift vector */
185 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
186 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
188 fix0 = _mm_setzero_pd();
189 fiy0 = _mm_setzero_pd();
190 fiz0 = _mm_setzero_pd();
191 fix1 = _mm_setzero_pd();
192 fiy1 = _mm_setzero_pd();
193 fiz1 = _mm_setzero_pd();
194 fix2 = _mm_setzero_pd();
195 fiy2 = _mm_setzero_pd();
196 fiz2 = _mm_setzero_pd();
198 /* Reset potential sums */
199 velecsum = _mm_setzero_pd();
200 vvdwsum = _mm_setzero_pd();
202 /* Start inner kernel loop */
203 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
206 /* Get j neighbor index, and coordinate index */
209 j_coord_offsetA = DIM*jnrA;
210 j_coord_offsetB = DIM*jnrB;
212 /* load j atom coordinates */
213 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
214 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
216 /* Calculate displacement vector */
217 dx00 = _mm_sub_pd(ix0,jx0);
218 dy00 = _mm_sub_pd(iy0,jy0);
219 dz00 = _mm_sub_pd(iz0,jz0);
220 dx01 = _mm_sub_pd(ix0,jx1);
221 dy01 = _mm_sub_pd(iy0,jy1);
222 dz01 = _mm_sub_pd(iz0,jz1);
223 dx02 = _mm_sub_pd(ix0,jx2);
224 dy02 = _mm_sub_pd(iy0,jy2);
225 dz02 = _mm_sub_pd(iz0,jz2);
226 dx10 = _mm_sub_pd(ix1,jx0);
227 dy10 = _mm_sub_pd(iy1,jy0);
228 dz10 = _mm_sub_pd(iz1,jz0);
229 dx11 = _mm_sub_pd(ix1,jx1);
230 dy11 = _mm_sub_pd(iy1,jy1);
231 dz11 = _mm_sub_pd(iz1,jz1);
232 dx12 = _mm_sub_pd(ix1,jx2);
233 dy12 = _mm_sub_pd(iy1,jy2);
234 dz12 = _mm_sub_pd(iz1,jz2);
235 dx20 = _mm_sub_pd(ix2,jx0);
236 dy20 = _mm_sub_pd(iy2,jy0);
237 dz20 = _mm_sub_pd(iz2,jz0);
238 dx21 = _mm_sub_pd(ix2,jx1);
239 dy21 = _mm_sub_pd(iy2,jy1);
240 dz21 = _mm_sub_pd(iz2,jz1);
241 dx22 = _mm_sub_pd(ix2,jx2);
242 dy22 = _mm_sub_pd(iy2,jy2);
243 dz22 = _mm_sub_pd(iz2,jz2);
245 /* Calculate squared distance and things based on it */
246 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
247 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
248 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
249 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
250 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
251 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
252 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
253 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
254 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
256 rinv00 = gmx_mm_invsqrt_pd(rsq00);
257 rinv01 = gmx_mm_invsqrt_pd(rsq01);
258 rinv02 = gmx_mm_invsqrt_pd(rsq02);
259 rinv10 = gmx_mm_invsqrt_pd(rsq10);
260 rinv11 = gmx_mm_invsqrt_pd(rsq11);
261 rinv12 = gmx_mm_invsqrt_pd(rsq12);
262 rinv20 = gmx_mm_invsqrt_pd(rsq20);
263 rinv21 = gmx_mm_invsqrt_pd(rsq21);
264 rinv22 = gmx_mm_invsqrt_pd(rsq22);
266 fjx0 = _mm_setzero_pd();
267 fjy0 = _mm_setzero_pd();
268 fjz0 = _mm_setzero_pd();
269 fjx1 = _mm_setzero_pd();
270 fjy1 = _mm_setzero_pd();
271 fjz1 = _mm_setzero_pd();
272 fjx2 = _mm_setzero_pd();
273 fjy2 = _mm_setzero_pd();
274 fjz2 = _mm_setzero_pd();
276 /**************************
277 * CALCULATE INTERACTIONS *
278 **************************/
280 r00 = _mm_mul_pd(rsq00,rinv00);
282 /* Calculate table index by multiplying r with table scale and truncate to integer */
283 rt = _mm_mul_pd(r00,vftabscale);
284 vfitab = _mm_cvttpd_epi32(rt);
285 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
286 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
288 /* CUBIC SPLINE TABLE ELECTROSTATICS */
289 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
290 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
291 GMX_MM_TRANSPOSE2_PD(Y,F);
292 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
293 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
294 GMX_MM_TRANSPOSE2_PD(G,H);
295 Heps = _mm_mul_pd(vfeps,H);
296 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
297 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
298 velec = _mm_mul_pd(qq00,VV);
299 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
300 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
302 /* CUBIC SPLINE TABLE DISPERSION */
303 vfitab = _mm_add_epi32(vfitab,ifour);
304 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
305 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
306 GMX_MM_TRANSPOSE2_PD(Y,F);
307 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
308 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
309 GMX_MM_TRANSPOSE2_PD(G,H);
310 Heps = _mm_mul_pd(vfeps,H);
311 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
312 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
313 vvdw6 = _mm_mul_pd(c6_00,VV);
314 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
315 fvdw6 = _mm_mul_pd(c6_00,FF);
317 /* CUBIC SPLINE TABLE REPULSION */
318 vfitab = _mm_add_epi32(vfitab,ifour);
319 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
320 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
321 GMX_MM_TRANSPOSE2_PD(Y,F);
322 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
323 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
324 GMX_MM_TRANSPOSE2_PD(G,H);
325 Heps = _mm_mul_pd(vfeps,H);
326 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
327 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
328 vvdw12 = _mm_mul_pd(c12_00,VV);
329 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
330 fvdw12 = _mm_mul_pd(c12_00,FF);
331 vvdw = _mm_add_pd(vvdw12,vvdw6);
332 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
334 /* Update potential sum for this i atom from the interaction with this j atom. */
335 velecsum = _mm_add_pd(velecsum,velec);
336 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
338 fscal = _mm_add_pd(felec,fvdw);
340 /* Calculate temporary vectorial force */
341 tx = _mm_mul_pd(fscal,dx00);
342 ty = _mm_mul_pd(fscal,dy00);
343 tz = _mm_mul_pd(fscal,dz00);
345 /* Update vectorial force */
346 fix0 = _mm_add_pd(fix0,tx);
347 fiy0 = _mm_add_pd(fiy0,ty);
348 fiz0 = _mm_add_pd(fiz0,tz);
350 fjx0 = _mm_add_pd(fjx0,tx);
351 fjy0 = _mm_add_pd(fjy0,ty);
352 fjz0 = _mm_add_pd(fjz0,tz);
354 /**************************
355 * CALCULATE INTERACTIONS *
356 **************************/
358 r01 = _mm_mul_pd(rsq01,rinv01);
360 /* Calculate table index by multiplying r with table scale and truncate to integer */
361 rt = _mm_mul_pd(r01,vftabscale);
362 vfitab = _mm_cvttpd_epi32(rt);
363 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
364 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
366 /* CUBIC SPLINE TABLE ELECTROSTATICS */
367 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
368 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
369 GMX_MM_TRANSPOSE2_PD(Y,F);
370 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
371 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
372 GMX_MM_TRANSPOSE2_PD(G,H);
373 Heps = _mm_mul_pd(vfeps,H);
374 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
375 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
376 velec = _mm_mul_pd(qq01,VV);
377 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
378 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
380 /* Update potential sum for this i atom from the interaction with this j atom. */
381 velecsum = _mm_add_pd(velecsum,velec);
385 /* Calculate temporary vectorial force */
386 tx = _mm_mul_pd(fscal,dx01);
387 ty = _mm_mul_pd(fscal,dy01);
388 tz = _mm_mul_pd(fscal,dz01);
390 /* Update vectorial force */
391 fix0 = _mm_add_pd(fix0,tx);
392 fiy0 = _mm_add_pd(fiy0,ty);
393 fiz0 = _mm_add_pd(fiz0,tz);
395 fjx1 = _mm_add_pd(fjx1,tx);
396 fjy1 = _mm_add_pd(fjy1,ty);
397 fjz1 = _mm_add_pd(fjz1,tz);
399 /**************************
400 * CALCULATE INTERACTIONS *
401 **************************/
403 r02 = _mm_mul_pd(rsq02,rinv02);
405 /* Calculate table index by multiplying r with table scale and truncate to integer */
406 rt = _mm_mul_pd(r02,vftabscale);
407 vfitab = _mm_cvttpd_epi32(rt);
408 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
409 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
411 /* CUBIC SPLINE TABLE ELECTROSTATICS */
412 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
413 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
414 GMX_MM_TRANSPOSE2_PD(Y,F);
415 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
416 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
417 GMX_MM_TRANSPOSE2_PD(G,H);
418 Heps = _mm_mul_pd(vfeps,H);
419 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
420 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
421 velec = _mm_mul_pd(qq02,VV);
422 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
423 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
425 /* Update potential sum for this i atom from the interaction with this j atom. */
426 velecsum = _mm_add_pd(velecsum,velec);
430 /* Calculate temporary vectorial force */
431 tx = _mm_mul_pd(fscal,dx02);
432 ty = _mm_mul_pd(fscal,dy02);
433 tz = _mm_mul_pd(fscal,dz02);
435 /* Update vectorial force */
436 fix0 = _mm_add_pd(fix0,tx);
437 fiy0 = _mm_add_pd(fiy0,ty);
438 fiz0 = _mm_add_pd(fiz0,tz);
440 fjx2 = _mm_add_pd(fjx2,tx);
441 fjy2 = _mm_add_pd(fjy2,ty);
442 fjz2 = _mm_add_pd(fjz2,tz);
444 /**************************
445 * CALCULATE INTERACTIONS *
446 **************************/
448 r10 = _mm_mul_pd(rsq10,rinv10);
450 /* Calculate table index by multiplying r with table scale and truncate to integer */
451 rt = _mm_mul_pd(r10,vftabscale);
452 vfitab = _mm_cvttpd_epi32(rt);
453 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
454 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
456 /* CUBIC SPLINE TABLE ELECTROSTATICS */
457 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
458 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
459 GMX_MM_TRANSPOSE2_PD(Y,F);
460 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
461 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
462 GMX_MM_TRANSPOSE2_PD(G,H);
463 Heps = _mm_mul_pd(vfeps,H);
464 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
465 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
466 velec = _mm_mul_pd(qq10,VV);
467 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
468 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
470 /* Update potential sum for this i atom from the interaction with this j atom. */
471 velecsum = _mm_add_pd(velecsum,velec);
475 /* Calculate temporary vectorial force */
476 tx = _mm_mul_pd(fscal,dx10);
477 ty = _mm_mul_pd(fscal,dy10);
478 tz = _mm_mul_pd(fscal,dz10);
480 /* Update vectorial force */
481 fix1 = _mm_add_pd(fix1,tx);
482 fiy1 = _mm_add_pd(fiy1,ty);
483 fiz1 = _mm_add_pd(fiz1,tz);
485 fjx0 = _mm_add_pd(fjx0,tx);
486 fjy0 = _mm_add_pd(fjy0,ty);
487 fjz0 = _mm_add_pd(fjz0,tz);
489 /**************************
490 * CALCULATE INTERACTIONS *
491 **************************/
493 r11 = _mm_mul_pd(rsq11,rinv11);
495 /* Calculate table index by multiplying r with table scale and truncate to integer */
496 rt = _mm_mul_pd(r11,vftabscale);
497 vfitab = _mm_cvttpd_epi32(rt);
498 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
499 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
501 /* CUBIC SPLINE TABLE ELECTROSTATICS */
502 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
503 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
504 GMX_MM_TRANSPOSE2_PD(Y,F);
505 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
506 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
507 GMX_MM_TRANSPOSE2_PD(G,H);
508 Heps = _mm_mul_pd(vfeps,H);
509 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
510 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
511 velec = _mm_mul_pd(qq11,VV);
512 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
513 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
515 /* Update potential sum for this i atom from the interaction with this j atom. */
516 velecsum = _mm_add_pd(velecsum,velec);
520 /* Calculate temporary vectorial force */
521 tx = _mm_mul_pd(fscal,dx11);
522 ty = _mm_mul_pd(fscal,dy11);
523 tz = _mm_mul_pd(fscal,dz11);
525 /* Update vectorial force */
526 fix1 = _mm_add_pd(fix1,tx);
527 fiy1 = _mm_add_pd(fiy1,ty);
528 fiz1 = _mm_add_pd(fiz1,tz);
530 fjx1 = _mm_add_pd(fjx1,tx);
531 fjy1 = _mm_add_pd(fjy1,ty);
532 fjz1 = _mm_add_pd(fjz1,tz);
534 /**************************
535 * CALCULATE INTERACTIONS *
536 **************************/
538 r12 = _mm_mul_pd(rsq12,rinv12);
540 /* Calculate table index by multiplying r with table scale and truncate to integer */
541 rt = _mm_mul_pd(r12,vftabscale);
542 vfitab = _mm_cvttpd_epi32(rt);
543 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
544 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
546 /* CUBIC SPLINE TABLE ELECTROSTATICS */
547 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
548 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
549 GMX_MM_TRANSPOSE2_PD(Y,F);
550 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
551 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
552 GMX_MM_TRANSPOSE2_PD(G,H);
553 Heps = _mm_mul_pd(vfeps,H);
554 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
555 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
556 velec = _mm_mul_pd(qq12,VV);
557 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
558 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
560 /* Update potential sum for this i atom from the interaction with this j atom. */
561 velecsum = _mm_add_pd(velecsum,velec);
565 /* Calculate temporary vectorial force */
566 tx = _mm_mul_pd(fscal,dx12);
567 ty = _mm_mul_pd(fscal,dy12);
568 tz = _mm_mul_pd(fscal,dz12);
570 /* Update vectorial force */
571 fix1 = _mm_add_pd(fix1,tx);
572 fiy1 = _mm_add_pd(fiy1,ty);
573 fiz1 = _mm_add_pd(fiz1,tz);
575 fjx2 = _mm_add_pd(fjx2,tx);
576 fjy2 = _mm_add_pd(fjy2,ty);
577 fjz2 = _mm_add_pd(fjz2,tz);
579 /**************************
580 * CALCULATE INTERACTIONS *
581 **************************/
583 r20 = _mm_mul_pd(rsq20,rinv20);
585 /* Calculate table index by multiplying r with table scale and truncate to integer */
586 rt = _mm_mul_pd(r20,vftabscale);
587 vfitab = _mm_cvttpd_epi32(rt);
588 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
589 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
591 /* CUBIC SPLINE TABLE ELECTROSTATICS */
592 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
593 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
594 GMX_MM_TRANSPOSE2_PD(Y,F);
595 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
596 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
597 GMX_MM_TRANSPOSE2_PD(G,H);
598 Heps = _mm_mul_pd(vfeps,H);
599 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
600 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
601 velec = _mm_mul_pd(qq20,VV);
602 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
603 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
605 /* Update potential sum for this i atom from the interaction with this j atom. */
606 velecsum = _mm_add_pd(velecsum,velec);
610 /* Calculate temporary vectorial force */
611 tx = _mm_mul_pd(fscal,dx20);
612 ty = _mm_mul_pd(fscal,dy20);
613 tz = _mm_mul_pd(fscal,dz20);
615 /* Update vectorial force */
616 fix2 = _mm_add_pd(fix2,tx);
617 fiy2 = _mm_add_pd(fiy2,ty);
618 fiz2 = _mm_add_pd(fiz2,tz);
620 fjx0 = _mm_add_pd(fjx0,tx);
621 fjy0 = _mm_add_pd(fjy0,ty);
622 fjz0 = _mm_add_pd(fjz0,tz);
624 /**************************
625 * CALCULATE INTERACTIONS *
626 **************************/
628 r21 = _mm_mul_pd(rsq21,rinv21);
630 /* Calculate table index by multiplying r with table scale and truncate to integer */
631 rt = _mm_mul_pd(r21,vftabscale);
632 vfitab = _mm_cvttpd_epi32(rt);
633 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
634 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
636 /* CUBIC SPLINE TABLE ELECTROSTATICS */
637 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
638 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
639 GMX_MM_TRANSPOSE2_PD(Y,F);
640 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
641 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
642 GMX_MM_TRANSPOSE2_PD(G,H);
643 Heps = _mm_mul_pd(vfeps,H);
644 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
645 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
646 velec = _mm_mul_pd(qq21,VV);
647 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
648 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
650 /* Update potential sum for this i atom from the interaction with this j atom. */
651 velecsum = _mm_add_pd(velecsum,velec);
655 /* Calculate temporary vectorial force */
656 tx = _mm_mul_pd(fscal,dx21);
657 ty = _mm_mul_pd(fscal,dy21);
658 tz = _mm_mul_pd(fscal,dz21);
660 /* Update vectorial force */
661 fix2 = _mm_add_pd(fix2,tx);
662 fiy2 = _mm_add_pd(fiy2,ty);
663 fiz2 = _mm_add_pd(fiz2,tz);
665 fjx1 = _mm_add_pd(fjx1,tx);
666 fjy1 = _mm_add_pd(fjy1,ty);
667 fjz1 = _mm_add_pd(fjz1,tz);
669 /**************************
670 * CALCULATE INTERACTIONS *
671 **************************/
673 r22 = _mm_mul_pd(rsq22,rinv22);
675 /* Calculate table index by multiplying r with table scale and truncate to integer */
676 rt = _mm_mul_pd(r22,vftabscale);
677 vfitab = _mm_cvttpd_epi32(rt);
678 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
679 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
681 /* CUBIC SPLINE TABLE ELECTROSTATICS */
682 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
683 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
684 GMX_MM_TRANSPOSE2_PD(Y,F);
685 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
686 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
687 GMX_MM_TRANSPOSE2_PD(G,H);
688 Heps = _mm_mul_pd(vfeps,H);
689 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
690 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
691 velec = _mm_mul_pd(qq22,VV);
692 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
693 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
695 /* Update potential sum for this i atom from the interaction with this j atom. */
696 velecsum = _mm_add_pd(velecsum,velec);
700 /* Calculate temporary vectorial force */
701 tx = _mm_mul_pd(fscal,dx22);
702 ty = _mm_mul_pd(fscal,dy22);
703 tz = _mm_mul_pd(fscal,dz22);
705 /* Update vectorial force */
706 fix2 = _mm_add_pd(fix2,tx);
707 fiy2 = _mm_add_pd(fiy2,ty);
708 fiz2 = _mm_add_pd(fiz2,tz);
710 fjx2 = _mm_add_pd(fjx2,tx);
711 fjy2 = _mm_add_pd(fjy2,ty);
712 fjz2 = _mm_add_pd(fjz2,tz);
714 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
716 /* Inner loop uses 417 flops */
723 j_coord_offsetA = DIM*jnrA;
725 /* load j atom coordinates */
726 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
727 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
729 /* Calculate displacement vector */
730 dx00 = _mm_sub_pd(ix0,jx0);
731 dy00 = _mm_sub_pd(iy0,jy0);
732 dz00 = _mm_sub_pd(iz0,jz0);
733 dx01 = _mm_sub_pd(ix0,jx1);
734 dy01 = _mm_sub_pd(iy0,jy1);
735 dz01 = _mm_sub_pd(iz0,jz1);
736 dx02 = _mm_sub_pd(ix0,jx2);
737 dy02 = _mm_sub_pd(iy0,jy2);
738 dz02 = _mm_sub_pd(iz0,jz2);
739 dx10 = _mm_sub_pd(ix1,jx0);
740 dy10 = _mm_sub_pd(iy1,jy0);
741 dz10 = _mm_sub_pd(iz1,jz0);
742 dx11 = _mm_sub_pd(ix1,jx1);
743 dy11 = _mm_sub_pd(iy1,jy1);
744 dz11 = _mm_sub_pd(iz1,jz1);
745 dx12 = _mm_sub_pd(ix1,jx2);
746 dy12 = _mm_sub_pd(iy1,jy2);
747 dz12 = _mm_sub_pd(iz1,jz2);
748 dx20 = _mm_sub_pd(ix2,jx0);
749 dy20 = _mm_sub_pd(iy2,jy0);
750 dz20 = _mm_sub_pd(iz2,jz0);
751 dx21 = _mm_sub_pd(ix2,jx1);
752 dy21 = _mm_sub_pd(iy2,jy1);
753 dz21 = _mm_sub_pd(iz2,jz1);
754 dx22 = _mm_sub_pd(ix2,jx2);
755 dy22 = _mm_sub_pd(iy2,jy2);
756 dz22 = _mm_sub_pd(iz2,jz2);
758 /* Calculate squared distance and things based on it */
759 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
760 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
761 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
762 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
763 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
764 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
765 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
766 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
767 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
769 rinv00 = gmx_mm_invsqrt_pd(rsq00);
770 rinv01 = gmx_mm_invsqrt_pd(rsq01);
771 rinv02 = gmx_mm_invsqrt_pd(rsq02);
772 rinv10 = gmx_mm_invsqrt_pd(rsq10);
773 rinv11 = gmx_mm_invsqrt_pd(rsq11);
774 rinv12 = gmx_mm_invsqrt_pd(rsq12);
775 rinv20 = gmx_mm_invsqrt_pd(rsq20);
776 rinv21 = gmx_mm_invsqrt_pd(rsq21);
777 rinv22 = gmx_mm_invsqrt_pd(rsq22);
779 fjx0 = _mm_setzero_pd();
780 fjy0 = _mm_setzero_pd();
781 fjz0 = _mm_setzero_pd();
782 fjx1 = _mm_setzero_pd();
783 fjy1 = _mm_setzero_pd();
784 fjz1 = _mm_setzero_pd();
785 fjx2 = _mm_setzero_pd();
786 fjy2 = _mm_setzero_pd();
787 fjz2 = _mm_setzero_pd();
789 /**************************
790 * CALCULATE INTERACTIONS *
791 **************************/
793 r00 = _mm_mul_pd(rsq00,rinv00);
795 /* Calculate table index by multiplying r with table scale and truncate to integer */
796 rt = _mm_mul_pd(r00,vftabscale);
797 vfitab = _mm_cvttpd_epi32(rt);
798 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
799 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
801 /* CUBIC SPLINE TABLE ELECTROSTATICS */
802 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
803 F = _mm_setzero_pd();
804 GMX_MM_TRANSPOSE2_PD(Y,F);
805 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
806 H = _mm_setzero_pd();
807 GMX_MM_TRANSPOSE2_PD(G,H);
808 Heps = _mm_mul_pd(vfeps,H);
809 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
810 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
811 velec = _mm_mul_pd(qq00,VV);
812 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
813 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
815 /* CUBIC SPLINE TABLE DISPERSION */
816 vfitab = _mm_add_epi32(vfitab,ifour);
817 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
818 F = _mm_setzero_pd();
819 GMX_MM_TRANSPOSE2_PD(Y,F);
820 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
821 H = _mm_setzero_pd();
822 GMX_MM_TRANSPOSE2_PD(G,H);
823 Heps = _mm_mul_pd(vfeps,H);
824 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
825 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
826 vvdw6 = _mm_mul_pd(c6_00,VV);
827 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
828 fvdw6 = _mm_mul_pd(c6_00,FF);
830 /* CUBIC SPLINE TABLE REPULSION */
831 vfitab = _mm_add_epi32(vfitab,ifour);
832 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
833 F = _mm_setzero_pd();
834 GMX_MM_TRANSPOSE2_PD(Y,F);
835 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
836 H = _mm_setzero_pd();
837 GMX_MM_TRANSPOSE2_PD(G,H);
838 Heps = _mm_mul_pd(vfeps,H);
839 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
840 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
841 vvdw12 = _mm_mul_pd(c12_00,VV);
842 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
843 fvdw12 = _mm_mul_pd(c12_00,FF);
844 vvdw = _mm_add_pd(vvdw12,vvdw6);
845 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
847 /* Update potential sum for this i atom from the interaction with this j atom. */
848 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
849 velecsum = _mm_add_pd(velecsum,velec);
850 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
851 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
853 fscal = _mm_add_pd(felec,fvdw);
855 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
857 /* Calculate temporary vectorial force */
858 tx = _mm_mul_pd(fscal,dx00);
859 ty = _mm_mul_pd(fscal,dy00);
860 tz = _mm_mul_pd(fscal,dz00);
862 /* Update vectorial force */
863 fix0 = _mm_add_pd(fix0,tx);
864 fiy0 = _mm_add_pd(fiy0,ty);
865 fiz0 = _mm_add_pd(fiz0,tz);
867 fjx0 = _mm_add_pd(fjx0,tx);
868 fjy0 = _mm_add_pd(fjy0,ty);
869 fjz0 = _mm_add_pd(fjz0,tz);
871 /**************************
872 * CALCULATE INTERACTIONS *
873 **************************/
875 r01 = _mm_mul_pd(rsq01,rinv01);
877 /* Calculate table index by multiplying r with table scale and truncate to integer */
878 rt = _mm_mul_pd(r01,vftabscale);
879 vfitab = _mm_cvttpd_epi32(rt);
880 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
881 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
883 /* CUBIC SPLINE TABLE ELECTROSTATICS */
884 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
885 F = _mm_setzero_pd();
886 GMX_MM_TRANSPOSE2_PD(Y,F);
887 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
888 H = _mm_setzero_pd();
889 GMX_MM_TRANSPOSE2_PD(G,H);
890 Heps = _mm_mul_pd(vfeps,H);
891 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
892 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
893 velec = _mm_mul_pd(qq01,VV);
894 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
895 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
897 /* Update potential sum for this i atom from the interaction with this j atom. */
898 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
899 velecsum = _mm_add_pd(velecsum,velec);
903 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
905 /* Calculate temporary vectorial force */
906 tx = _mm_mul_pd(fscal,dx01);
907 ty = _mm_mul_pd(fscal,dy01);
908 tz = _mm_mul_pd(fscal,dz01);
910 /* Update vectorial force */
911 fix0 = _mm_add_pd(fix0,tx);
912 fiy0 = _mm_add_pd(fiy0,ty);
913 fiz0 = _mm_add_pd(fiz0,tz);
915 fjx1 = _mm_add_pd(fjx1,tx);
916 fjy1 = _mm_add_pd(fjy1,ty);
917 fjz1 = _mm_add_pd(fjz1,tz);
919 /**************************
920 * CALCULATE INTERACTIONS *
921 **************************/
923 r02 = _mm_mul_pd(rsq02,rinv02);
925 /* Calculate table index by multiplying r with table scale and truncate to integer */
926 rt = _mm_mul_pd(r02,vftabscale);
927 vfitab = _mm_cvttpd_epi32(rt);
928 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
929 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
931 /* CUBIC SPLINE TABLE ELECTROSTATICS */
932 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
933 F = _mm_setzero_pd();
934 GMX_MM_TRANSPOSE2_PD(Y,F);
935 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
936 H = _mm_setzero_pd();
937 GMX_MM_TRANSPOSE2_PD(G,H);
938 Heps = _mm_mul_pd(vfeps,H);
939 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
940 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
941 velec = _mm_mul_pd(qq02,VV);
942 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
943 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
945 /* Update potential sum for this i atom from the interaction with this j atom. */
946 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
947 velecsum = _mm_add_pd(velecsum,velec);
951 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
953 /* Calculate temporary vectorial force */
954 tx = _mm_mul_pd(fscal,dx02);
955 ty = _mm_mul_pd(fscal,dy02);
956 tz = _mm_mul_pd(fscal,dz02);
958 /* Update vectorial force */
959 fix0 = _mm_add_pd(fix0,tx);
960 fiy0 = _mm_add_pd(fiy0,ty);
961 fiz0 = _mm_add_pd(fiz0,tz);
963 fjx2 = _mm_add_pd(fjx2,tx);
964 fjy2 = _mm_add_pd(fjy2,ty);
965 fjz2 = _mm_add_pd(fjz2,tz);
967 /**************************
968 * CALCULATE INTERACTIONS *
969 **************************/
971 r10 = _mm_mul_pd(rsq10,rinv10);
973 /* Calculate table index by multiplying r with table scale and truncate to integer */
974 rt = _mm_mul_pd(r10,vftabscale);
975 vfitab = _mm_cvttpd_epi32(rt);
976 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
977 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
979 /* CUBIC SPLINE TABLE ELECTROSTATICS */
980 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
981 F = _mm_setzero_pd();
982 GMX_MM_TRANSPOSE2_PD(Y,F);
983 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
984 H = _mm_setzero_pd();
985 GMX_MM_TRANSPOSE2_PD(G,H);
986 Heps = _mm_mul_pd(vfeps,H);
987 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
988 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
989 velec = _mm_mul_pd(qq10,VV);
990 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
991 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
993 /* Update potential sum for this i atom from the interaction with this j atom. */
994 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
995 velecsum = _mm_add_pd(velecsum,velec);
999 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1001 /* Calculate temporary vectorial force */
1002 tx = _mm_mul_pd(fscal,dx10);
1003 ty = _mm_mul_pd(fscal,dy10);
1004 tz = _mm_mul_pd(fscal,dz10);
1006 /* Update vectorial force */
1007 fix1 = _mm_add_pd(fix1,tx);
1008 fiy1 = _mm_add_pd(fiy1,ty);
1009 fiz1 = _mm_add_pd(fiz1,tz);
1011 fjx0 = _mm_add_pd(fjx0,tx);
1012 fjy0 = _mm_add_pd(fjy0,ty);
1013 fjz0 = _mm_add_pd(fjz0,tz);
1015 /**************************
1016 * CALCULATE INTERACTIONS *
1017 **************************/
1019 r11 = _mm_mul_pd(rsq11,rinv11);
1021 /* Calculate table index by multiplying r with table scale and truncate to integer */
1022 rt = _mm_mul_pd(r11,vftabscale);
1023 vfitab = _mm_cvttpd_epi32(rt);
1024 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1025 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1027 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1028 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1029 F = _mm_setzero_pd();
1030 GMX_MM_TRANSPOSE2_PD(Y,F);
1031 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1032 H = _mm_setzero_pd();
1033 GMX_MM_TRANSPOSE2_PD(G,H);
1034 Heps = _mm_mul_pd(vfeps,H);
1035 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1036 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1037 velec = _mm_mul_pd(qq11,VV);
1038 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1039 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1041 /* Update potential sum for this i atom from the interaction with this j atom. */
1042 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1043 velecsum = _mm_add_pd(velecsum,velec);
1047 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1049 /* Calculate temporary vectorial force */
1050 tx = _mm_mul_pd(fscal,dx11);
1051 ty = _mm_mul_pd(fscal,dy11);
1052 tz = _mm_mul_pd(fscal,dz11);
1054 /* Update vectorial force */
1055 fix1 = _mm_add_pd(fix1,tx);
1056 fiy1 = _mm_add_pd(fiy1,ty);
1057 fiz1 = _mm_add_pd(fiz1,tz);
1059 fjx1 = _mm_add_pd(fjx1,tx);
1060 fjy1 = _mm_add_pd(fjy1,ty);
1061 fjz1 = _mm_add_pd(fjz1,tz);
1063 /**************************
1064 * CALCULATE INTERACTIONS *
1065 **************************/
1067 r12 = _mm_mul_pd(rsq12,rinv12);
1069 /* Calculate table index by multiplying r with table scale and truncate to integer */
1070 rt = _mm_mul_pd(r12,vftabscale);
1071 vfitab = _mm_cvttpd_epi32(rt);
1072 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1073 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1075 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1076 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1077 F = _mm_setzero_pd();
1078 GMX_MM_TRANSPOSE2_PD(Y,F);
1079 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1080 H = _mm_setzero_pd();
1081 GMX_MM_TRANSPOSE2_PD(G,H);
1082 Heps = _mm_mul_pd(vfeps,H);
1083 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1084 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1085 velec = _mm_mul_pd(qq12,VV);
1086 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1087 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1089 /* Update potential sum for this i atom from the interaction with this j atom. */
1090 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1091 velecsum = _mm_add_pd(velecsum,velec);
1095 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1097 /* Calculate temporary vectorial force */
1098 tx = _mm_mul_pd(fscal,dx12);
1099 ty = _mm_mul_pd(fscal,dy12);
1100 tz = _mm_mul_pd(fscal,dz12);
1102 /* Update vectorial force */
1103 fix1 = _mm_add_pd(fix1,tx);
1104 fiy1 = _mm_add_pd(fiy1,ty);
1105 fiz1 = _mm_add_pd(fiz1,tz);
1107 fjx2 = _mm_add_pd(fjx2,tx);
1108 fjy2 = _mm_add_pd(fjy2,ty);
1109 fjz2 = _mm_add_pd(fjz2,tz);
1111 /**************************
1112 * CALCULATE INTERACTIONS *
1113 **************************/
1115 r20 = _mm_mul_pd(rsq20,rinv20);
1117 /* Calculate table index by multiplying r with table scale and truncate to integer */
1118 rt = _mm_mul_pd(r20,vftabscale);
1119 vfitab = _mm_cvttpd_epi32(rt);
1120 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1121 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1123 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1124 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1125 F = _mm_setzero_pd();
1126 GMX_MM_TRANSPOSE2_PD(Y,F);
1127 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1128 H = _mm_setzero_pd();
1129 GMX_MM_TRANSPOSE2_PD(G,H);
1130 Heps = _mm_mul_pd(vfeps,H);
1131 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1132 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1133 velec = _mm_mul_pd(qq20,VV);
1134 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1135 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1137 /* Update potential sum for this i atom from the interaction with this j atom. */
1138 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1139 velecsum = _mm_add_pd(velecsum,velec);
1143 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1145 /* Calculate temporary vectorial force */
1146 tx = _mm_mul_pd(fscal,dx20);
1147 ty = _mm_mul_pd(fscal,dy20);
1148 tz = _mm_mul_pd(fscal,dz20);
1150 /* Update vectorial force */
1151 fix2 = _mm_add_pd(fix2,tx);
1152 fiy2 = _mm_add_pd(fiy2,ty);
1153 fiz2 = _mm_add_pd(fiz2,tz);
1155 fjx0 = _mm_add_pd(fjx0,tx);
1156 fjy0 = _mm_add_pd(fjy0,ty);
1157 fjz0 = _mm_add_pd(fjz0,tz);
1159 /**************************
1160 * CALCULATE INTERACTIONS *
1161 **************************/
1163 r21 = _mm_mul_pd(rsq21,rinv21);
1165 /* Calculate table index by multiplying r with table scale and truncate to integer */
1166 rt = _mm_mul_pd(r21,vftabscale);
1167 vfitab = _mm_cvttpd_epi32(rt);
1168 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1169 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1171 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1172 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1173 F = _mm_setzero_pd();
1174 GMX_MM_TRANSPOSE2_PD(Y,F);
1175 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1176 H = _mm_setzero_pd();
1177 GMX_MM_TRANSPOSE2_PD(G,H);
1178 Heps = _mm_mul_pd(vfeps,H);
1179 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1180 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1181 velec = _mm_mul_pd(qq21,VV);
1182 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1183 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1185 /* Update potential sum for this i atom from the interaction with this j atom. */
1186 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1187 velecsum = _mm_add_pd(velecsum,velec);
1191 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1193 /* Calculate temporary vectorial force */
1194 tx = _mm_mul_pd(fscal,dx21);
1195 ty = _mm_mul_pd(fscal,dy21);
1196 tz = _mm_mul_pd(fscal,dz21);
1198 /* Update vectorial force */
1199 fix2 = _mm_add_pd(fix2,tx);
1200 fiy2 = _mm_add_pd(fiy2,ty);
1201 fiz2 = _mm_add_pd(fiz2,tz);
1203 fjx1 = _mm_add_pd(fjx1,tx);
1204 fjy1 = _mm_add_pd(fjy1,ty);
1205 fjz1 = _mm_add_pd(fjz1,tz);
1207 /**************************
1208 * CALCULATE INTERACTIONS *
1209 **************************/
1211 r22 = _mm_mul_pd(rsq22,rinv22);
1213 /* Calculate table index by multiplying r with table scale and truncate to integer */
1214 rt = _mm_mul_pd(r22,vftabscale);
1215 vfitab = _mm_cvttpd_epi32(rt);
1216 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1217 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1219 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1220 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1221 F = _mm_setzero_pd();
1222 GMX_MM_TRANSPOSE2_PD(Y,F);
1223 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1224 H = _mm_setzero_pd();
1225 GMX_MM_TRANSPOSE2_PD(G,H);
1226 Heps = _mm_mul_pd(vfeps,H);
1227 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1228 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1229 velec = _mm_mul_pd(qq22,VV);
1230 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1231 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1233 /* Update potential sum for this i atom from the interaction with this j atom. */
1234 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1235 velecsum = _mm_add_pd(velecsum,velec);
1239 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1241 /* Calculate temporary vectorial force */
1242 tx = _mm_mul_pd(fscal,dx22);
1243 ty = _mm_mul_pd(fscal,dy22);
1244 tz = _mm_mul_pd(fscal,dz22);
1246 /* Update vectorial force */
1247 fix2 = _mm_add_pd(fix2,tx);
1248 fiy2 = _mm_add_pd(fiy2,ty);
1249 fiz2 = _mm_add_pd(fiz2,tz);
1251 fjx2 = _mm_add_pd(fjx2,tx);
1252 fjy2 = _mm_add_pd(fjy2,ty);
1253 fjz2 = _mm_add_pd(fjz2,tz);
1255 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1257 /* Inner loop uses 417 flops */
1260 /* End of innermost loop */
1262 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1263 f+i_coord_offset,fshift+i_shift_offset);
1266 /* Update potential energies */
1267 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1268 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1270 /* Increment number of inner iterations */
1271 inneriter += j_index_end - j_index_start;
1273 /* Outer loop uses 20 flops */
1276 /* Increment number of outer iterations */
1279 /* Update outer/inner flops */
1281 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*417);
1284 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse4_1_double
1285 * Electrostatics interaction: CubicSplineTable
1286 * VdW interaction: CubicSplineTable
1287 * Geometry: Water3-Water3
1288 * Calculate force/pot: Force
1291 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse4_1_double
1292 (t_nblist * gmx_restrict nlist,
1293 rvec * gmx_restrict xx,
1294 rvec * gmx_restrict ff,
1295 t_forcerec * gmx_restrict fr,
1296 t_mdatoms * gmx_restrict mdatoms,
1297 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1298 t_nrnb * gmx_restrict nrnb)
1300 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1301 * just 0 for non-waters.
1302 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1303 * jnr indices corresponding to data put in the four positions in the SIMD register.
1305 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1306 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1308 int j_coord_offsetA,j_coord_offsetB;
1309 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1310 real rcutoff_scalar;
1311 real *shiftvec,*fshift,*x,*f;
1312 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1314 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1316 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1318 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1319 int vdwjidx0A,vdwjidx0B;
1320 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1321 int vdwjidx1A,vdwjidx1B;
1322 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1323 int vdwjidx2A,vdwjidx2B;
1324 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1325 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1326 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1327 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1328 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1329 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1330 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1331 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1332 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1333 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1334 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1337 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1340 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1341 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1343 __m128i ifour = _mm_set1_epi32(4);
1344 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1346 __m128d dummy_mask,cutoff_mask;
1347 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1348 __m128d one = _mm_set1_pd(1.0);
1349 __m128d two = _mm_set1_pd(2.0);
1355 jindex = nlist->jindex;
1357 shiftidx = nlist->shift;
1359 shiftvec = fr->shift_vec[0];
1360 fshift = fr->fshift[0];
1361 facel = _mm_set1_pd(fr->epsfac);
1362 charge = mdatoms->chargeA;
1363 nvdwtype = fr->ntype;
1364 vdwparam = fr->nbfp;
1365 vdwtype = mdatoms->typeA;
1367 vftab = kernel_data->table_elec_vdw->data;
1368 vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
1370 /* Setup water-specific parameters */
1371 inr = nlist->iinr[0];
1372 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1373 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1374 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1375 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1377 jq0 = _mm_set1_pd(charge[inr+0]);
1378 jq1 = _mm_set1_pd(charge[inr+1]);
1379 jq2 = _mm_set1_pd(charge[inr+2]);
1380 vdwjidx0A = 2*vdwtype[inr+0];
1381 qq00 = _mm_mul_pd(iq0,jq0);
1382 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1383 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1384 qq01 = _mm_mul_pd(iq0,jq1);
1385 qq02 = _mm_mul_pd(iq0,jq2);
1386 qq10 = _mm_mul_pd(iq1,jq0);
1387 qq11 = _mm_mul_pd(iq1,jq1);
1388 qq12 = _mm_mul_pd(iq1,jq2);
1389 qq20 = _mm_mul_pd(iq2,jq0);
1390 qq21 = _mm_mul_pd(iq2,jq1);
1391 qq22 = _mm_mul_pd(iq2,jq2);
1393 /* Avoid stupid compiler warnings */
1395 j_coord_offsetA = 0;
1396 j_coord_offsetB = 0;
1401 /* Start outer loop over neighborlists */
1402 for(iidx=0; iidx<nri; iidx++)
1404 /* Load shift vector for this list */
1405 i_shift_offset = DIM*shiftidx[iidx];
1407 /* Load limits for loop over neighbors */
1408 j_index_start = jindex[iidx];
1409 j_index_end = jindex[iidx+1];
1411 /* Get outer coordinate index */
1413 i_coord_offset = DIM*inr;
1415 /* Load i particle coords and add shift vector */
1416 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1417 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1419 fix0 = _mm_setzero_pd();
1420 fiy0 = _mm_setzero_pd();
1421 fiz0 = _mm_setzero_pd();
1422 fix1 = _mm_setzero_pd();
1423 fiy1 = _mm_setzero_pd();
1424 fiz1 = _mm_setzero_pd();
1425 fix2 = _mm_setzero_pd();
1426 fiy2 = _mm_setzero_pd();
1427 fiz2 = _mm_setzero_pd();
1429 /* Start inner kernel loop */
1430 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1433 /* Get j neighbor index, and coordinate index */
1435 jnrB = jjnr[jidx+1];
1436 j_coord_offsetA = DIM*jnrA;
1437 j_coord_offsetB = DIM*jnrB;
1439 /* load j atom coordinates */
1440 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1441 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1443 /* Calculate displacement vector */
1444 dx00 = _mm_sub_pd(ix0,jx0);
1445 dy00 = _mm_sub_pd(iy0,jy0);
1446 dz00 = _mm_sub_pd(iz0,jz0);
1447 dx01 = _mm_sub_pd(ix0,jx1);
1448 dy01 = _mm_sub_pd(iy0,jy1);
1449 dz01 = _mm_sub_pd(iz0,jz1);
1450 dx02 = _mm_sub_pd(ix0,jx2);
1451 dy02 = _mm_sub_pd(iy0,jy2);
1452 dz02 = _mm_sub_pd(iz0,jz2);
1453 dx10 = _mm_sub_pd(ix1,jx0);
1454 dy10 = _mm_sub_pd(iy1,jy0);
1455 dz10 = _mm_sub_pd(iz1,jz0);
1456 dx11 = _mm_sub_pd(ix1,jx1);
1457 dy11 = _mm_sub_pd(iy1,jy1);
1458 dz11 = _mm_sub_pd(iz1,jz1);
1459 dx12 = _mm_sub_pd(ix1,jx2);
1460 dy12 = _mm_sub_pd(iy1,jy2);
1461 dz12 = _mm_sub_pd(iz1,jz2);
1462 dx20 = _mm_sub_pd(ix2,jx0);
1463 dy20 = _mm_sub_pd(iy2,jy0);
1464 dz20 = _mm_sub_pd(iz2,jz0);
1465 dx21 = _mm_sub_pd(ix2,jx1);
1466 dy21 = _mm_sub_pd(iy2,jy1);
1467 dz21 = _mm_sub_pd(iz2,jz1);
1468 dx22 = _mm_sub_pd(ix2,jx2);
1469 dy22 = _mm_sub_pd(iy2,jy2);
1470 dz22 = _mm_sub_pd(iz2,jz2);
1472 /* Calculate squared distance and things based on it */
1473 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1474 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1475 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1476 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1477 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1478 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1479 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1480 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1481 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1483 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1484 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1485 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1486 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1487 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1488 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1489 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1490 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1491 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1493 fjx0 = _mm_setzero_pd();
1494 fjy0 = _mm_setzero_pd();
1495 fjz0 = _mm_setzero_pd();
1496 fjx1 = _mm_setzero_pd();
1497 fjy1 = _mm_setzero_pd();
1498 fjz1 = _mm_setzero_pd();
1499 fjx2 = _mm_setzero_pd();
1500 fjy2 = _mm_setzero_pd();
1501 fjz2 = _mm_setzero_pd();
1503 /**************************
1504 * CALCULATE INTERACTIONS *
1505 **************************/
1507 r00 = _mm_mul_pd(rsq00,rinv00);
1509 /* Calculate table index by multiplying r with table scale and truncate to integer */
1510 rt = _mm_mul_pd(r00,vftabscale);
1511 vfitab = _mm_cvttpd_epi32(rt);
1512 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1513 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1515 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1516 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1517 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1518 GMX_MM_TRANSPOSE2_PD(Y,F);
1519 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1520 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1521 GMX_MM_TRANSPOSE2_PD(G,H);
1522 Heps = _mm_mul_pd(vfeps,H);
1523 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1524 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1525 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1527 /* CUBIC SPLINE TABLE DISPERSION */
1528 vfitab = _mm_add_epi32(vfitab,ifour);
1529 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1530 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1531 GMX_MM_TRANSPOSE2_PD(Y,F);
1532 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1533 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1534 GMX_MM_TRANSPOSE2_PD(G,H);
1535 Heps = _mm_mul_pd(vfeps,H);
1536 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1537 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1538 fvdw6 = _mm_mul_pd(c6_00,FF);
1540 /* CUBIC SPLINE TABLE REPULSION */
1541 vfitab = _mm_add_epi32(vfitab,ifour);
1542 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1543 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1544 GMX_MM_TRANSPOSE2_PD(Y,F);
1545 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1546 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1547 GMX_MM_TRANSPOSE2_PD(G,H);
1548 Heps = _mm_mul_pd(vfeps,H);
1549 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1550 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1551 fvdw12 = _mm_mul_pd(c12_00,FF);
1552 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1554 fscal = _mm_add_pd(felec,fvdw);
1556 /* Calculate temporary vectorial force */
1557 tx = _mm_mul_pd(fscal,dx00);
1558 ty = _mm_mul_pd(fscal,dy00);
1559 tz = _mm_mul_pd(fscal,dz00);
1561 /* Update vectorial force */
1562 fix0 = _mm_add_pd(fix0,tx);
1563 fiy0 = _mm_add_pd(fiy0,ty);
1564 fiz0 = _mm_add_pd(fiz0,tz);
1566 fjx0 = _mm_add_pd(fjx0,tx);
1567 fjy0 = _mm_add_pd(fjy0,ty);
1568 fjz0 = _mm_add_pd(fjz0,tz);
1570 /**************************
1571 * CALCULATE INTERACTIONS *
1572 **************************/
1574 r01 = _mm_mul_pd(rsq01,rinv01);
1576 /* Calculate table index by multiplying r with table scale and truncate to integer */
1577 rt = _mm_mul_pd(r01,vftabscale);
1578 vfitab = _mm_cvttpd_epi32(rt);
1579 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1580 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1582 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1583 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1584 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1585 GMX_MM_TRANSPOSE2_PD(Y,F);
1586 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1587 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1588 GMX_MM_TRANSPOSE2_PD(G,H);
1589 Heps = _mm_mul_pd(vfeps,H);
1590 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1591 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1592 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1596 /* Calculate temporary vectorial force */
1597 tx = _mm_mul_pd(fscal,dx01);
1598 ty = _mm_mul_pd(fscal,dy01);
1599 tz = _mm_mul_pd(fscal,dz01);
1601 /* Update vectorial force */
1602 fix0 = _mm_add_pd(fix0,tx);
1603 fiy0 = _mm_add_pd(fiy0,ty);
1604 fiz0 = _mm_add_pd(fiz0,tz);
1606 fjx1 = _mm_add_pd(fjx1,tx);
1607 fjy1 = _mm_add_pd(fjy1,ty);
1608 fjz1 = _mm_add_pd(fjz1,tz);
1610 /**************************
1611 * CALCULATE INTERACTIONS *
1612 **************************/
1614 r02 = _mm_mul_pd(rsq02,rinv02);
1616 /* Calculate table index by multiplying r with table scale and truncate to integer */
1617 rt = _mm_mul_pd(r02,vftabscale);
1618 vfitab = _mm_cvttpd_epi32(rt);
1619 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1620 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1622 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1623 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1624 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1625 GMX_MM_TRANSPOSE2_PD(Y,F);
1626 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1627 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1628 GMX_MM_TRANSPOSE2_PD(G,H);
1629 Heps = _mm_mul_pd(vfeps,H);
1630 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1631 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1632 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1636 /* Calculate temporary vectorial force */
1637 tx = _mm_mul_pd(fscal,dx02);
1638 ty = _mm_mul_pd(fscal,dy02);
1639 tz = _mm_mul_pd(fscal,dz02);
1641 /* Update vectorial force */
1642 fix0 = _mm_add_pd(fix0,tx);
1643 fiy0 = _mm_add_pd(fiy0,ty);
1644 fiz0 = _mm_add_pd(fiz0,tz);
1646 fjx2 = _mm_add_pd(fjx2,tx);
1647 fjy2 = _mm_add_pd(fjy2,ty);
1648 fjz2 = _mm_add_pd(fjz2,tz);
1650 /**************************
1651 * CALCULATE INTERACTIONS *
1652 **************************/
1654 r10 = _mm_mul_pd(rsq10,rinv10);
1656 /* Calculate table index by multiplying r with table scale and truncate to integer */
1657 rt = _mm_mul_pd(r10,vftabscale);
1658 vfitab = _mm_cvttpd_epi32(rt);
1659 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1660 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1662 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1663 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1664 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1665 GMX_MM_TRANSPOSE2_PD(Y,F);
1666 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1667 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1668 GMX_MM_TRANSPOSE2_PD(G,H);
1669 Heps = _mm_mul_pd(vfeps,H);
1670 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1671 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1672 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1676 /* Calculate temporary vectorial force */
1677 tx = _mm_mul_pd(fscal,dx10);
1678 ty = _mm_mul_pd(fscal,dy10);
1679 tz = _mm_mul_pd(fscal,dz10);
1681 /* Update vectorial force */
1682 fix1 = _mm_add_pd(fix1,tx);
1683 fiy1 = _mm_add_pd(fiy1,ty);
1684 fiz1 = _mm_add_pd(fiz1,tz);
1686 fjx0 = _mm_add_pd(fjx0,tx);
1687 fjy0 = _mm_add_pd(fjy0,ty);
1688 fjz0 = _mm_add_pd(fjz0,tz);
1690 /**************************
1691 * CALCULATE INTERACTIONS *
1692 **************************/
1694 r11 = _mm_mul_pd(rsq11,rinv11);
1696 /* Calculate table index by multiplying r with table scale and truncate to integer */
1697 rt = _mm_mul_pd(r11,vftabscale);
1698 vfitab = _mm_cvttpd_epi32(rt);
1699 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1700 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1702 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1703 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1704 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1705 GMX_MM_TRANSPOSE2_PD(Y,F);
1706 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1707 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1708 GMX_MM_TRANSPOSE2_PD(G,H);
1709 Heps = _mm_mul_pd(vfeps,H);
1710 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1711 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1712 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1716 /* Calculate temporary vectorial force */
1717 tx = _mm_mul_pd(fscal,dx11);
1718 ty = _mm_mul_pd(fscal,dy11);
1719 tz = _mm_mul_pd(fscal,dz11);
1721 /* Update vectorial force */
1722 fix1 = _mm_add_pd(fix1,tx);
1723 fiy1 = _mm_add_pd(fiy1,ty);
1724 fiz1 = _mm_add_pd(fiz1,tz);
1726 fjx1 = _mm_add_pd(fjx1,tx);
1727 fjy1 = _mm_add_pd(fjy1,ty);
1728 fjz1 = _mm_add_pd(fjz1,tz);
1730 /**************************
1731 * CALCULATE INTERACTIONS *
1732 **************************/
1734 r12 = _mm_mul_pd(rsq12,rinv12);
1736 /* Calculate table index by multiplying r with table scale and truncate to integer */
1737 rt = _mm_mul_pd(r12,vftabscale);
1738 vfitab = _mm_cvttpd_epi32(rt);
1739 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1740 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1742 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1743 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1744 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1745 GMX_MM_TRANSPOSE2_PD(Y,F);
1746 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1747 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1748 GMX_MM_TRANSPOSE2_PD(G,H);
1749 Heps = _mm_mul_pd(vfeps,H);
1750 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1751 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1752 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1756 /* Calculate temporary vectorial force */
1757 tx = _mm_mul_pd(fscal,dx12);
1758 ty = _mm_mul_pd(fscal,dy12);
1759 tz = _mm_mul_pd(fscal,dz12);
1761 /* Update vectorial force */
1762 fix1 = _mm_add_pd(fix1,tx);
1763 fiy1 = _mm_add_pd(fiy1,ty);
1764 fiz1 = _mm_add_pd(fiz1,tz);
1766 fjx2 = _mm_add_pd(fjx2,tx);
1767 fjy2 = _mm_add_pd(fjy2,ty);
1768 fjz2 = _mm_add_pd(fjz2,tz);
1770 /**************************
1771 * CALCULATE INTERACTIONS *
1772 **************************/
1774 r20 = _mm_mul_pd(rsq20,rinv20);
1776 /* Calculate table index by multiplying r with table scale and truncate to integer */
1777 rt = _mm_mul_pd(r20,vftabscale);
1778 vfitab = _mm_cvttpd_epi32(rt);
1779 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1780 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1782 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1783 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1784 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1785 GMX_MM_TRANSPOSE2_PD(Y,F);
1786 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1787 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1788 GMX_MM_TRANSPOSE2_PD(G,H);
1789 Heps = _mm_mul_pd(vfeps,H);
1790 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1791 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1792 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1796 /* Calculate temporary vectorial force */
1797 tx = _mm_mul_pd(fscal,dx20);
1798 ty = _mm_mul_pd(fscal,dy20);
1799 tz = _mm_mul_pd(fscal,dz20);
1801 /* Update vectorial force */
1802 fix2 = _mm_add_pd(fix2,tx);
1803 fiy2 = _mm_add_pd(fiy2,ty);
1804 fiz2 = _mm_add_pd(fiz2,tz);
1806 fjx0 = _mm_add_pd(fjx0,tx);
1807 fjy0 = _mm_add_pd(fjy0,ty);
1808 fjz0 = _mm_add_pd(fjz0,tz);
1810 /**************************
1811 * CALCULATE INTERACTIONS *
1812 **************************/
1814 r21 = _mm_mul_pd(rsq21,rinv21);
1816 /* Calculate table index by multiplying r with table scale and truncate to integer */
1817 rt = _mm_mul_pd(r21,vftabscale);
1818 vfitab = _mm_cvttpd_epi32(rt);
1819 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1820 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1822 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1823 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1824 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1825 GMX_MM_TRANSPOSE2_PD(Y,F);
1826 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1827 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1828 GMX_MM_TRANSPOSE2_PD(G,H);
1829 Heps = _mm_mul_pd(vfeps,H);
1830 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1831 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1832 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1836 /* Calculate temporary vectorial force */
1837 tx = _mm_mul_pd(fscal,dx21);
1838 ty = _mm_mul_pd(fscal,dy21);
1839 tz = _mm_mul_pd(fscal,dz21);
1841 /* Update vectorial force */
1842 fix2 = _mm_add_pd(fix2,tx);
1843 fiy2 = _mm_add_pd(fiy2,ty);
1844 fiz2 = _mm_add_pd(fiz2,tz);
1846 fjx1 = _mm_add_pd(fjx1,tx);
1847 fjy1 = _mm_add_pd(fjy1,ty);
1848 fjz1 = _mm_add_pd(fjz1,tz);
1850 /**************************
1851 * CALCULATE INTERACTIONS *
1852 **************************/
1854 r22 = _mm_mul_pd(rsq22,rinv22);
1856 /* Calculate table index by multiplying r with table scale and truncate to integer */
1857 rt = _mm_mul_pd(r22,vftabscale);
1858 vfitab = _mm_cvttpd_epi32(rt);
1859 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1860 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1862 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1863 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1864 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1865 GMX_MM_TRANSPOSE2_PD(Y,F);
1866 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1867 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1868 GMX_MM_TRANSPOSE2_PD(G,H);
1869 Heps = _mm_mul_pd(vfeps,H);
1870 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1871 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1872 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1876 /* Calculate temporary vectorial force */
1877 tx = _mm_mul_pd(fscal,dx22);
1878 ty = _mm_mul_pd(fscal,dy22);
1879 tz = _mm_mul_pd(fscal,dz22);
1881 /* Update vectorial force */
1882 fix2 = _mm_add_pd(fix2,tx);
1883 fiy2 = _mm_add_pd(fiy2,ty);
1884 fiz2 = _mm_add_pd(fiz2,tz);
1886 fjx2 = _mm_add_pd(fjx2,tx);
1887 fjy2 = _mm_add_pd(fjy2,ty);
1888 fjz2 = _mm_add_pd(fjz2,tz);
1890 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1892 /* Inner loop uses 373 flops */
1895 if(jidx<j_index_end)
1899 j_coord_offsetA = DIM*jnrA;
1901 /* load j atom coordinates */
1902 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1903 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1905 /* Calculate displacement vector */
1906 dx00 = _mm_sub_pd(ix0,jx0);
1907 dy00 = _mm_sub_pd(iy0,jy0);
1908 dz00 = _mm_sub_pd(iz0,jz0);
1909 dx01 = _mm_sub_pd(ix0,jx1);
1910 dy01 = _mm_sub_pd(iy0,jy1);
1911 dz01 = _mm_sub_pd(iz0,jz1);
1912 dx02 = _mm_sub_pd(ix0,jx2);
1913 dy02 = _mm_sub_pd(iy0,jy2);
1914 dz02 = _mm_sub_pd(iz0,jz2);
1915 dx10 = _mm_sub_pd(ix1,jx0);
1916 dy10 = _mm_sub_pd(iy1,jy0);
1917 dz10 = _mm_sub_pd(iz1,jz0);
1918 dx11 = _mm_sub_pd(ix1,jx1);
1919 dy11 = _mm_sub_pd(iy1,jy1);
1920 dz11 = _mm_sub_pd(iz1,jz1);
1921 dx12 = _mm_sub_pd(ix1,jx2);
1922 dy12 = _mm_sub_pd(iy1,jy2);
1923 dz12 = _mm_sub_pd(iz1,jz2);
1924 dx20 = _mm_sub_pd(ix2,jx0);
1925 dy20 = _mm_sub_pd(iy2,jy0);
1926 dz20 = _mm_sub_pd(iz2,jz0);
1927 dx21 = _mm_sub_pd(ix2,jx1);
1928 dy21 = _mm_sub_pd(iy2,jy1);
1929 dz21 = _mm_sub_pd(iz2,jz1);
1930 dx22 = _mm_sub_pd(ix2,jx2);
1931 dy22 = _mm_sub_pd(iy2,jy2);
1932 dz22 = _mm_sub_pd(iz2,jz2);
1934 /* Calculate squared distance and things based on it */
1935 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1936 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1937 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1938 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1939 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1940 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1941 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1942 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1943 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1945 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1946 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1947 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1948 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1949 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1950 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1951 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1952 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1953 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1955 fjx0 = _mm_setzero_pd();
1956 fjy0 = _mm_setzero_pd();
1957 fjz0 = _mm_setzero_pd();
1958 fjx1 = _mm_setzero_pd();
1959 fjy1 = _mm_setzero_pd();
1960 fjz1 = _mm_setzero_pd();
1961 fjx2 = _mm_setzero_pd();
1962 fjy2 = _mm_setzero_pd();
1963 fjz2 = _mm_setzero_pd();
1965 /**************************
1966 * CALCULATE INTERACTIONS *
1967 **************************/
1969 r00 = _mm_mul_pd(rsq00,rinv00);
1971 /* Calculate table index by multiplying r with table scale and truncate to integer */
1972 rt = _mm_mul_pd(r00,vftabscale);
1973 vfitab = _mm_cvttpd_epi32(rt);
1974 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1975 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1977 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1978 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1979 F = _mm_setzero_pd();
1980 GMX_MM_TRANSPOSE2_PD(Y,F);
1981 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1982 H = _mm_setzero_pd();
1983 GMX_MM_TRANSPOSE2_PD(G,H);
1984 Heps = _mm_mul_pd(vfeps,H);
1985 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1986 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1987 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1989 /* CUBIC SPLINE TABLE DISPERSION */
1990 vfitab = _mm_add_epi32(vfitab,ifour);
1991 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1992 F = _mm_setzero_pd();
1993 GMX_MM_TRANSPOSE2_PD(Y,F);
1994 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1995 H = _mm_setzero_pd();
1996 GMX_MM_TRANSPOSE2_PD(G,H);
1997 Heps = _mm_mul_pd(vfeps,H);
1998 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1999 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2000 fvdw6 = _mm_mul_pd(c6_00,FF);
2002 /* CUBIC SPLINE TABLE REPULSION */
2003 vfitab = _mm_add_epi32(vfitab,ifour);
2004 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2005 F = _mm_setzero_pd();
2006 GMX_MM_TRANSPOSE2_PD(Y,F);
2007 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2008 H = _mm_setzero_pd();
2009 GMX_MM_TRANSPOSE2_PD(G,H);
2010 Heps = _mm_mul_pd(vfeps,H);
2011 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2012 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2013 fvdw12 = _mm_mul_pd(c12_00,FF);
2014 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
2016 fscal = _mm_add_pd(felec,fvdw);
2018 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2020 /* Calculate temporary vectorial force */
2021 tx = _mm_mul_pd(fscal,dx00);
2022 ty = _mm_mul_pd(fscal,dy00);
2023 tz = _mm_mul_pd(fscal,dz00);
2025 /* Update vectorial force */
2026 fix0 = _mm_add_pd(fix0,tx);
2027 fiy0 = _mm_add_pd(fiy0,ty);
2028 fiz0 = _mm_add_pd(fiz0,tz);
2030 fjx0 = _mm_add_pd(fjx0,tx);
2031 fjy0 = _mm_add_pd(fjy0,ty);
2032 fjz0 = _mm_add_pd(fjz0,tz);
2034 /**************************
2035 * CALCULATE INTERACTIONS *
2036 **************************/
2038 r01 = _mm_mul_pd(rsq01,rinv01);
2040 /* Calculate table index by multiplying r with table scale and truncate to integer */
2041 rt = _mm_mul_pd(r01,vftabscale);
2042 vfitab = _mm_cvttpd_epi32(rt);
2043 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2044 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2046 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2047 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2048 F = _mm_setzero_pd();
2049 GMX_MM_TRANSPOSE2_PD(Y,F);
2050 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2051 H = _mm_setzero_pd();
2052 GMX_MM_TRANSPOSE2_PD(G,H);
2053 Heps = _mm_mul_pd(vfeps,H);
2054 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2055 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2056 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
2060 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2062 /* Calculate temporary vectorial force */
2063 tx = _mm_mul_pd(fscal,dx01);
2064 ty = _mm_mul_pd(fscal,dy01);
2065 tz = _mm_mul_pd(fscal,dz01);
2067 /* Update vectorial force */
2068 fix0 = _mm_add_pd(fix0,tx);
2069 fiy0 = _mm_add_pd(fiy0,ty);
2070 fiz0 = _mm_add_pd(fiz0,tz);
2072 fjx1 = _mm_add_pd(fjx1,tx);
2073 fjy1 = _mm_add_pd(fjy1,ty);
2074 fjz1 = _mm_add_pd(fjz1,tz);
2076 /**************************
2077 * CALCULATE INTERACTIONS *
2078 **************************/
2080 r02 = _mm_mul_pd(rsq02,rinv02);
2082 /* Calculate table index by multiplying r with table scale and truncate to integer */
2083 rt = _mm_mul_pd(r02,vftabscale);
2084 vfitab = _mm_cvttpd_epi32(rt);
2085 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2086 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2088 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2089 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2090 F = _mm_setzero_pd();
2091 GMX_MM_TRANSPOSE2_PD(Y,F);
2092 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2093 H = _mm_setzero_pd();
2094 GMX_MM_TRANSPOSE2_PD(G,H);
2095 Heps = _mm_mul_pd(vfeps,H);
2096 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2097 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2098 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
2102 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2104 /* Calculate temporary vectorial force */
2105 tx = _mm_mul_pd(fscal,dx02);
2106 ty = _mm_mul_pd(fscal,dy02);
2107 tz = _mm_mul_pd(fscal,dz02);
2109 /* Update vectorial force */
2110 fix0 = _mm_add_pd(fix0,tx);
2111 fiy0 = _mm_add_pd(fiy0,ty);
2112 fiz0 = _mm_add_pd(fiz0,tz);
2114 fjx2 = _mm_add_pd(fjx2,tx);
2115 fjy2 = _mm_add_pd(fjy2,ty);
2116 fjz2 = _mm_add_pd(fjz2,tz);
2118 /**************************
2119 * CALCULATE INTERACTIONS *
2120 **************************/
2122 r10 = _mm_mul_pd(rsq10,rinv10);
2124 /* Calculate table index by multiplying r with table scale and truncate to integer */
2125 rt = _mm_mul_pd(r10,vftabscale);
2126 vfitab = _mm_cvttpd_epi32(rt);
2127 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2128 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2130 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2131 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2132 F = _mm_setzero_pd();
2133 GMX_MM_TRANSPOSE2_PD(Y,F);
2134 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2135 H = _mm_setzero_pd();
2136 GMX_MM_TRANSPOSE2_PD(G,H);
2137 Heps = _mm_mul_pd(vfeps,H);
2138 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2139 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2140 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
2144 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2146 /* Calculate temporary vectorial force */
2147 tx = _mm_mul_pd(fscal,dx10);
2148 ty = _mm_mul_pd(fscal,dy10);
2149 tz = _mm_mul_pd(fscal,dz10);
2151 /* Update vectorial force */
2152 fix1 = _mm_add_pd(fix1,tx);
2153 fiy1 = _mm_add_pd(fiy1,ty);
2154 fiz1 = _mm_add_pd(fiz1,tz);
2156 fjx0 = _mm_add_pd(fjx0,tx);
2157 fjy0 = _mm_add_pd(fjy0,ty);
2158 fjz0 = _mm_add_pd(fjz0,tz);
2160 /**************************
2161 * CALCULATE INTERACTIONS *
2162 **************************/
2164 r11 = _mm_mul_pd(rsq11,rinv11);
2166 /* Calculate table index by multiplying r with table scale and truncate to integer */
2167 rt = _mm_mul_pd(r11,vftabscale);
2168 vfitab = _mm_cvttpd_epi32(rt);
2169 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2170 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2172 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2173 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2174 F = _mm_setzero_pd();
2175 GMX_MM_TRANSPOSE2_PD(Y,F);
2176 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2177 H = _mm_setzero_pd();
2178 GMX_MM_TRANSPOSE2_PD(G,H);
2179 Heps = _mm_mul_pd(vfeps,H);
2180 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2181 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2182 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2186 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2188 /* Calculate temporary vectorial force */
2189 tx = _mm_mul_pd(fscal,dx11);
2190 ty = _mm_mul_pd(fscal,dy11);
2191 tz = _mm_mul_pd(fscal,dz11);
2193 /* Update vectorial force */
2194 fix1 = _mm_add_pd(fix1,tx);
2195 fiy1 = _mm_add_pd(fiy1,ty);
2196 fiz1 = _mm_add_pd(fiz1,tz);
2198 fjx1 = _mm_add_pd(fjx1,tx);
2199 fjy1 = _mm_add_pd(fjy1,ty);
2200 fjz1 = _mm_add_pd(fjz1,tz);
2202 /**************************
2203 * CALCULATE INTERACTIONS *
2204 **************************/
2206 r12 = _mm_mul_pd(rsq12,rinv12);
2208 /* Calculate table index by multiplying r with table scale and truncate to integer */
2209 rt = _mm_mul_pd(r12,vftabscale);
2210 vfitab = _mm_cvttpd_epi32(rt);
2211 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2212 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2214 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2215 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2216 F = _mm_setzero_pd();
2217 GMX_MM_TRANSPOSE2_PD(Y,F);
2218 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2219 H = _mm_setzero_pd();
2220 GMX_MM_TRANSPOSE2_PD(G,H);
2221 Heps = _mm_mul_pd(vfeps,H);
2222 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2223 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2224 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2228 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2230 /* Calculate temporary vectorial force */
2231 tx = _mm_mul_pd(fscal,dx12);
2232 ty = _mm_mul_pd(fscal,dy12);
2233 tz = _mm_mul_pd(fscal,dz12);
2235 /* Update vectorial force */
2236 fix1 = _mm_add_pd(fix1,tx);
2237 fiy1 = _mm_add_pd(fiy1,ty);
2238 fiz1 = _mm_add_pd(fiz1,tz);
2240 fjx2 = _mm_add_pd(fjx2,tx);
2241 fjy2 = _mm_add_pd(fjy2,ty);
2242 fjz2 = _mm_add_pd(fjz2,tz);
2244 /**************************
2245 * CALCULATE INTERACTIONS *
2246 **************************/
2248 r20 = _mm_mul_pd(rsq20,rinv20);
2250 /* Calculate table index by multiplying r with table scale and truncate to integer */
2251 rt = _mm_mul_pd(r20,vftabscale);
2252 vfitab = _mm_cvttpd_epi32(rt);
2253 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2254 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2256 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2257 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2258 F = _mm_setzero_pd();
2259 GMX_MM_TRANSPOSE2_PD(Y,F);
2260 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2261 H = _mm_setzero_pd();
2262 GMX_MM_TRANSPOSE2_PD(G,H);
2263 Heps = _mm_mul_pd(vfeps,H);
2264 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2265 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2266 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
2270 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2272 /* Calculate temporary vectorial force */
2273 tx = _mm_mul_pd(fscal,dx20);
2274 ty = _mm_mul_pd(fscal,dy20);
2275 tz = _mm_mul_pd(fscal,dz20);
2277 /* Update vectorial force */
2278 fix2 = _mm_add_pd(fix2,tx);
2279 fiy2 = _mm_add_pd(fiy2,ty);
2280 fiz2 = _mm_add_pd(fiz2,tz);
2282 fjx0 = _mm_add_pd(fjx0,tx);
2283 fjy0 = _mm_add_pd(fjy0,ty);
2284 fjz0 = _mm_add_pd(fjz0,tz);
2286 /**************************
2287 * CALCULATE INTERACTIONS *
2288 **************************/
2290 r21 = _mm_mul_pd(rsq21,rinv21);
2292 /* Calculate table index by multiplying r with table scale and truncate to integer */
2293 rt = _mm_mul_pd(r21,vftabscale);
2294 vfitab = _mm_cvttpd_epi32(rt);
2295 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2296 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2298 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2299 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2300 F = _mm_setzero_pd();
2301 GMX_MM_TRANSPOSE2_PD(Y,F);
2302 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2303 H = _mm_setzero_pd();
2304 GMX_MM_TRANSPOSE2_PD(G,H);
2305 Heps = _mm_mul_pd(vfeps,H);
2306 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2307 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2308 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2312 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2314 /* Calculate temporary vectorial force */
2315 tx = _mm_mul_pd(fscal,dx21);
2316 ty = _mm_mul_pd(fscal,dy21);
2317 tz = _mm_mul_pd(fscal,dz21);
2319 /* Update vectorial force */
2320 fix2 = _mm_add_pd(fix2,tx);
2321 fiy2 = _mm_add_pd(fiy2,ty);
2322 fiz2 = _mm_add_pd(fiz2,tz);
2324 fjx1 = _mm_add_pd(fjx1,tx);
2325 fjy1 = _mm_add_pd(fjy1,ty);
2326 fjz1 = _mm_add_pd(fjz1,tz);
2328 /**************************
2329 * CALCULATE INTERACTIONS *
2330 **************************/
2332 r22 = _mm_mul_pd(rsq22,rinv22);
2334 /* Calculate table index by multiplying r with table scale and truncate to integer */
2335 rt = _mm_mul_pd(r22,vftabscale);
2336 vfitab = _mm_cvttpd_epi32(rt);
2337 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2338 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2340 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2341 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2342 F = _mm_setzero_pd();
2343 GMX_MM_TRANSPOSE2_PD(Y,F);
2344 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2345 H = _mm_setzero_pd();
2346 GMX_MM_TRANSPOSE2_PD(G,H);
2347 Heps = _mm_mul_pd(vfeps,H);
2348 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2349 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2350 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2354 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2356 /* Calculate temporary vectorial force */
2357 tx = _mm_mul_pd(fscal,dx22);
2358 ty = _mm_mul_pd(fscal,dy22);
2359 tz = _mm_mul_pd(fscal,dz22);
2361 /* Update vectorial force */
2362 fix2 = _mm_add_pd(fix2,tx);
2363 fiy2 = _mm_add_pd(fiy2,ty);
2364 fiz2 = _mm_add_pd(fiz2,tz);
2366 fjx2 = _mm_add_pd(fjx2,tx);
2367 fjy2 = _mm_add_pd(fjy2,ty);
2368 fjz2 = _mm_add_pd(fjz2,tz);
2370 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2372 /* Inner loop uses 373 flops */
2375 /* End of innermost loop */
2377 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2378 f+i_coord_offset,fshift+i_shift_offset);
2380 /* Increment number of inner iterations */
2381 inneriter += j_index_end - j_index_start;
2383 /* Outer loop uses 18 flops */
2386 /* Increment number of outer iterations */
2389 /* Update outer/inner flops */
2391 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*373);