2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse2_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_double
51 * Electrostatics interaction: CubicSplineTable
52 * VdW interaction: LennardJones
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int j_coord_offsetA,j_coord_offsetB;
75 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
77 real *shiftvec,*fshift,*x,*f;
78 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
82 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
84 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
87 int vdwjidx0A,vdwjidx0B;
88 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
89 int vdwjidx1A,vdwjidx1B;
90 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
91 int vdwjidx2A,vdwjidx2B;
92 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
93 int vdwjidx3A,vdwjidx3B;
94 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
95 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
97 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
98 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
99 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
100 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
101 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
102 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
103 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
104 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
105 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
108 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
111 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
112 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
114 __m128i ifour = _mm_set1_epi32(4);
115 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
117 __m128d dummy_mask,cutoff_mask;
118 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
119 __m128d one = _mm_set1_pd(1.0);
120 __m128d two = _mm_set1_pd(2.0);
126 jindex = nlist->jindex;
128 shiftidx = nlist->shift;
130 shiftvec = fr->shift_vec[0];
131 fshift = fr->fshift[0];
132 facel = _mm_set1_pd(fr->ic->epsfac);
133 charge = mdatoms->chargeA;
134 nvdwtype = fr->ntype;
136 vdwtype = mdatoms->typeA;
138 vftab = kernel_data->table_elec->data;
139 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
141 /* Setup water-specific parameters */
142 inr = nlist->iinr[0];
143 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
144 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
145 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
146 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
148 jq1 = _mm_set1_pd(charge[inr+1]);
149 jq2 = _mm_set1_pd(charge[inr+2]);
150 jq3 = _mm_set1_pd(charge[inr+3]);
151 vdwjidx0A = 2*vdwtype[inr+0];
152 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
153 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
154 qq11 = _mm_mul_pd(iq1,jq1);
155 qq12 = _mm_mul_pd(iq1,jq2);
156 qq13 = _mm_mul_pd(iq1,jq3);
157 qq21 = _mm_mul_pd(iq2,jq1);
158 qq22 = _mm_mul_pd(iq2,jq2);
159 qq23 = _mm_mul_pd(iq2,jq3);
160 qq31 = _mm_mul_pd(iq3,jq1);
161 qq32 = _mm_mul_pd(iq3,jq2);
162 qq33 = _mm_mul_pd(iq3,jq3);
164 /* Avoid stupid compiler warnings */
172 /* Start outer loop over neighborlists */
173 for(iidx=0; iidx<nri; iidx++)
175 /* Load shift vector for this list */
176 i_shift_offset = DIM*shiftidx[iidx];
178 /* Load limits for loop over neighbors */
179 j_index_start = jindex[iidx];
180 j_index_end = jindex[iidx+1];
182 /* Get outer coordinate index */
184 i_coord_offset = DIM*inr;
186 /* Load i particle coords and add shift vector */
187 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
188 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
190 fix0 = _mm_setzero_pd();
191 fiy0 = _mm_setzero_pd();
192 fiz0 = _mm_setzero_pd();
193 fix1 = _mm_setzero_pd();
194 fiy1 = _mm_setzero_pd();
195 fiz1 = _mm_setzero_pd();
196 fix2 = _mm_setzero_pd();
197 fiy2 = _mm_setzero_pd();
198 fiz2 = _mm_setzero_pd();
199 fix3 = _mm_setzero_pd();
200 fiy3 = _mm_setzero_pd();
201 fiz3 = _mm_setzero_pd();
203 /* Reset potential sums */
204 velecsum = _mm_setzero_pd();
205 vvdwsum = _mm_setzero_pd();
207 /* Start inner kernel loop */
208 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
211 /* Get j neighbor index, and coordinate index */
214 j_coord_offsetA = DIM*jnrA;
215 j_coord_offsetB = DIM*jnrB;
217 /* load j atom coordinates */
218 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
219 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
220 &jy2,&jz2,&jx3,&jy3,&jz3);
222 /* Calculate displacement vector */
223 dx00 = _mm_sub_pd(ix0,jx0);
224 dy00 = _mm_sub_pd(iy0,jy0);
225 dz00 = _mm_sub_pd(iz0,jz0);
226 dx11 = _mm_sub_pd(ix1,jx1);
227 dy11 = _mm_sub_pd(iy1,jy1);
228 dz11 = _mm_sub_pd(iz1,jz1);
229 dx12 = _mm_sub_pd(ix1,jx2);
230 dy12 = _mm_sub_pd(iy1,jy2);
231 dz12 = _mm_sub_pd(iz1,jz2);
232 dx13 = _mm_sub_pd(ix1,jx3);
233 dy13 = _mm_sub_pd(iy1,jy3);
234 dz13 = _mm_sub_pd(iz1,jz3);
235 dx21 = _mm_sub_pd(ix2,jx1);
236 dy21 = _mm_sub_pd(iy2,jy1);
237 dz21 = _mm_sub_pd(iz2,jz1);
238 dx22 = _mm_sub_pd(ix2,jx2);
239 dy22 = _mm_sub_pd(iy2,jy2);
240 dz22 = _mm_sub_pd(iz2,jz2);
241 dx23 = _mm_sub_pd(ix2,jx3);
242 dy23 = _mm_sub_pd(iy2,jy3);
243 dz23 = _mm_sub_pd(iz2,jz3);
244 dx31 = _mm_sub_pd(ix3,jx1);
245 dy31 = _mm_sub_pd(iy3,jy1);
246 dz31 = _mm_sub_pd(iz3,jz1);
247 dx32 = _mm_sub_pd(ix3,jx2);
248 dy32 = _mm_sub_pd(iy3,jy2);
249 dz32 = _mm_sub_pd(iz3,jz2);
250 dx33 = _mm_sub_pd(ix3,jx3);
251 dy33 = _mm_sub_pd(iy3,jy3);
252 dz33 = _mm_sub_pd(iz3,jz3);
254 /* Calculate squared distance and things based on it */
255 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
256 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
257 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
258 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
259 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
260 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
261 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
262 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
263 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
264 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
266 rinv11 = sse2_invsqrt_d(rsq11);
267 rinv12 = sse2_invsqrt_d(rsq12);
268 rinv13 = sse2_invsqrt_d(rsq13);
269 rinv21 = sse2_invsqrt_d(rsq21);
270 rinv22 = sse2_invsqrt_d(rsq22);
271 rinv23 = sse2_invsqrt_d(rsq23);
272 rinv31 = sse2_invsqrt_d(rsq31);
273 rinv32 = sse2_invsqrt_d(rsq32);
274 rinv33 = sse2_invsqrt_d(rsq33);
276 rinvsq00 = sse2_inv_d(rsq00);
278 fjx0 = _mm_setzero_pd();
279 fjy0 = _mm_setzero_pd();
280 fjz0 = _mm_setzero_pd();
281 fjx1 = _mm_setzero_pd();
282 fjy1 = _mm_setzero_pd();
283 fjz1 = _mm_setzero_pd();
284 fjx2 = _mm_setzero_pd();
285 fjy2 = _mm_setzero_pd();
286 fjz2 = _mm_setzero_pd();
287 fjx3 = _mm_setzero_pd();
288 fjy3 = _mm_setzero_pd();
289 fjz3 = _mm_setzero_pd();
291 /**************************
292 * CALCULATE INTERACTIONS *
293 **************************/
295 /* LENNARD-JONES DISPERSION/REPULSION */
297 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
298 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
299 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
300 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
301 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
303 /* Update potential sum for this i atom from the interaction with this j atom. */
304 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
308 /* Calculate temporary vectorial force */
309 tx = _mm_mul_pd(fscal,dx00);
310 ty = _mm_mul_pd(fscal,dy00);
311 tz = _mm_mul_pd(fscal,dz00);
313 /* Update vectorial force */
314 fix0 = _mm_add_pd(fix0,tx);
315 fiy0 = _mm_add_pd(fiy0,ty);
316 fiz0 = _mm_add_pd(fiz0,tz);
318 fjx0 = _mm_add_pd(fjx0,tx);
319 fjy0 = _mm_add_pd(fjy0,ty);
320 fjz0 = _mm_add_pd(fjz0,tz);
322 /**************************
323 * CALCULATE INTERACTIONS *
324 **************************/
326 r11 = _mm_mul_pd(rsq11,rinv11);
328 /* Calculate table index by multiplying r with table scale and truncate to integer */
329 rt = _mm_mul_pd(r11,vftabscale);
330 vfitab = _mm_cvttpd_epi32(rt);
331 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
332 vfitab = _mm_slli_epi32(vfitab,2);
334 /* CUBIC SPLINE TABLE ELECTROSTATICS */
335 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
336 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
337 GMX_MM_TRANSPOSE2_PD(Y,F);
338 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
339 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
340 GMX_MM_TRANSPOSE2_PD(G,H);
341 Heps = _mm_mul_pd(vfeps,H);
342 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
343 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
344 velec = _mm_mul_pd(qq11,VV);
345 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
346 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
348 /* Update potential sum for this i atom from the interaction with this j atom. */
349 velecsum = _mm_add_pd(velecsum,velec);
353 /* Calculate temporary vectorial force */
354 tx = _mm_mul_pd(fscal,dx11);
355 ty = _mm_mul_pd(fscal,dy11);
356 tz = _mm_mul_pd(fscal,dz11);
358 /* Update vectorial force */
359 fix1 = _mm_add_pd(fix1,tx);
360 fiy1 = _mm_add_pd(fiy1,ty);
361 fiz1 = _mm_add_pd(fiz1,tz);
363 fjx1 = _mm_add_pd(fjx1,tx);
364 fjy1 = _mm_add_pd(fjy1,ty);
365 fjz1 = _mm_add_pd(fjz1,tz);
367 /**************************
368 * CALCULATE INTERACTIONS *
369 **************************/
371 r12 = _mm_mul_pd(rsq12,rinv12);
373 /* Calculate table index by multiplying r with table scale and truncate to integer */
374 rt = _mm_mul_pd(r12,vftabscale);
375 vfitab = _mm_cvttpd_epi32(rt);
376 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
377 vfitab = _mm_slli_epi32(vfitab,2);
379 /* CUBIC SPLINE TABLE ELECTROSTATICS */
380 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
381 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
382 GMX_MM_TRANSPOSE2_PD(Y,F);
383 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
384 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
385 GMX_MM_TRANSPOSE2_PD(G,H);
386 Heps = _mm_mul_pd(vfeps,H);
387 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
388 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
389 velec = _mm_mul_pd(qq12,VV);
390 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
391 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
393 /* Update potential sum for this i atom from the interaction with this j atom. */
394 velecsum = _mm_add_pd(velecsum,velec);
398 /* Calculate temporary vectorial force */
399 tx = _mm_mul_pd(fscal,dx12);
400 ty = _mm_mul_pd(fscal,dy12);
401 tz = _mm_mul_pd(fscal,dz12);
403 /* Update vectorial force */
404 fix1 = _mm_add_pd(fix1,tx);
405 fiy1 = _mm_add_pd(fiy1,ty);
406 fiz1 = _mm_add_pd(fiz1,tz);
408 fjx2 = _mm_add_pd(fjx2,tx);
409 fjy2 = _mm_add_pd(fjy2,ty);
410 fjz2 = _mm_add_pd(fjz2,tz);
412 /**************************
413 * CALCULATE INTERACTIONS *
414 **************************/
416 r13 = _mm_mul_pd(rsq13,rinv13);
418 /* Calculate table index by multiplying r with table scale and truncate to integer */
419 rt = _mm_mul_pd(r13,vftabscale);
420 vfitab = _mm_cvttpd_epi32(rt);
421 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
422 vfitab = _mm_slli_epi32(vfitab,2);
424 /* CUBIC SPLINE TABLE ELECTROSTATICS */
425 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
426 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
427 GMX_MM_TRANSPOSE2_PD(Y,F);
428 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
429 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
430 GMX_MM_TRANSPOSE2_PD(G,H);
431 Heps = _mm_mul_pd(vfeps,H);
432 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
433 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
434 velec = _mm_mul_pd(qq13,VV);
435 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
436 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
438 /* Update potential sum for this i atom from the interaction with this j atom. */
439 velecsum = _mm_add_pd(velecsum,velec);
443 /* Calculate temporary vectorial force */
444 tx = _mm_mul_pd(fscal,dx13);
445 ty = _mm_mul_pd(fscal,dy13);
446 tz = _mm_mul_pd(fscal,dz13);
448 /* Update vectorial force */
449 fix1 = _mm_add_pd(fix1,tx);
450 fiy1 = _mm_add_pd(fiy1,ty);
451 fiz1 = _mm_add_pd(fiz1,tz);
453 fjx3 = _mm_add_pd(fjx3,tx);
454 fjy3 = _mm_add_pd(fjy3,ty);
455 fjz3 = _mm_add_pd(fjz3,tz);
457 /**************************
458 * CALCULATE INTERACTIONS *
459 **************************/
461 r21 = _mm_mul_pd(rsq21,rinv21);
463 /* Calculate table index by multiplying r with table scale and truncate to integer */
464 rt = _mm_mul_pd(r21,vftabscale);
465 vfitab = _mm_cvttpd_epi32(rt);
466 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
467 vfitab = _mm_slli_epi32(vfitab,2);
469 /* CUBIC SPLINE TABLE ELECTROSTATICS */
470 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
471 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
472 GMX_MM_TRANSPOSE2_PD(Y,F);
473 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
474 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
475 GMX_MM_TRANSPOSE2_PD(G,H);
476 Heps = _mm_mul_pd(vfeps,H);
477 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
478 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
479 velec = _mm_mul_pd(qq21,VV);
480 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
481 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
483 /* Update potential sum for this i atom from the interaction with this j atom. */
484 velecsum = _mm_add_pd(velecsum,velec);
488 /* Calculate temporary vectorial force */
489 tx = _mm_mul_pd(fscal,dx21);
490 ty = _mm_mul_pd(fscal,dy21);
491 tz = _mm_mul_pd(fscal,dz21);
493 /* Update vectorial force */
494 fix2 = _mm_add_pd(fix2,tx);
495 fiy2 = _mm_add_pd(fiy2,ty);
496 fiz2 = _mm_add_pd(fiz2,tz);
498 fjx1 = _mm_add_pd(fjx1,tx);
499 fjy1 = _mm_add_pd(fjy1,ty);
500 fjz1 = _mm_add_pd(fjz1,tz);
502 /**************************
503 * CALCULATE INTERACTIONS *
504 **************************/
506 r22 = _mm_mul_pd(rsq22,rinv22);
508 /* Calculate table index by multiplying r with table scale and truncate to integer */
509 rt = _mm_mul_pd(r22,vftabscale);
510 vfitab = _mm_cvttpd_epi32(rt);
511 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
512 vfitab = _mm_slli_epi32(vfitab,2);
514 /* CUBIC SPLINE TABLE ELECTROSTATICS */
515 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
516 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
517 GMX_MM_TRANSPOSE2_PD(Y,F);
518 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
519 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
520 GMX_MM_TRANSPOSE2_PD(G,H);
521 Heps = _mm_mul_pd(vfeps,H);
522 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
523 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
524 velec = _mm_mul_pd(qq22,VV);
525 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
526 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
528 /* Update potential sum for this i atom from the interaction with this j atom. */
529 velecsum = _mm_add_pd(velecsum,velec);
533 /* Calculate temporary vectorial force */
534 tx = _mm_mul_pd(fscal,dx22);
535 ty = _mm_mul_pd(fscal,dy22);
536 tz = _mm_mul_pd(fscal,dz22);
538 /* Update vectorial force */
539 fix2 = _mm_add_pd(fix2,tx);
540 fiy2 = _mm_add_pd(fiy2,ty);
541 fiz2 = _mm_add_pd(fiz2,tz);
543 fjx2 = _mm_add_pd(fjx2,tx);
544 fjy2 = _mm_add_pd(fjy2,ty);
545 fjz2 = _mm_add_pd(fjz2,tz);
547 /**************************
548 * CALCULATE INTERACTIONS *
549 **************************/
551 r23 = _mm_mul_pd(rsq23,rinv23);
553 /* Calculate table index by multiplying r with table scale and truncate to integer */
554 rt = _mm_mul_pd(r23,vftabscale);
555 vfitab = _mm_cvttpd_epi32(rt);
556 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
557 vfitab = _mm_slli_epi32(vfitab,2);
559 /* CUBIC SPLINE TABLE ELECTROSTATICS */
560 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
561 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
562 GMX_MM_TRANSPOSE2_PD(Y,F);
563 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
564 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
565 GMX_MM_TRANSPOSE2_PD(G,H);
566 Heps = _mm_mul_pd(vfeps,H);
567 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
568 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
569 velec = _mm_mul_pd(qq23,VV);
570 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
571 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
573 /* Update potential sum for this i atom from the interaction with this j atom. */
574 velecsum = _mm_add_pd(velecsum,velec);
578 /* Calculate temporary vectorial force */
579 tx = _mm_mul_pd(fscal,dx23);
580 ty = _mm_mul_pd(fscal,dy23);
581 tz = _mm_mul_pd(fscal,dz23);
583 /* Update vectorial force */
584 fix2 = _mm_add_pd(fix2,tx);
585 fiy2 = _mm_add_pd(fiy2,ty);
586 fiz2 = _mm_add_pd(fiz2,tz);
588 fjx3 = _mm_add_pd(fjx3,tx);
589 fjy3 = _mm_add_pd(fjy3,ty);
590 fjz3 = _mm_add_pd(fjz3,tz);
592 /**************************
593 * CALCULATE INTERACTIONS *
594 **************************/
596 r31 = _mm_mul_pd(rsq31,rinv31);
598 /* Calculate table index by multiplying r with table scale and truncate to integer */
599 rt = _mm_mul_pd(r31,vftabscale);
600 vfitab = _mm_cvttpd_epi32(rt);
601 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
602 vfitab = _mm_slli_epi32(vfitab,2);
604 /* CUBIC SPLINE TABLE ELECTROSTATICS */
605 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
606 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
607 GMX_MM_TRANSPOSE2_PD(Y,F);
608 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
609 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
610 GMX_MM_TRANSPOSE2_PD(G,H);
611 Heps = _mm_mul_pd(vfeps,H);
612 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
613 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
614 velec = _mm_mul_pd(qq31,VV);
615 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
616 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
618 /* Update potential sum for this i atom from the interaction with this j atom. */
619 velecsum = _mm_add_pd(velecsum,velec);
623 /* Calculate temporary vectorial force */
624 tx = _mm_mul_pd(fscal,dx31);
625 ty = _mm_mul_pd(fscal,dy31);
626 tz = _mm_mul_pd(fscal,dz31);
628 /* Update vectorial force */
629 fix3 = _mm_add_pd(fix3,tx);
630 fiy3 = _mm_add_pd(fiy3,ty);
631 fiz3 = _mm_add_pd(fiz3,tz);
633 fjx1 = _mm_add_pd(fjx1,tx);
634 fjy1 = _mm_add_pd(fjy1,ty);
635 fjz1 = _mm_add_pd(fjz1,tz);
637 /**************************
638 * CALCULATE INTERACTIONS *
639 **************************/
641 r32 = _mm_mul_pd(rsq32,rinv32);
643 /* Calculate table index by multiplying r with table scale and truncate to integer */
644 rt = _mm_mul_pd(r32,vftabscale);
645 vfitab = _mm_cvttpd_epi32(rt);
646 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
647 vfitab = _mm_slli_epi32(vfitab,2);
649 /* CUBIC SPLINE TABLE ELECTROSTATICS */
650 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
651 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
652 GMX_MM_TRANSPOSE2_PD(Y,F);
653 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
654 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
655 GMX_MM_TRANSPOSE2_PD(G,H);
656 Heps = _mm_mul_pd(vfeps,H);
657 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
658 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
659 velec = _mm_mul_pd(qq32,VV);
660 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
661 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
663 /* Update potential sum for this i atom from the interaction with this j atom. */
664 velecsum = _mm_add_pd(velecsum,velec);
668 /* Calculate temporary vectorial force */
669 tx = _mm_mul_pd(fscal,dx32);
670 ty = _mm_mul_pd(fscal,dy32);
671 tz = _mm_mul_pd(fscal,dz32);
673 /* Update vectorial force */
674 fix3 = _mm_add_pd(fix3,tx);
675 fiy3 = _mm_add_pd(fiy3,ty);
676 fiz3 = _mm_add_pd(fiz3,tz);
678 fjx2 = _mm_add_pd(fjx2,tx);
679 fjy2 = _mm_add_pd(fjy2,ty);
680 fjz2 = _mm_add_pd(fjz2,tz);
682 /**************************
683 * CALCULATE INTERACTIONS *
684 **************************/
686 r33 = _mm_mul_pd(rsq33,rinv33);
688 /* Calculate table index by multiplying r with table scale and truncate to integer */
689 rt = _mm_mul_pd(r33,vftabscale);
690 vfitab = _mm_cvttpd_epi32(rt);
691 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
692 vfitab = _mm_slli_epi32(vfitab,2);
694 /* CUBIC SPLINE TABLE ELECTROSTATICS */
695 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
696 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
697 GMX_MM_TRANSPOSE2_PD(Y,F);
698 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
699 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
700 GMX_MM_TRANSPOSE2_PD(G,H);
701 Heps = _mm_mul_pd(vfeps,H);
702 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
703 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
704 velec = _mm_mul_pd(qq33,VV);
705 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
706 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
708 /* Update potential sum for this i atom from the interaction with this j atom. */
709 velecsum = _mm_add_pd(velecsum,velec);
713 /* Calculate temporary vectorial force */
714 tx = _mm_mul_pd(fscal,dx33);
715 ty = _mm_mul_pd(fscal,dy33);
716 tz = _mm_mul_pd(fscal,dz33);
718 /* Update vectorial force */
719 fix3 = _mm_add_pd(fix3,tx);
720 fiy3 = _mm_add_pd(fiy3,ty);
721 fiz3 = _mm_add_pd(fiz3,tz);
723 fjx3 = _mm_add_pd(fjx3,tx);
724 fjy3 = _mm_add_pd(fjy3,ty);
725 fjz3 = _mm_add_pd(fjz3,tz);
727 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
729 /* Inner loop uses 422 flops */
736 j_coord_offsetA = DIM*jnrA;
738 /* load j atom coordinates */
739 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
740 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
741 &jy2,&jz2,&jx3,&jy3,&jz3);
743 /* Calculate displacement vector */
744 dx00 = _mm_sub_pd(ix0,jx0);
745 dy00 = _mm_sub_pd(iy0,jy0);
746 dz00 = _mm_sub_pd(iz0,jz0);
747 dx11 = _mm_sub_pd(ix1,jx1);
748 dy11 = _mm_sub_pd(iy1,jy1);
749 dz11 = _mm_sub_pd(iz1,jz1);
750 dx12 = _mm_sub_pd(ix1,jx2);
751 dy12 = _mm_sub_pd(iy1,jy2);
752 dz12 = _mm_sub_pd(iz1,jz2);
753 dx13 = _mm_sub_pd(ix1,jx3);
754 dy13 = _mm_sub_pd(iy1,jy3);
755 dz13 = _mm_sub_pd(iz1,jz3);
756 dx21 = _mm_sub_pd(ix2,jx1);
757 dy21 = _mm_sub_pd(iy2,jy1);
758 dz21 = _mm_sub_pd(iz2,jz1);
759 dx22 = _mm_sub_pd(ix2,jx2);
760 dy22 = _mm_sub_pd(iy2,jy2);
761 dz22 = _mm_sub_pd(iz2,jz2);
762 dx23 = _mm_sub_pd(ix2,jx3);
763 dy23 = _mm_sub_pd(iy2,jy3);
764 dz23 = _mm_sub_pd(iz2,jz3);
765 dx31 = _mm_sub_pd(ix3,jx1);
766 dy31 = _mm_sub_pd(iy3,jy1);
767 dz31 = _mm_sub_pd(iz3,jz1);
768 dx32 = _mm_sub_pd(ix3,jx2);
769 dy32 = _mm_sub_pd(iy3,jy2);
770 dz32 = _mm_sub_pd(iz3,jz2);
771 dx33 = _mm_sub_pd(ix3,jx3);
772 dy33 = _mm_sub_pd(iy3,jy3);
773 dz33 = _mm_sub_pd(iz3,jz3);
775 /* Calculate squared distance and things based on it */
776 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
777 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
778 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
779 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
780 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
781 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
782 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
783 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
784 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
785 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
787 rinv11 = sse2_invsqrt_d(rsq11);
788 rinv12 = sse2_invsqrt_d(rsq12);
789 rinv13 = sse2_invsqrt_d(rsq13);
790 rinv21 = sse2_invsqrt_d(rsq21);
791 rinv22 = sse2_invsqrt_d(rsq22);
792 rinv23 = sse2_invsqrt_d(rsq23);
793 rinv31 = sse2_invsqrt_d(rsq31);
794 rinv32 = sse2_invsqrt_d(rsq32);
795 rinv33 = sse2_invsqrt_d(rsq33);
797 rinvsq00 = sse2_inv_d(rsq00);
799 fjx0 = _mm_setzero_pd();
800 fjy0 = _mm_setzero_pd();
801 fjz0 = _mm_setzero_pd();
802 fjx1 = _mm_setzero_pd();
803 fjy1 = _mm_setzero_pd();
804 fjz1 = _mm_setzero_pd();
805 fjx2 = _mm_setzero_pd();
806 fjy2 = _mm_setzero_pd();
807 fjz2 = _mm_setzero_pd();
808 fjx3 = _mm_setzero_pd();
809 fjy3 = _mm_setzero_pd();
810 fjz3 = _mm_setzero_pd();
812 /**************************
813 * CALCULATE INTERACTIONS *
814 **************************/
816 /* LENNARD-JONES DISPERSION/REPULSION */
818 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
819 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
820 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
821 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
822 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
824 /* Update potential sum for this i atom from the interaction with this j atom. */
825 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
826 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
830 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
832 /* Calculate temporary vectorial force */
833 tx = _mm_mul_pd(fscal,dx00);
834 ty = _mm_mul_pd(fscal,dy00);
835 tz = _mm_mul_pd(fscal,dz00);
837 /* Update vectorial force */
838 fix0 = _mm_add_pd(fix0,tx);
839 fiy0 = _mm_add_pd(fiy0,ty);
840 fiz0 = _mm_add_pd(fiz0,tz);
842 fjx0 = _mm_add_pd(fjx0,tx);
843 fjy0 = _mm_add_pd(fjy0,ty);
844 fjz0 = _mm_add_pd(fjz0,tz);
846 /**************************
847 * CALCULATE INTERACTIONS *
848 **************************/
850 r11 = _mm_mul_pd(rsq11,rinv11);
852 /* Calculate table index by multiplying r with table scale and truncate to integer */
853 rt = _mm_mul_pd(r11,vftabscale);
854 vfitab = _mm_cvttpd_epi32(rt);
855 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
856 vfitab = _mm_slli_epi32(vfitab,2);
858 /* CUBIC SPLINE TABLE ELECTROSTATICS */
859 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
860 F = _mm_setzero_pd();
861 GMX_MM_TRANSPOSE2_PD(Y,F);
862 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
863 H = _mm_setzero_pd();
864 GMX_MM_TRANSPOSE2_PD(G,H);
865 Heps = _mm_mul_pd(vfeps,H);
866 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
867 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
868 velec = _mm_mul_pd(qq11,VV);
869 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
870 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
872 /* Update potential sum for this i atom from the interaction with this j atom. */
873 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
874 velecsum = _mm_add_pd(velecsum,velec);
878 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
880 /* Calculate temporary vectorial force */
881 tx = _mm_mul_pd(fscal,dx11);
882 ty = _mm_mul_pd(fscal,dy11);
883 tz = _mm_mul_pd(fscal,dz11);
885 /* Update vectorial force */
886 fix1 = _mm_add_pd(fix1,tx);
887 fiy1 = _mm_add_pd(fiy1,ty);
888 fiz1 = _mm_add_pd(fiz1,tz);
890 fjx1 = _mm_add_pd(fjx1,tx);
891 fjy1 = _mm_add_pd(fjy1,ty);
892 fjz1 = _mm_add_pd(fjz1,tz);
894 /**************************
895 * CALCULATE INTERACTIONS *
896 **************************/
898 r12 = _mm_mul_pd(rsq12,rinv12);
900 /* Calculate table index by multiplying r with table scale and truncate to integer */
901 rt = _mm_mul_pd(r12,vftabscale);
902 vfitab = _mm_cvttpd_epi32(rt);
903 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
904 vfitab = _mm_slli_epi32(vfitab,2);
906 /* CUBIC SPLINE TABLE ELECTROSTATICS */
907 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
908 F = _mm_setzero_pd();
909 GMX_MM_TRANSPOSE2_PD(Y,F);
910 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
911 H = _mm_setzero_pd();
912 GMX_MM_TRANSPOSE2_PD(G,H);
913 Heps = _mm_mul_pd(vfeps,H);
914 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
915 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
916 velec = _mm_mul_pd(qq12,VV);
917 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
918 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
920 /* Update potential sum for this i atom from the interaction with this j atom. */
921 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
922 velecsum = _mm_add_pd(velecsum,velec);
926 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
928 /* Calculate temporary vectorial force */
929 tx = _mm_mul_pd(fscal,dx12);
930 ty = _mm_mul_pd(fscal,dy12);
931 tz = _mm_mul_pd(fscal,dz12);
933 /* Update vectorial force */
934 fix1 = _mm_add_pd(fix1,tx);
935 fiy1 = _mm_add_pd(fiy1,ty);
936 fiz1 = _mm_add_pd(fiz1,tz);
938 fjx2 = _mm_add_pd(fjx2,tx);
939 fjy2 = _mm_add_pd(fjy2,ty);
940 fjz2 = _mm_add_pd(fjz2,tz);
942 /**************************
943 * CALCULATE INTERACTIONS *
944 **************************/
946 r13 = _mm_mul_pd(rsq13,rinv13);
948 /* Calculate table index by multiplying r with table scale and truncate to integer */
949 rt = _mm_mul_pd(r13,vftabscale);
950 vfitab = _mm_cvttpd_epi32(rt);
951 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
952 vfitab = _mm_slli_epi32(vfitab,2);
954 /* CUBIC SPLINE TABLE ELECTROSTATICS */
955 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
956 F = _mm_setzero_pd();
957 GMX_MM_TRANSPOSE2_PD(Y,F);
958 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
959 H = _mm_setzero_pd();
960 GMX_MM_TRANSPOSE2_PD(G,H);
961 Heps = _mm_mul_pd(vfeps,H);
962 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
963 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
964 velec = _mm_mul_pd(qq13,VV);
965 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
966 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
968 /* Update potential sum for this i atom from the interaction with this j atom. */
969 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
970 velecsum = _mm_add_pd(velecsum,velec);
974 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
976 /* Calculate temporary vectorial force */
977 tx = _mm_mul_pd(fscal,dx13);
978 ty = _mm_mul_pd(fscal,dy13);
979 tz = _mm_mul_pd(fscal,dz13);
981 /* Update vectorial force */
982 fix1 = _mm_add_pd(fix1,tx);
983 fiy1 = _mm_add_pd(fiy1,ty);
984 fiz1 = _mm_add_pd(fiz1,tz);
986 fjx3 = _mm_add_pd(fjx3,tx);
987 fjy3 = _mm_add_pd(fjy3,ty);
988 fjz3 = _mm_add_pd(fjz3,tz);
990 /**************************
991 * CALCULATE INTERACTIONS *
992 **************************/
994 r21 = _mm_mul_pd(rsq21,rinv21);
996 /* Calculate table index by multiplying r with table scale and truncate to integer */
997 rt = _mm_mul_pd(r21,vftabscale);
998 vfitab = _mm_cvttpd_epi32(rt);
999 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1000 vfitab = _mm_slli_epi32(vfitab,2);
1002 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1003 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1004 F = _mm_setzero_pd();
1005 GMX_MM_TRANSPOSE2_PD(Y,F);
1006 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1007 H = _mm_setzero_pd();
1008 GMX_MM_TRANSPOSE2_PD(G,H);
1009 Heps = _mm_mul_pd(vfeps,H);
1010 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1011 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1012 velec = _mm_mul_pd(qq21,VV);
1013 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1014 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1016 /* Update potential sum for this i atom from the interaction with this j atom. */
1017 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1018 velecsum = _mm_add_pd(velecsum,velec);
1022 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1024 /* Calculate temporary vectorial force */
1025 tx = _mm_mul_pd(fscal,dx21);
1026 ty = _mm_mul_pd(fscal,dy21);
1027 tz = _mm_mul_pd(fscal,dz21);
1029 /* Update vectorial force */
1030 fix2 = _mm_add_pd(fix2,tx);
1031 fiy2 = _mm_add_pd(fiy2,ty);
1032 fiz2 = _mm_add_pd(fiz2,tz);
1034 fjx1 = _mm_add_pd(fjx1,tx);
1035 fjy1 = _mm_add_pd(fjy1,ty);
1036 fjz1 = _mm_add_pd(fjz1,tz);
1038 /**************************
1039 * CALCULATE INTERACTIONS *
1040 **************************/
1042 r22 = _mm_mul_pd(rsq22,rinv22);
1044 /* Calculate table index by multiplying r with table scale and truncate to integer */
1045 rt = _mm_mul_pd(r22,vftabscale);
1046 vfitab = _mm_cvttpd_epi32(rt);
1047 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1048 vfitab = _mm_slli_epi32(vfitab,2);
1050 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1051 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1052 F = _mm_setzero_pd();
1053 GMX_MM_TRANSPOSE2_PD(Y,F);
1054 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1055 H = _mm_setzero_pd();
1056 GMX_MM_TRANSPOSE2_PD(G,H);
1057 Heps = _mm_mul_pd(vfeps,H);
1058 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1059 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1060 velec = _mm_mul_pd(qq22,VV);
1061 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1062 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1064 /* Update potential sum for this i atom from the interaction with this j atom. */
1065 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1066 velecsum = _mm_add_pd(velecsum,velec);
1070 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1072 /* Calculate temporary vectorial force */
1073 tx = _mm_mul_pd(fscal,dx22);
1074 ty = _mm_mul_pd(fscal,dy22);
1075 tz = _mm_mul_pd(fscal,dz22);
1077 /* Update vectorial force */
1078 fix2 = _mm_add_pd(fix2,tx);
1079 fiy2 = _mm_add_pd(fiy2,ty);
1080 fiz2 = _mm_add_pd(fiz2,tz);
1082 fjx2 = _mm_add_pd(fjx2,tx);
1083 fjy2 = _mm_add_pd(fjy2,ty);
1084 fjz2 = _mm_add_pd(fjz2,tz);
1086 /**************************
1087 * CALCULATE INTERACTIONS *
1088 **************************/
1090 r23 = _mm_mul_pd(rsq23,rinv23);
1092 /* Calculate table index by multiplying r with table scale and truncate to integer */
1093 rt = _mm_mul_pd(r23,vftabscale);
1094 vfitab = _mm_cvttpd_epi32(rt);
1095 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1096 vfitab = _mm_slli_epi32(vfitab,2);
1098 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1099 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1100 F = _mm_setzero_pd();
1101 GMX_MM_TRANSPOSE2_PD(Y,F);
1102 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1103 H = _mm_setzero_pd();
1104 GMX_MM_TRANSPOSE2_PD(G,H);
1105 Heps = _mm_mul_pd(vfeps,H);
1106 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1107 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1108 velec = _mm_mul_pd(qq23,VV);
1109 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1110 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
1112 /* Update potential sum for this i atom from the interaction with this j atom. */
1113 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1114 velecsum = _mm_add_pd(velecsum,velec);
1118 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1120 /* Calculate temporary vectorial force */
1121 tx = _mm_mul_pd(fscal,dx23);
1122 ty = _mm_mul_pd(fscal,dy23);
1123 tz = _mm_mul_pd(fscal,dz23);
1125 /* Update vectorial force */
1126 fix2 = _mm_add_pd(fix2,tx);
1127 fiy2 = _mm_add_pd(fiy2,ty);
1128 fiz2 = _mm_add_pd(fiz2,tz);
1130 fjx3 = _mm_add_pd(fjx3,tx);
1131 fjy3 = _mm_add_pd(fjy3,ty);
1132 fjz3 = _mm_add_pd(fjz3,tz);
1134 /**************************
1135 * CALCULATE INTERACTIONS *
1136 **************************/
1138 r31 = _mm_mul_pd(rsq31,rinv31);
1140 /* Calculate table index by multiplying r with table scale and truncate to integer */
1141 rt = _mm_mul_pd(r31,vftabscale);
1142 vfitab = _mm_cvttpd_epi32(rt);
1143 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1144 vfitab = _mm_slli_epi32(vfitab,2);
1146 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1147 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1148 F = _mm_setzero_pd();
1149 GMX_MM_TRANSPOSE2_PD(Y,F);
1150 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1151 H = _mm_setzero_pd();
1152 GMX_MM_TRANSPOSE2_PD(G,H);
1153 Heps = _mm_mul_pd(vfeps,H);
1154 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1155 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1156 velec = _mm_mul_pd(qq31,VV);
1157 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1158 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
1160 /* Update potential sum for this i atom from the interaction with this j atom. */
1161 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1162 velecsum = _mm_add_pd(velecsum,velec);
1166 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1168 /* Calculate temporary vectorial force */
1169 tx = _mm_mul_pd(fscal,dx31);
1170 ty = _mm_mul_pd(fscal,dy31);
1171 tz = _mm_mul_pd(fscal,dz31);
1173 /* Update vectorial force */
1174 fix3 = _mm_add_pd(fix3,tx);
1175 fiy3 = _mm_add_pd(fiy3,ty);
1176 fiz3 = _mm_add_pd(fiz3,tz);
1178 fjx1 = _mm_add_pd(fjx1,tx);
1179 fjy1 = _mm_add_pd(fjy1,ty);
1180 fjz1 = _mm_add_pd(fjz1,tz);
1182 /**************************
1183 * CALCULATE INTERACTIONS *
1184 **************************/
1186 r32 = _mm_mul_pd(rsq32,rinv32);
1188 /* Calculate table index by multiplying r with table scale and truncate to integer */
1189 rt = _mm_mul_pd(r32,vftabscale);
1190 vfitab = _mm_cvttpd_epi32(rt);
1191 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1192 vfitab = _mm_slli_epi32(vfitab,2);
1194 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1195 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1196 F = _mm_setzero_pd();
1197 GMX_MM_TRANSPOSE2_PD(Y,F);
1198 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1199 H = _mm_setzero_pd();
1200 GMX_MM_TRANSPOSE2_PD(G,H);
1201 Heps = _mm_mul_pd(vfeps,H);
1202 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1203 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1204 velec = _mm_mul_pd(qq32,VV);
1205 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1206 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
1208 /* Update potential sum for this i atom from the interaction with this j atom. */
1209 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1210 velecsum = _mm_add_pd(velecsum,velec);
1214 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1216 /* Calculate temporary vectorial force */
1217 tx = _mm_mul_pd(fscal,dx32);
1218 ty = _mm_mul_pd(fscal,dy32);
1219 tz = _mm_mul_pd(fscal,dz32);
1221 /* Update vectorial force */
1222 fix3 = _mm_add_pd(fix3,tx);
1223 fiy3 = _mm_add_pd(fiy3,ty);
1224 fiz3 = _mm_add_pd(fiz3,tz);
1226 fjx2 = _mm_add_pd(fjx2,tx);
1227 fjy2 = _mm_add_pd(fjy2,ty);
1228 fjz2 = _mm_add_pd(fjz2,tz);
1230 /**************************
1231 * CALCULATE INTERACTIONS *
1232 **************************/
1234 r33 = _mm_mul_pd(rsq33,rinv33);
1236 /* Calculate table index by multiplying r with table scale and truncate to integer */
1237 rt = _mm_mul_pd(r33,vftabscale);
1238 vfitab = _mm_cvttpd_epi32(rt);
1239 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1240 vfitab = _mm_slli_epi32(vfitab,2);
1242 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1243 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1244 F = _mm_setzero_pd();
1245 GMX_MM_TRANSPOSE2_PD(Y,F);
1246 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1247 H = _mm_setzero_pd();
1248 GMX_MM_TRANSPOSE2_PD(G,H);
1249 Heps = _mm_mul_pd(vfeps,H);
1250 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1251 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1252 velec = _mm_mul_pd(qq33,VV);
1253 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1254 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
1256 /* Update potential sum for this i atom from the interaction with this j atom. */
1257 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1258 velecsum = _mm_add_pd(velecsum,velec);
1262 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1264 /* Calculate temporary vectorial force */
1265 tx = _mm_mul_pd(fscal,dx33);
1266 ty = _mm_mul_pd(fscal,dy33);
1267 tz = _mm_mul_pd(fscal,dz33);
1269 /* Update vectorial force */
1270 fix3 = _mm_add_pd(fix3,tx);
1271 fiy3 = _mm_add_pd(fiy3,ty);
1272 fiz3 = _mm_add_pd(fiz3,tz);
1274 fjx3 = _mm_add_pd(fjx3,tx);
1275 fjy3 = _mm_add_pd(fjy3,ty);
1276 fjz3 = _mm_add_pd(fjz3,tz);
1278 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1280 /* Inner loop uses 422 flops */
1283 /* End of innermost loop */
1285 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1286 f+i_coord_offset,fshift+i_shift_offset);
1289 /* Update potential energies */
1290 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1291 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1293 /* Increment number of inner iterations */
1294 inneriter += j_index_end - j_index_start;
1296 /* Outer loop uses 26 flops */
1299 /* Increment number of outer iterations */
1302 /* Update outer/inner flops */
1304 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*422);
1307 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_double
1308 * Electrostatics interaction: CubicSplineTable
1309 * VdW interaction: LennardJones
1310 * Geometry: Water4-Water4
1311 * Calculate force/pot: Force
1314 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_double
1315 (t_nblist * gmx_restrict nlist,
1316 rvec * gmx_restrict xx,
1317 rvec * gmx_restrict ff,
1318 struct t_forcerec * gmx_restrict fr,
1319 t_mdatoms * gmx_restrict mdatoms,
1320 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1321 t_nrnb * gmx_restrict nrnb)
1323 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1324 * just 0 for non-waters.
1325 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1326 * jnr indices corresponding to data put in the four positions in the SIMD register.
1328 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1329 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1331 int j_coord_offsetA,j_coord_offsetB;
1332 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1333 real rcutoff_scalar;
1334 real *shiftvec,*fshift,*x,*f;
1335 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1337 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1339 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1341 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1343 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1344 int vdwjidx0A,vdwjidx0B;
1345 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1346 int vdwjidx1A,vdwjidx1B;
1347 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1348 int vdwjidx2A,vdwjidx2B;
1349 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1350 int vdwjidx3A,vdwjidx3B;
1351 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1352 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1353 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1354 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1355 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1356 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1357 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1358 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1359 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1360 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1361 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1362 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1365 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1368 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1369 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1371 __m128i ifour = _mm_set1_epi32(4);
1372 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1374 __m128d dummy_mask,cutoff_mask;
1375 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1376 __m128d one = _mm_set1_pd(1.0);
1377 __m128d two = _mm_set1_pd(2.0);
1383 jindex = nlist->jindex;
1385 shiftidx = nlist->shift;
1387 shiftvec = fr->shift_vec[0];
1388 fshift = fr->fshift[0];
1389 facel = _mm_set1_pd(fr->ic->epsfac);
1390 charge = mdatoms->chargeA;
1391 nvdwtype = fr->ntype;
1392 vdwparam = fr->nbfp;
1393 vdwtype = mdatoms->typeA;
1395 vftab = kernel_data->table_elec->data;
1396 vftabscale = _mm_set1_pd(kernel_data->table_elec->scale);
1398 /* Setup water-specific parameters */
1399 inr = nlist->iinr[0];
1400 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1401 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1402 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1403 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1405 jq1 = _mm_set1_pd(charge[inr+1]);
1406 jq2 = _mm_set1_pd(charge[inr+2]);
1407 jq3 = _mm_set1_pd(charge[inr+3]);
1408 vdwjidx0A = 2*vdwtype[inr+0];
1409 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1410 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1411 qq11 = _mm_mul_pd(iq1,jq1);
1412 qq12 = _mm_mul_pd(iq1,jq2);
1413 qq13 = _mm_mul_pd(iq1,jq3);
1414 qq21 = _mm_mul_pd(iq2,jq1);
1415 qq22 = _mm_mul_pd(iq2,jq2);
1416 qq23 = _mm_mul_pd(iq2,jq3);
1417 qq31 = _mm_mul_pd(iq3,jq1);
1418 qq32 = _mm_mul_pd(iq3,jq2);
1419 qq33 = _mm_mul_pd(iq3,jq3);
1421 /* Avoid stupid compiler warnings */
1423 j_coord_offsetA = 0;
1424 j_coord_offsetB = 0;
1429 /* Start outer loop over neighborlists */
1430 for(iidx=0; iidx<nri; iidx++)
1432 /* Load shift vector for this list */
1433 i_shift_offset = DIM*shiftidx[iidx];
1435 /* Load limits for loop over neighbors */
1436 j_index_start = jindex[iidx];
1437 j_index_end = jindex[iidx+1];
1439 /* Get outer coordinate index */
1441 i_coord_offset = DIM*inr;
1443 /* Load i particle coords and add shift vector */
1444 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1445 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1447 fix0 = _mm_setzero_pd();
1448 fiy0 = _mm_setzero_pd();
1449 fiz0 = _mm_setzero_pd();
1450 fix1 = _mm_setzero_pd();
1451 fiy1 = _mm_setzero_pd();
1452 fiz1 = _mm_setzero_pd();
1453 fix2 = _mm_setzero_pd();
1454 fiy2 = _mm_setzero_pd();
1455 fiz2 = _mm_setzero_pd();
1456 fix3 = _mm_setzero_pd();
1457 fiy3 = _mm_setzero_pd();
1458 fiz3 = _mm_setzero_pd();
1460 /* Start inner kernel loop */
1461 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1464 /* Get j neighbor index, and coordinate index */
1466 jnrB = jjnr[jidx+1];
1467 j_coord_offsetA = DIM*jnrA;
1468 j_coord_offsetB = DIM*jnrB;
1470 /* load j atom coordinates */
1471 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1472 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1473 &jy2,&jz2,&jx3,&jy3,&jz3);
1475 /* Calculate displacement vector */
1476 dx00 = _mm_sub_pd(ix0,jx0);
1477 dy00 = _mm_sub_pd(iy0,jy0);
1478 dz00 = _mm_sub_pd(iz0,jz0);
1479 dx11 = _mm_sub_pd(ix1,jx1);
1480 dy11 = _mm_sub_pd(iy1,jy1);
1481 dz11 = _mm_sub_pd(iz1,jz1);
1482 dx12 = _mm_sub_pd(ix1,jx2);
1483 dy12 = _mm_sub_pd(iy1,jy2);
1484 dz12 = _mm_sub_pd(iz1,jz2);
1485 dx13 = _mm_sub_pd(ix1,jx3);
1486 dy13 = _mm_sub_pd(iy1,jy3);
1487 dz13 = _mm_sub_pd(iz1,jz3);
1488 dx21 = _mm_sub_pd(ix2,jx1);
1489 dy21 = _mm_sub_pd(iy2,jy1);
1490 dz21 = _mm_sub_pd(iz2,jz1);
1491 dx22 = _mm_sub_pd(ix2,jx2);
1492 dy22 = _mm_sub_pd(iy2,jy2);
1493 dz22 = _mm_sub_pd(iz2,jz2);
1494 dx23 = _mm_sub_pd(ix2,jx3);
1495 dy23 = _mm_sub_pd(iy2,jy3);
1496 dz23 = _mm_sub_pd(iz2,jz3);
1497 dx31 = _mm_sub_pd(ix3,jx1);
1498 dy31 = _mm_sub_pd(iy3,jy1);
1499 dz31 = _mm_sub_pd(iz3,jz1);
1500 dx32 = _mm_sub_pd(ix3,jx2);
1501 dy32 = _mm_sub_pd(iy3,jy2);
1502 dz32 = _mm_sub_pd(iz3,jz2);
1503 dx33 = _mm_sub_pd(ix3,jx3);
1504 dy33 = _mm_sub_pd(iy3,jy3);
1505 dz33 = _mm_sub_pd(iz3,jz3);
1507 /* Calculate squared distance and things based on it */
1508 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1509 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1510 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1511 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1512 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1513 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1514 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1515 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1516 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1517 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1519 rinv11 = sse2_invsqrt_d(rsq11);
1520 rinv12 = sse2_invsqrt_d(rsq12);
1521 rinv13 = sse2_invsqrt_d(rsq13);
1522 rinv21 = sse2_invsqrt_d(rsq21);
1523 rinv22 = sse2_invsqrt_d(rsq22);
1524 rinv23 = sse2_invsqrt_d(rsq23);
1525 rinv31 = sse2_invsqrt_d(rsq31);
1526 rinv32 = sse2_invsqrt_d(rsq32);
1527 rinv33 = sse2_invsqrt_d(rsq33);
1529 rinvsq00 = sse2_inv_d(rsq00);
1531 fjx0 = _mm_setzero_pd();
1532 fjy0 = _mm_setzero_pd();
1533 fjz0 = _mm_setzero_pd();
1534 fjx1 = _mm_setzero_pd();
1535 fjy1 = _mm_setzero_pd();
1536 fjz1 = _mm_setzero_pd();
1537 fjx2 = _mm_setzero_pd();
1538 fjy2 = _mm_setzero_pd();
1539 fjz2 = _mm_setzero_pd();
1540 fjx3 = _mm_setzero_pd();
1541 fjy3 = _mm_setzero_pd();
1542 fjz3 = _mm_setzero_pd();
1544 /**************************
1545 * CALCULATE INTERACTIONS *
1546 **************************/
1548 /* LENNARD-JONES DISPERSION/REPULSION */
1550 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1551 fvdw = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1555 /* Calculate temporary vectorial force */
1556 tx = _mm_mul_pd(fscal,dx00);
1557 ty = _mm_mul_pd(fscal,dy00);
1558 tz = _mm_mul_pd(fscal,dz00);
1560 /* Update vectorial force */
1561 fix0 = _mm_add_pd(fix0,tx);
1562 fiy0 = _mm_add_pd(fiy0,ty);
1563 fiz0 = _mm_add_pd(fiz0,tz);
1565 fjx0 = _mm_add_pd(fjx0,tx);
1566 fjy0 = _mm_add_pd(fjy0,ty);
1567 fjz0 = _mm_add_pd(fjz0,tz);
1569 /**************************
1570 * CALCULATE INTERACTIONS *
1571 **************************/
1573 r11 = _mm_mul_pd(rsq11,rinv11);
1575 /* Calculate table index by multiplying r with table scale and truncate to integer */
1576 rt = _mm_mul_pd(r11,vftabscale);
1577 vfitab = _mm_cvttpd_epi32(rt);
1578 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1579 vfitab = _mm_slli_epi32(vfitab,2);
1581 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1582 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1583 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1584 GMX_MM_TRANSPOSE2_PD(Y,F);
1585 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1586 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1587 GMX_MM_TRANSPOSE2_PD(G,H);
1588 Heps = _mm_mul_pd(vfeps,H);
1589 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1590 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1591 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1595 /* Calculate temporary vectorial force */
1596 tx = _mm_mul_pd(fscal,dx11);
1597 ty = _mm_mul_pd(fscal,dy11);
1598 tz = _mm_mul_pd(fscal,dz11);
1600 /* Update vectorial force */
1601 fix1 = _mm_add_pd(fix1,tx);
1602 fiy1 = _mm_add_pd(fiy1,ty);
1603 fiz1 = _mm_add_pd(fiz1,tz);
1605 fjx1 = _mm_add_pd(fjx1,tx);
1606 fjy1 = _mm_add_pd(fjy1,ty);
1607 fjz1 = _mm_add_pd(fjz1,tz);
1609 /**************************
1610 * CALCULATE INTERACTIONS *
1611 **************************/
1613 r12 = _mm_mul_pd(rsq12,rinv12);
1615 /* Calculate table index by multiplying r with table scale and truncate to integer */
1616 rt = _mm_mul_pd(r12,vftabscale);
1617 vfitab = _mm_cvttpd_epi32(rt);
1618 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1619 vfitab = _mm_slli_epi32(vfitab,2);
1621 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1622 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1623 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1624 GMX_MM_TRANSPOSE2_PD(Y,F);
1625 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1626 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1627 GMX_MM_TRANSPOSE2_PD(G,H);
1628 Heps = _mm_mul_pd(vfeps,H);
1629 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1630 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1631 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1635 /* Calculate temporary vectorial force */
1636 tx = _mm_mul_pd(fscal,dx12);
1637 ty = _mm_mul_pd(fscal,dy12);
1638 tz = _mm_mul_pd(fscal,dz12);
1640 /* Update vectorial force */
1641 fix1 = _mm_add_pd(fix1,tx);
1642 fiy1 = _mm_add_pd(fiy1,ty);
1643 fiz1 = _mm_add_pd(fiz1,tz);
1645 fjx2 = _mm_add_pd(fjx2,tx);
1646 fjy2 = _mm_add_pd(fjy2,ty);
1647 fjz2 = _mm_add_pd(fjz2,tz);
1649 /**************************
1650 * CALCULATE INTERACTIONS *
1651 **************************/
1653 r13 = _mm_mul_pd(rsq13,rinv13);
1655 /* Calculate table index by multiplying r with table scale and truncate to integer */
1656 rt = _mm_mul_pd(r13,vftabscale);
1657 vfitab = _mm_cvttpd_epi32(rt);
1658 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1659 vfitab = _mm_slli_epi32(vfitab,2);
1661 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1662 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1663 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1664 GMX_MM_TRANSPOSE2_PD(Y,F);
1665 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1666 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1667 GMX_MM_TRANSPOSE2_PD(G,H);
1668 Heps = _mm_mul_pd(vfeps,H);
1669 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1670 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1671 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
1675 /* Calculate temporary vectorial force */
1676 tx = _mm_mul_pd(fscal,dx13);
1677 ty = _mm_mul_pd(fscal,dy13);
1678 tz = _mm_mul_pd(fscal,dz13);
1680 /* Update vectorial force */
1681 fix1 = _mm_add_pd(fix1,tx);
1682 fiy1 = _mm_add_pd(fiy1,ty);
1683 fiz1 = _mm_add_pd(fiz1,tz);
1685 fjx3 = _mm_add_pd(fjx3,tx);
1686 fjy3 = _mm_add_pd(fjy3,ty);
1687 fjz3 = _mm_add_pd(fjz3,tz);
1689 /**************************
1690 * CALCULATE INTERACTIONS *
1691 **************************/
1693 r21 = _mm_mul_pd(rsq21,rinv21);
1695 /* Calculate table index by multiplying r with table scale and truncate to integer */
1696 rt = _mm_mul_pd(r21,vftabscale);
1697 vfitab = _mm_cvttpd_epi32(rt);
1698 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1699 vfitab = _mm_slli_epi32(vfitab,2);
1701 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1702 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1703 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1704 GMX_MM_TRANSPOSE2_PD(Y,F);
1705 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1706 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1707 GMX_MM_TRANSPOSE2_PD(G,H);
1708 Heps = _mm_mul_pd(vfeps,H);
1709 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1710 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1711 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1715 /* Calculate temporary vectorial force */
1716 tx = _mm_mul_pd(fscal,dx21);
1717 ty = _mm_mul_pd(fscal,dy21);
1718 tz = _mm_mul_pd(fscal,dz21);
1720 /* Update vectorial force */
1721 fix2 = _mm_add_pd(fix2,tx);
1722 fiy2 = _mm_add_pd(fiy2,ty);
1723 fiz2 = _mm_add_pd(fiz2,tz);
1725 fjx1 = _mm_add_pd(fjx1,tx);
1726 fjy1 = _mm_add_pd(fjy1,ty);
1727 fjz1 = _mm_add_pd(fjz1,tz);
1729 /**************************
1730 * CALCULATE INTERACTIONS *
1731 **************************/
1733 r22 = _mm_mul_pd(rsq22,rinv22);
1735 /* Calculate table index by multiplying r with table scale and truncate to integer */
1736 rt = _mm_mul_pd(r22,vftabscale);
1737 vfitab = _mm_cvttpd_epi32(rt);
1738 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1739 vfitab = _mm_slli_epi32(vfitab,2);
1741 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1742 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1743 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1744 GMX_MM_TRANSPOSE2_PD(Y,F);
1745 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1746 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1747 GMX_MM_TRANSPOSE2_PD(G,H);
1748 Heps = _mm_mul_pd(vfeps,H);
1749 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1750 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1751 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1755 /* Calculate temporary vectorial force */
1756 tx = _mm_mul_pd(fscal,dx22);
1757 ty = _mm_mul_pd(fscal,dy22);
1758 tz = _mm_mul_pd(fscal,dz22);
1760 /* Update vectorial force */
1761 fix2 = _mm_add_pd(fix2,tx);
1762 fiy2 = _mm_add_pd(fiy2,ty);
1763 fiz2 = _mm_add_pd(fiz2,tz);
1765 fjx2 = _mm_add_pd(fjx2,tx);
1766 fjy2 = _mm_add_pd(fjy2,ty);
1767 fjz2 = _mm_add_pd(fjz2,tz);
1769 /**************************
1770 * CALCULATE INTERACTIONS *
1771 **************************/
1773 r23 = _mm_mul_pd(rsq23,rinv23);
1775 /* Calculate table index by multiplying r with table scale and truncate to integer */
1776 rt = _mm_mul_pd(r23,vftabscale);
1777 vfitab = _mm_cvttpd_epi32(rt);
1778 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1779 vfitab = _mm_slli_epi32(vfitab,2);
1781 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1782 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1783 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1784 GMX_MM_TRANSPOSE2_PD(Y,F);
1785 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1786 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1787 GMX_MM_TRANSPOSE2_PD(G,H);
1788 Heps = _mm_mul_pd(vfeps,H);
1789 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1790 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1791 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
1795 /* Calculate temporary vectorial force */
1796 tx = _mm_mul_pd(fscal,dx23);
1797 ty = _mm_mul_pd(fscal,dy23);
1798 tz = _mm_mul_pd(fscal,dz23);
1800 /* Update vectorial force */
1801 fix2 = _mm_add_pd(fix2,tx);
1802 fiy2 = _mm_add_pd(fiy2,ty);
1803 fiz2 = _mm_add_pd(fiz2,tz);
1805 fjx3 = _mm_add_pd(fjx3,tx);
1806 fjy3 = _mm_add_pd(fjy3,ty);
1807 fjz3 = _mm_add_pd(fjz3,tz);
1809 /**************************
1810 * CALCULATE INTERACTIONS *
1811 **************************/
1813 r31 = _mm_mul_pd(rsq31,rinv31);
1815 /* Calculate table index by multiplying r with table scale and truncate to integer */
1816 rt = _mm_mul_pd(r31,vftabscale);
1817 vfitab = _mm_cvttpd_epi32(rt);
1818 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1819 vfitab = _mm_slli_epi32(vfitab,2);
1821 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1822 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1823 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1824 GMX_MM_TRANSPOSE2_PD(Y,F);
1825 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1826 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1827 GMX_MM_TRANSPOSE2_PD(G,H);
1828 Heps = _mm_mul_pd(vfeps,H);
1829 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1830 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1831 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
1835 /* Calculate temporary vectorial force */
1836 tx = _mm_mul_pd(fscal,dx31);
1837 ty = _mm_mul_pd(fscal,dy31);
1838 tz = _mm_mul_pd(fscal,dz31);
1840 /* Update vectorial force */
1841 fix3 = _mm_add_pd(fix3,tx);
1842 fiy3 = _mm_add_pd(fiy3,ty);
1843 fiz3 = _mm_add_pd(fiz3,tz);
1845 fjx1 = _mm_add_pd(fjx1,tx);
1846 fjy1 = _mm_add_pd(fjy1,ty);
1847 fjz1 = _mm_add_pd(fjz1,tz);
1849 /**************************
1850 * CALCULATE INTERACTIONS *
1851 **************************/
1853 r32 = _mm_mul_pd(rsq32,rinv32);
1855 /* Calculate table index by multiplying r with table scale and truncate to integer */
1856 rt = _mm_mul_pd(r32,vftabscale);
1857 vfitab = _mm_cvttpd_epi32(rt);
1858 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1859 vfitab = _mm_slli_epi32(vfitab,2);
1861 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1862 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1863 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1864 GMX_MM_TRANSPOSE2_PD(Y,F);
1865 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1866 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1867 GMX_MM_TRANSPOSE2_PD(G,H);
1868 Heps = _mm_mul_pd(vfeps,H);
1869 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1870 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1871 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
1875 /* Calculate temporary vectorial force */
1876 tx = _mm_mul_pd(fscal,dx32);
1877 ty = _mm_mul_pd(fscal,dy32);
1878 tz = _mm_mul_pd(fscal,dz32);
1880 /* Update vectorial force */
1881 fix3 = _mm_add_pd(fix3,tx);
1882 fiy3 = _mm_add_pd(fiy3,ty);
1883 fiz3 = _mm_add_pd(fiz3,tz);
1885 fjx2 = _mm_add_pd(fjx2,tx);
1886 fjy2 = _mm_add_pd(fjy2,ty);
1887 fjz2 = _mm_add_pd(fjz2,tz);
1889 /**************************
1890 * CALCULATE INTERACTIONS *
1891 **************************/
1893 r33 = _mm_mul_pd(rsq33,rinv33);
1895 /* Calculate table index by multiplying r with table scale and truncate to integer */
1896 rt = _mm_mul_pd(r33,vftabscale);
1897 vfitab = _mm_cvttpd_epi32(rt);
1898 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1899 vfitab = _mm_slli_epi32(vfitab,2);
1901 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1902 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1903 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1904 GMX_MM_TRANSPOSE2_PD(Y,F);
1905 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1906 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1907 GMX_MM_TRANSPOSE2_PD(G,H);
1908 Heps = _mm_mul_pd(vfeps,H);
1909 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1910 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1911 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
1915 /* Calculate temporary vectorial force */
1916 tx = _mm_mul_pd(fscal,dx33);
1917 ty = _mm_mul_pd(fscal,dy33);
1918 tz = _mm_mul_pd(fscal,dz33);
1920 /* Update vectorial force */
1921 fix3 = _mm_add_pd(fix3,tx);
1922 fiy3 = _mm_add_pd(fiy3,ty);
1923 fiz3 = _mm_add_pd(fiz3,tz);
1925 fjx3 = _mm_add_pd(fjx3,tx);
1926 fjy3 = _mm_add_pd(fjy3,ty);
1927 fjz3 = _mm_add_pd(fjz3,tz);
1929 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1931 /* Inner loop uses 381 flops */
1934 if(jidx<j_index_end)
1938 j_coord_offsetA = DIM*jnrA;
1940 /* load j atom coordinates */
1941 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1942 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1943 &jy2,&jz2,&jx3,&jy3,&jz3);
1945 /* Calculate displacement vector */
1946 dx00 = _mm_sub_pd(ix0,jx0);
1947 dy00 = _mm_sub_pd(iy0,jy0);
1948 dz00 = _mm_sub_pd(iz0,jz0);
1949 dx11 = _mm_sub_pd(ix1,jx1);
1950 dy11 = _mm_sub_pd(iy1,jy1);
1951 dz11 = _mm_sub_pd(iz1,jz1);
1952 dx12 = _mm_sub_pd(ix1,jx2);
1953 dy12 = _mm_sub_pd(iy1,jy2);
1954 dz12 = _mm_sub_pd(iz1,jz2);
1955 dx13 = _mm_sub_pd(ix1,jx3);
1956 dy13 = _mm_sub_pd(iy1,jy3);
1957 dz13 = _mm_sub_pd(iz1,jz3);
1958 dx21 = _mm_sub_pd(ix2,jx1);
1959 dy21 = _mm_sub_pd(iy2,jy1);
1960 dz21 = _mm_sub_pd(iz2,jz1);
1961 dx22 = _mm_sub_pd(ix2,jx2);
1962 dy22 = _mm_sub_pd(iy2,jy2);
1963 dz22 = _mm_sub_pd(iz2,jz2);
1964 dx23 = _mm_sub_pd(ix2,jx3);
1965 dy23 = _mm_sub_pd(iy2,jy3);
1966 dz23 = _mm_sub_pd(iz2,jz3);
1967 dx31 = _mm_sub_pd(ix3,jx1);
1968 dy31 = _mm_sub_pd(iy3,jy1);
1969 dz31 = _mm_sub_pd(iz3,jz1);
1970 dx32 = _mm_sub_pd(ix3,jx2);
1971 dy32 = _mm_sub_pd(iy3,jy2);
1972 dz32 = _mm_sub_pd(iz3,jz2);
1973 dx33 = _mm_sub_pd(ix3,jx3);
1974 dy33 = _mm_sub_pd(iy3,jy3);
1975 dz33 = _mm_sub_pd(iz3,jz3);
1977 /* Calculate squared distance and things based on it */
1978 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1979 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1980 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1981 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1982 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1983 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1984 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1985 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1986 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1987 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1989 rinv11 = sse2_invsqrt_d(rsq11);
1990 rinv12 = sse2_invsqrt_d(rsq12);
1991 rinv13 = sse2_invsqrt_d(rsq13);
1992 rinv21 = sse2_invsqrt_d(rsq21);
1993 rinv22 = sse2_invsqrt_d(rsq22);
1994 rinv23 = sse2_invsqrt_d(rsq23);
1995 rinv31 = sse2_invsqrt_d(rsq31);
1996 rinv32 = sse2_invsqrt_d(rsq32);
1997 rinv33 = sse2_invsqrt_d(rsq33);
1999 rinvsq00 = sse2_inv_d(rsq00);
2001 fjx0 = _mm_setzero_pd();
2002 fjy0 = _mm_setzero_pd();
2003 fjz0 = _mm_setzero_pd();
2004 fjx1 = _mm_setzero_pd();
2005 fjy1 = _mm_setzero_pd();
2006 fjz1 = _mm_setzero_pd();
2007 fjx2 = _mm_setzero_pd();
2008 fjy2 = _mm_setzero_pd();
2009 fjz2 = _mm_setzero_pd();
2010 fjx3 = _mm_setzero_pd();
2011 fjy3 = _mm_setzero_pd();
2012 fjz3 = _mm_setzero_pd();
2014 /**************************
2015 * CALCULATE INTERACTIONS *
2016 **************************/
2018 /* LENNARD-JONES DISPERSION/REPULSION */
2020 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2021 fvdw = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
2025 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2027 /* Calculate temporary vectorial force */
2028 tx = _mm_mul_pd(fscal,dx00);
2029 ty = _mm_mul_pd(fscal,dy00);
2030 tz = _mm_mul_pd(fscal,dz00);
2032 /* Update vectorial force */
2033 fix0 = _mm_add_pd(fix0,tx);
2034 fiy0 = _mm_add_pd(fiy0,ty);
2035 fiz0 = _mm_add_pd(fiz0,tz);
2037 fjx0 = _mm_add_pd(fjx0,tx);
2038 fjy0 = _mm_add_pd(fjy0,ty);
2039 fjz0 = _mm_add_pd(fjz0,tz);
2041 /**************************
2042 * CALCULATE INTERACTIONS *
2043 **************************/
2045 r11 = _mm_mul_pd(rsq11,rinv11);
2047 /* Calculate table index by multiplying r with table scale and truncate to integer */
2048 rt = _mm_mul_pd(r11,vftabscale);
2049 vfitab = _mm_cvttpd_epi32(rt);
2050 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2051 vfitab = _mm_slli_epi32(vfitab,2);
2053 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2054 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2055 F = _mm_setzero_pd();
2056 GMX_MM_TRANSPOSE2_PD(Y,F);
2057 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2058 H = _mm_setzero_pd();
2059 GMX_MM_TRANSPOSE2_PD(G,H);
2060 Heps = _mm_mul_pd(vfeps,H);
2061 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2062 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2063 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2067 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2069 /* Calculate temporary vectorial force */
2070 tx = _mm_mul_pd(fscal,dx11);
2071 ty = _mm_mul_pd(fscal,dy11);
2072 tz = _mm_mul_pd(fscal,dz11);
2074 /* Update vectorial force */
2075 fix1 = _mm_add_pd(fix1,tx);
2076 fiy1 = _mm_add_pd(fiy1,ty);
2077 fiz1 = _mm_add_pd(fiz1,tz);
2079 fjx1 = _mm_add_pd(fjx1,tx);
2080 fjy1 = _mm_add_pd(fjy1,ty);
2081 fjz1 = _mm_add_pd(fjz1,tz);
2083 /**************************
2084 * CALCULATE INTERACTIONS *
2085 **************************/
2087 r12 = _mm_mul_pd(rsq12,rinv12);
2089 /* Calculate table index by multiplying r with table scale and truncate to integer */
2090 rt = _mm_mul_pd(r12,vftabscale);
2091 vfitab = _mm_cvttpd_epi32(rt);
2092 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2093 vfitab = _mm_slli_epi32(vfitab,2);
2095 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2096 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2097 F = _mm_setzero_pd();
2098 GMX_MM_TRANSPOSE2_PD(Y,F);
2099 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2100 H = _mm_setzero_pd();
2101 GMX_MM_TRANSPOSE2_PD(G,H);
2102 Heps = _mm_mul_pd(vfeps,H);
2103 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2104 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2105 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2109 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2111 /* Calculate temporary vectorial force */
2112 tx = _mm_mul_pd(fscal,dx12);
2113 ty = _mm_mul_pd(fscal,dy12);
2114 tz = _mm_mul_pd(fscal,dz12);
2116 /* Update vectorial force */
2117 fix1 = _mm_add_pd(fix1,tx);
2118 fiy1 = _mm_add_pd(fiy1,ty);
2119 fiz1 = _mm_add_pd(fiz1,tz);
2121 fjx2 = _mm_add_pd(fjx2,tx);
2122 fjy2 = _mm_add_pd(fjy2,ty);
2123 fjz2 = _mm_add_pd(fjz2,tz);
2125 /**************************
2126 * CALCULATE INTERACTIONS *
2127 **************************/
2129 r13 = _mm_mul_pd(rsq13,rinv13);
2131 /* Calculate table index by multiplying r with table scale and truncate to integer */
2132 rt = _mm_mul_pd(r13,vftabscale);
2133 vfitab = _mm_cvttpd_epi32(rt);
2134 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2135 vfitab = _mm_slli_epi32(vfitab,2);
2137 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2138 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2139 F = _mm_setzero_pd();
2140 GMX_MM_TRANSPOSE2_PD(Y,F);
2141 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2142 H = _mm_setzero_pd();
2143 GMX_MM_TRANSPOSE2_PD(G,H);
2144 Heps = _mm_mul_pd(vfeps,H);
2145 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2146 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2147 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
2151 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2153 /* Calculate temporary vectorial force */
2154 tx = _mm_mul_pd(fscal,dx13);
2155 ty = _mm_mul_pd(fscal,dy13);
2156 tz = _mm_mul_pd(fscal,dz13);
2158 /* Update vectorial force */
2159 fix1 = _mm_add_pd(fix1,tx);
2160 fiy1 = _mm_add_pd(fiy1,ty);
2161 fiz1 = _mm_add_pd(fiz1,tz);
2163 fjx3 = _mm_add_pd(fjx3,tx);
2164 fjy3 = _mm_add_pd(fjy3,ty);
2165 fjz3 = _mm_add_pd(fjz3,tz);
2167 /**************************
2168 * CALCULATE INTERACTIONS *
2169 **************************/
2171 r21 = _mm_mul_pd(rsq21,rinv21);
2173 /* Calculate table index by multiplying r with table scale and truncate to integer */
2174 rt = _mm_mul_pd(r21,vftabscale);
2175 vfitab = _mm_cvttpd_epi32(rt);
2176 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2177 vfitab = _mm_slli_epi32(vfitab,2);
2179 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2180 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2181 F = _mm_setzero_pd();
2182 GMX_MM_TRANSPOSE2_PD(Y,F);
2183 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2184 H = _mm_setzero_pd();
2185 GMX_MM_TRANSPOSE2_PD(G,H);
2186 Heps = _mm_mul_pd(vfeps,H);
2187 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2188 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2189 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2193 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2195 /* Calculate temporary vectorial force */
2196 tx = _mm_mul_pd(fscal,dx21);
2197 ty = _mm_mul_pd(fscal,dy21);
2198 tz = _mm_mul_pd(fscal,dz21);
2200 /* Update vectorial force */
2201 fix2 = _mm_add_pd(fix2,tx);
2202 fiy2 = _mm_add_pd(fiy2,ty);
2203 fiz2 = _mm_add_pd(fiz2,tz);
2205 fjx1 = _mm_add_pd(fjx1,tx);
2206 fjy1 = _mm_add_pd(fjy1,ty);
2207 fjz1 = _mm_add_pd(fjz1,tz);
2209 /**************************
2210 * CALCULATE INTERACTIONS *
2211 **************************/
2213 r22 = _mm_mul_pd(rsq22,rinv22);
2215 /* Calculate table index by multiplying r with table scale and truncate to integer */
2216 rt = _mm_mul_pd(r22,vftabscale);
2217 vfitab = _mm_cvttpd_epi32(rt);
2218 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2219 vfitab = _mm_slli_epi32(vfitab,2);
2221 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2222 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2223 F = _mm_setzero_pd();
2224 GMX_MM_TRANSPOSE2_PD(Y,F);
2225 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2226 H = _mm_setzero_pd();
2227 GMX_MM_TRANSPOSE2_PD(G,H);
2228 Heps = _mm_mul_pd(vfeps,H);
2229 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2230 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2231 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2235 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2237 /* Calculate temporary vectorial force */
2238 tx = _mm_mul_pd(fscal,dx22);
2239 ty = _mm_mul_pd(fscal,dy22);
2240 tz = _mm_mul_pd(fscal,dz22);
2242 /* Update vectorial force */
2243 fix2 = _mm_add_pd(fix2,tx);
2244 fiy2 = _mm_add_pd(fiy2,ty);
2245 fiz2 = _mm_add_pd(fiz2,tz);
2247 fjx2 = _mm_add_pd(fjx2,tx);
2248 fjy2 = _mm_add_pd(fjy2,ty);
2249 fjz2 = _mm_add_pd(fjz2,tz);
2251 /**************************
2252 * CALCULATE INTERACTIONS *
2253 **************************/
2255 r23 = _mm_mul_pd(rsq23,rinv23);
2257 /* Calculate table index by multiplying r with table scale and truncate to integer */
2258 rt = _mm_mul_pd(r23,vftabscale);
2259 vfitab = _mm_cvttpd_epi32(rt);
2260 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2261 vfitab = _mm_slli_epi32(vfitab,2);
2263 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2264 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2265 F = _mm_setzero_pd();
2266 GMX_MM_TRANSPOSE2_PD(Y,F);
2267 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2268 H = _mm_setzero_pd();
2269 GMX_MM_TRANSPOSE2_PD(G,H);
2270 Heps = _mm_mul_pd(vfeps,H);
2271 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2272 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2273 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
2277 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2279 /* Calculate temporary vectorial force */
2280 tx = _mm_mul_pd(fscal,dx23);
2281 ty = _mm_mul_pd(fscal,dy23);
2282 tz = _mm_mul_pd(fscal,dz23);
2284 /* Update vectorial force */
2285 fix2 = _mm_add_pd(fix2,tx);
2286 fiy2 = _mm_add_pd(fiy2,ty);
2287 fiz2 = _mm_add_pd(fiz2,tz);
2289 fjx3 = _mm_add_pd(fjx3,tx);
2290 fjy3 = _mm_add_pd(fjy3,ty);
2291 fjz3 = _mm_add_pd(fjz3,tz);
2293 /**************************
2294 * CALCULATE INTERACTIONS *
2295 **************************/
2297 r31 = _mm_mul_pd(rsq31,rinv31);
2299 /* Calculate table index by multiplying r with table scale and truncate to integer */
2300 rt = _mm_mul_pd(r31,vftabscale);
2301 vfitab = _mm_cvttpd_epi32(rt);
2302 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2303 vfitab = _mm_slli_epi32(vfitab,2);
2305 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2306 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2307 F = _mm_setzero_pd();
2308 GMX_MM_TRANSPOSE2_PD(Y,F);
2309 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2310 H = _mm_setzero_pd();
2311 GMX_MM_TRANSPOSE2_PD(G,H);
2312 Heps = _mm_mul_pd(vfeps,H);
2313 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2314 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2315 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
2319 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2321 /* Calculate temporary vectorial force */
2322 tx = _mm_mul_pd(fscal,dx31);
2323 ty = _mm_mul_pd(fscal,dy31);
2324 tz = _mm_mul_pd(fscal,dz31);
2326 /* Update vectorial force */
2327 fix3 = _mm_add_pd(fix3,tx);
2328 fiy3 = _mm_add_pd(fiy3,ty);
2329 fiz3 = _mm_add_pd(fiz3,tz);
2331 fjx1 = _mm_add_pd(fjx1,tx);
2332 fjy1 = _mm_add_pd(fjy1,ty);
2333 fjz1 = _mm_add_pd(fjz1,tz);
2335 /**************************
2336 * CALCULATE INTERACTIONS *
2337 **************************/
2339 r32 = _mm_mul_pd(rsq32,rinv32);
2341 /* Calculate table index by multiplying r with table scale and truncate to integer */
2342 rt = _mm_mul_pd(r32,vftabscale);
2343 vfitab = _mm_cvttpd_epi32(rt);
2344 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2345 vfitab = _mm_slli_epi32(vfitab,2);
2347 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2348 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2349 F = _mm_setzero_pd();
2350 GMX_MM_TRANSPOSE2_PD(Y,F);
2351 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2352 H = _mm_setzero_pd();
2353 GMX_MM_TRANSPOSE2_PD(G,H);
2354 Heps = _mm_mul_pd(vfeps,H);
2355 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2356 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2357 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
2361 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2363 /* Calculate temporary vectorial force */
2364 tx = _mm_mul_pd(fscal,dx32);
2365 ty = _mm_mul_pd(fscal,dy32);
2366 tz = _mm_mul_pd(fscal,dz32);
2368 /* Update vectorial force */
2369 fix3 = _mm_add_pd(fix3,tx);
2370 fiy3 = _mm_add_pd(fiy3,ty);
2371 fiz3 = _mm_add_pd(fiz3,tz);
2373 fjx2 = _mm_add_pd(fjx2,tx);
2374 fjy2 = _mm_add_pd(fjy2,ty);
2375 fjz2 = _mm_add_pd(fjz2,tz);
2377 /**************************
2378 * CALCULATE INTERACTIONS *
2379 **************************/
2381 r33 = _mm_mul_pd(rsq33,rinv33);
2383 /* Calculate table index by multiplying r with table scale and truncate to integer */
2384 rt = _mm_mul_pd(r33,vftabscale);
2385 vfitab = _mm_cvttpd_epi32(rt);
2386 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2387 vfitab = _mm_slli_epi32(vfitab,2);
2389 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2390 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2391 F = _mm_setzero_pd();
2392 GMX_MM_TRANSPOSE2_PD(Y,F);
2393 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2394 H = _mm_setzero_pd();
2395 GMX_MM_TRANSPOSE2_PD(G,H);
2396 Heps = _mm_mul_pd(vfeps,H);
2397 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2398 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2399 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
2403 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2405 /* Calculate temporary vectorial force */
2406 tx = _mm_mul_pd(fscal,dx33);
2407 ty = _mm_mul_pd(fscal,dy33);
2408 tz = _mm_mul_pd(fscal,dz33);
2410 /* Update vectorial force */
2411 fix3 = _mm_add_pd(fix3,tx);
2412 fiy3 = _mm_add_pd(fiy3,ty);
2413 fiz3 = _mm_add_pd(fiz3,tz);
2415 fjx3 = _mm_add_pd(fjx3,tx);
2416 fjy3 = _mm_add_pd(fjy3,ty);
2417 fjz3 = _mm_add_pd(fjz3,tz);
2419 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2421 /* Inner loop uses 381 flops */
2424 /* End of innermost loop */
2426 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2427 f+i_coord_offset,fshift+i_shift_offset);
2429 /* Increment number of inner iterations */
2430 inneriter += j_index_end - j_index_start;
2432 /* Outer loop uses 24 flops */
2435 /* Increment number of outer iterations */
2438 /* Update outer/inner flops */
2440 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*381);