2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse2_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_double
51 * Electrostatics interaction: CubicSplineTable
52 * VdW interaction: CubicSplineTable
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int j_coord_offsetA,j_coord_offsetB;
75 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
77 real *shiftvec,*fshift,*x,*f;
78 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
82 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
84 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
85 int vdwjidx0A,vdwjidx0B;
86 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
87 int vdwjidx1A,vdwjidx1B;
88 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
89 int vdwjidx2A,vdwjidx2B;
90 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
91 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
92 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
93 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
94 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
95 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
96 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
97 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
98 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
99 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
100 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
103 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
106 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
107 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
109 __m128i ifour = _mm_set1_epi32(4);
110 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
112 __m128d dummy_mask,cutoff_mask;
113 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
114 __m128d one = _mm_set1_pd(1.0);
115 __m128d two = _mm_set1_pd(2.0);
121 jindex = nlist->jindex;
123 shiftidx = nlist->shift;
125 shiftvec = fr->shift_vec[0];
126 fshift = fr->fshift[0];
127 facel = _mm_set1_pd(fr->ic->epsfac);
128 charge = mdatoms->chargeA;
129 nvdwtype = fr->ntype;
131 vdwtype = mdatoms->typeA;
133 vftab = kernel_data->table_elec_vdw->data;
134 vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
136 /* Setup water-specific parameters */
137 inr = nlist->iinr[0];
138 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
139 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
140 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
141 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
143 jq0 = _mm_set1_pd(charge[inr+0]);
144 jq1 = _mm_set1_pd(charge[inr+1]);
145 jq2 = _mm_set1_pd(charge[inr+2]);
146 vdwjidx0A = 2*vdwtype[inr+0];
147 qq00 = _mm_mul_pd(iq0,jq0);
148 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
149 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
150 qq01 = _mm_mul_pd(iq0,jq1);
151 qq02 = _mm_mul_pd(iq0,jq2);
152 qq10 = _mm_mul_pd(iq1,jq0);
153 qq11 = _mm_mul_pd(iq1,jq1);
154 qq12 = _mm_mul_pd(iq1,jq2);
155 qq20 = _mm_mul_pd(iq2,jq0);
156 qq21 = _mm_mul_pd(iq2,jq1);
157 qq22 = _mm_mul_pd(iq2,jq2);
159 /* Avoid stupid compiler warnings */
167 /* Start outer loop over neighborlists */
168 for(iidx=0; iidx<nri; iidx++)
170 /* Load shift vector for this list */
171 i_shift_offset = DIM*shiftidx[iidx];
173 /* Load limits for loop over neighbors */
174 j_index_start = jindex[iidx];
175 j_index_end = jindex[iidx+1];
177 /* Get outer coordinate index */
179 i_coord_offset = DIM*inr;
181 /* Load i particle coords and add shift vector */
182 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
183 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
185 fix0 = _mm_setzero_pd();
186 fiy0 = _mm_setzero_pd();
187 fiz0 = _mm_setzero_pd();
188 fix1 = _mm_setzero_pd();
189 fiy1 = _mm_setzero_pd();
190 fiz1 = _mm_setzero_pd();
191 fix2 = _mm_setzero_pd();
192 fiy2 = _mm_setzero_pd();
193 fiz2 = _mm_setzero_pd();
195 /* Reset potential sums */
196 velecsum = _mm_setzero_pd();
197 vvdwsum = _mm_setzero_pd();
199 /* Start inner kernel loop */
200 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
203 /* Get j neighbor index, and coordinate index */
206 j_coord_offsetA = DIM*jnrA;
207 j_coord_offsetB = DIM*jnrB;
209 /* load j atom coordinates */
210 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
211 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
213 /* Calculate displacement vector */
214 dx00 = _mm_sub_pd(ix0,jx0);
215 dy00 = _mm_sub_pd(iy0,jy0);
216 dz00 = _mm_sub_pd(iz0,jz0);
217 dx01 = _mm_sub_pd(ix0,jx1);
218 dy01 = _mm_sub_pd(iy0,jy1);
219 dz01 = _mm_sub_pd(iz0,jz1);
220 dx02 = _mm_sub_pd(ix0,jx2);
221 dy02 = _mm_sub_pd(iy0,jy2);
222 dz02 = _mm_sub_pd(iz0,jz2);
223 dx10 = _mm_sub_pd(ix1,jx0);
224 dy10 = _mm_sub_pd(iy1,jy0);
225 dz10 = _mm_sub_pd(iz1,jz0);
226 dx11 = _mm_sub_pd(ix1,jx1);
227 dy11 = _mm_sub_pd(iy1,jy1);
228 dz11 = _mm_sub_pd(iz1,jz1);
229 dx12 = _mm_sub_pd(ix1,jx2);
230 dy12 = _mm_sub_pd(iy1,jy2);
231 dz12 = _mm_sub_pd(iz1,jz2);
232 dx20 = _mm_sub_pd(ix2,jx0);
233 dy20 = _mm_sub_pd(iy2,jy0);
234 dz20 = _mm_sub_pd(iz2,jz0);
235 dx21 = _mm_sub_pd(ix2,jx1);
236 dy21 = _mm_sub_pd(iy2,jy1);
237 dz21 = _mm_sub_pd(iz2,jz1);
238 dx22 = _mm_sub_pd(ix2,jx2);
239 dy22 = _mm_sub_pd(iy2,jy2);
240 dz22 = _mm_sub_pd(iz2,jz2);
242 /* Calculate squared distance and things based on it */
243 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
244 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
245 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
246 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
247 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
248 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
249 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
250 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
251 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
253 rinv00 = sse2_invsqrt_d(rsq00);
254 rinv01 = sse2_invsqrt_d(rsq01);
255 rinv02 = sse2_invsqrt_d(rsq02);
256 rinv10 = sse2_invsqrt_d(rsq10);
257 rinv11 = sse2_invsqrt_d(rsq11);
258 rinv12 = sse2_invsqrt_d(rsq12);
259 rinv20 = sse2_invsqrt_d(rsq20);
260 rinv21 = sse2_invsqrt_d(rsq21);
261 rinv22 = sse2_invsqrt_d(rsq22);
263 fjx0 = _mm_setzero_pd();
264 fjy0 = _mm_setzero_pd();
265 fjz0 = _mm_setzero_pd();
266 fjx1 = _mm_setzero_pd();
267 fjy1 = _mm_setzero_pd();
268 fjz1 = _mm_setzero_pd();
269 fjx2 = _mm_setzero_pd();
270 fjy2 = _mm_setzero_pd();
271 fjz2 = _mm_setzero_pd();
273 /**************************
274 * CALCULATE INTERACTIONS *
275 **************************/
277 r00 = _mm_mul_pd(rsq00,rinv00);
279 /* Calculate table index by multiplying r with table scale and truncate to integer */
280 rt = _mm_mul_pd(r00,vftabscale);
281 vfitab = _mm_cvttpd_epi32(rt);
282 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
283 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
285 /* CUBIC SPLINE TABLE ELECTROSTATICS */
286 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
287 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
288 GMX_MM_TRANSPOSE2_PD(Y,F);
289 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
290 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
291 GMX_MM_TRANSPOSE2_PD(G,H);
292 Heps = _mm_mul_pd(vfeps,H);
293 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
294 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
295 velec = _mm_mul_pd(qq00,VV);
296 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
297 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
299 /* CUBIC SPLINE TABLE DISPERSION */
300 vfitab = _mm_add_epi32(vfitab,ifour);
301 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
302 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
303 GMX_MM_TRANSPOSE2_PD(Y,F);
304 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
305 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
306 GMX_MM_TRANSPOSE2_PD(G,H);
307 Heps = _mm_mul_pd(vfeps,H);
308 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
309 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
310 vvdw6 = _mm_mul_pd(c6_00,VV);
311 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
312 fvdw6 = _mm_mul_pd(c6_00,FF);
314 /* CUBIC SPLINE TABLE REPULSION */
315 vfitab = _mm_add_epi32(vfitab,ifour);
316 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
317 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
318 GMX_MM_TRANSPOSE2_PD(Y,F);
319 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
320 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
321 GMX_MM_TRANSPOSE2_PD(G,H);
322 Heps = _mm_mul_pd(vfeps,H);
323 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
324 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
325 vvdw12 = _mm_mul_pd(c12_00,VV);
326 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
327 fvdw12 = _mm_mul_pd(c12_00,FF);
328 vvdw = _mm_add_pd(vvdw12,vvdw6);
329 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
331 /* Update potential sum for this i atom from the interaction with this j atom. */
332 velecsum = _mm_add_pd(velecsum,velec);
333 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
335 fscal = _mm_add_pd(felec,fvdw);
337 /* Calculate temporary vectorial force */
338 tx = _mm_mul_pd(fscal,dx00);
339 ty = _mm_mul_pd(fscal,dy00);
340 tz = _mm_mul_pd(fscal,dz00);
342 /* Update vectorial force */
343 fix0 = _mm_add_pd(fix0,tx);
344 fiy0 = _mm_add_pd(fiy0,ty);
345 fiz0 = _mm_add_pd(fiz0,tz);
347 fjx0 = _mm_add_pd(fjx0,tx);
348 fjy0 = _mm_add_pd(fjy0,ty);
349 fjz0 = _mm_add_pd(fjz0,tz);
351 /**************************
352 * CALCULATE INTERACTIONS *
353 **************************/
355 r01 = _mm_mul_pd(rsq01,rinv01);
357 /* Calculate table index by multiplying r with table scale and truncate to integer */
358 rt = _mm_mul_pd(r01,vftabscale);
359 vfitab = _mm_cvttpd_epi32(rt);
360 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
361 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
363 /* CUBIC SPLINE TABLE ELECTROSTATICS */
364 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
365 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
366 GMX_MM_TRANSPOSE2_PD(Y,F);
367 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
368 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
369 GMX_MM_TRANSPOSE2_PD(G,H);
370 Heps = _mm_mul_pd(vfeps,H);
371 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
372 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
373 velec = _mm_mul_pd(qq01,VV);
374 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
375 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
377 /* Update potential sum for this i atom from the interaction with this j atom. */
378 velecsum = _mm_add_pd(velecsum,velec);
382 /* Calculate temporary vectorial force */
383 tx = _mm_mul_pd(fscal,dx01);
384 ty = _mm_mul_pd(fscal,dy01);
385 tz = _mm_mul_pd(fscal,dz01);
387 /* Update vectorial force */
388 fix0 = _mm_add_pd(fix0,tx);
389 fiy0 = _mm_add_pd(fiy0,ty);
390 fiz0 = _mm_add_pd(fiz0,tz);
392 fjx1 = _mm_add_pd(fjx1,tx);
393 fjy1 = _mm_add_pd(fjy1,ty);
394 fjz1 = _mm_add_pd(fjz1,tz);
396 /**************************
397 * CALCULATE INTERACTIONS *
398 **************************/
400 r02 = _mm_mul_pd(rsq02,rinv02);
402 /* Calculate table index by multiplying r with table scale and truncate to integer */
403 rt = _mm_mul_pd(r02,vftabscale);
404 vfitab = _mm_cvttpd_epi32(rt);
405 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
406 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
408 /* CUBIC SPLINE TABLE ELECTROSTATICS */
409 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
410 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
411 GMX_MM_TRANSPOSE2_PD(Y,F);
412 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
413 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
414 GMX_MM_TRANSPOSE2_PD(G,H);
415 Heps = _mm_mul_pd(vfeps,H);
416 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
417 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
418 velec = _mm_mul_pd(qq02,VV);
419 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
420 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
422 /* Update potential sum for this i atom from the interaction with this j atom. */
423 velecsum = _mm_add_pd(velecsum,velec);
427 /* Calculate temporary vectorial force */
428 tx = _mm_mul_pd(fscal,dx02);
429 ty = _mm_mul_pd(fscal,dy02);
430 tz = _mm_mul_pd(fscal,dz02);
432 /* Update vectorial force */
433 fix0 = _mm_add_pd(fix0,tx);
434 fiy0 = _mm_add_pd(fiy0,ty);
435 fiz0 = _mm_add_pd(fiz0,tz);
437 fjx2 = _mm_add_pd(fjx2,tx);
438 fjy2 = _mm_add_pd(fjy2,ty);
439 fjz2 = _mm_add_pd(fjz2,tz);
441 /**************************
442 * CALCULATE INTERACTIONS *
443 **************************/
445 r10 = _mm_mul_pd(rsq10,rinv10);
447 /* Calculate table index by multiplying r with table scale and truncate to integer */
448 rt = _mm_mul_pd(r10,vftabscale);
449 vfitab = _mm_cvttpd_epi32(rt);
450 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
451 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
453 /* CUBIC SPLINE TABLE ELECTROSTATICS */
454 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
455 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
456 GMX_MM_TRANSPOSE2_PD(Y,F);
457 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
458 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
459 GMX_MM_TRANSPOSE2_PD(G,H);
460 Heps = _mm_mul_pd(vfeps,H);
461 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
462 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
463 velec = _mm_mul_pd(qq10,VV);
464 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
465 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
467 /* Update potential sum for this i atom from the interaction with this j atom. */
468 velecsum = _mm_add_pd(velecsum,velec);
472 /* Calculate temporary vectorial force */
473 tx = _mm_mul_pd(fscal,dx10);
474 ty = _mm_mul_pd(fscal,dy10);
475 tz = _mm_mul_pd(fscal,dz10);
477 /* Update vectorial force */
478 fix1 = _mm_add_pd(fix1,tx);
479 fiy1 = _mm_add_pd(fiy1,ty);
480 fiz1 = _mm_add_pd(fiz1,tz);
482 fjx0 = _mm_add_pd(fjx0,tx);
483 fjy0 = _mm_add_pd(fjy0,ty);
484 fjz0 = _mm_add_pd(fjz0,tz);
486 /**************************
487 * CALCULATE INTERACTIONS *
488 **************************/
490 r11 = _mm_mul_pd(rsq11,rinv11);
492 /* Calculate table index by multiplying r with table scale and truncate to integer */
493 rt = _mm_mul_pd(r11,vftabscale);
494 vfitab = _mm_cvttpd_epi32(rt);
495 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
496 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
498 /* CUBIC SPLINE TABLE ELECTROSTATICS */
499 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
500 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
501 GMX_MM_TRANSPOSE2_PD(Y,F);
502 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
503 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
504 GMX_MM_TRANSPOSE2_PD(G,H);
505 Heps = _mm_mul_pd(vfeps,H);
506 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
507 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
508 velec = _mm_mul_pd(qq11,VV);
509 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
510 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
512 /* Update potential sum for this i atom from the interaction with this j atom. */
513 velecsum = _mm_add_pd(velecsum,velec);
517 /* Calculate temporary vectorial force */
518 tx = _mm_mul_pd(fscal,dx11);
519 ty = _mm_mul_pd(fscal,dy11);
520 tz = _mm_mul_pd(fscal,dz11);
522 /* Update vectorial force */
523 fix1 = _mm_add_pd(fix1,tx);
524 fiy1 = _mm_add_pd(fiy1,ty);
525 fiz1 = _mm_add_pd(fiz1,tz);
527 fjx1 = _mm_add_pd(fjx1,tx);
528 fjy1 = _mm_add_pd(fjy1,ty);
529 fjz1 = _mm_add_pd(fjz1,tz);
531 /**************************
532 * CALCULATE INTERACTIONS *
533 **************************/
535 r12 = _mm_mul_pd(rsq12,rinv12);
537 /* Calculate table index by multiplying r with table scale and truncate to integer */
538 rt = _mm_mul_pd(r12,vftabscale);
539 vfitab = _mm_cvttpd_epi32(rt);
540 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
541 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
543 /* CUBIC SPLINE TABLE ELECTROSTATICS */
544 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
545 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
546 GMX_MM_TRANSPOSE2_PD(Y,F);
547 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
548 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
549 GMX_MM_TRANSPOSE2_PD(G,H);
550 Heps = _mm_mul_pd(vfeps,H);
551 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
552 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
553 velec = _mm_mul_pd(qq12,VV);
554 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
555 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
557 /* Update potential sum for this i atom from the interaction with this j atom. */
558 velecsum = _mm_add_pd(velecsum,velec);
562 /* Calculate temporary vectorial force */
563 tx = _mm_mul_pd(fscal,dx12);
564 ty = _mm_mul_pd(fscal,dy12);
565 tz = _mm_mul_pd(fscal,dz12);
567 /* Update vectorial force */
568 fix1 = _mm_add_pd(fix1,tx);
569 fiy1 = _mm_add_pd(fiy1,ty);
570 fiz1 = _mm_add_pd(fiz1,tz);
572 fjx2 = _mm_add_pd(fjx2,tx);
573 fjy2 = _mm_add_pd(fjy2,ty);
574 fjz2 = _mm_add_pd(fjz2,tz);
576 /**************************
577 * CALCULATE INTERACTIONS *
578 **************************/
580 r20 = _mm_mul_pd(rsq20,rinv20);
582 /* Calculate table index by multiplying r with table scale and truncate to integer */
583 rt = _mm_mul_pd(r20,vftabscale);
584 vfitab = _mm_cvttpd_epi32(rt);
585 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
586 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
588 /* CUBIC SPLINE TABLE ELECTROSTATICS */
589 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
590 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
591 GMX_MM_TRANSPOSE2_PD(Y,F);
592 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
593 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
594 GMX_MM_TRANSPOSE2_PD(G,H);
595 Heps = _mm_mul_pd(vfeps,H);
596 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
597 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
598 velec = _mm_mul_pd(qq20,VV);
599 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
600 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
602 /* Update potential sum for this i atom from the interaction with this j atom. */
603 velecsum = _mm_add_pd(velecsum,velec);
607 /* Calculate temporary vectorial force */
608 tx = _mm_mul_pd(fscal,dx20);
609 ty = _mm_mul_pd(fscal,dy20);
610 tz = _mm_mul_pd(fscal,dz20);
612 /* Update vectorial force */
613 fix2 = _mm_add_pd(fix2,tx);
614 fiy2 = _mm_add_pd(fiy2,ty);
615 fiz2 = _mm_add_pd(fiz2,tz);
617 fjx0 = _mm_add_pd(fjx0,tx);
618 fjy0 = _mm_add_pd(fjy0,ty);
619 fjz0 = _mm_add_pd(fjz0,tz);
621 /**************************
622 * CALCULATE INTERACTIONS *
623 **************************/
625 r21 = _mm_mul_pd(rsq21,rinv21);
627 /* Calculate table index by multiplying r with table scale and truncate to integer */
628 rt = _mm_mul_pd(r21,vftabscale);
629 vfitab = _mm_cvttpd_epi32(rt);
630 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
631 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
633 /* CUBIC SPLINE TABLE ELECTROSTATICS */
634 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
635 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
636 GMX_MM_TRANSPOSE2_PD(Y,F);
637 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
638 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
639 GMX_MM_TRANSPOSE2_PD(G,H);
640 Heps = _mm_mul_pd(vfeps,H);
641 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
642 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
643 velec = _mm_mul_pd(qq21,VV);
644 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
645 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
647 /* Update potential sum for this i atom from the interaction with this j atom. */
648 velecsum = _mm_add_pd(velecsum,velec);
652 /* Calculate temporary vectorial force */
653 tx = _mm_mul_pd(fscal,dx21);
654 ty = _mm_mul_pd(fscal,dy21);
655 tz = _mm_mul_pd(fscal,dz21);
657 /* Update vectorial force */
658 fix2 = _mm_add_pd(fix2,tx);
659 fiy2 = _mm_add_pd(fiy2,ty);
660 fiz2 = _mm_add_pd(fiz2,tz);
662 fjx1 = _mm_add_pd(fjx1,tx);
663 fjy1 = _mm_add_pd(fjy1,ty);
664 fjz1 = _mm_add_pd(fjz1,tz);
666 /**************************
667 * CALCULATE INTERACTIONS *
668 **************************/
670 r22 = _mm_mul_pd(rsq22,rinv22);
672 /* Calculate table index by multiplying r with table scale and truncate to integer */
673 rt = _mm_mul_pd(r22,vftabscale);
674 vfitab = _mm_cvttpd_epi32(rt);
675 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
676 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
678 /* CUBIC SPLINE TABLE ELECTROSTATICS */
679 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
680 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
681 GMX_MM_TRANSPOSE2_PD(Y,F);
682 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
683 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
684 GMX_MM_TRANSPOSE2_PD(G,H);
685 Heps = _mm_mul_pd(vfeps,H);
686 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
687 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
688 velec = _mm_mul_pd(qq22,VV);
689 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
690 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
692 /* Update potential sum for this i atom from the interaction with this j atom. */
693 velecsum = _mm_add_pd(velecsum,velec);
697 /* Calculate temporary vectorial force */
698 tx = _mm_mul_pd(fscal,dx22);
699 ty = _mm_mul_pd(fscal,dy22);
700 tz = _mm_mul_pd(fscal,dz22);
702 /* Update vectorial force */
703 fix2 = _mm_add_pd(fix2,tx);
704 fiy2 = _mm_add_pd(fiy2,ty);
705 fiz2 = _mm_add_pd(fiz2,tz);
707 fjx2 = _mm_add_pd(fjx2,tx);
708 fjy2 = _mm_add_pd(fjy2,ty);
709 fjz2 = _mm_add_pd(fjz2,tz);
711 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
713 /* Inner loop uses 417 flops */
720 j_coord_offsetA = DIM*jnrA;
722 /* load j atom coordinates */
723 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
724 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
726 /* Calculate displacement vector */
727 dx00 = _mm_sub_pd(ix0,jx0);
728 dy00 = _mm_sub_pd(iy0,jy0);
729 dz00 = _mm_sub_pd(iz0,jz0);
730 dx01 = _mm_sub_pd(ix0,jx1);
731 dy01 = _mm_sub_pd(iy0,jy1);
732 dz01 = _mm_sub_pd(iz0,jz1);
733 dx02 = _mm_sub_pd(ix0,jx2);
734 dy02 = _mm_sub_pd(iy0,jy2);
735 dz02 = _mm_sub_pd(iz0,jz2);
736 dx10 = _mm_sub_pd(ix1,jx0);
737 dy10 = _mm_sub_pd(iy1,jy0);
738 dz10 = _mm_sub_pd(iz1,jz0);
739 dx11 = _mm_sub_pd(ix1,jx1);
740 dy11 = _mm_sub_pd(iy1,jy1);
741 dz11 = _mm_sub_pd(iz1,jz1);
742 dx12 = _mm_sub_pd(ix1,jx2);
743 dy12 = _mm_sub_pd(iy1,jy2);
744 dz12 = _mm_sub_pd(iz1,jz2);
745 dx20 = _mm_sub_pd(ix2,jx0);
746 dy20 = _mm_sub_pd(iy2,jy0);
747 dz20 = _mm_sub_pd(iz2,jz0);
748 dx21 = _mm_sub_pd(ix2,jx1);
749 dy21 = _mm_sub_pd(iy2,jy1);
750 dz21 = _mm_sub_pd(iz2,jz1);
751 dx22 = _mm_sub_pd(ix2,jx2);
752 dy22 = _mm_sub_pd(iy2,jy2);
753 dz22 = _mm_sub_pd(iz2,jz2);
755 /* Calculate squared distance and things based on it */
756 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
757 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
758 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
759 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
760 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
761 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
762 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
763 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
764 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
766 rinv00 = sse2_invsqrt_d(rsq00);
767 rinv01 = sse2_invsqrt_d(rsq01);
768 rinv02 = sse2_invsqrt_d(rsq02);
769 rinv10 = sse2_invsqrt_d(rsq10);
770 rinv11 = sse2_invsqrt_d(rsq11);
771 rinv12 = sse2_invsqrt_d(rsq12);
772 rinv20 = sse2_invsqrt_d(rsq20);
773 rinv21 = sse2_invsqrt_d(rsq21);
774 rinv22 = sse2_invsqrt_d(rsq22);
776 fjx0 = _mm_setzero_pd();
777 fjy0 = _mm_setzero_pd();
778 fjz0 = _mm_setzero_pd();
779 fjx1 = _mm_setzero_pd();
780 fjy1 = _mm_setzero_pd();
781 fjz1 = _mm_setzero_pd();
782 fjx2 = _mm_setzero_pd();
783 fjy2 = _mm_setzero_pd();
784 fjz2 = _mm_setzero_pd();
786 /**************************
787 * CALCULATE INTERACTIONS *
788 **************************/
790 r00 = _mm_mul_pd(rsq00,rinv00);
792 /* Calculate table index by multiplying r with table scale and truncate to integer */
793 rt = _mm_mul_pd(r00,vftabscale);
794 vfitab = _mm_cvttpd_epi32(rt);
795 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
796 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
798 /* CUBIC SPLINE TABLE ELECTROSTATICS */
799 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
800 F = _mm_setzero_pd();
801 GMX_MM_TRANSPOSE2_PD(Y,F);
802 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
803 H = _mm_setzero_pd();
804 GMX_MM_TRANSPOSE2_PD(G,H);
805 Heps = _mm_mul_pd(vfeps,H);
806 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
807 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
808 velec = _mm_mul_pd(qq00,VV);
809 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
810 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
812 /* CUBIC SPLINE TABLE DISPERSION */
813 vfitab = _mm_add_epi32(vfitab,ifour);
814 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
815 F = _mm_setzero_pd();
816 GMX_MM_TRANSPOSE2_PD(Y,F);
817 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
818 H = _mm_setzero_pd();
819 GMX_MM_TRANSPOSE2_PD(G,H);
820 Heps = _mm_mul_pd(vfeps,H);
821 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
822 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
823 vvdw6 = _mm_mul_pd(c6_00,VV);
824 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
825 fvdw6 = _mm_mul_pd(c6_00,FF);
827 /* CUBIC SPLINE TABLE REPULSION */
828 vfitab = _mm_add_epi32(vfitab,ifour);
829 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
830 F = _mm_setzero_pd();
831 GMX_MM_TRANSPOSE2_PD(Y,F);
832 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
833 H = _mm_setzero_pd();
834 GMX_MM_TRANSPOSE2_PD(G,H);
835 Heps = _mm_mul_pd(vfeps,H);
836 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
837 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
838 vvdw12 = _mm_mul_pd(c12_00,VV);
839 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
840 fvdw12 = _mm_mul_pd(c12_00,FF);
841 vvdw = _mm_add_pd(vvdw12,vvdw6);
842 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
844 /* Update potential sum for this i atom from the interaction with this j atom. */
845 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
846 velecsum = _mm_add_pd(velecsum,velec);
847 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
848 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
850 fscal = _mm_add_pd(felec,fvdw);
852 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
854 /* Calculate temporary vectorial force */
855 tx = _mm_mul_pd(fscal,dx00);
856 ty = _mm_mul_pd(fscal,dy00);
857 tz = _mm_mul_pd(fscal,dz00);
859 /* Update vectorial force */
860 fix0 = _mm_add_pd(fix0,tx);
861 fiy0 = _mm_add_pd(fiy0,ty);
862 fiz0 = _mm_add_pd(fiz0,tz);
864 fjx0 = _mm_add_pd(fjx0,tx);
865 fjy0 = _mm_add_pd(fjy0,ty);
866 fjz0 = _mm_add_pd(fjz0,tz);
868 /**************************
869 * CALCULATE INTERACTIONS *
870 **************************/
872 r01 = _mm_mul_pd(rsq01,rinv01);
874 /* Calculate table index by multiplying r with table scale and truncate to integer */
875 rt = _mm_mul_pd(r01,vftabscale);
876 vfitab = _mm_cvttpd_epi32(rt);
877 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
878 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
880 /* CUBIC SPLINE TABLE ELECTROSTATICS */
881 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
882 F = _mm_setzero_pd();
883 GMX_MM_TRANSPOSE2_PD(Y,F);
884 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
885 H = _mm_setzero_pd();
886 GMX_MM_TRANSPOSE2_PD(G,H);
887 Heps = _mm_mul_pd(vfeps,H);
888 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
889 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
890 velec = _mm_mul_pd(qq01,VV);
891 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
892 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
894 /* Update potential sum for this i atom from the interaction with this j atom. */
895 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
896 velecsum = _mm_add_pd(velecsum,velec);
900 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
902 /* Calculate temporary vectorial force */
903 tx = _mm_mul_pd(fscal,dx01);
904 ty = _mm_mul_pd(fscal,dy01);
905 tz = _mm_mul_pd(fscal,dz01);
907 /* Update vectorial force */
908 fix0 = _mm_add_pd(fix0,tx);
909 fiy0 = _mm_add_pd(fiy0,ty);
910 fiz0 = _mm_add_pd(fiz0,tz);
912 fjx1 = _mm_add_pd(fjx1,tx);
913 fjy1 = _mm_add_pd(fjy1,ty);
914 fjz1 = _mm_add_pd(fjz1,tz);
916 /**************************
917 * CALCULATE INTERACTIONS *
918 **************************/
920 r02 = _mm_mul_pd(rsq02,rinv02);
922 /* Calculate table index by multiplying r with table scale and truncate to integer */
923 rt = _mm_mul_pd(r02,vftabscale);
924 vfitab = _mm_cvttpd_epi32(rt);
925 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
926 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
928 /* CUBIC SPLINE TABLE ELECTROSTATICS */
929 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
930 F = _mm_setzero_pd();
931 GMX_MM_TRANSPOSE2_PD(Y,F);
932 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
933 H = _mm_setzero_pd();
934 GMX_MM_TRANSPOSE2_PD(G,H);
935 Heps = _mm_mul_pd(vfeps,H);
936 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
937 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
938 velec = _mm_mul_pd(qq02,VV);
939 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
940 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
942 /* Update potential sum for this i atom from the interaction with this j atom. */
943 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
944 velecsum = _mm_add_pd(velecsum,velec);
948 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
950 /* Calculate temporary vectorial force */
951 tx = _mm_mul_pd(fscal,dx02);
952 ty = _mm_mul_pd(fscal,dy02);
953 tz = _mm_mul_pd(fscal,dz02);
955 /* Update vectorial force */
956 fix0 = _mm_add_pd(fix0,tx);
957 fiy0 = _mm_add_pd(fiy0,ty);
958 fiz0 = _mm_add_pd(fiz0,tz);
960 fjx2 = _mm_add_pd(fjx2,tx);
961 fjy2 = _mm_add_pd(fjy2,ty);
962 fjz2 = _mm_add_pd(fjz2,tz);
964 /**************************
965 * CALCULATE INTERACTIONS *
966 **************************/
968 r10 = _mm_mul_pd(rsq10,rinv10);
970 /* Calculate table index by multiplying r with table scale and truncate to integer */
971 rt = _mm_mul_pd(r10,vftabscale);
972 vfitab = _mm_cvttpd_epi32(rt);
973 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
974 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
976 /* CUBIC SPLINE TABLE ELECTROSTATICS */
977 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
978 F = _mm_setzero_pd();
979 GMX_MM_TRANSPOSE2_PD(Y,F);
980 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
981 H = _mm_setzero_pd();
982 GMX_MM_TRANSPOSE2_PD(G,H);
983 Heps = _mm_mul_pd(vfeps,H);
984 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
985 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
986 velec = _mm_mul_pd(qq10,VV);
987 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
988 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
990 /* Update potential sum for this i atom from the interaction with this j atom. */
991 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
992 velecsum = _mm_add_pd(velecsum,velec);
996 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
998 /* Calculate temporary vectorial force */
999 tx = _mm_mul_pd(fscal,dx10);
1000 ty = _mm_mul_pd(fscal,dy10);
1001 tz = _mm_mul_pd(fscal,dz10);
1003 /* Update vectorial force */
1004 fix1 = _mm_add_pd(fix1,tx);
1005 fiy1 = _mm_add_pd(fiy1,ty);
1006 fiz1 = _mm_add_pd(fiz1,tz);
1008 fjx0 = _mm_add_pd(fjx0,tx);
1009 fjy0 = _mm_add_pd(fjy0,ty);
1010 fjz0 = _mm_add_pd(fjz0,tz);
1012 /**************************
1013 * CALCULATE INTERACTIONS *
1014 **************************/
1016 r11 = _mm_mul_pd(rsq11,rinv11);
1018 /* Calculate table index by multiplying r with table scale and truncate to integer */
1019 rt = _mm_mul_pd(r11,vftabscale);
1020 vfitab = _mm_cvttpd_epi32(rt);
1021 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1022 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1024 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1025 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1026 F = _mm_setzero_pd();
1027 GMX_MM_TRANSPOSE2_PD(Y,F);
1028 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1029 H = _mm_setzero_pd();
1030 GMX_MM_TRANSPOSE2_PD(G,H);
1031 Heps = _mm_mul_pd(vfeps,H);
1032 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1033 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1034 velec = _mm_mul_pd(qq11,VV);
1035 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1036 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1038 /* Update potential sum for this i atom from the interaction with this j atom. */
1039 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1040 velecsum = _mm_add_pd(velecsum,velec);
1044 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1046 /* Calculate temporary vectorial force */
1047 tx = _mm_mul_pd(fscal,dx11);
1048 ty = _mm_mul_pd(fscal,dy11);
1049 tz = _mm_mul_pd(fscal,dz11);
1051 /* Update vectorial force */
1052 fix1 = _mm_add_pd(fix1,tx);
1053 fiy1 = _mm_add_pd(fiy1,ty);
1054 fiz1 = _mm_add_pd(fiz1,tz);
1056 fjx1 = _mm_add_pd(fjx1,tx);
1057 fjy1 = _mm_add_pd(fjy1,ty);
1058 fjz1 = _mm_add_pd(fjz1,tz);
1060 /**************************
1061 * CALCULATE INTERACTIONS *
1062 **************************/
1064 r12 = _mm_mul_pd(rsq12,rinv12);
1066 /* Calculate table index by multiplying r with table scale and truncate to integer */
1067 rt = _mm_mul_pd(r12,vftabscale);
1068 vfitab = _mm_cvttpd_epi32(rt);
1069 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1070 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1072 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1073 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1074 F = _mm_setzero_pd();
1075 GMX_MM_TRANSPOSE2_PD(Y,F);
1076 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1077 H = _mm_setzero_pd();
1078 GMX_MM_TRANSPOSE2_PD(G,H);
1079 Heps = _mm_mul_pd(vfeps,H);
1080 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1081 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1082 velec = _mm_mul_pd(qq12,VV);
1083 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1084 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1086 /* Update potential sum for this i atom from the interaction with this j atom. */
1087 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1088 velecsum = _mm_add_pd(velecsum,velec);
1092 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1094 /* Calculate temporary vectorial force */
1095 tx = _mm_mul_pd(fscal,dx12);
1096 ty = _mm_mul_pd(fscal,dy12);
1097 tz = _mm_mul_pd(fscal,dz12);
1099 /* Update vectorial force */
1100 fix1 = _mm_add_pd(fix1,tx);
1101 fiy1 = _mm_add_pd(fiy1,ty);
1102 fiz1 = _mm_add_pd(fiz1,tz);
1104 fjx2 = _mm_add_pd(fjx2,tx);
1105 fjy2 = _mm_add_pd(fjy2,ty);
1106 fjz2 = _mm_add_pd(fjz2,tz);
1108 /**************************
1109 * CALCULATE INTERACTIONS *
1110 **************************/
1112 r20 = _mm_mul_pd(rsq20,rinv20);
1114 /* Calculate table index by multiplying r with table scale and truncate to integer */
1115 rt = _mm_mul_pd(r20,vftabscale);
1116 vfitab = _mm_cvttpd_epi32(rt);
1117 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1118 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1120 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1121 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1122 F = _mm_setzero_pd();
1123 GMX_MM_TRANSPOSE2_PD(Y,F);
1124 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1125 H = _mm_setzero_pd();
1126 GMX_MM_TRANSPOSE2_PD(G,H);
1127 Heps = _mm_mul_pd(vfeps,H);
1128 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1129 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1130 velec = _mm_mul_pd(qq20,VV);
1131 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1132 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1134 /* Update potential sum for this i atom from the interaction with this j atom. */
1135 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1136 velecsum = _mm_add_pd(velecsum,velec);
1140 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1142 /* Calculate temporary vectorial force */
1143 tx = _mm_mul_pd(fscal,dx20);
1144 ty = _mm_mul_pd(fscal,dy20);
1145 tz = _mm_mul_pd(fscal,dz20);
1147 /* Update vectorial force */
1148 fix2 = _mm_add_pd(fix2,tx);
1149 fiy2 = _mm_add_pd(fiy2,ty);
1150 fiz2 = _mm_add_pd(fiz2,tz);
1152 fjx0 = _mm_add_pd(fjx0,tx);
1153 fjy0 = _mm_add_pd(fjy0,ty);
1154 fjz0 = _mm_add_pd(fjz0,tz);
1156 /**************************
1157 * CALCULATE INTERACTIONS *
1158 **************************/
1160 r21 = _mm_mul_pd(rsq21,rinv21);
1162 /* Calculate table index by multiplying r with table scale and truncate to integer */
1163 rt = _mm_mul_pd(r21,vftabscale);
1164 vfitab = _mm_cvttpd_epi32(rt);
1165 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1166 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1168 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1169 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1170 F = _mm_setzero_pd();
1171 GMX_MM_TRANSPOSE2_PD(Y,F);
1172 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1173 H = _mm_setzero_pd();
1174 GMX_MM_TRANSPOSE2_PD(G,H);
1175 Heps = _mm_mul_pd(vfeps,H);
1176 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1177 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1178 velec = _mm_mul_pd(qq21,VV);
1179 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1180 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1182 /* Update potential sum for this i atom from the interaction with this j atom. */
1183 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1184 velecsum = _mm_add_pd(velecsum,velec);
1188 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1190 /* Calculate temporary vectorial force */
1191 tx = _mm_mul_pd(fscal,dx21);
1192 ty = _mm_mul_pd(fscal,dy21);
1193 tz = _mm_mul_pd(fscal,dz21);
1195 /* Update vectorial force */
1196 fix2 = _mm_add_pd(fix2,tx);
1197 fiy2 = _mm_add_pd(fiy2,ty);
1198 fiz2 = _mm_add_pd(fiz2,tz);
1200 fjx1 = _mm_add_pd(fjx1,tx);
1201 fjy1 = _mm_add_pd(fjy1,ty);
1202 fjz1 = _mm_add_pd(fjz1,tz);
1204 /**************************
1205 * CALCULATE INTERACTIONS *
1206 **************************/
1208 r22 = _mm_mul_pd(rsq22,rinv22);
1210 /* Calculate table index by multiplying r with table scale and truncate to integer */
1211 rt = _mm_mul_pd(r22,vftabscale);
1212 vfitab = _mm_cvttpd_epi32(rt);
1213 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1214 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1216 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1217 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1218 F = _mm_setzero_pd();
1219 GMX_MM_TRANSPOSE2_PD(Y,F);
1220 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1221 H = _mm_setzero_pd();
1222 GMX_MM_TRANSPOSE2_PD(G,H);
1223 Heps = _mm_mul_pd(vfeps,H);
1224 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1225 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1226 velec = _mm_mul_pd(qq22,VV);
1227 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1228 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1230 /* Update potential sum for this i atom from the interaction with this j atom. */
1231 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1232 velecsum = _mm_add_pd(velecsum,velec);
1236 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1238 /* Calculate temporary vectorial force */
1239 tx = _mm_mul_pd(fscal,dx22);
1240 ty = _mm_mul_pd(fscal,dy22);
1241 tz = _mm_mul_pd(fscal,dz22);
1243 /* Update vectorial force */
1244 fix2 = _mm_add_pd(fix2,tx);
1245 fiy2 = _mm_add_pd(fiy2,ty);
1246 fiz2 = _mm_add_pd(fiz2,tz);
1248 fjx2 = _mm_add_pd(fjx2,tx);
1249 fjy2 = _mm_add_pd(fjy2,ty);
1250 fjz2 = _mm_add_pd(fjz2,tz);
1252 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1254 /* Inner loop uses 417 flops */
1257 /* End of innermost loop */
1259 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1260 f+i_coord_offset,fshift+i_shift_offset);
1263 /* Update potential energies */
1264 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1265 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1267 /* Increment number of inner iterations */
1268 inneriter += j_index_end - j_index_start;
1270 /* Outer loop uses 20 flops */
1273 /* Increment number of outer iterations */
1276 /* Update outer/inner flops */
1278 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*417);
1281 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_double
1282 * Electrostatics interaction: CubicSplineTable
1283 * VdW interaction: CubicSplineTable
1284 * Geometry: Water3-Water3
1285 * Calculate force/pot: Force
1288 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_double
1289 (t_nblist * gmx_restrict nlist,
1290 rvec * gmx_restrict xx,
1291 rvec * gmx_restrict ff,
1292 struct t_forcerec * gmx_restrict fr,
1293 t_mdatoms * gmx_restrict mdatoms,
1294 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1295 t_nrnb * gmx_restrict nrnb)
1297 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1298 * just 0 for non-waters.
1299 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1300 * jnr indices corresponding to data put in the four positions in the SIMD register.
1302 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1303 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1305 int j_coord_offsetA,j_coord_offsetB;
1306 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1307 real rcutoff_scalar;
1308 real *shiftvec,*fshift,*x,*f;
1309 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1311 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1313 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1315 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1316 int vdwjidx0A,vdwjidx0B;
1317 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1318 int vdwjidx1A,vdwjidx1B;
1319 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1320 int vdwjidx2A,vdwjidx2B;
1321 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1322 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1323 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1324 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1325 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1326 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1327 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1328 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1329 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1330 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1331 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1334 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1337 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1338 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1340 __m128i ifour = _mm_set1_epi32(4);
1341 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1343 __m128d dummy_mask,cutoff_mask;
1344 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1345 __m128d one = _mm_set1_pd(1.0);
1346 __m128d two = _mm_set1_pd(2.0);
1352 jindex = nlist->jindex;
1354 shiftidx = nlist->shift;
1356 shiftvec = fr->shift_vec[0];
1357 fshift = fr->fshift[0];
1358 facel = _mm_set1_pd(fr->ic->epsfac);
1359 charge = mdatoms->chargeA;
1360 nvdwtype = fr->ntype;
1361 vdwparam = fr->nbfp;
1362 vdwtype = mdatoms->typeA;
1364 vftab = kernel_data->table_elec_vdw->data;
1365 vftabscale = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
1367 /* Setup water-specific parameters */
1368 inr = nlist->iinr[0];
1369 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1370 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1371 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1372 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1374 jq0 = _mm_set1_pd(charge[inr+0]);
1375 jq1 = _mm_set1_pd(charge[inr+1]);
1376 jq2 = _mm_set1_pd(charge[inr+2]);
1377 vdwjidx0A = 2*vdwtype[inr+0];
1378 qq00 = _mm_mul_pd(iq0,jq0);
1379 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1380 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1381 qq01 = _mm_mul_pd(iq0,jq1);
1382 qq02 = _mm_mul_pd(iq0,jq2);
1383 qq10 = _mm_mul_pd(iq1,jq0);
1384 qq11 = _mm_mul_pd(iq1,jq1);
1385 qq12 = _mm_mul_pd(iq1,jq2);
1386 qq20 = _mm_mul_pd(iq2,jq0);
1387 qq21 = _mm_mul_pd(iq2,jq1);
1388 qq22 = _mm_mul_pd(iq2,jq2);
1390 /* Avoid stupid compiler warnings */
1392 j_coord_offsetA = 0;
1393 j_coord_offsetB = 0;
1398 /* Start outer loop over neighborlists */
1399 for(iidx=0; iidx<nri; iidx++)
1401 /* Load shift vector for this list */
1402 i_shift_offset = DIM*shiftidx[iidx];
1404 /* Load limits for loop over neighbors */
1405 j_index_start = jindex[iidx];
1406 j_index_end = jindex[iidx+1];
1408 /* Get outer coordinate index */
1410 i_coord_offset = DIM*inr;
1412 /* Load i particle coords and add shift vector */
1413 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1414 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1416 fix0 = _mm_setzero_pd();
1417 fiy0 = _mm_setzero_pd();
1418 fiz0 = _mm_setzero_pd();
1419 fix1 = _mm_setzero_pd();
1420 fiy1 = _mm_setzero_pd();
1421 fiz1 = _mm_setzero_pd();
1422 fix2 = _mm_setzero_pd();
1423 fiy2 = _mm_setzero_pd();
1424 fiz2 = _mm_setzero_pd();
1426 /* Start inner kernel loop */
1427 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1430 /* Get j neighbor index, and coordinate index */
1432 jnrB = jjnr[jidx+1];
1433 j_coord_offsetA = DIM*jnrA;
1434 j_coord_offsetB = DIM*jnrB;
1436 /* load j atom coordinates */
1437 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1438 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1440 /* Calculate displacement vector */
1441 dx00 = _mm_sub_pd(ix0,jx0);
1442 dy00 = _mm_sub_pd(iy0,jy0);
1443 dz00 = _mm_sub_pd(iz0,jz0);
1444 dx01 = _mm_sub_pd(ix0,jx1);
1445 dy01 = _mm_sub_pd(iy0,jy1);
1446 dz01 = _mm_sub_pd(iz0,jz1);
1447 dx02 = _mm_sub_pd(ix0,jx2);
1448 dy02 = _mm_sub_pd(iy0,jy2);
1449 dz02 = _mm_sub_pd(iz0,jz2);
1450 dx10 = _mm_sub_pd(ix1,jx0);
1451 dy10 = _mm_sub_pd(iy1,jy0);
1452 dz10 = _mm_sub_pd(iz1,jz0);
1453 dx11 = _mm_sub_pd(ix1,jx1);
1454 dy11 = _mm_sub_pd(iy1,jy1);
1455 dz11 = _mm_sub_pd(iz1,jz1);
1456 dx12 = _mm_sub_pd(ix1,jx2);
1457 dy12 = _mm_sub_pd(iy1,jy2);
1458 dz12 = _mm_sub_pd(iz1,jz2);
1459 dx20 = _mm_sub_pd(ix2,jx0);
1460 dy20 = _mm_sub_pd(iy2,jy0);
1461 dz20 = _mm_sub_pd(iz2,jz0);
1462 dx21 = _mm_sub_pd(ix2,jx1);
1463 dy21 = _mm_sub_pd(iy2,jy1);
1464 dz21 = _mm_sub_pd(iz2,jz1);
1465 dx22 = _mm_sub_pd(ix2,jx2);
1466 dy22 = _mm_sub_pd(iy2,jy2);
1467 dz22 = _mm_sub_pd(iz2,jz2);
1469 /* Calculate squared distance and things based on it */
1470 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1471 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1472 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1473 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1474 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1475 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1476 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1477 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1478 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1480 rinv00 = sse2_invsqrt_d(rsq00);
1481 rinv01 = sse2_invsqrt_d(rsq01);
1482 rinv02 = sse2_invsqrt_d(rsq02);
1483 rinv10 = sse2_invsqrt_d(rsq10);
1484 rinv11 = sse2_invsqrt_d(rsq11);
1485 rinv12 = sse2_invsqrt_d(rsq12);
1486 rinv20 = sse2_invsqrt_d(rsq20);
1487 rinv21 = sse2_invsqrt_d(rsq21);
1488 rinv22 = sse2_invsqrt_d(rsq22);
1490 fjx0 = _mm_setzero_pd();
1491 fjy0 = _mm_setzero_pd();
1492 fjz0 = _mm_setzero_pd();
1493 fjx1 = _mm_setzero_pd();
1494 fjy1 = _mm_setzero_pd();
1495 fjz1 = _mm_setzero_pd();
1496 fjx2 = _mm_setzero_pd();
1497 fjy2 = _mm_setzero_pd();
1498 fjz2 = _mm_setzero_pd();
1500 /**************************
1501 * CALCULATE INTERACTIONS *
1502 **************************/
1504 r00 = _mm_mul_pd(rsq00,rinv00);
1506 /* Calculate table index by multiplying r with table scale and truncate to integer */
1507 rt = _mm_mul_pd(r00,vftabscale);
1508 vfitab = _mm_cvttpd_epi32(rt);
1509 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1510 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1512 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1513 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1514 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1515 GMX_MM_TRANSPOSE2_PD(Y,F);
1516 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1517 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1518 GMX_MM_TRANSPOSE2_PD(G,H);
1519 Heps = _mm_mul_pd(vfeps,H);
1520 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1521 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1522 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1524 /* CUBIC SPLINE TABLE DISPERSION */
1525 vfitab = _mm_add_epi32(vfitab,ifour);
1526 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1527 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1528 GMX_MM_TRANSPOSE2_PD(Y,F);
1529 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1530 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1531 GMX_MM_TRANSPOSE2_PD(G,H);
1532 Heps = _mm_mul_pd(vfeps,H);
1533 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1534 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1535 fvdw6 = _mm_mul_pd(c6_00,FF);
1537 /* CUBIC SPLINE TABLE REPULSION */
1538 vfitab = _mm_add_epi32(vfitab,ifour);
1539 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1540 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1541 GMX_MM_TRANSPOSE2_PD(Y,F);
1542 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1543 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1544 GMX_MM_TRANSPOSE2_PD(G,H);
1545 Heps = _mm_mul_pd(vfeps,H);
1546 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1547 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1548 fvdw12 = _mm_mul_pd(c12_00,FF);
1549 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1551 fscal = _mm_add_pd(felec,fvdw);
1553 /* Calculate temporary vectorial force */
1554 tx = _mm_mul_pd(fscal,dx00);
1555 ty = _mm_mul_pd(fscal,dy00);
1556 tz = _mm_mul_pd(fscal,dz00);
1558 /* Update vectorial force */
1559 fix0 = _mm_add_pd(fix0,tx);
1560 fiy0 = _mm_add_pd(fiy0,ty);
1561 fiz0 = _mm_add_pd(fiz0,tz);
1563 fjx0 = _mm_add_pd(fjx0,tx);
1564 fjy0 = _mm_add_pd(fjy0,ty);
1565 fjz0 = _mm_add_pd(fjz0,tz);
1567 /**************************
1568 * CALCULATE INTERACTIONS *
1569 **************************/
1571 r01 = _mm_mul_pd(rsq01,rinv01);
1573 /* Calculate table index by multiplying r with table scale and truncate to integer */
1574 rt = _mm_mul_pd(r01,vftabscale);
1575 vfitab = _mm_cvttpd_epi32(rt);
1576 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1577 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1579 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1580 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1581 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1582 GMX_MM_TRANSPOSE2_PD(Y,F);
1583 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1584 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1585 GMX_MM_TRANSPOSE2_PD(G,H);
1586 Heps = _mm_mul_pd(vfeps,H);
1587 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1588 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1589 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1593 /* Calculate temporary vectorial force */
1594 tx = _mm_mul_pd(fscal,dx01);
1595 ty = _mm_mul_pd(fscal,dy01);
1596 tz = _mm_mul_pd(fscal,dz01);
1598 /* Update vectorial force */
1599 fix0 = _mm_add_pd(fix0,tx);
1600 fiy0 = _mm_add_pd(fiy0,ty);
1601 fiz0 = _mm_add_pd(fiz0,tz);
1603 fjx1 = _mm_add_pd(fjx1,tx);
1604 fjy1 = _mm_add_pd(fjy1,ty);
1605 fjz1 = _mm_add_pd(fjz1,tz);
1607 /**************************
1608 * CALCULATE INTERACTIONS *
1609 **************************/
1611 r02 = _mm_mul_pd(rsq02,rinv02);
1613 /* Calculate table index by multiplying r with table scale and truncate to integer */
1614 rt = _mm_mul_pd(r02,vftabscale);
1615 vfitab = _mm_cvttpd_epi32(rt);
1616 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1617 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1619 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1620 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1621 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1622 GMX_MM_TRANSPOSE2_PD(Y,F);
1623 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1624 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1625 GMX_MM_TRANSPOSE2_PD(G,H);
1626 Heps = _mm_mul_pd(vfeps,H);
1627 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1628 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1629 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1633 /* Calculate temporary vectorial force */
1634 tx = _mm_mul_pd(fscal,dx02);
1635 ty = _mm_mul_pd(fscal,dy02);
1636 tz = _mm_mul_pd(fscal,dz02);
1638 /* Update vectorial force */
1639 fix0 = _mm_add_pd(fix0,tx);
1640 fiy0 = _mm_add_pd(fiy0,ty);
1641 fiz0 = _mm_add_pd(fiz0,tz);
1643 fjx2 = _mm_add_pd(fjx2,tx);
1644 fjy2 = _mm_add_pd(fjy2,ty);
1645 fjz2 = _mm_add_pd(fjz2,tz);
1647 /**************************
1648 * CALCULATE INTERACTIONS *
1649 **************************/
1651 r10 = _mm_mul_pd(rsq10,rinv10);
1653 /* Calculate table index by multiplying r with table scale and truncate to integer */
1654 rt = _mm_mul_pd(r10,vftabscale);
1655 vfitab = _mm_cvttpd_epi32(rt);
1656 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1657 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1659 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1660 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1661 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1662 GMX_MM_TRANSPOSE2_PD(Y,F);
1663 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1664 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1665 GMX_MM_TRANSPOSE2_PD(G,H);
1666 Heps = _mm_mul_pd(vfeps,H);
1667 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1668 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1669 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1673 /* Calculate temporary vectorial force */
1674 tx = _mm_mul_pd(fscal,dx10);
1675 ty = _mm_mul_pd(fscal,dy10);
1676 tz = _mm_mul_pd(fscal,dz10);
1678 /* Update vectorial force */
1679 fix1 = _mm_add_pd(fix1,tx);
1680 fiy1 = _mm_add_pd(fiy1,ty);
1681 fiz1 = _mm_add_pd(fiz1,tz);
1683 fjx0 = _mm_add_pd(fjx0,tx);
1684 fjy0 = _mm_add_pd(fjy0,ty);
1685 fjz0 = _mm_add_pd(fjz0,tz);
1687 /**************************
1688 * CALCULATE INTERACTIONS *
1689 **************************/
1691 r11 = _mm_mul_pd(rsq11,rinv11);
1693 /* Calculate table index by multiplying r with table scale and truncate to integer */
1694 rt = _mm_mul_pd(r11,vftabscale);
1695 vfitab = _mm_cvttpd_epi32(rt);
1696 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1697 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1699 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1700 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1701 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1702 GMX_MM_TRANSPOSE2_PD(Y,F);
1703 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1704 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1705 GMX_MM_TRANSPOSE2_PD(G,H);
1706 Heps = _mm_mul_pd(vfeps,H);
1707 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1708 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1709 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1713 /* Calculate temporary vectorial force */
1714 tx = _mm_mul_pd(fscal,dx11);
1715 ty = _mm_mul_pd(fscal,dy11);
1716 tz = _mm_mul_pd(fscal,dz11);
1718 /* Update vectorial force */
1719 fix1 = _mm_add_pd(fix1,tx);
1720 fiy1 = _mm_add_pd(fiy1,ty);
1721 fiz1 = _mm_add_pd(fiz1,tz);
1723 fjx1 = _mm_add_pd(fjx1,tx);
1724 fjy1 = _mm_add_pd(fjy1,ty);
1725 fjz1 = _mm_add_pd(fjz1,tz);
1727 /**************************
1728 * CALCULATE INTERACTIONS *
1729 **************************/
1731 r12 = _mm_mul_pd(rsq12,rinv12);
1733 /* Calculate table index by multiplying r with table scale and truncate to integer */
1734 rt = _mm_mul_pd(r12,vftabscale);
1735 vfitab = _mm_cvttpd_epi32(rt);
1736 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1737 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1739 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1740 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1741 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1742 GMX_MM_TRANSPOSE2_PD(Y,F);
1743 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1744 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1745 GMX_MM_TRANSPOSE2_PD(G,H);
1746 Heps = _mm_mul_pd(vfeps,H);
1747 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1748 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1749 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1753 /* Calculate temporary vectorial force */
1754 tx = _mm_mul_pd(fscal,dx12);
1755 ty = _mm_mul_pd(fscal,dy12);
1756 tz = _mm_mul_pd(fscal,dz12);
1758 /* Update vectorial force */
1759 fix1 = _mm_add_pd(fix1,tx);
1760 fiy1 = _mm_add_pd(fiy1,ty);
1761 fiz1 = _mm_add_pd(fiz1,tz);
1763 fjx2 = _mm_add_pd(fjx2,tx);
1764 fjy2 = _mm_add_pd(fjy2,ty);
1765 fjz2 = _mm_add_pd(fjz2,tz);
1767 /**************************
1768 * CALCULATE INTERACTIONS *
1769 **************************/
1771 r20 = _mm_mul_pd(rsq20,rinv20);
1773 /* Calculate table index by multiplying r with table scale and truncate to integer */
1774 rt = _mm_mul_pd(r20,vftabscale);
1775 vfitab = _mm_cvttpd_epi32(rt);
1776 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1777 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1779 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1780 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1781 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1782 GMX_MM_TRANSPOSE2_PD(Y,F);
1783 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1784 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1785 GMX_MM_TRANSPOSE2_PD(G,H);
1786 Heps = _mm_mul_pd(vfeps,H);
1787 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1788 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1789 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1793 /* Calculate temporary vectorial force */
1794 tx = _mm_mul_pd(fscal,dx20);
1795 ty = _mm_mul_pd(fscal,dy20);
1796 tz = _mm_mul_pd(fscal,dz20);
1798 /* Update vectorial force */
1799 fix2 = _mm_add_pd(fix2,tx);
1800 fiy2 = _mm_add_pd(fiy2,ty);
1801 fiz2 = _mm_add_pd(fiz2,tz);
1803 fjx0 = _mm_add_pd(fjx0,tx);
1804 fjy0 = _mm_add_pd(fjy0,ty);
1805 fjz0 = _mm_add_pd(fjz0,tz);
1807 /**************************
1808 * CALCULATE INTERACTIONS *
1809 **************************/
1811 r21 = _mm_mul_pd(rsq21,rinv21);
1813 /* Calculate table index by multiplying r with table scale and truncate to integer */
1814 rt = _mm_mul_pd(r21,vftabscale);
1815 vfitab = _mm_cvttpd_epi32(rt);
1816 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1817 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1819 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1820 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1821 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1822 GMX_MM_TRANSPOSE2_PD(Y,F);
1823 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1824 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1825 GMX_MM_TRANSPOSE2_PD(G,H);
1826 Heps = _mm_mul_pd(vfeps,H);
1827 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1828 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1829 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1833 /* Calculate temporary vectorial force */
1834 tx = _mm_mul_pd(fscal,dx21);
1835 ty = _mm_mul_pd(fscal,dy21);
1836 tz = _mm_mul_pd(fscal,dz21);
1838 /* Update vectorial force */
1839 fix2 = _mm_add_pd(fix2,tx);
1840 fiy2 = _mm_add_pd(fiy2,ty);
1841 fiz2 = _mm_add_pd(fiz2,tz);
1843 fjx1 = _mm_add_pd(fjx1,tx);
1844 fjy1 = _mm_add_pd(fjy1,ty);
1845 fjz1 = _mm_add_pd(fjz1,tz);
1847 /**************************
1848 * CALCULATE INTERACTIONS *
1849 **************************/
1851 r22 = _mm_mul_pd(rsq22,rinv22);
1853 /* Calculate table index by multiplying r with table scale and truncate to integer */
1854 rt = _mm_mul_pd(r22,vftabscale);
1855 vfitab = _mm_cvttpd_epi32(rt);
1856 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1857 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1859 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1860 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1861 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1862 GMX_MM_TRANSPOSE2_PD(Y,F);
1863 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1864 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1865 GMX_MM_TRANSPOSE2_PD(G,H);
1866 Heps = _mm_mul_pd(vfeps,H);
1867 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1868 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1869 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1873 /* Calculate temporary vectorial force */
1874 tx = _mm_mul_pd(fscal,dx22);
1875 ty = _mm_mul_pd(fscal,dy22);
1876 tz = _mm_mul_pd(fscal,dz22);
1878 /* Update vectorial force */
1879 fix2 = _mm_add_pd(fix2,tx);
1880 fiy2 = _mm_add_pd(fiy2,ty);
1881 fiz2 = _mm_add_pd(fiz2,tz);
1883 fjx2 = _mm_add_pd(fjx2,tx);
1884 fjy2 = _mm_add_pd(fjy2,ty);
1885 fjz2 = _mm_add_pd(fjz2,tz);
1887 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1889 /* Inner loop uses 373 flops */
1892 if(jidx<j_index_end)
1896 j_coord_offsetA = DIM*jnrA;
1898 /* load j atom coordinates */
1899 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1900 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1902 /* Calculate displacement vector */
1903 dx00 = _mm_sub_pd(ix0,jx0);
1904 dy00 = _mm_sub_pd(iy0,jy0);
1905 dz00 = _mm_sub_pd(iz0,jz0);
1906 dx01 = _mm_sub_pd(ix0,jx1);
1907 dy01 = _mm_sub_pd(iy0,jy1);
1908 dz01 = _mm_sub_pd(iz0,jz1);
1909 dx02 = _mm_sub_pd(ix0,jx2);
1910 dy02 = _mm_sub_pd(iy0,jy2);
1911 dz02 = _mm_sub_pd(iz0,jz2);
1912 dx10 = _mm_sub_pd(ix1,jx0);
1913 dy10 = _mm_sub_pd(iy1,jy0);
1914 dz10 = _mm_sub_pd(iz1,jz0);
1915 dx11 = _mm_sub_pd(ix1,jx1);
1916 dy11 = _mm_sub_pd(iy1,jy1);
1917 dz11 = _mm_sub_pd(iz1,jz1);
1918 dx12 = _mm_sub_pd(ix1,jx2);
1919 dy12 = _mm_sub_pd(iy1,jy2);
1920 dz12 = _mm_sub_pd(iz1,jz2);
1921 dx20 = _mm_sub_pd(ix2,jx0);
1922 dy20 = _mm_sub_pd(iy2,jy0);
1923 dz20 = _mm_sub_pd(iz2,jz0);
1924 dx21 = _mm_sub_pd(ix2,jx1);
1925 dy21 = _mm_sub_pd(iy2,jy1);
1926 dz21 = _mm_sub_pd(iz2,jz1);
1927 dx22 = _mm_sub_pd(ix2,jx2);
1928 dy22 = _mm_sub_pd(iy2,jy2);
1929 dz22 = _mm_sub_pd(iz2,jz2);
1931 /* Calculate squared distance and things based on it */
1932 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1933 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1934 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1935 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1936 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1937 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1938 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1939 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1940 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1942 rinv00 = sse2_invsqrt_d(rsq00);
1943 rinv01 = sse2_invsqrt_d(rsq01);
1944 rinv02 = sse2_invsqrt_d(rsq02);
1945 rinv10 = sse2_invsqrt_d(rsq10);
1946 rinv11 = sse2_invsqrt_d(rsq11);
1947 rinv12 = sse2_invsqrt_d(rsq12);
1948 rinv20 = sse2_invsqrt_d(rsq20);
1949 rinv21 = sse2_invsqrt_d(rsq21);
1950 rinv22 = sse2_invsqrt_d(rsq22);
1952 fjx0 = _mm_setzero_pd();
1953 fjy0 = _mm_setzero_pd();
1954 fjz0 = _mm_setzero_pd();
1955 fjx1 = _mm_setzero_pd();
1956 fjy1 = _mm_setzero_pd();
1957 fjz1 = _mm_setzero_pd();
1958 fjx2 = _mm_setzero_pd();
1959 fjy2 = _mm_setzero_pd();
1960 fjz2 = _mm_setzero_pd();
1962 /**************************
1963 * CALCULATE INTERACTIONS *
1964 **************************/
1966 r00 = _mm_mul_pd(rsq00,rinv00);
1968 /* Calculate table index by multiplying r with table scale and truncate to integer */
1969 rt = _mm_mul_pd(r00,vftabscale);
1970 vfitab = _mm_cvttpd_epi32(rt);
1971 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1972 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1974 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1975 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1976 F = _mm_setzero_pd();
1977 GMX_MM_TRANSPOSE2_PD(Y,F);
1978 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1979 H = _mm_setzero_pd();
1980 GMX_MM_TRANSPOSE2_PD(G,H);
1981 Heps = _mm_mul_pd(vfeps,H);
1982 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1983 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1984 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1986 /* CUBIC SPLINE TABLE DISPERSION */
1987 vfitab = _mm_add_epi32(vfitab,ifour);
1988 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1989 F = _mm_setzero_pd();
1990 GMX_MM_TRANSPOSE2_PD(Y,F);
1991 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1992 H = _mm_setzero_pd();
1993 GMX_MM_TRANSPOSE2_PD(G,H);
1994 Heps = _mm_mul_pd(vfeps,H);
1995 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1996 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1997 fvdw6 = _mm_mul_pd(c6_00,FF);
1999 /* CUBIC SPLINE TABLE REPULSION */
2000 vfitab = _mm_add_epi32(vfitab,ifour);
2001 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2002 F = _mm_setzero_pd();
2003 GMX_MM_TRANSPOSE2_PD(Y,F);
2004 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2005 H = _mm_setzero_pd();
2006 GMX_MM_TRANSPOSE2_PD(G,H);
2007 Heps = _mm_mul_pd(vfeps,H);
2008 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2009 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2010 fvdw12 = _mm_mul_pd(c12_00,FF);
2011 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
2013 fscal = _mm_add_pd(felec,fvdw);
2015 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2017 /* Calculate temporary vectorial force */
2018 tx = _mm_mul_pd(fscal,dx00);
2019 ty = _mm_mul_pd(fscal,dy00);
2020 tz = _mm_mul_pd(fscal,dz00);
2022 /* Update vectorial force */
2023 fix0 = _mm_add_pd(fix0,tx);
2024 fiy0 = _mm_add_pd(fiy0,ty);
2025 fiz0 = _mm_add_pd(fiz0,tz);
2027 fjx0 = _mm_add_pd(fjx0,tx);
2028 fjy0 = _mm_add_pd(fjy0,ty);
2029 fjz0 = _mm_add_pd(fjz0,tz);
2031 /**************************
2032 * CALCULATE INTERACTIONS *
2033 **************************/
2035 r01 = _mm_mul_pd(rsq01,rinv01);
2037 /* Calculate table index by multiplying r with table scale and truncate to integer */
2038 rt = _mm_mul_pd(r01,vftabscale);
2039 vfitab = _mm_cvttpd_epi32(rt);
2040 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2041 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2043 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2044 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2045 F = _mm_setzero_pd();
2046 GMX_MM_TRANSPOSE2_PD(Y,F);
2047 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2048 H = _mm_setzero_pd();
2049 GMX_MM_TRANSPOSE2_PD(G,H);
2050 Heps = _mm_mul_pd(vfeps,H);
2051 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2052 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2053 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
2057 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2059 /* Calculate temporary vectorial force */
2060 tx = _mm_mul_pd(fscal,dx01);
2061 ty = _mm_mul_pd(fscal,dy01);
2062 tz = _mm_mul_pd(fscal,dz01);
2064 /* Update vectorial force */
2065 fix0 = _mm_add_pd(fix0,tx);
2066 fiy0 = _mm_add_pd(fiy0,ty);
2067 fiz0 = _mm_add_pd(fiz0,tz);
2069 fjx1 = _mm_add_pd(fjx1,tx);
2070 fjy1 = _mm_add_pd(fjy1,ty);
2071 fjz1 = _mm_add_pd(fjz1,tz);
2073 /**************************
2074 * CALCULATE INTERACTIONS *
2075 **************************/
2077 r02 = _mm_mul_pd(rsq02,rinv02);
2079 /* Calculate table index by multiplying r with table scale and truncate to integer */
2080 rt = _mm_mul_pd(r02,vftabscale);
2081 vfitab = _mm_cvttpd_epi32(rt);
2082 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2083 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2085 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2086 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2087 F = _mm_setzero_pd();
2088 GMX_MM_TRANSPOSE2_PD(Y,F);
2089 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2090 H = _mm_setzero_pd();
2091 GMX_MM_TRANSPOSE2_PD(G,H);
2092 Heps = _mm_mul_pd(vfeps,H);
2093 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2094 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2095 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
2099 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2101 /* Calculate temporary vectorial force */
2102 tx = _mm_mul_pd(fscal,dx02);
2103 ty = _mm_mul_pd(fscal,dy02);
2104 tz = _mm_mul_pd(fscal,dz02);
2106 /* Update vectorial force */
2107 fix0 = _mm_add_pd(fix0,tx);
2108 fiy0 = _mm_add_pd(fiy0,ty);
2109 fiz0 = _mm_add_pd(fiz0,tz);
2111 fjx2 = _mm_add_pd(fjx2,tx);
2112 fjy2 = _mm_add_pd(fjy2,ty);
2113 fjz2 = _mm_add_pd(fjz2,tz);
2115 /**************************
2116 * CALCULATE INTERACTIONS *
2117 **************************/
2119 r10 = _mm_mul_pd(rsq10,rinv10);
2121 /* Calculate table index by multiplying r with table scale and truncate to integer */
2122 rt = _mm_mul_pd(r10,vftabscale);
2123 vfitab = _mm_cvttpd_epi32(rt);
2124 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2125 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2127 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2128 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2129 F = _mm_setzero_pd();
2130 GMX_MM_TRANSPOSE2_PD(Y,F);
2131 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2132 H = _mm_setzero_pd();
2133 GMX_MM_TRANSPOSE2_PD(G,H);
2134 Heps = _mm_mul_pd(vfeps,H);
2135 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2136 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2137 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
2141 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2143 /* Calculate temporary vectorial force */
2144 tx = _mm_mul_pd(fscal,dx10);
2145 ty = _mm_mul_pd(fscal,dy10);
2146 tz = _mm_mul_pd(fscal,dz10);
2148 /* Update vectorial force */
2149 fix1 = _mm_add_pd(fix1,tx);
2150 fiy1 = _mm_add_pd(fiy1,ty);
2151 fiz1 = _mm_add_pd(fiz1,tz);
2153 fjx0 = _mm_add_pd(fjx0,tx);
2154 fjy0 = _mm_add_pd(fjy0,ty);
2155 fjz0 = _mm_add_pd(fjz0,tz);
2157 /**************************
2158 * CALCULATE INTERACTIONS *
2159 **************************/
2161 r11 = _mm_mul_pd(rsq11,rinv11);
2163 /* Calculate table index by multiplying r with table scale and truncate to integer */
2164 rt = _mm_mul_pd(r11,vftabscale);
2165 vfitab = _mm_cvttpd_epi32(rt);
2166 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2167 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2169 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2170 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2171 F = _mm_setzero_pd();
2172 GMX_MM_TRANSPOSE2_PD(Y,F);
2173 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2174 H = _mm_setzero_pd();
2175 GMX_MM_TRANSPOSE2_PD(G,H);
2176 Heps = _mm_mul_pd(vfeps,H);
2177 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2178 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2179 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2183 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2185 /* Calculate temporary vectorial force */
2186 tx = _mm_mul_pd(fscal,dx11);
2187 ty = _mm_mul_pd(fscal,dy11);
2188 tz = _mm_mul_pd(fscal,dz11);
2190 /* Update vectorial force */
2191 fix1 = _mm_add_pd(fix1,tx);
2192 fiy1 = _mm_add_pd(fiy1,ty);
2193 fiz1 = _mm_add_pd(fiz1,tz);
2195 fjx1 = _mm_add_pd(fjx1,tx);
2196 fjy1 = _mm_add_pd(fjy1,ty);
2197 fjz1 = _mm_add_pd(fjz1,tz);
2199 /**************************
2200 * CALCULATE INTERACTIONS *
2201 **************************/
2203 r12 = _mm_mul_pd(rsq12,rinv12);
2205 /* Calculate table index by multiplying r with table scale and truncate to integer */
2206 rt = _mm_mul_pd(r12,vftabscale);
2207 vfitab = _mm_cvttpd_epi32(rt);
2208 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2209 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2211 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2212 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2213 F = _mm_setzero_pd();
2214 GMX_MM_TRANSPOSE2_PD(Y,F);
2215 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2216 H = _mm_setzero_pd();
2217 GMX_MM_TRANSPOSE2_PD(G,H);
2218 Heps = _mm_mul_pd(vfeps,H);
2219 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2220 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2221 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2225 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2227 /* Calculate temporary vectorial force */
2228 tx = _mm_mul_pd(fscal,dx12);
2229 ty = _mm_mul_pd(fscal,dy12);
2230 tz = _mm_mul_pd(fscal,dz12);
2232 /* Update vectorial force */
2233 fix1 = _mm_add_pd(fix1,tx);
2234 fiy1 = _mm_add_pd(fiy1,ty);
2235 fiz1 = _mm_add_pd(fiz1,tz);
2237 fjx2 = _mm_add_pd(fjx2,tx);
2238 fjy2 = _mm_add_pd(fjy2,ty);
2239 fjz2 = _mm_add_pd(fjz2,tz);
2241 /**************************
2242 * CALCULATE INTERACTIONS *
2243 **************************/
2245 r20 = _mm_mul_pd(rsq20,rinv20);
2247 /* Calculate table index by multiplying r with table scale and truncate to integer */
2248 rt = _mm_mul_pd(r20,vftabscale);
2249 vfitab = _mm_cvttpd_epi32(rt);
2250 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2251 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2253 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2254 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2255 F = _mm_setzero_pd();
2256 GMX_MM_TRANSPOSE2_PD(Y,F);
2257 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2258 H = _mm_setzero_pd();
2259 GMX_MM_TRANSPOSE2_PD(G,H);
2260 Heps = _mm_mul_pd(vfeps,H);
2261 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2262 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2263 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
2267 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2269 /* Calculate temporary vectorial force */
2270 tx = _mm_mul_pd(fscal,dx20);
2271 ty = _mm_mul_pd(fscal,dy20);
2272 tz = _mm_mul_pd(fscal,dz20);
2274 /* Update vectorial force */
2275 fix2 = _mm_add_pd(fix2,tx);
2276 fiy2 = _mm_add_pd(fiy2,ty);
2277 fiz2 = _mm_add_pd(fiz2,tz);
2279 fjx0 = _mm_add_pd(fjx0,tx);
2280 fjy0 = _mm_add_pd(fjy0,ty);
2281 fjz0 = _mm_add_pd(fjz0,tz);
2283 /**************************
2284 * CALCULATE INTERACTIONS *
2285 **************************/
2287 r21 = _mm_mul_pd(rsq21,rinv21);
2289 /* Calculate table index by multiplying r with table scale and truncate to integer */
2290 rt = _mm_mul_pd(r21,vftabscale);
2291 vfitab = _mm_cvttpd_epi32(rt);
2292 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2293 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2295 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2296 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2297 F = _mm_setzero_pd();
2298 GMX_MM_TRANSPOSE2_PD(Y,F);
2299 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2300 H = _mm_setzero_pd();
2301 GMX_MM_TRANSPOSE2_PD(G,H);
2302 Heps = _mm_mul_pd(vfeps,H);
2303 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2304 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2305 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2309 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2311 /* Calculate temporary vectorial force */
2312 tx = _mm_mul_pd(fscal,dx21);
2313 ty = _mm_mul_pd(fscal,dy21);
2314 tz = _mm_mul_pd(fscal,dz21);
2316 /* Update vectorial force */
2317 fix2 = _mm_add_pd(fix2,tx);
2318 fiy2 = _mm_add_pd(fiy2,ty);
2319 fiz2 = _mm_add_pd(fiz2,tz);
2321 fjx1 = _mm_add_pd(fjx1,tx);
2322 fjy1 = _mm_add_pd(fjy1,ty);
2323 fjz1 = _mm_add_pd(fjz1,tz);
2325 /**************************
2326 * CALCULATE INTERACTIONS *
2327 **************************/
2329 r22 = _mm_mul_pd(rsq22,rinv22);
2331 /* Calculate table index by multiplying r with table scale and truncate to integer */
2332 rt = _mm_mul_pd(r22,vftabscale);
2333 vfitab = _mm_cvttpd_epi32(rt);
2334 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2335 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2337 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2338 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2339 F = _mm_setzero_pd();
2340 GMX_MM_TRANSPOSE2_PD(Y,F);
2341 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2342 H = _mm_setzero_pd();
2343 GMX_MM_TRANSPOSE2_PD(G,H);
2344 Heps = _mm_mul_pd(vfeps,H);
2345 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2346 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2347 felec = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2351 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2353 /* Calculate temporary vectorial force */
2354 tx = _mm_mul_pd(fscal,dx22);
2355 ty = _mm_mul_pd(fscal,dy22);
2356 tz = _mm_mul_pd(fscal,dz22);
2358 /* Update vectorial force */
2359 fix2 = _mm_add_pd(fix2,tx);
2360 fiy2 = _mm_add_pd(fiy2,ty);
2361 fiz2 = _mm_add_pd(fiz2,tz);
2363 fjx2 = _mm_add_pd(fjx2,tx);
2364 fjy2 = _mm_add_pd(fjy2,ty);
2365 fjz2 = _mm_add_pd(fjz2,tz);
2367 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2369 /* Inner loop uses 373 flops */
2372 /* End of innermost loop */
2374 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2375 f+i_coord_offset,fshift+i_shift_offset);
2377 /* Increment number of inner iterations */
2378 inneriter += j_index_end - j_index_start;
2380 /* Outer loop uses 18 flops */
2383 /* Increment number of outer iterations */
2386 /* Update outer/inner flops */
2388 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*373);