2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
49 #include "gromacs/simd/math_x86_sse2_double.h"
50 #include "kernelutil_x86_sse2_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_double
54 * Electrostatics interaction: Ewald
55 * VdW interaction: CubicSplineTable
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
77 int j_coord_offsetA,j_coord_offsetB;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 int vdwjidx0A,vdwjidx0B;
89 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 int vdwjidx1A,vdwjidx1B;
91 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B;
93 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
106 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
109 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
110 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
112 __m128i ifour = _mm_set1_epi32(4);
113 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
116 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
118 __m128d dummy_mask,cutoff_mask;
119 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
120 __m128d one = _mm_set1_pd(1.0);
121 __m128d two = _mm_set1_pd(2.0);
127 jindex = nlist->jindex;
129 shiftidx = nlist->shift;
131 shiftvec = fr->shift_vec[0];
132 fshift = fr->fshift[0];
133 facel = _mm_set1_pd(fr->epsfac);
134 charge = mdatoms->chargeA;
135 nvdwtype = fr->ntype;
137 vdwtype = mdatoms->typeA;
139 vftab = kernel_data->table_vdw->data;
140 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
142 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
143 ewtab = fr->ic->tabq_coul_FDV0;
144 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
145 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
147 /* Setup water-specific parameters */
148 inr = nlist->iinr[0];
149 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
150 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
151 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
152 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
154 jq0 = _mm_set1_pd(charge[inr+0]);
155 jq1 = _mm_set1_pd(charge[inr+1]);
156 jq2 = _mm_set1_pd(charge[inr+2]);
157 vdwjidx0A = 2*vdwtype[inr+0];
158 qq00 = _mm_mul_pd(iq0,jq0);
159 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
160 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
161 qq01 = _mm_mul_pd(iq0,jq1);
162 qq02 = _mm_mul_pd(iq0,jq2);
163 qq10 = _mm_mul_pd(iq1,jq0);
164 qq11 = _mm_mul_pd(iq1,jq1);
165 qq12 = _mm_mul_pd(iq1,jq2);
166 qq20 = _mm_mul_pd(iq2,jq0);
167 qq21 = _mm_mul_pd(iq2,jq1);
168 qq22 = _mm_mul_pd(iq2,jq2);
170 /* Avoid stupid compiler warnings */
178 /* Start outer loop over neighborlists */
179 for(iidx=0; iidx<nri; iidx++)
181 /* Load shift vector for this list */
182 i_shift_offset = DIM*shiftidx[iidx];
184 /* Load limits for loop over neighbors */
185 j_index_start = jindex[iidx];
186 j_index_end = jindex[iidx+1];
188 /* Get outer coordinate index */
190 i_coord_offset = DIM*inr;
192 /* Load i particle coords and add shift vector */
193 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
194 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
196 fix0 = _mm_setzero_pd();
197 fiy0 = _mm_setzero_pd();
198 fiz0 = _mm_setzero_pd();
199 fix1 = _mm_setzero_pd();
200 fiy1 = _mm_setzero_pd();
201 fiz1 = _mm_setzero_pd();
202 fix2 = _mm_setzero_pd();
203 fiy2 = _mm_setzero_pd();
204 fiz2 = _mm_setzero_pd();
206 /* Reset potential sums */
207 velecsum = _mm_setzero_pd();
208 vvdwsum = _mm_setzero_pd();
210 /* Start inner kernel loop */
211 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
214 /* Get j neighbor index, and coordinate index */
217 j_coord_offsetA = DIM*jnrA;
218 j_coord_offsetB = DIM*jnrB;
220 /* load j atom coordinates */
221 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
222 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
224 /* Calculate displacement vector */
225 dx00 = _mm_sub_pd(ix0,jx0);
226 dy00 = _mm_sub_pd(iy0,jy0);
227 dz00 = _mm_sub_pd(iz0,jz0);
228 dx01 = _mm_sub_pd(ix0,jx1);
229 dy01 = _mm_sub_pd(iy0,jy1);
230 dz01 = _mm_sub_pd(iz0,jz1);
231 dx02 = _mm_sub_pd(ix0,jx2);
232 dy02 = _mm_sub_pd(iy0,jy2);
233 dz02 = _mm_sub_pd(iz0,jz2);
234 dx10 = _mm_sub_pd(ix1,jx0);
235 dy10 = _mm_sub_pd(iy1,jy0);
236 dz10 = _mm_sub_pd(iz1,jz0);
237 dx11 = _mm_sub_pd(ix1,jx1);
238 dy11 = _mm_sub_pd(iy1,jy1);
239 dz11 = _mm_sub_pd(iz1,jz1);
240 dx12 = _mm_sub_pd(ix1,jx2);
241 dy12 = _mm_sub_pd(iy1,jy2);
242 dz12 = _mm_sub_pd(iz1,jz2);
243 dx20 = _mm_sub_pd(ix2,jx0);
244 dy20 = _mm_sub_pd(iy2,jy0);
245 dz20 = _mm_sub_pd(iz2,jz0);
246 dx21 = _mm_sub_pd(ix2,jx1);
247 dy21 = _mm_sub_pd(iy2,jy1);
248 dz21 = _mm_sub_pd(iz2,jz1);
249 dx22 = _mm_sub_pd(ix2,jx2);
250 dy22 = _mm_sub_pd(iy2,jy2);
251 dz22 = _mm_sub_pd(iz2,jz2);
253 /* Calculate squared distance and things based on it */
254 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
255 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
256 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
257 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
258 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
259 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
260 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
261 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
262 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
264 rinv00 = gmx_mm_invsqrt_pd(rsq00);
265 rinv01 = gmx_mm_invsqrt_pd(rsq01);
266 rinv02 = gmx_mm_invsqrt_pd(rsq02);
267 rinv10 = gmx_mm_invsqrt_pd(rsq10);
268 rinv11 = gmx_mm_invsqrt_pd(rsq11);
269 rinv12 = gmx_mm_invsqrt_pd(rsq12);
270 rinv20 = gmx_mm_invsqrt_pd(rsq20);
271 rinv21 = gmx_mm_invsqrt_pd(rsq21);
272 rinv22 = gmx_mm_invsqrt_pd(rsq22);
274 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
275 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
276 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
277 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
278 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
279 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
280 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
281 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
282 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
284 fjx0 = _mm_setzero_pd();
285 fjy0 = _mm_setzero_pd();
286 fjz0 = _mm_setzero_pd();
287 fjx1 = _mm_setzero_pd();
288 fjy1 = _mm_setzero_pd();
289 fjz1 = _mm_setzero_pd();
290 fjx2 = _mm_setzero_pd();
291 fjy2 = _mm_setzero_pd();
292 fjz2 = _mm_setzero_pd();
294 /**************************
295 * CALCULATE INTERACTIONS *
296 **************************/
298 r00 = _mm_mul_pd(rsq00,rinv00);
300 /* Calculate table index by multiplying r with table scale and truncate to integer */
301 rt = _mm_mul_pd(r00,vftabscale);
302 vfitab = _mm_cvttpd_epi32(rt);
303 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
304 vfitab = _mm_slli_epi32(vfitab,3);
306 /* EWALD ELECTROSTATICS */
308 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
309 ewrt = _mm_mul_pd(r00,ewtabscale);
310 ewitab = _mm_cvttpd_epi32(ewrt);
311 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
312 ewitab = _mm_slli_epi32(ewitab,2);
313 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
314 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
315 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
316 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
317 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
318 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
319 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
320 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
321 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
322 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
324 /* CUBIC SPLINE TABLE DISPERSION */
325 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
326 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
327 GMX_MM_TRANSPOSE2_PD(Y,F);
328 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
329 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
330 GMX_MM_TRANSPOSE2_PD(G,H);
331 Heps = _mm_mul_pd(vfeps,H);
332 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
333 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
334 vvdw6 = _mm_mul_pd(c6_00,VV);
335 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
336 fvdw6 = _mm_mul_pd(c6_00,FF);
338 /* CUBIC SPLINE TABLE REPULSION */
339 vfitab = _mm_add_epi32(vfitab,ifour);
340 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
341 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
342 GMX_MM_TRANSPOSE2_PD(Y,F);
343 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
344 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
345 GMX_MM_TRANSPOSE2_PD(G,H);
346 Heps = _mm_mul_pd(vfeps,H);
347 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
348 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
349 vvdw12 = _mm_mul_pd(c12_00,VV);
350 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
351 fvdw12 = _mm_mul_pd(c12_00,FF);
352 vvdw = _mm_add_pd(vvdw12,vvdw6);
353 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
355 /* Update potential sum for this i atom from the interaction with this j atom. */
356 velecsum = _mm_add_pd(velecsum,velec);
357 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
359 fscal = _mm_add_pd(felec,fvdw);
361 /* Calculate temporary vectorial force */
362 tx = _mm_mul_pd(fscal,dx00);
363 ty = _mm_mul_pd(fscal,dy00);
364 tz = _mm_mul_pd(fscal,dz00);
366 /* Update vectorial force */
367 fix0 = _mm_add_pd(fix0,tx);
368 fiy0 = _mm_add_pd(fiy0,ty);
369 fiz0 = _mm_add_pd(fiz0,tz);
371 fjx0 = _mm_add_pd(fjx0,tx);
372 fjy0 = _mm_add_pd(fjy0,ty);
373 fjz0 = _mm_add_pd(fjz0,tz);
375 /**************************
376 * CALCULATE INTERACTIONS *
377 **************************/
379 r01 = _mm_mul_pd(rsq01,rinv01);
381 /* EWALD ELECTROSTATICS */
383 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
384 ewrt = _mm_mul_pd(r01,ewtabscale);
385 ewitab = _mm_cvttpd_epi32(ewrt);
386 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
387 ewitab = _mm_slli_epi32(ewitab,2);
388 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
389 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
390 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
391 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
392 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
393 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
394 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
395 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
396 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
397 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
399 /* Update potential sum for this i atom from the interaction with this j atom. */
400 velecsum = _mm_add_pd(velecsum,velec);
404 /* Calculate temporary vectorial force */
405 tx = _mm_mul_pd(fscal,dx01);
406 ty = _mm_mul_pd(fscal,dy01);
407 tz = _mm_mul_pd(fscal,dz01);
409 /* Update vectorial force */
410 fix0 = _mm_add_pd(fix0,tx);
411 fiy0 = _mm_add_pd(fiy0,ty);
412 fiz0 = _mm_add_pd(fiz0,tz);
414 fjx1 = _mm_add_pd(fjx1,tx);
415 fjy1 = _mm_add_pd(fjy1,ty);
416 fjz1 = _mm_add_pd(fjz1,tz);
418 /**************************
419 * CALCULATE INTERACTIONS *
420 **************************/
422 r02 = _mm_mul_pd(rsq02,rinv02);
424 /* EWALD ELECTROSTATICS */
426 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
427 ewrt = _mm_mul_pd(r02,ewtabscale);
428 ewitab = _mm_cvttpd_epi32(ewrt);
429 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
430 ewitab = _mm_slli_epi32(ewitab,2);
431 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
432 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
433 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
434 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
435 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
436 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
437 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
438 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
439 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
440 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
442 /* Update potential sum for this i atom from the interaction with this j atom. */
443 velecsum = _mm_add_pd(velecsum,velec);
447 /* Calculate temporary vectorial force */
448 tx = _mm_mul_pd(fscal,dx02);
449 ty = _mm_mul_pd(fscal,dy02);
450 tz = _mm_mul_pd(fscal,dz02);
452 /* Update vectorial force */
453 fix0 = _mm_add_pd(fix0,tx);
454 fiy0 = _mm_add_pd(fiy0,ty);
455 fiz0 = _mm_add_pd(fiz0,tz);
457 fjx2 = _mm_add_pd(fjx2,tx);
458 fjy2 = _mm_add_pd(fjy2,ty);
459 fjz2 = _mm_add_pd(fjz2,tz);
461 /**************************
462 * CALCULATE INTERACTIONS *
463 **************************/
465 r10 = _mm_mul_pd(rsq10,rinv10);
467 /* EWALD ELECTROSTATICS */
469 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
470 ewrt = _mm_mul_pd(r10,ewtabscale);
471 ewitab = _mm_cvttpd_epi32(ewrt);
472 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
473 ewitab = _mm_slli_epi32(ewitab,2);
474 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
475 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
476 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
477 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
478 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
479 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
480 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
481 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
482 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
483 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
485 /* Update potential sum for this i atom from the interaction with this j atom. */
486 velecsum = _mm_add_pd(velecsum,velec);
490 /* Calculate temporary vectorial force */
491 tx = _mm_mul_pd(fscal,dx10);
492 ty = _mm_mul_pd(fscal,dy10);
493 tz = _mm_mul_pd(fscal,dz10);
495 /* Update vectorial force */
496 fix1 = _mm_add_pd(fix1,tx);
497 fiy1 = _mm_add_pd(fiy1,ty);
498 fiz1 = _mm_add_pd(fiz1,tz);
500 fjx0 = _mm_add_pd(fjx0,tx);
501 fjy0 = _mm_add_pd(fjy0,ty);
502 fjz0 = _mm_add_pd(fjz0,tz);
504 /**************************
505 * CALCULATE INTERACTIONS *
506 **************************/
508 r11 = _mm_mul_pd(rsq11,rinv11);
510 /* EWALD ELECTROSTATICS */
512 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
513 ewrt = _mm_mul_pd(r11,ewtabscale);
514 ewitab = _mm_cvttpd_epi32(ewrt);
515 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
516 ewitab = _mm_slli_epi32(ewitab,2);
517 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
518 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
519 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
520 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
521 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
522 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
523 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
524 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
525 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
526 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
528 /* Update potential sum for this i atom from the interaction with this j atom. */
529 velecsum = _mm_add_pd(velecsum,velec);
533 /* Calculate temporary vectorial force */
534 tx = _mm_mul_pd(fscal,dx11);
535 ty = _mm_mul_pd(fscal,dy11);
536 tz = _mm_mul_pd(fscal,dz11);
538 /* Update vectorial force */
539 fix1 = _mm_add_pd(fix1,tx);
540 fiy1 = _mm_add_pd(fiy1,ty);
541 fiz1 = _mm_add_pd(fiz1,tz);
543 fjx1 = _mm_add_pd(fjx1,tx);
544 fjy1 = _mm_add_pd(fjy1,ty);
545 fjz1 = _mm_add_pd(fjz1,tz);
547 /**************************
548 * CALCULATE INTERACTIONS *
549 **************************/
551 r12 = _mm_mul_pd(rsq12,rinv12);
553 /* EWALD ELECTROSTATICS */
555 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
556 ewrt = _mm_mul_pd(r12,ewtabscale);
557 ewitab = _mm_cvttpd_epi32(ewrt);
558 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
559 ewitab = _mm_slli_epi32(ewitab,2);
560 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
561 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
562 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
563 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
564 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
565 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
566 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
567 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
568 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
569 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
571 /* Update potential sum for this i atom from the interaction with this j atom. */
572 velecsum = _mm_add_pd(velecsum,velec);
576 /* Calculate temporary vectorial force */
577 tx = _mm_mul_pd(fscal,dx12);
578 ty = _mm_mul_pd(fscal,dy12);
579 tz = _mm_mul_pd(fscal,dz12);
581 /* Update vectorial force */
582 fix1 = _mm_add_pd(fix1,tx);
583 fiy1 = _mm_add_pd(fiy1,ty);
584 fiz1 = _mm_add_pd(fiz1,tz);
586 fjx2 = _mm_add_pd(fjx2,tx);
587 fjy2 = _mm_add_pd(fjy2,ty);
588 fjz2 = _mm_add_pd(fjz2,tz);
590 /**************************
591 * CALCULATE INTERACTIONS *
592 **************************/
594 r20 = _mm_mul_pd(rsq20,rinv20);
596 /* EWALD ELECTROSTATICS */
598 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
599 ewrt = _mm_mul_pd(r20,ewtabscale);
600 ewitab = _mm_cvttpd_epi32(ewrt);
601 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
602 ewitab = _mm_slli_epi32(ewitab,2);
603 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
604 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
605 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
606 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
607 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
608 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
609 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
610 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
611 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
612 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
614 /* Update potential sum for this i atom from the interaction with this j atom. */
615 velecsum = _mm_add_pd(velecsum,velec);
619 /* Calculate temporary vectorial force */
620 tx = _mm_mul_pd(fscal,dx20);
621 ty = _mm_mul_pd(fscal,dy20);
622 tz = _mm_mul_pd(fscal,dz20);
624 /* Update vectorial force */
625 fix2 = _mm_add_pd(fix2,tx);
626 fiy2 = _mm_add_pd(fiy2,ty);
627 fiz2 = _mm_add_pd(fiz2,tz);
629 fjx0 = _mm_add_pd(fjx0,tx);
630 fjy0 = _mm_add_pd(fjy0,ty);
631 fjz0 = _mm_add_pd(fjz0,tz);
633 /**************************
634 * CALCULATE INTERACTIONS *
635 **************************/
637 r21 = _mm_mul_pd(rsq21,rinv21);
639 /* EWALD ELECTROSTATICS */
641 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
642 ewrt = _mm_mul_pd(r21,ewtabscale);
643 ewitab = _mm_cvttpd_epi32(ewrt);
644 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
645 ewitab = _mm_slli_epi32(ewitab,2);
646 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
647 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
648 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
649 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
650 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
651 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
652 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
653 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
654 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
655 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
657 /* Update potential sum for this i atom from the interaction with this j atom. */
658 velecsum = _mm_add_pd(velecsum,velec);
662 /* Calculate temporary vectorial force */
663 tx = _mm_mul_pd(fscal,dx21);
664 ty = _mm_mul_pd(fscal,dy21);
665 tz = _mm_mul_pd(fscal,dz21);
667 /* Update vectorial force */
668 fix2 = _mm_add_pd(fix2,tx);
669 fiy2 = _mm_add_pd(fiy2,ty);
670 fiz2 = _mm_add_pd(fiz2,tz);
672 fjx1 = _mm_add_pd(fjx1,tx);
673 fjy1 = _mm_add_pd(fjy1,ty);
674 fjz1 = _mm_add_pd(fjz1,tz);
676 /**************************
677 * CALCULATE INTERACTIONS *
678 **************************/
680 r22 = _mm_mul_pd(rsq22,rinv22);
682 /* EWALD ELECTROSTATICS */
684 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
685 ewrt = _mm_mul_pd(r22,ewtabscale);
686 ewitab = _mm_cvttpd_epi32(ewrt);
687 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
688 ewitab = _mm_slli_epi32(ewitab,2);
689 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
690 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
691 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
692 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
693 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
694 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
695 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
696 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
697 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
698 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
700 /* Update potential sum for this i atom from the interaction with this j atom. */
701 velecsum = _mm_add_pd(velecsum,velec);
705 /* Calculate temporary vectorial force */
706 tx = _mm_mul_pd(fscal,dx22);
707 ty = _mm_mul_pd(fscal,dy22);
708 tz = _mm_mul_pd(fscal,dz22);
710 /* Update vectorial force */
711 fix2 = _mm_add_pd(fix2,tx);
712 fiy2 = _mm_add_pd(fiy2,ty);
713 fiz2 = _mm_add_pd(fiz2,tz);
715 fjx2 = _mm_add_pd(fjx2,tx);
716 fjy2 = _mm_add_pd(fjy2,ty);
717 fjz2 = _mm_add_pd(fjz2,tz);
719 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
721 /* Inner loop uses 403 flops */
728 j_coord_offsetA = DIM*jnrA;
730 /* load j atom coordinates */
731 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
732 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
734 /* Calculate displacement vector */
735 dx00 = _mm_sub_pd(ix0,jx0);
736 dy00 = _mm_sub_pd(iy0,jy0);
737 dz00 = _mm_sub_pd(iz0,jz0);
738 dx01 = _mm_sub_pd(ix0,jx1);
739 dy01 = _mm_sub_pd(iy0,jy1);
740 dz01 = _mm_sub_pd(iz0,jz1);
741 dx02 = _mm_sub_pd(ix0,jx2);
742 dy02 = _mm_sub_pd(iy0,jy2);
743 dz02 = _mm_sub_pd(iz0,jz2);
744 dx10 = _mm_sub_pd(ix1,jx0);
745 dy10 = _mm_sub_pd(iy1,jy0);
746 dz10 = _mm_sub_pd(iz1,jz0);
747 dx11 = _mm_sub_pd(ix1,jx1);
748 dy11 = _mm_sub_pd(iy1,jy1);
749 dz11 = _mm_sub_pd(iz1,jz1);
750 dx12 = _mm_sub_pd(ix1,jx2);
751 dy12 = _mm_sub_pd(iy1,jy2);
752 dz12 = _mm_sub_pd(iz1,jz2);
753 dx20 = _mm_sub_pd(ix2,jx0);
754 dy20 = _mm_sub_pd(iy2,jy0);
755 dz20 = _mm_sub_pd(iz2,jz0);
756 dx21 = _mm_sub_pd(ix2,jx1);
757 dy21 = _mm_sub_pd(iy2,jy1);
758 dz21 = _mm_sub_pd(iz2,jz1);
759 dx22 = _mm_sub_pd(ix2,jx2);
760 dy22 = _mm_sub_pd(iy2,jy2);
761 dz22 = _mm_sub_pd(iz2,jz2);
763 /* Calculate squared distance and things based on it */
764 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
765 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
766 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
767 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
768 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
769 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
770 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
771 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
772 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
774 rinv00 = gmx_mm_invsqrt_pd(rsq00);
775 rinv01 = gmx_mm_invsqrt_pd(rsq01);
776 rinv02 = gmx_mm_invsqrt_pd(rsq02);
777 rinv10 = gmx_mm_invsqrt_pd(rsq10);
778 rinv11 = gmx_mm_invsqrt_pd(rsq11);
779 rinv12 = gmx_mm_invsqrt_pd(rsq12);
780 rinv20 = gmx_mm_invsqrt_pd(rsq20);
781 rinv21 = gmx_mm_invsqrt_pd(rsq21);
782 rinv22 = gmx_mm_invsqrt_pd(rsq22);
784 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
785 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
786 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
787 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
788 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
789 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
790 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
791 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
792 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
794 fjx0 = _mm_setzero_pd();
795 fjy0 = _mm_setzero_pd();
796 fjz0 = _mm_setzero_pd();
797 fjx1 = _mm_setzero_pd();
798 fjy1 = _mm_setzero_pd();
799 fjz1 = _mm_setzero_pd();
800 fjx2 = _mm_setzero_pd();
801 fjy2 = _mm_setzero_pd();
802 fjz2 = _mm_setzero_pd();
804 /**************************
805 * CALCULATE INTERACTIONS *
806 **************************/
808 r00 = _mm_mul_pd(rsq00,rinv00);
810 /* Calculate table index by multiplying r with table scale and truncate to integer */
811 rt = _mm_mul_pd(r00,vftabscale);
812 vfitab = _mm_cvttpd_epi32(rt);
813 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
814 vfitab = _mm_slli_epi32(vfitab,3);
816 /* EWALD ELECTROSTATICS */
818 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
819 ewrt = _mm_mul_pd(r00,ewtabscale);
820 ewitab = _mm_cvttpd_epi32(ewrt);
821 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
822 ewitab = _mm_slli_epi32(ewitab,2);
823 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
824 ewtabD = _mm_setzero_pd();
825 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
826 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
827 ewtabFn = _mm_setzero_pd();
828 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
829 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
830 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
831 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
832 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
834 /* CUBIC SPLINE TABLE DISPERSION */
835 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
836 F = _mm_setzero_pd();
837 GMX_MM_TRANSPOSE2_PD(Y,F);
838 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
839 H = _mm_setzero_pd();
840 GMX_MM_TRANSPOSE2_PD(G,H);
841 Heps = _mm_mul_pd(vfeps,H);
842 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
843 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
844 vvdw6 = _mm_mul_pd(c6_00,VV);
845 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
846 fvdw6 = _mm_mul_pd(c6_00,FF);
848 /* CUBIC SPLINE TABLE REPULSION */
849 vfitab = _mm_add_epi32(vfitab,ifour);
850 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
851 F = _mm_setzero_pd();
852 GMX_MM_TRANSPOSE2_PD(Y,F);
853 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
854 H = _mm_setzero_pd();
855 GMX_MM_TRANSPOSE2_PD(G,H);
856 Heps = _mm_mul_pd(vfeps,H);
857 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
858 VV = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
859 vvdw12 = _mm_mul_pd(c12_00,VV);
860 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
861 fvdw12 = _mm_mul_pd(c12_00,FF);
862 vvdw = _mm_add_pd(vvdw12,vvdw6);
863 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
865 /* Update potential sum for this i atom from the interaction with this j atom. */
866 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
867 velecsum = _mm_add_pd(velecsum,velec);
868 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
869 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
871 fscal = _mm_add_pd(felec,fvdw);
873 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
875 /* Calculate temporary vectorial force */
876 tx = _mm_mul_pd(fscal,dx00);
877 ty = _mm_mul_pd(fscal,dy00);
878 tz = _mm_mul_pd(fscal,dz00);
880 /* Update vectorial force */
881 fix0 = _mm_add_pd(fix0,tx);
882 fiy0 = _mm_add_pd(fiy0,ty);
883 fiz0 = _mm_add_pd(fiz0,tz);
885 fjx0 = _mm_add_pd(fjx0,tx);
886 fjy0 = _mm_add_pd(fjy0,ty);
887 fjz0 = _mm_add_pd(fjz0,tz);
889 /**************************
890 * CALCULATE INTERACTIONS *
891 **************************/
893 r01 = _mm_mul_pd(rsq01,rinv01);
895 /* EWALD ELECTROSTATICS */
897 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
898 ewrt = _mm_mul_pd(r01,ewtabscale);
899 ewitab = _mm_cvttpd_epi32(ewrt);
900 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
901 ewitab = _mm_slli_epi32(ewitab,2);
902 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
903 ewtabD = _mm_setzero_pd();
904 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
905 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
906 ewtabFn = _mm_setzero_pd();
907 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
908 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
909 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
910 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
911 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
913 /* Update potential sum for this i atom from the interaction with this j atom. */
914 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
915 velecsum = _mm_add_pd(velecsum,velec);
919 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
921 /* Calculate temporary vectorial force */
922 tx = _mm_mul_pd(fscal,dx01);
923 ty = _mm_mul_pd(fscal,dy01);
924 tz = _mm_mul_pd(fscal,dz01);
926 /* Update vectorial force */
927 fix0 = _mm_add_pd(fix0,tx);
928 fiy0 = _mm_add_pd(fiy0,ty);
929 fiz0 = _mm_add_pd(fiz0,tz);
931 fjx1 = _mm_add_pd(fjx1,tx);
932 fjy1 = _mm_add_pd(fjy1,ty);
933 fjz1 = _mm_add_pd(fjz1,tz);
935 /**************************
936 * CALCULATE INTERACTIONS *
937 **************************/
939 r02 = _mm_mul_pd(rsq02,rinv02);
941 /* EWALD ELECTROSTATICS */
943 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
944 ewrt = _mm_mul_pd(r02,ewtabscale);
945 ewitab = _mm_cvttpd_epi32(ewrt);
946 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
947 ewitab = _mm_slli_epi32(ewitab,2);
948 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
949 ewtabD = _mm_setzero_pd();
950 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
951 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
952 ewtabFn = _mm_setzero_pd();
953 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
954 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
955 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
956 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
957 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
959 /* Update potential sum for this i atom from the interaction with this j atom. */
960 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
961 velecsum = _mm_add_pd(velecsum,velec);
965 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
967 /* Calculate temporary vectorial force */
968 tx = _mm_mul_pd(fscal,dx02);
969 ty = _mm_mul_pd(fscal,dy02);
970 tz = _mm_mul_pd(fscal,dz02);
972 /* Update vectorial force */
973 fix0 = _mm_add_pd(fix0,tx);
974 fiy0 = _mm_add_pd(fiy0,ty);
975 fiz0 = _mm_add_pd(fiz0,tz);
977 fjx2 = _mm_add_pd(fjx2,tx);
978 fjy2 = _mm_add_pd(fjy2,ty);
979 fjz2 = _mm_add_pd(fjz2,tz);
981 /**************************
982 * CALCULATE INTERACTIONS *
983 **************************/
985 r10 = _mm_mul_pd(rsq10,rinv10);
987 /* EWALD ELECTROSTATICS */
989 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
990 ewrt = _mm_mul_pd(r10,ewtabscale);
991 ewitab = _mm_cvttpd_epi32(ewrt);
992 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
993 ewitab = _mm_slli_epi32(ewitab,2);
994 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
995 ewtabD = _mm_setzero_pd();
996 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
997 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
998 ewtabFn = _mm_setzero_pd();
999 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1000 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1001 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1002 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
1003 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1005 /* Update potential sum for this i atom from the interaction with this j atom. */
1006 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1007 velecsum = _mm_add_pd(velecsum,velec);
1011 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1013 /* Calculate temporary vectorial force */
1014 tx = _mm_mul_pd(fscal,dx10);
1015 ty = _mm_mul_pd(fscal,dy10);
1016 tz = _mm_mul_pd(fscal,dz10);
1018 /* Update vectorial force */
1019 fix1 = _mm_add_pd(fix1,tx);
1020 fiy1 = _mm_add_pd(fiy1,ty);
1021 fiz1 = _mm_add_pd(fiz1,tz);
1023 fjx0 = _mm_add_pd(fjx0,tx);
1024 fjy0 = _mm_add_pd(fjy0,ty);
1025 fjz0 = _mm_add_pd(fjz0,tz);
1027 /**************************
1028 * CALCULATE INTERACTIONS *
1029 **************************/
1031 r11 = _mm_mul_pd(rsq11,rinv11);
1033 /* EWALD ELECTROSTATICS */
1035 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1036 ewrt = _mm_mul_pd(r11,ewtabscale);
1037 ewitab = _mm_cvttpd_epi32(ewrt);
1038 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1039 ewitab = _mm_slli_epi32(ewitab,2);
1040 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1041 ewtabD = _mm_setzero_pd();
1042 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1043 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1044 ewtabFn = _mm_setzero_pd();
1045 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1046 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1047 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1048 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1049 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1051 /* Update potential sum for this i atom from the interaction with this j atom. */
1052 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1053 velecsum = _mm_add_pd(velecsum,velec);
1057 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1059 /* Calculate temporary vectorial force */
1060 tx = _mm_mul_pd(fscal,dx11);
1061 ty = _mm_mul_pd(fscal,dy11);
1062 tz = _mm_mul_pd(fscal,dz11);
1064 /* Update vectorial force */
1065 fix1 = _mm_add_pd(fix1,tx);
1066 fiy1 = _mm_add_pd(fiy1,ty);
1067 fiz1 = _mm_add_pd(fiz1,tz);
1069 fjx1 = _mm_add_pd(fjx1,tx);
1070 fjy1 = _mm_add_pd(fjy1,ty);
1071 fjz1 = _mm_add_pd(fjz1,tz);
1073 /**************************
1074 * CALCULATE INTERACTIONS *
1075 **************************/
1077 r12 = _mm_mul_pd(rsq12,rinv12);
1079 /* EWALD ELECTROSTATICS */
1081 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1082 ewrt = _mm_mul_pd(r12,ewtabscale);
1083 ewitab = _mm_cvttpd_epi32(ewrt);
1084 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1085 ewitab = _mm_slli_epi32(ewitab,2);
1086 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1087 ewtabD = _mm_setzero_pd();
1088 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1089 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1090 ewtabFn = _mm_setzero_pd();
1091 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1092 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1093 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1094 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1095 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1097 /* Update potential sum for this i atom from the interaction with this j atom. */
1098 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1099 velecsum = _mm_add_pd(velecsum,velec);
1103 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1105 /* Calculate temporary vectorial force */
1106 tx = _mm_mul_pd(fscal,dx12);
1107 ty = _mm_mul_pd(fscal,dy12);
1108 tz = _mm_mul_pd(fscal,dz12);
1110 /* Update vectorial force */
1111 fix1 = _mm_add_pd(fix1,tx);
1112 fiy1 = _mm_add_pd(fiy1,ty);
1113 fiz1 = _mm_add_pd(fiz1,tz);
1115 fjx2 = _mm_add_pd(fjx2,tx);
1116 fjy2 = _mm_add_pd(fjy2,ty);
1117 fjz2 = _mm_add_pd(fjz2,tz);
1119 /**************************
1120 * CALCULATE INTERACTIONS *
1121 **************************/
1123 r20 = _mm_mul_pd(rsq20,rinv20);
1125 /* EWALD ELECTROSTATICS */
1127 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1128 ewrt = _mm_mul_pd(r20,ewtabscale);
1129 ewitab = _mm_cvttpd_epi32(ewrt);
1130 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1131 ewitab = _mm_slli_epi32(ewitab,2);
1132 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1133 ewtabD = _mm_setzero_pd();
1134 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1135 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1136 ewtabFn = _mm_setzero_pd();
1137 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1138 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1139 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1140 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
1141 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1143 /* Update potential sum for this i atom from the interaction with this j atom. */
1144 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1145 velecsum = _mm_add_pd(velecsum,velec);
1149 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1151 /* Calculate temporary vectorial force */
1152 tx = _mm_mul_pd(fscal,dx20);
1153 ty = _mm_mul_pd(fscal,dy20);
1154 tz = _mm_mul_pd(fscal,dz20);
1156 /* Update vectorial force */
1157 fix2 = _mm_add_pd(fix2,tx);
1158 fiy2 = _mm_add_pd(fiy2,ty);
1159 fiz2 = _mm_add_pd(fiz2,tz);
1161 fjx0 = _mm_add_pd(fjx0,tx);
1162 fjy0 = _mm_add_pd(fjy0,ty);
1163 fjz0 = _mm_add_pd(fjz0,tz);
1165 /**************************
1166 * CALCULATE INTERACTIONS *
1167 **************************/
1169 r21 = _mm_mul_pd(rsq21,rinv21);
1171 /* EWALD ELECTROSTATICS */
1173 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1174 ewrt = _mm_mul_pd(r21,ewtabscale);
1175 ewitab = _mm_cvttpd_epi32(ewrt);
1176 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1177 ewitab = _mm_slli_epi32(ewitab,2);
1178 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1179 ewtabD = _mm_setzero_pd();
1180 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1181 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1182 ewtabFn = _mm_setzero_pd();
1183 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1184 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1185 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1186 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1187 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1189 /* Update potential sum for this i atom from the interaction with this j atom. */
1190 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1191 velecsum = _mm_add_pd(velecsum,velec);
1195 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1197 /* Calculate temporary vectorial force */
1198 tx = _mm_mul_pd(fscal,dx21);
1199 ty = _mm_mul_pd(fscal,dy21);
1200 tz = _mm_mul_pd(fscal,dz21);
1202 /* Update vectorial force */
1203 fix2 = _mm_add_pd(fix2,tx);
1204 fiy2 = _mm_add_pd(fiy2,ty);
1205 fiz2 = _mm_add_pd(fiz2,tz);
1207 fjx1 = _mm_add_pd(fjx1,tx);
1208 fjy1 = _mm_add_pd(fjy1,ty);
1209 fjz1 = _mm_add_pd(fjz1,tz);
1211 /**************************
1212 * CALCULATE INTERACTIONS *
1213 **************************/
1215 r22 = _mm_mul_pd(rsq22,rinv22);
1217 /* EWALD ELECTROSTATICS */
1219 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1220 ewrt = _mm_mul_pd(r22,ewtabscale);
1221 ewitab = _mm_cvttpd_epi32(ewrt);
1222 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1223 ewitab = _mm_slli_epi32(ewitab,2);
1224 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1225 ewtabD = _mm_setzero_pd();
1226 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1227 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1228 ewtabFn = _mm_setzero_pd();
1229 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1230 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1231 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1232 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1233 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1235 /* Update potential sum for this i atom from the interaction with this j atom. */
1236 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1237 velecsum = _mm_add_pd(velecsum,velec);
1241 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1243 /* Calculate temporary vectorial force */
1244 tx = _mm_mul_pd(fscal,dx22);
1245 ty = _mm_mul_pd(fscal,dy22);
1246 tz = _mm_mul_pd(fscal,dz22);
1248 /* Update vectorial force */
1249 fix2 = _mm_add_pd(fix2,tx);
1250 fiy2 = _mm_add_pd(fiy2,ty);
1251 fiz2 = _mm_add_pd(fiz2,tz);
1253 fjx2 = _mm_add_pd(fjx2,tx);
1254 fjy2 = _mm_add_pd(fjy2,ty);
1255 fjz2 = _mm_add_pd(fjz2,tz);
1257 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1259 /* Inner loop uses 403 flops */
1262 /* End of innermost loop */
1264 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1265 f+i_coord_offset,fshift+i_shift_offset);
1268 /* Update potential energies */
1269 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1270 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1272 /* Increment number of inner iterations */
1273 inneriter += j_index_end - j_index_start;
1275 /* Outer loop uses 20 flops */
1278 /* Increment number of outer iterations */
1281 /* Update outer/inner flops */
1283 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*403);
1286 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_double
1287 * Electrostatics interaction: Ewald
1288 * VdW interaction: CubicSplineTable
1289 * Geometry: Water3-Water3
1290 * Calculate force/pot: Force
1293 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_double
1294 (t_nblist * gmx_restrict nlist,
1295 rvec * gmx_restrict xx,
1296 rvec * gmx_restrict ff,
1297 t_forcerec * gmx_restrict fr,
1298 t_mdatoms * gmx_restrict mdatoms,
1299 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1300 t_nrnb * gmx_restrict nrnb)
1302 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1303 * just 0 for non-waters.
1304 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1305 * jnr indices corresponding to data put in the four positions in the SIMD register.
1307 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1308 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1310 int j_coord_offsetA,j_coord_offsetB;
1311 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1312 real rcutoff_scalar;
1313 real *shiftvec,*fshift,*x,*f;
1314 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1316 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1318 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1320 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1321 int vdwjidx0A,vdwjidx0B;
1322 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1323 int vdwjidx1A,vdwjidx1B;
1324 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1325 int vdwjidx2A,vdwjidx2B;
1326 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1327 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1328 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1329 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1330 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1331 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1332 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1333 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1334 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1335 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1336 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1339 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1342 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1343 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1345 __m128i ifour = _mm_set1_epi32(4);
1346 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1349 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1351 __m128d dummy_mask,cutoff_mask;
1352 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1353 __m128d one = _mm_set1_pd(1.0);
1354 __m128d two = _mm_set1_pd(2.0);
1360 jindex = nlist->jindex;
1362 shiftidx = nlist->shift;
1364 shiftvec = fr->shift_vec[0];
1365 fshift = fr->fshift[0];
1366 facel = _mm_set1_pd(fr->epsfac);
1367 charge = mdatoms->chargeA;
1368 nvdwtype = fr->ntype;
1369 vdwparam = fr->nbfp;
1370 vdwtype = mdatoms->typeA;
1372 vftab = kernel_data->table_vdw->data;
1373 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
1375 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1376 ewtab = fr->ic->tabq_coul_F;
1377 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1378 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1380 /* Setup water-specific parameters */
1381 inr = nlist->iinr[0];
1382 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1383 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1384 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1385 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1387 jq0 = _mm_set1_pd(charge[inr+0]);
1388 jq1 = _mm_set1_pd(charge[inr+1]);
1389 jq2 = _mm_set1_pd(charge[inr+2]);
1390 vdwjidx0A = 2*vdwtype[inr+0];
1391 qq00 = _mm_mul_pd(iq0,jq0);
1392 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1393 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1394 qq01 = _mm_mul_pd(iq0,jq1);
1395 qq02 = _mm_mul_pd(iq0,jq2);
1396 qq10 = _mm_mul_pd(iq1,jq0);
1397 qq11 = _mm_mul_pd(iq1,jq1);
1398 qq12 = _mm_mul_pd(iq1,jq2);
1399 qq20 = _mm_mul_pd(iq2,jq0);
1400 qq21 = _mm_mul_pd(iq2,jq1);
1401 qq22 = _mm_mul_pd(iq2,jq2);
1403 /* Avoid stupid compiler warnings */
1405 j_coord_offsetA = 0;
1406 j_coord_offsetB = 0;
1411 /* Start outer loop over neighborlists */
1412 for(iidx=0; iidx<nri; iidx++)
1414 /* Load shift vector for this list */
1415 i_shift_offset = DIM*shiftidx[iidx];
1417 /* Load limits for loop over neighbors */
1418 j_index_start = jindex[iidx];
1419 j_index_end = jindex[iidx+1];
1421 /* Get outer coordinate index */
1423 i_coord_offset = DIM*inr;
1425 /* Load i particle coords and add shift vector */
1426 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1427 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1429 fix0 = _mm_setzero_pd();
1430 fiy0 = _mm_setzero_pd();
1431 fiz0 = _mm_setzero_pd();
1432 fix1 = _mm_setzero_pd();
1433 fiy1 = _mm_setzero_pd();
1434 fiz1 = _mm_setzero_pd();
1435 fix2 = _mm_setzero_pd();
1436 fiy2 = _mm_setzero_pd();
1437 fiz2 = _mm_setzero_pd();
1439 /* Start inner kernel loop */
1440 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1443 /* Get j neighbor index, and coordinate index */
1445 jnrB = jjnr[jidx+1];
1446 j_coord_offsetA = DIM*jnrA;
1447 j_coord_offsetB = DIM*jnrB;
1449 /* load j atom coordinates */
1450 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1451 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1453 /* Calculate displacement vector */
1454 dx00 = _mm_sub_pd(ix0,jx0);
1455 dy00 = _mm_sub_pd(iy0,jy0);
1456 dz00 = _mm_sub_pd(iz0,jz0);
1457 dx01 = _mm_sub_pd(ix0,jx1);
1458 dy01 = _mm_sub_pd(iy0,jy1);
1459 dz01 = _mm_sub_pd(iz0,jz1);
1460 dx02 = _mm_sub_pd(ix0,jx2);
1461 dy02 = _mm_sub_pd(iy0,jy2);
1462 dz02 = _mm_sub_pd(iz0,jz2);
1463 dx10 = _mm_sub_pd(ix1,jx0);
1464 dy10 = _mm_sub_pd(iy1,jy0);
1465 dz10 = _mm_sub_pd(iz1,jz0);
1466 dx11 = _mm_sub_pd(ix1,jx1);
1467 dy11 = _mm_sub_pd(iy1,jy1);
1468 dz11 = _mm_sub_pd(iz1,jz1);
1469 dx12 = _mm_sub_pd(ix1,jx2);
1470 dy12 = _mm_sub_pd(iy1,jy2);
1471 dz12 = _mm_sub_pd(iz1,jz2);
1472 dx20 = _mm_sub_pd(ix2,jx0);
1473 dy20 = _mm_sub_pd(iy2,jy0);
1474 dz20 = _mm_sub_pd(iz2,jz0);
1475 dx21 = _mm_sub_pd(ix2,jx1);
1476 dy21 = _mm_sub_pd(iy2,jy1);
1477 dz21 = _mm_sub_pd(iz2,jz1);
1478 dx22 = _mm_sub_pd(ix2,jx2);
1479 dy22 = _mm_sub_pd(iy2,jy2);
1480 dz22 = _mm_sub_pd(iz2,jz2);
1482 /* Calculate squared distance and things based on it */
1483 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1484 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1485 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1486 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1487 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1488 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1489 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1490 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1491 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1493 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1494 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1495 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1496 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1497 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1498 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1499 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1500 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1501 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1503 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1504 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1505 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1506 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1507 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1508 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1509 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1510 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1511 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1513 fjx0 = _mm_setzero_pd();
1514 fjy0 = _mm_setzero_pd();
1515 fjz0 = _mm_setzero_pd();
1516 fjx1 = _mm_setzero_pd();
1517 fjy1 = _mm_setzero_pd();
1518 fjz1 = _mm_setzero_pd();
1519 fjx2 = _mm_setzero_pd();
1520 fjy2 = _mm_setzero_pd();
1521 fjz2 = _mm_setzero_pd();
1523 /**************************
1524 * CALCULATE INTERACTIONS *
1525 **************************/
1527 r00 = _mm_mul_pd(rsq00,rinv00);
1529 /* Calculate table index by multiplying r with table scale and truncate to integer */
1530 rt = _mm_mul_pd(r00,vftabscale);
1531 vfitab = _mm_cvttpd_epi32(rt);
1532 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1533 vfitab = _mm_slli_epi32(vfitab,3);
1535 /* EWALD ELECTROSTATICS */
1537 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1538 ewrt = _mm_mul_pd(r00,ewtabscale);
1539 ewitab = _mm_cvttpd_epi32(ewrt);
1540 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1541 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1543 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1544 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1546 /* CUBIC SPLINE TABLE DISPERSION */
1547 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1548 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1549 GMX_MM_TRANSPOSE2_PD(Y,F);
1550 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1551 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1552 GMX_MM_TRANSPOSE2_PD(G,H);
1553 Heps = _mm_mul_pd(vfeps,H);
1554 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1555 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1556 fvdw6 = _mm_mul_pd(c6_00,FF);
1558 /* CUBIC SPLINE TABLE REPULSION */
1559 vfitab = _mm_add_epi32(vfitab,ifour);
1560 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1561 F = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1562 GMX_MM_TRANSPOSE2_PD(Y,F);
1563 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1564 H = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1565 GMX_MM_TRANSPOSE2_PD(G,H);
1566 Heps = _mm_mul_pd(vfeps,H);
1567 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1568 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1569 fvdw12 = _mm_mul_pd(c12_00,FF);
1570 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1572 fscal = _mm_add_pd(felec,fvdw);
1574 /* Calculate temporary vectorial force */
1575 tx = _mm_mul_pd(fscal,dx00);
1576 ty = _mm_mul_pd(fscal,dy00);
1577 tz = _mm_mul_pd(fscal,dz00);
1579 /* Update vectorial force */
1580 fix0 = _mm_add_pd(fix0,tx);
1581 fiy0 = _mm_add_pd(fiy0,ty);
1582 fiz0 = _mm_add_pd(fiz0,tz);
1584 fjx0 = _mm_add_pd(fjx0,tx);
1585 fjy0 = _mm_add_pd(fjy0,ty);
1586 fjz0 = _mm_add_pd(fjz0,tz);
1588 /**************************
1589 * CALCULATE INTERACTIONS *
1590 **************************/
1592 r01 = _mm_mul_pd(rsq01,rinv01);
1594 /* EWALD ELECTROSTATICS */
1596 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1597 ewrt = _mm_mul_pd(r01,ewtabscale);
1598 ewitab = _mm_cvttpd_epi32(ewrt);
1599 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1600 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1602 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1603 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1607 /* Calculate temporary vectorial force */
1608 tx = _mm_mul_pd(fscal,dx01);
1609 ty = _mm_mul_pd(fscal,dy01);
1610 tz = _mm_mul_pd(fscal,dz01);
1612 /* Update vectorial force */
1613 fix0 = _mm_add_pd(fix0,tx);
1614 fiy0 = _mm_add_pd(fiy0,ty);
1615 fiz0 = _mm_add_pd(fiz0,tz);
1617 fjx1 = _mm_add_pd(fjx1,tx);
1618 fjy1 = _mm_add_pd(fjy1,ty);
1619 fjz1 = _mm_add_pd(fjz1,tz);
1621 /**************************
1622 * CALCULATE INTERACTIONS *
1623 **************************/
1625 r02 = _mm_mul_pd(rsq02,rinv02);
1627 /* EWALD ELECTROSTATICS */
1629 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1630 ewrt = _mm_mul_pd(r02,ewtabscale);
1631 ewitab = _mm_cvttpd_epi32(ewrt);
1632 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1633 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1635 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1636 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1640 /* Calculate temporary vectorial force */
1641 tx = _mm_mul_pd(fscal,dx02);
1642 ty = _mm_mul_pd(fscal,dy02);
1643 tz = _mm_mul_pd(fscal,dz02);
1645 /* Update vectorial force */
1646 fix0 = _mm_add_pd(fix0,tx);
1647 fiy0 = _mm_add_pd(fiy0,ty);
1648 fiz0 = _mm_add_pd(fiz0,tz);
1650 fjx2 = _mm_add_pd(fjx2,tx);
1651 fjy2 = _mm_add_pd(fjy2,ty);
1652 fjz2 = _mm_add_pd(fjz2,tz);
1654 /**************************
1655 * CALCULATE INTERACTIONS *
1656 **************************/
1658 r10 = _mm_mul_pd(rsq10,rinv10);
1660 /* EWALD ELECTROSTATICS */
1662 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1663 ewrt = _mm_mul_pd(r10,ewtabscale);
1664 ewitab = _mm_cvttpd_epi32(ewrt);
1665 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1666 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1668 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1669 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1673 /* Calculate temporary vectorial force */
1674 tx = _mm_mul_pd(fscal,dx10);
1675 ty = _mm_mul_pd(fscal,dy10);
1676 tz = _mm_mul_pd(fscal,dz10);
1678 /* Update vectorial force */
1679 fix1 = _mm_add_pd(fix1,tx);
1680 fiy1 = _mm_add_pd(fiy1,ty);
1681 fiz1 = _mm_add_pd(fiz1,tz);
1683 fjx0 = _mm_add_pd(fjx0,tx);
1684 fjy0 = _mm_add_pd(fjy0,ty);
1685 fjz0 = _mm_add_pd(fjz0,tz);
1687 /**************************
1688 * CALCULATE INTERACTIONS *
1689 **************************/
1691 r11 = _mm_mul_pd(rsq11,rinv11);
1693 /* EWALD ELECTROSTATICS */
1695 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1696 ewrt = _mm_mul_pd(r11,ewtabscale);
1697 ewitab = _mm_cvttpd_epi32(ewrt);
1698 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1699 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1701 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1702 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1706 /* Calculate temporary vectorial force */
1707 tx = _mm_mul_pd(fscal,dx11);
1708 ty = _mm_mul_pd(fscal,dy11);
1709 tz = _mm_mul_pd(fscal,dz11);
1711 /* Update vectorial force */
1712 fix1 = _mm_add_pd(fix1,tx);
1713 fiy1 = _mm_add_pd(fiy1,ty);
1714 fiz1 = _mm_add_pd(fiz1,tz);
1716 fjx1 = _mm_add_pd(fjx1,tx);
1717 fjy1 = _mm_add_pd(fjy1,ty);
1718 fjz1 = _mm_add_pd(fjz1,tz);
1720 /**************************
1721 * CALCULATE INTERACTIONS *
1722 **************************/
1724 r12 = _mm_mul_pd(rsq12,rinv12);
1726 /* EWALD ELECTROSTATICS */
1728 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1729 ewrt = _mm_mul_pd(r12,ewtabscale);
1730 ewitab = _mm_cvttpd_epi32(ewrt);
1731 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1732 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1734 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1735 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1739 /* Calculate temporary vectorial force */
1740 tx = _mm_mul_pd(fscal,dx12);
1741 ty = _mm_mul_pd(fscal,dy12);
1742 tz = _mm_mul_pd(fscal,dz12);
1744 /* Update vectorial force */
1745 fix1 = _mm_add_pd(fix1,tx);
1746 fiy1 = _mm_add_pd(fiy1,ty);
1747 fiz1 = _mm_add_pd(fiz1,tz);
1749 fjx2 = _mm_add_pd(fjx2,tx);
1750 fjy2 = _mm_add_pd(fjy2,ty);
1751 fjz2 = _mm_add_pd(fjz2,tz);
1753 /**************************
1754 * CALCULATE INTERACTIONS *
1755 **************************/
1757 r20 = _mm_mul_pd(rsq20,rinv20);
1759 /* EWALD ELECTROSTATICS */
1761 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1762 ewrt = _mm_mul_pd(r20,ewtabscale);
1763 ewitab = _mm_cvttpd_epi32(ewrt);
1764 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1765 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1767 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1768 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1772 /* Calculate temporary vectorial force */
1773 tx = _mm_mul_pd(fscal,dx20);
1774 ty = _mm_mul_pd(fscal,dy20);
1775 tz = _mm_mul_pd(fscal,dz20);
1777 /* Update vectorial force */
1778 fix2 = _mm_add_pd(fix2,tx);
1779 fiy2 = _mm_add_pd(fiy2,ty);
1780 fiz2 = _mm_add_pd(fiz2,tz);
1782 fjx0 = _mm_add_pd(fjx0,tx);
1783 fjy0 = _mm_add_pd(fjy0,ty);
1784 fjz0 = _mm_add_pd(fjz0,tz);
1786 /**************************
1787 * CALCULATE INTERACTIONS *
1788 **************************/
1790 r21 = _mm_mul_pd(rsq21,rinv21);
1792 /* EWALD ELECTROSTATICS */
1794 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1795 ewrt = _mm_mul_pd(r21,ewtabscale);
1796 ewitab = _mm_cvttpd_epi32(ewrt);
1797 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1798 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1800 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1801 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1805 /* Calculate temporary vectorial force */
1806 tx = _mm_mul_pd(fscal,dx21);
1807 ty = _mm_mul_pd(fscal,dy21);
1808 tz = _mm_mul_pd(fscal,dz21);
1810 /* Update vectorial force */
1811 fix2 = _mm_add_pd(fix2,tx);
1812 fiy2 = _mm_add_pd(fiy2,ty);
1813 fiz2 = _mm_add_pd(fiz2,tz);
1815 fjx1 = _mm_add_pd(fjx1,tx);
1816 fjy1 = _mm_add_pd(fjy1,ty);
1817 fjz1 = _mm_add_pd(fjz1,tz);
1819 /**************************
1820 * CALCULATE INTERACTIONS *
1821 **************************/
1823 r22 = _mm_mul_pd(rsq22,rinv22);
1825 /* EWALD ELECTROSTATICS */
1827 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1828 ewrt = _mm_mul_pd(r22,ewtabscale);
1829 ewitab = _mm_cvttpd_epi32(ewrt);
1830 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1831 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1833 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1834 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1838 /* Calculate temporary vectorial force */
1839 tx = _mm_mul_pd(fscal,dx22);
1840 ty = _mm_mul_pd(fscal,dy22);
1841 tz = _mm_mul_pd(fscal,dz22);
1843 /* Update vectorial force */
1844 fix2 = _mm_add_pd(fix2,tx);
1845 fiy2 = _mm_add_pd(fiy2,ty);
1846 fiz2 = _mm_add_pd(fiz2,tz);
1848 fjx2 = _mm_add_pd(fjx2,tx);
1849 fjy2 = _mm_add_pd(fjy2,ty);
1850 fjz2 = _mm_add_pd(fjz2,tz);
1852 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1854 /* Inner loop uses 350 flops */
1857 if(jidx<j_index_end)
1861 j_coord_offsetA = DIM*jnrA;
1863 /* load j atom coordinates */
1864 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1865 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1867 /* Calculate displacement vector */
1868 dx00 = _mm_sub_pd(ix0,jx0);
1869 dy00 = _mm_sub_pd(iy0,jy0);
1870 dz00 = _mm_sub_pd(iz0,jz0);
1871 dx01 = _mm_sub_pd(ix0,jx1);
1872 dy01 = _mm_sub_pd(iy0,jy1);
1873 dz01 = _mm_sub_pd(iz0,jz1);
1874 dx02 = _mm_sub_pd(ix0,jx2);
1875 dy02 = _mm_sub_pd(iy0,jy2);
1876 dz02 = _mm_sub_pd(iz0,jz2);
1877 dx10 = _mm_sub_pd(ix1,jx0);
1878 dy10 = _mm_sub_pd(iy1,jy0);
1879 dz10 = _mm_sub_pd(iz1,jz0);
1880 dx11 = _mm_sub_pd(ix1,jx1);
1881 dy11 = _mm_sub_pd(iy1,jy1);
1882 dz11 = _mm_sub_pd(iz1,jz1);
1883 dx12 = _mm_sub_pd(ix1,jx2);
1884 dy12 = _mm_sub_pd(iy1,jy2);
1885 dz12 = _mm_sub_pd(iz1,jz2);
1886 dx20 = _mm_sub_pd(ix2,jx0);
1887 dy20 = _mm_sub_pd(iy2,jy0);
1888 dz20 = _mm_sub_pd(iz2,jz0);
1889 dx21 = _mm_sub_pd(ix2,jx1);
1890 dy21 = _mm_sub_pd(iy2,jy1);
1891 dz21 = _mm_sub_pd(iz2,jz1);
1892 dx22 = _mm_sub_pd(ix2,jx2);
1893 dy22 = _mm_sub_pd(iy2,jy2);
1894 dz22 = _mm_sub_pd(iz2,jz2);
1896 /* Calculate squared distance and things based on it */
1897 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1898 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1899 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1900 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1901 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1902 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1903 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1904 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1905 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1907 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1908 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1909 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1910 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1911 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1912 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1913 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1914 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1915 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1917 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1918 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1919 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1920 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1921 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1922 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1923 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1924 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1925 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1927 fjx0 = _mm_setzero_pd();
1928 fjy0 = _mm_setzero_pd();
1929 fjz0 = _mm_setzero_pd();
1930 fjx1 = _mm_setzero_pd();
1931 fjy1 = _mm_setzero_pd();
1932 fjz1 = _mm_setzero_pd();
1933 fjx2 = _mm_setzero_pd();
1934 fjy2 = _mm_setzero_pd();
1935 fjz2 = _mm_setzero_pd();
1937 /**************************
1938 * CALCULATE INTERACTIONS *
1939 **************************/
1941 r00 = _mm_mul_pd(rsq00,rinv00);
1943 /* Calculate table index by multiplying r with table scale and truncate to integer */
1944 rt = _mm_mul_pd(r00,vftabscale);
1945 vfitab = _mm_cvttpd_epi32(rt);
1946 vfeps = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1947 vfitab = _mm_slli_epi32(vfitab,3);
1949 /* EWALD ELECTROSTATICS */
1951 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1952 ewrt = _mm_mul_pd(r00,ewtabscale);
1953 ewitab = _mm_cvttpd_epi32(ewrt);
1954 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1955 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1956 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1957 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1959 /* CUBIC SPLINE TABLE DISPERSION */
1960 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1961 F = _mm_setzero_pd();
1962 GMX_MM_TRANSPOSE2_PD(Y,F);
1963 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1964 H = _mm_setzero_pd();
1965 GMX_MM_TRANSPOSE2_PD(G,H);
1966 Heps = _mm_mul_pd(vfeps,H);
1967 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1968 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1969 fvdw6 = _mm_mul_pd(c6_00,FF);
1971 /* CUBIC SPLINE TABLE REPULSION */
1972 vfitab = _mm_add_epi32(vfitab,ifour);
1973 Y = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1974 F = _mm_setzero_pd();
1975 GMX_MM_TRANSPOSE2_PD(Y,F);
1976 G = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1977 H = _mm_setzero_pd();
1978 GMX_MM_TRANSPOSE2_PD(G,H);
1979 Heps = _mm_mul_pd(vfeps,H);
1980 Fp = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1981 FF = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1982 fvdw12 = _mm_mul_pd(c12_00,FF);
1983 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1985 fscal = _mm_add_pd(felec,fvdw);
1987 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1989 /* Calculate temporary vectorial force */
1990 tx = _mm_mul_pd(fscal,dx00);
1991 ty = _mm_mul_pd(fscal,dy00);
1992 tz = _mm_mul_pd(fscal,dz00);
1994 /* Update vectorial force */
1995 fix0 = _mm_add_pd(fix0,tx);
1996 fiy0 = _mm_add_pd(fiy0,ty);
1997 fiz0 = _mm_add_pd(fiz0,tz);
1999 fjx0 = _mm_add_pd(fjx0,tx);
2000 fjy0 = _mm_add_pd(fjy0,ty);
2001 fjz0 = _mm_add_pd(fjz0,tz);
2003 /**************************
2004 * CALCULATE INTERACTIONS *
2005 **************************/
2007 r01 = _mm_mul_pd(rsq01,rinv01);
2009 /* EWALD ELECTROSTATICS */
2011 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2012 ewrt = _mm_mul_pd(r01,ewtabscale);
2013 ewitab = _mm_cvttpd_epi32(ewrt);
2014 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2015 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2016 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2017 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
2021 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2023 /* Calculate temporary vectorial force */
2024 tx = _mm_mul_pd(fscal,dx01);
2025 ty = _mm_mul_pd(fscal,dy01);
2026 tz = _mm_mul_pd(fscal,dz01);
2028 /* Update vectorial force */
2029 fix0 = _mm_add_pd(fix0,tx);
2030 fiy0 = _mm_add_pd(fiy0,ty);
2031 fiz0 = _mm_add_pd(fiz0,tz);
2033 fjx1 = _mm_add_pd(fjx1,tx);
2034 fjy1 = _mm_add_pd(fjy1,ty);
2035 fjz1 = _mm_add_pd(fjz1,tz);
2037 /**************************
2038 * CALCULATE INTERACTIONS *
2039 **************************/
2041 r02 = _mm_mul_pd(rsq02,rinv02);
2043 /* EWALD ELECTROSTATICS */
2045 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2046 ewrt = _mm_mul_pd(r02,ewtabscale);
2047 ewitab = _mm_cvttpd_epi32(ewrt);
2048 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2049 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2050 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2051 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
2055 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2057 /* Calculate temporary vectorial force */
2058 tx = _mm_mul_pd(fscal,dx02);
2059 ty = _mm_mul_pd(fscal,dy02);
2060 tz = _mm_mul_pd(fscal,dz02);
2062 /* Update vectorial force */
2063 fix0 = _mm_add_pd(fix0,tx);
2064 fiy0 = _mm_add_pd(fiy0,ty);
2065 fiz0 = _mm_add_pd(fiz0,tz);
2067 fjx2 = _mm_add_pd(fjx2,tx);
2068 fjy2 = _mm_add_pd(fjy2,ty);
2069 fjz2 = _mm_add_pd(fjz2,tz);
2071 /**************************
2072 * CALCULATE INTERACTIONS *
2073 **************************/
2075 r10 = _mm_mul_pd(rsq10,rinv10);
2077 /* EWALD ELECTROSTATICS */
2079 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2080 ewrt = _mm_mul_pd(r10,ewtabscale);
2081 ewitab = _mm_cvttpd_epi32(ewrt);
2082 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2083 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2084 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2085 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2089 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2091 /* Calculate temporary vectorial force */
2092 tx = _mm_mul_pd(fscal,dx10);
2093 ty = _mm_mul_pd(fscal,dy10);
2094 tz = _mm_mul_pd(fscal,dz10);
2096 /* Update vectorial force */
2097 fix1 = _mm_add_pd(fix1,tx);
2098 fiy1 = _mm_add_pd(fiy1,ty);
2099 fiz1 = _mm_add_pd(fiz1,tz);
2101 fjx0 = _mm_add_pd(fjx0,tx);
2102 fjy0 = _mm_add_pd(fjy0,ty);
2103 fjz0 = _mm_add_pd(fjz0,tz);
2105 /**************************
2106 * CALCULATE INTERACTIONS *
2107 **************************/
2109 r11 = _mm_mul_pd(rsq11,rinv11);
2111 /* EWALD ELECTROSTATICS */
2113 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2114 ewrt = _mm_mul_pd(r11,ewtabscale);
2115 ewitab = _mm_cvttpd_epi32(ewrt);
2116 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2117 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2118 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2119 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2123 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2125 /* Calculate temporary vectorial force */
2126 tx = _mm_mul_pd(fscal,dx11);
2127 ty = _mm_mul_pd(fscal,dy11);
2128 tz = _mm_mul_pd(fscal,dz11);
2130 /* Update vectorial force */
2131 fix1 = _mm_add_pd(fix1,tx);
2132 fiy1 = _mm_add_pd(fiy1,ty);
2133 fiz1 = _mm_add_pd(fiz1,tz);
2135 fjx1 = _mm_add_pd(fjx1,tx);
2136 fjy1 = _mm_add_pd(fjy1,ty);
2137 fjz1 = _mm_add_pd(fjz1,tz);
2139 /**************************
2140 * CALCULATE INTERACTIONS *
2141 **************************/
2143 r12 = _mm_mul_pd(rsq12,rinv12);
2145 /* EWALD ELECTROSTATICS */
2147 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2148 ewrt = _mm_mul_pd(r12,ewtabscale);
2149 ewitab = _mm_cvttpd_epi32(ewrt);
2150 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2151 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2152 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2153 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2157 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2159 /* Calculate temporary vectorial force */
2160 tx = _mm_mul_pd(fscal,dx12);
2161 ty = _mm_mul_pd(fscal,dy12);
2162 tz = _mm_mul_pd(fscal,dz12);
2164 /* Update vectorial force */
2165 fix1 = _mm_add_pd(fix1,tx);
2166 fiy1 = _mm_add_pd(fiy1,ty);
2167 fiz1 = _mm_add_pd(fiz1,tz);
2169 fjx2 = _mm_add_pd(fjx2,tx);
2170 fjy2 = _mm_add_pd(fjy2,ty);
2171 fjz2 = _mm_add_pd(fjz2,tz);
2173 /**************************
2174 * CALCULATE INTERACTIONS *
2175 **************************/
2177 r20 = _mm_mul_pd(rsq20,rinv20);
2179 /* EWALD ELECTROSTATICS */
2181 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2182 ewrt = _mm_mul_pd(r20,ewtabscale);
2183 ewitab = _mm_cvttpd_epi32(ewrt);
2184 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2185 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2186 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2187 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2191 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2193 /* Calculate temporary vectorial force */
2194 tx = _mm_mul_pd(fscal,dx20);
2195 ty = _mm_mul_pd(fscal,dy20);
2196 tz = _mm_mul_pd(fscal,dz20);
2198 /* Update vectorial force */
2199 fix2 = _mm_add_pd(fix2,tx);
2200 fiy2 = _mm_add_pd(fiy2,ty);
2201 fiz2 = _mm_add_pd(fiz2,tz);
2203 fjx0 = _mm_add_pd(fjx0,tx);
2204 fjy0 = _mm_add_pd(fjy0,ty);
2205 fjz0 = _mm_add_pd(fjz0,tz);
2207 /**************************
2208 * CALCULATE INTERACTIONS *
2209 **************************/
2211 r21 = _mm_mul_pd(rsq21,rinv21);
2213 /* EWALD ELECTROSTATICS */
2215 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2216 ewrt = _mm_mul_pd(r21,ewtabscale);
2217 ewitab = _mm_cvttpd_epi32(ewrt);
2218 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2219 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2220 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2221 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2225 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2227 /* Calculate temporary vectorial force */
2228 tx = _mm_mul_pd(fscal,dx21);
2229 ty = _mm_mul_pd(fscal,dy21);
2230 tz = _mm_mul_pd(fscal,dz21);
2232 /* Update vectorial force */
2233 fix2 = _mm_add_pd(fix2,tx);
2234 fiy2 = _mm_add_pd(fiy2,ty);
2235 fiz2 = _mm_add_pd(fiz2,tz);
2237 fjx1 = _mm_add_pd(fjx1,tx);
2238 fjy1 = _mm_add_pd(fjy1,ty);
2239 fjz1 = _mm_add_pd(fjz1,tz);
2241 /**************************
2242 * CALCULATE INTERACTIONS *
2243 **************************/
2245 r22 = _mm_mul_pd(rsq22,rinv22);
2247 /* EWALD ELECTROSTATICS */
2249 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2250 ewrt = _mm_mul_pd(r22,ewtabscale);
2251 ewitab = _mm_cvttpd_epi32(ewrt);
2252 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
2253 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2254 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2255 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2259 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2261 /* Calculate temporary vectorial force */
2262 tx = _mm_mul_pd(fscal,dx22);
2263 ty = _mm_mul_pd(fscal,dy22);
2264 tz = _mm_mul_pd(fscal,dz22);
2266 /* Update vectorial force */
2267 fix2 = _mm_add_pd(fix2,tx);
2268 fiy2 = _mm_add_pd(fiy2,ty);
2269 fiz2 = _mm_add_pd(fiz2,tz);
2271 fjx2 = _mm_add_pd(fjx2,tx);
2272 fjy2 = _mm_add_pd(fjy2,ty);
2273 fjz2 = _mm_add_pd(fjz2,tz);
2275 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2277 /* Inner loop uses 350 flops */
2280 /* End of innermost loop */
2282 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2283 f+i_coord_offset,fshift+i_shift_offset);
2285 /* Increment number of inner iterations */
2286 inneriter += j_index_end - j_index_start;
2288 /* Outer loop uses 18 flops */
2291 /* Increment number of outer iterations */
2294 /* Update outer/inner flops */
2296 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*350);