2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse4_1_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse4_1_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LJEwald
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse4_1_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int j_coord_offsetA,j_coord_offsetB;
75 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
77 real *shiftvec,*fshift,*x,*f;
78 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
82 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
84 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
87 int vdwjidx0A,vdwjidx0B;
88 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
89 int vdwjidx1A,vdwjidx1B;
90 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
91 int vdwjidx2A,vdwjidx2B;
92 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
93 int vdwjidx3A,vdwjidx3B;
94 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
95 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
97 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
98 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
99 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
100 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
101 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
102 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
103 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
104 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
105 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
108 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
111 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
112 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
123 __m128d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
125 __m128d one_half = _mm_set1_pd(0.5);
126 __m128d minus_one = _mm_set1_pd(-1.0);
128 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
130 __m128d dummy_mask,cutoff_mask;
131 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
132 __m128d one = _mm_set1_pd(1.0);
133 __m128d two = _mm_set1_pd(2.0);
139 jindex = nlist->jindex;
141 shiftidx = nlist->shift;
143 shiftvec = fr->shift_vec[0];
144 fshift = fr->fshift[0];
145 facel = _mm_set1_pd(fr->ic->epsfac);
146 charge = mdatoms->chargeA;
147 nvdwtype = fr->ntype;
149 vdwtype = mdatoms->typeA;
150 vdwgridparam = fr->ljpme_c6grid;
151 sh_lj_ewald = _mm_set1_pd(fr->ic->sh_lj_ewald);
152 ewclj = _mm_set1_pd(fr->ic->ewaldcoeff_lj);
153 ewclj2 = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
155 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
156 ewtab = fr->ic->tabq_coul_FDV0;
157 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
158 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
160 /* Setup water-specific parameters */
161 inr = nlist->iinr[0];
162 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
163 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
164 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
165 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
167 jq1 = _mm_set1_pd(charge[inr+1]);
168 jq2 = _mm_set1_pd(charge[inr+2]);
169 jq3 = _mm_set1_pd(charge[inr+3]);
170 vdwjidx0A = 2*vdwtype[inr+0];
171 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
172 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
173 c6grid_00 = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
174 qq11 = _mm_mul_pd(iq1,jq1);
175 qq12 = _mm_mul_pd(iq1,jq2);
176 qq13 = _mm_mul_pd(iq1,jq3);
177 qq21 = _mm_mul_pd(iq2,jq1);
178 qq22 = _mm_mul_pd(iq2,jq2);
179 qq23 = _mm_mul_pd(iq2,jq3);
180 qq31 = _mm_mul_pd(iq3,jq1);
181 qq32 = _mm_mul_pd(iq3,jq2);
182 qq33 = _mm_mul_pd(iq3,jq3);
184 /* Avoid stupid compiler warnings */
192 /* Start outer loop over neighborlists */
193 for(iidx=0; iidx<nri; iidx++)
195 /* Load shift vector for this list */
196 i_shift_offset = DIM*shiftidx[iidx];
198 /* Load limits for loop over neighbors */
199 j_index_start = jindex[iidx];
200 j_index_end = jindex[iidx+1];
202 /* Get outer coordinate index */
204 i_coord_offset = DIM*inr;
206 /* Load i particle coords and add shift vector */
207 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
208 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
210 fix0 = _mm_setzero_pd();
211 fiy0 = _mm_setzero_pd();
212 fiz0 = _mm_setzero_pd();
213 fix1 = _mm_setzero_pd();
214 fiy1 = _mm_setzero_pd();
215 fiz1 = _mm_setzero_pd();
216 fix2 = _mm_setzero_pd();
217 fiy2 = _mm_setzero_pd();
218 fiz2 = _mm_setzero_pd();
219 fix3 = _mm_setzero_pd();
220 fiy3 = _mm_setzero_pd();
221 fiz3 = _mm_setzero_pd();
223 /* Reset potential sums */
224 velecsum = _mm_setzero_pd();
225 vvdwsum = _mm_setzero_pd();
227 /* Start inner kernel loop */
228 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
231 /* Get j neighbor index, and coordinate index */
234 j_coord_offsetA = DIM*jnrA;
235 j_coord_offsetB = DIM*jnrB;
237 /* load j atom coordinates */
238 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
239 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
240 &jy2,&jz2,&jx3,&jy3,&jz3);
242 /* Calculate displacement vector */
243 dx00 = _mm_sub_pd(ix0,jx0);
244 dy00 = _mm_sub_pd(iy0,jy0);
245 dz00 = _mm_sub_pd(iz0,jz0);
246 dx11 = _mm_sub_pd(ix1,jx1);
247 dy11 = _mm_sub_pd(iy1,jy1);
248 dz11 = _mm_sub_pd(iz1,jz1);
249 dx12 = _mm_sub_pd(ix1,jx2);
250 dy12 = _mm_sub_pd(iy1,jy2);
251 dz12 = _mm_sub_pd(iz1,jz2);
252 dx13 = _mm_sub_pd(ix1,jx3);
253 dy13 = _mm_sub_pd(iy1,jy3);
254 dz13 = _mm_sub_pd(iz1,jz3);
255 dx21 = _mm_sub_pd(ix2,jx1);
256 dy21 = _mm_sub_pd(iy2,jy1);
257 dz21 = _mm_sub_pd(iz2,jz1);
258 dx22 = _mm_sub_pd(ix2,jx2);
259 dy22 = _mm_sub_pd(iy2,jy2);
260 dz22 = _mm_sub_pd(iz2,jz2);
261 dx23 = _mm_sub_pd(ix2,jx3);
262 dy23 = _mm_sub_pd(iy2,jy3);
263 dz23 = _mm_sub_pd(iz2,jz3);
264 dx31 = _mm_sub_pd(ix3,jx1);
265 dy31 = _mm_sub_pd(iy3,jy1);
266 dz31 = _mm_sub_pd(iz3,jz1);
267 dx32 = _mm_sub_pd(ix3,jx2);
268 dy32 = _mm_sub_pd(iy3,jy2);
269 dz32 = _mm_sub_pd(iz3,jz2);
270 dx33 = _mm_sub_pd(ix3,jx3);
271 dy33 = _mm_sub_pd(iy3,jy3);
272 dz33 = _mm_sub_pd(iz3,jz3);
274 /* Calculate squared distance and things based on it */
275 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
276 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
277 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
278 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
279 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
280 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
281 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
282 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
283 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
284 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
286 rinv00 = sse41_invsqrt_d(rsq00);
287 rinv11 = sse41_invsqrt_d(rsq11);
288 rinv12 = sse41_invsqrt_d(rsq12);
289 rinv13 = sse41_invsqrt_d(rsq13);
290 rinv21 = sse41_invsqrt_d(rsq21);
291 rinv22 = sse41_invsqrt_d(rsq22);
292 rinv23 = sse41_invsqrt_d(rsq23);
293 rinv31 = sse41_invsqrt_d(rsq31);
294 rinv32 = sse41_invsqrt_d(rsq32);
295 rinv33 = sse41_invsqrt_d(rsq33);
297 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
298 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
299 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
300 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
301 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
302 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
303 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
304 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
305 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
306 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
308 fjx0 = _mm_setzero_pd();
309 fjy0 = _mm_setzero_pd();
310 fjz0 = _mm_setzero_pd();
311 fjx1 = _mm_setzero_pd();
312 fjy1 = _mm_setzero_pd();
313 fjz1 = _mm_setzero_pd();
314 fjx2 = _mm_setzero_pd();
315 fjy2 = _mm_setzero_pd();
316 fjz2 = _mm_setzero_pd();
317 fjx3 = _mm_setzero_pd();
318 fjy3 = _mm_setzero_pd();
319 fjz3 = _mm_setzero_pd();
321 /**************************
322 * CALCULATE INTERACTIONS *
323 **************************/
325 r00 = _mm_mul_pd(rsq00,rinv00);
327 /* Analytical LJ-PME */
328 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
329 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
330 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
331 exponent = sse41_exp_d(ewcljrsq);
332 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
333 poly = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
334 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
335 vvdw6 = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
336 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
337 vvdw = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
338 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
339 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
341 /* Update potential sum for this i atom from the interaction with this j atom. */
342 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
346 /* Calculate temporary vectorial force */
347 tx = _mm_mul_pd(fscal,dx00);
348 ty = _mm_mul_pd(fscal,dy00);
349 tz = _mm_mul_pd(fscal,dz00);
351 /* Update vectorial force */
352 fix0 = _mm_add_pd(fix0,tx);
353 fiy0 = _mm_add_pd(fiy0,ty);
354 fiz0 = _mm_add_pd(fiz0,tz);
356 fjx0 = _mm_add_pd(fjx0,tx);
357 fjy0 = _mm_add_pd(fjy0,ty);
358 fjz0 = _mm_add_pd(fjz0,tz);
360 /**************************
361 * CALCULATE INTERACTIONS *
362 **************************/
364 r11 = _mm_mul_pd(rsq11,rinv11);
366 /* EWALD ELECTROSTATICS */
368 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
369 ewrt = _mm_mul_pd(r11,ewtabscale);
370 ewitab = _mm_cvttpd_epi32(ewrt);
371 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
372 ewitab = _mm_slli_epi32(ewitab,2);
373 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
374 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
375 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
376 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
377 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
378 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
379 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
380 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
381 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
382 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
384 /* Update potential sum for this i atom from the interaction with this j atom. */
385 velecsum = _mm_add_pd(velecsum,velec);
389 /* Calculate temporary vectorial force */
390 tx = _mm_mul_pd(fscal,dx11);
391 ty = _mm_mul_pd(fscal,dy11);
392 tz = _mm_mul_pd(fscal,dz11);
394 /* Update vectorial force */
395 fix1 = _mm_add_pd(fix1,tx);
396 fiy1 = _mm_add_pd(fiy1,ty);
397 fiz1 = _mm_add_pd(fiz1,tz);
399 fjx1 = _mm_add_pd(fjx1,tx);
400 fjy1 = _mm_add_pd(fjy1,ty);
401 fjz1 = _mm_add_pd(fjz1,tz);
403 /**************************
404 * CALCULATE INTERACTIONS *
405 **************************/
407 r12 = _mm_mul_pd(rsq12,rinv12);
409 /* EWALD ELECTROSTATICS */
411 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
412 ewrt = _mm_mul_pd(r12,ewtabscale);
413 ewitab = _mm_cvttpd_epi32(ewrt);
414 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
415 ewitab = _mm_slli_epi32(ewitab,2);
416 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
417 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
418 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
419 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
420 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
421 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
422 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
423 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
424 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
425 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
427 /* Update potential sum for this i atom from the interaction with this j atom. */
428 velecsum = _mm_add_pd(velecsum,velec);
432 /* Calculate temporary vectorial force */
433 tx = _mm_mul_pd(fscal,dx12);
434 ty = _mm_mul_pd(fscal,dy12);
435 tz = _mm_mul_pd(fscal,dz12);
437 /* Update vectorial force */
438 fix1 = _mm_add_pd(fix1,tx);
439 fiy1 = _mm_add_pd(fiy1,ty);
440 fiz1 = _mm_add_pd(fiz1,tz);
442 fjx2 = _mm_add_pd(fjx2,tx);
443 fjy2 = _mm_add_pd(fjy2,ty);
444 fjz2 = _mm_add_pd(fjz2,tz);
446 /**************************
447 * CALCULATE INTERACTIONS *
448 **************************/
450 r13 = _mm_mul_pd(rsq13,rinv13);
452 /* EWALD ELECTROSTATICS */
454 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
455 ewrt = _mm_mul_pd(r13,ewtabscale);
456 ewitab = _mm_cvttpd_epi32(ewrt);
457 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
458 ewitab = _mm_slli_epi32(ewitab,2);
459 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
460 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
461 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
462 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
463 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
464 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
465 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
466 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
467 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
468 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
470 /* Update potential sum for this i atom from the interaction with this j atom. */
471 velecsum = _mm_add_pd(velecsum,velec);
475 /* Calculate temporary vectorial force */
476 tx = _mm_mul_pd(fscal,dx13);
477 ty = _mm_mul_pd(fscal,dy13);
478 tz = _mm_mul_pd(fscal,dz13);
480 /* Update vectorial force */
481 fix1 = _mm_add_pd(fix1,tx);
482 fiy1 = _mm_add_pd(fiy1,ty);
483 fiz1 = _mm_add_pd(fiz1,tz);
485 fjx3 = _mm_add_pd(fjx3,tx);
486 fjy3 = _mm_add_pd(fjy3,ty);
487 fjz3 = _mm_add_pd(fjz3,tz);
489 /**************************
490 * CALCULATE INTERACTIONS *
491 **************************/
493 r21 = _mm_mul_pd(rsq21,rinv21);
495 /* EWALD ELECTROSTATICS */
497 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
498 ewrt = _mm_mul_pd(r21,ewtabscale);
499 ewitab = _mm_cvttpd_epi32(ewrt);
500 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
501 ewitab = _mm_slli_epi32(ewitab,2);
502 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
503 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
504 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
505 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
506 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
507 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
508 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
509 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
510 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
511 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
513 /* Update potential sum for this i atom from the interaction with this j atom. */
514 velecsum = _mm_add_pd(velecsum,velec);
518 /* Calculate temporary vectorial force */
519 tx = _mm_mul_pd(fscal,dx21);
520 ty = _mm_mul_pd(fscal,dy21);
521 tz = _mm_mul_pd(fscal,dz21);
523 /* Update vectorial force */
524 fix2 = _mm_add_pd(fix2,tx);
525 fiy2 = _mm_add_pd(fiy2,ty);
526 fiz2 = _mm_add_pd(fiz2,tz);
528 fjx1 = _mm_add_pd(fjx1,tx);
529 fjy1 = _mm_add_pd(fjy1,ty);
530 fjz1 = _mm_add_pd(fjz1,tz);
532 /**************************
533 * CALCULATE INTERACTIONS *
534 **************************/
536 r22 = _mm_mul_pd(rsq22,rinv22);
538 /* EWALD ELECTROSTATICS */
540 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
541 ewrt = _mm_mul_pd(r22,ewtabscale);
542 ewitab = _mm_cvttpd_epi32(ewrt);
543 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
544 ewitab = _mm_slli_epi32(ewitab,2);
545 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
546 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
547 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
548 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
549 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
550 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
551 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
552 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
553 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
554 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
556 /* Update potential sum for this i atom from the interaction with this j atom. */
557 velecsum = _mm_add_pd(velecsum,velec);
561 /* Calculate temporary vectorial force */
562 tx = _mm_mul_pd(fscal,dx22);
563 ty = _mm_mul_pd(fscal,dy22);
564 tz = _mm_mul_pd(fscal,dz22);
566 /* Update vectorial force */
567 fix2 = _mm_add_pd(fix2,tx);
568 fiy2 = _mm_add_pd(fiy2,ty);
569 fiz2 = _mm_add_pd(fiz2,tz);
571 fjx2 = _mm_add_pd(fjx2,tx);
572 fjy2 = _mm_add_pd(fjy2,ty);
573 fjz2 = _mm_add_pd(fjz2,tz);
575 /**************************
576 * CALCULATE INTERACTIONS *
577 **************************/
579 r23 = _mm_mul_pd(rsq23,rinv23);
581 /* EWALD ELECTROSTATICS */
583 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
584 ewrt = _mm_mul_pd(r23,ewtabscale);
585 ewitab = _mm_cvttpd_epi32(ewrt);
586 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
587 ewitab = _mm_slli_epi32(ewitab,2);
588 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
589 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
590 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
591 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
592 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
593 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
594 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
595 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
596 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
597 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
599 /* Update potential sum for this i atom from the interaction with this j atom. */
600 velecsum = _mm_add_pd(velecsum,velec);
604 /* Calculate temporary vectorial force */
605 tx = _mm_mul_pd(fscal,dx23);
606 ty = _mm_mul_pd(fscal,dy23);
607 tz = _mm_mul_pd(fscal,dz23);
609 /* Update vectorial force */
610 fix2 = _mm_add_pd(fix2,tx);
611 fiy2 = _mm_add_pd(fiy2,ty);
612 fiz2 = _mm_add_pd(fiz2,tz);
614 fjx3 = _mm_add_pd(fjx3,tx);
615 fjy3 = _mm_add_pd(fjy3,ty);
616 fjz3 = _mm_add_pd(fjz3,tz);
618 /**************************
619 * CALCULATE INTERACTIONS *
620 **************************/
622 r31 = _mm_mul_pd(rsq31,rinv31);
624 /* EWALD ELECTROSTATICS */
626 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
627 ewrt = _mm_mul_pd(r31,ewtabscale);
628 ewitab = _mm_cvttpd_epi32(ewrt);
629 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
630 ewitab = _mm_slli_epi32(ewitab,2);
631 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
632 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
633 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
634 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
635 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
636 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
637 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
638 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
639 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
640 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
642 /* Update potential sum for this i atom from the interaction with this j atom. */
643 velecsum = _mm_add_pd(velecsum,velec);
647 /* Calculate temporary vectorial force */
648 tx = _mm_mul_pd(fscal,dx31);
649 ty = _mm_mul_pd(fscal,dy31);
650 tz = _mm_mul_pd(fscal,dz31);
652 /* Update vectorial force */
653 fix3 = _mm_add_pd(fix3,tx);
654 fiy3 = _mm_add_pd(fiy3,ty);
655 fiz3 = _mm_add_pd(fiz3,tz);
657 fjx1 = _mm_add_pd(fjx1,tx);
658 fjy1 = _mm_add_pd(fjy1,ty);
659 fjz1 = _mm_add_pd(fjz1,tz);
661 /**************************
662 * CALCULATE INTERACTIONS *
663 **************************/
665 r32 = _mm_mul_pd(rsq32,rinv32);
667 /* EWALD ELECTROSTATICS */
669 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
670 ewrt = _mm_mul_pd(r32,ewtabscale);
671 ewitab = _mm_cvttpd_epi32(ewrt);
672 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
673 ewitab = _mm_slli_epi32(ewitab,2);
674 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
675 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
676 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
677 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
678 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
679 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
680 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
681 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
682 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
683 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
685 /* Update potential sum for this i atom from the interaction with this j atom. */
686 velecsum = _mm_add_pd(velecsum,velec);
690 /* Calculate temporary vectorial force */
691 tx = _mm_mul_pd(fscal,dx32);
692 ty = _mm_mul_pd(fscal,dy32);
693 tz = _mm_mul_pd(fscal,dz32);
695 /* Update vectorial force */
696 fix3 = _mm_add_pd(fix3,tx);
697 fiy3 = _mm_add_pd(fiy3,ty);
698 fiz3 = _mm_add_pd(fiz3,tz);
700 fjx2 = _mm_add_pd(fjx2,tx);
701 fjy2 = _mm_add_pd(fjy2,ty);
702 fjz2 = _mm_add_pd(fjz2,tz);
704 /**************************
705 * CALCULATE INTERACTIONS *
706 **************************/
708 r33 = _mm_mul_pd(rsq33,rinv33);
710 /* EWALD ELECTROSTATICS */
712 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
713 ewrt = _mm_mul_pd(r33,ewtabscale);
714 ewitab = _mm_cvttpd_epi32(ewrt);
715 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
716 ewitab = _mm_slli_epi32(ewitab,2);
717 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
718 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
719 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
720 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
721 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
722 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
723 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
724 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
725 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
726 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
728 /* Update potential sum for this i atom from the interaction with this j atom. */
729 velecsum = _mm_add_pd(velecsum,velec);
733 /* Calculate temporary vectorial force */
734 tx = _mm_mul_pd(fscal,dx33);
735 ty = _mm_mul_pd(fscal,dy33);
736 tz = _mm_mul_pd(fscal,dz33);
738 /* Update vectorial force */
739 fix3 = _mm_add_pd(fix3,tx);
740 fiy3 = _mm_add_pd(fiy3,ty);
741 fiz3 = _mm_add_pd(fiz3,tz);
743 fjx3 = _mm_add_pd(fjx3,tx);
744 fjy3 = _mm_add_pd(fjy3,ty);
745 fjz3 = _mm_add_pd(fjz3,tz);
747 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
749 /* Inner loop uses 423 flops */
756 j_coord_offsetA = DIM*jnrA;
758 /* load j atom coordinates */
759 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
760 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
761 &jy2,&jz2,&jx3,&jy3,&jz3);
763 /* Calculate displacement vector */
764 dx00 = _mm_sub_pd(ix0,jx0);
765 dy00 = _mm_sub_pd(iy0,jy0);
766 dz00 = _mm_sub_pd(iz0,jz0);
767 dx11 = _mm_sub_pd(ix1,jx1);
768 dy11 = _mm_sub_pd(iy1,jy1);
769 dz11 = _mm_sub_pd(iz1,jz1);
770 dx12 = _mm_sub_pd(ix1,jx2);
771 dy12 = _mm_sub_pd(iy1,jy2);
772 dz12 = _mm_sub_pd(iz1,jz2);
773 dx13 = _mm_sub_pd(ix1,jx3);
774 dy13 = _mm_sub_pd(iy1,jy3);
775 dz13 = _mm_sub_pd(iz1,jz3);
776 dx21 = _mm_sub_pd(ix2,jx1);
777 dy21 = _mm_sub_pd(iy2,jy1);
778 dz21 = _mm_sub_pd(iz2,jz1);
779 dx22 = _mm_sub_pd(ix2,jx2);
780 dy22 = _mm_sub_pd(iy2,jy2);
781 dz22 = _mm_sub_pd(iz2,jz2);
782 dx23 = _mm_sub_pd(ix2,jx3);
783 dy23 = _mm_sub_pd(iy2,jy3);
784 dz23 = _mm_sub_pd(iz2,jz3);
785 dx31 = _mm_sub_pd(ix3,jx1);
786 dy31 = _mm_sub_pd(iy3,jy1);
787 dz31 = _mm_sub_pd(iz3,jz1);
788 dx32 = _mm_sub_pd(ix3,jx2);
789 dy32 = _mm_sub_pd(iy3,jy2);
790 dz32 = _mm_sub_pd(iz3,jz2);
791 dx33 = _mm_sub_pd(ix3,jx3);
792 dy33 = _mm_sub_pd(iy3,jy3);
793 dz33 = _mm_sub_pd(iz3,jz3);
795 /* Calculate squared distance and things based on it */
796 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
797 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
798 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
799 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
800 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
801 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
802 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
803 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
804 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
805 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
807 rinv00 = sse41_invsqrt_d(rsq00);
808 rinv11 = sse41_invsqrt_d(rsq11);
809 rinv12 = sse41_invsqrt_d(rsq12);
810 rinv13 = sse41_invsqrt_d(rsq13);
811 rinv21 = sse41_invsqrt_d(rsq21);
812 rinv22 = sse41_invsqrt_d(rsq22);
813 rinv23 = sse41_invsqrt_d(rsq23);
814 rinv31 = sse41_invsqrt_d(rsq31);
815 rinv32 = sse41_invsqrt_d(rsq32);
816 rinv33 = sse41_invsqrt_d(rsq33);
818 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
819 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
820 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
821 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
822 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
823 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
824 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
825 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
826 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
827 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
829 fjx0 = _mm_setzero_pd();
830 fjy0 = _mm_setzero_pd();
831 fjz0 = _mm_setzero_pd();
832 fjx1 = _mm_setzero_pd();
833 fjy1 = _mm_setzero_pd();
834 fjz1 = _mm_setzero_pd();
835 fjx2 = _mm_setzero_pd();
836 fjy2 = _mm_setzero_pd();
837 fjz2 = _mm_setzero_pd();
838 fjx3 = _mm_setzero_pd();
839 fjy3 = _mm_setzero_pd();
840 fjz3 = _mm_setzero_pd();
842 /**************************
843 * CALCULATE INTERACTIONS *
844 **************************/
846 r00 = _mm_mul_pd(rsq00,rinv00);
848 /* Analytical LJ-PME */
849 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
850 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
851 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
852 exponent = sse41_exp_d(ewcljrsq);
853 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
854 poly = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
855 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
856 vvdw6 = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
857 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
858 vvdw = _mm_sub_pd(_mm_mul_pd(vvdw12,one_twelfth),_mm_mul_pd(vvdw6,one_sixth));
859 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
860 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
862 /* Update potential sum for this i atom from the interaction with this j atom. */
863 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
864 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
868 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
870 /* Calculate temporary vectorial force */
871 tx = _mm_mul_pd(fscal,dx00);
872 ty = _mm_mul_pd(fscal,dy00);
873 tz = _mm_mul_pd(fscal,dz00);
875 /* Update vectorial force */
876 fix0 = _mm_add_pd(fix0,tx);
877 fiy0 = _mm_add_pd(fiy0,ty);
878 fiz0 = _mm_add_pd(fiz0,tz);
880 fjx0 = _mm_add_pd(fjx0,tx);
881 fjy0 = _mm_add_pd(fjy0,ty);
882 fjz0 = _mm_add_pd(fjz0,tz);
884 /**************************
885 * CALCULATE INTERACTIONS *
886 **************************/
888 r11 = _mm_mul_pd(rsq11,rinv11);
890 /* EWALD ELECTROSTATICS */
892 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
893 ewrt = _mm_mul_pd(r11,ewtabscale);
894 ewitab = _mm_cvttpd_epi32(ewrt);
895 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
896 ewitab = _mm_slli_epi32(ewitab,2);
897 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
898 ewtabD = _mm_setzero_pd();
899 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
900 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
901 ewtabFn = _mm_setzero_pd();
902 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
903 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
904 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
905 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
906 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
908 /* Update potential sum for this i atom from the interaction with this j atom. */
909 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
910 velecsum = _mm_add_pd(velecsum,velec);
914 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
916 /* Calculate temporary vectorial force */
917 tx = _mm_mul_pd(fscal,dx11);
918 ty = _mm_mul_pd(fscal,dy11);
919 tz = _mm_mul_pd(fscal,dz11);
921 /* Update vectorial force */
922 fix1 = _mm_add_pd(fix1,tx);
923 fiy1 = _mm_add_pd(fiy1,ty);
924 fiz1 = _mm_add_pd(fiz1,tz);
926 fjx1 = _mm_add_pd(fjx1,tx);
927 fjy1 = _mm_add_pd(fjy1,ty);
928 fjz1 = _mm_add_pd(fjz1,tz);
930 /**************************
931 * CALCULATE INTERACTIONS *
932 **************************/
934 r12 = _mm_mul_pd(rsq12,rinv12);
936 /* EWALD ELECTROSTATICS */
938 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
939 ewrt = _mm_mul_pd(r12,ewtabscale);
940 ewitab = _mm_cvttpd_epi32(ewrt);
941 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
942 ewitab = _mm_slli_epi32(ewitab,2);
943 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
944 ewtabD = _mm_setzero_pd();
945 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
946 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
947 ewtabFn = _mm_setzero_pd();
948 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
949 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
950 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
951 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
952 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
954 /* Update potential sum for this i atom from the interaction with this j atom. */
955 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
956 velecsum = _mm_add_pd(velecsum,velec);
960 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
962 /* Calculate temporary vectorial force */
963 tx = _mm_mul_pd(fscal,dx12);
964 ty = _mm_mul_pd(fscal,dy12);
965 tz = _mm_mul_pd(fscal,dz12);
967 /* Update vectorial force */
968 fix1 = _mm_add_pd(fix1,tx);
969 fiy1 = _mm_add_pd(fiy1,ty);
970 fiz1 = _mm_add_pd(fiz1,tz);
972 fjx2 = _mm_add_pd(fjx2,tx);
973 fjy2 = _mm_add_pd(fjy2,ty);
974 fjz2 = _mm_add_pd(fjz2,tz);
976 /**************************
977 * CALCULATE INTERACTIONS *
978 **************************/
980 r13 = _mm_mul_pd(rsq13,rinv13);
982 /* EWALD ELECTROSTATICS */
984 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
985 ewrt = _mm_mul_pd(r13,ewtabscale);
986 ewitab = _mm_cvttpd_epi32(ewrt);
987 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
988 ewitab = _mm_slli_epi32(ewitab,2);
989 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
990 ewtabD = _mm_setzero_pd();
991 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
992 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
993 ewtabFn = _mm_setzero_pd();
994 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
995 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
996 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
997 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
998 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1000 /* Update potential sum for this i atom from the interaction with this j atom. */
1001 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1002 velecsum = _mm_add_pd(velecsum,velec);
1006 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1008 /* Calculate temporary vectorial force */
1009 tx = _mm_mul_pd(fscal,dx13);
1010 ty = _mm_mul_pd(fscal,dy13);
1011 tz = _mm_mul_pd(fscal,dz13);
1013 /* Update vectorial force */
1014 fix1 = _mm_add_pd(fix1,tx);
1015 fiy1 = _mm_add_pd(fiy1,ty);
1016 fiz1 = _mm_add_pd(fiz1,tz);
1018 fjx3 = _mm_add_pd(fjx3,tx);
1019 fjy3 = _mm_add_pd(fjy3,ty);
1020 fjz3 = _mm_add_pd(fjz3,tz);
1022 /**************************
1023 * CALCULATE INTERACTIONS *
1024 **************************/
1026 r21 = _mm_mul_pd(rsq21,rinv21);
1028 /* EWALD ELECTROSTATICS */
1030 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1031 ewrt = _mm_mul_pd(r21,ewtabscale);
1032 ewitab = _mm_cvttpd_epi32(ewrt);
1033 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1034 ewitab = _mm_slli_epi32(ewitab,2);
1035 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1036 ewtabD = _mm_setzero_pd();
1037 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1038 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1039 ewtabFn = _mm_setzero_pd();
1040 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1041 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1042 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1043 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1044 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1046 /* Update potential sum for this i atom from the interaction with this j atom. */
1047 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1048 velecsum = _mm_add_pd(velecsum,velec);
1052 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1054 /* Calculate temporary vectorial force */
1055 tx = _mm_mul_pd(fscal,dx21);
1056 ty = _mm_mul_pd(fscal,dy21);
1057 tz = _mm_mul_pd(fscal,dz21);
1059 /* Update vectorial force */
1060 fix2 = _mm_add_pd(fix2,tx);
1061 fiy2 = _mm_add_pd(fiy2,ty);
1062 fiz2 = _mm_add_pd(fiz2,tz);
1064 fjx1 = _mm_add_pd(fjx1,tx);
1065 fjy1 = _mm_add_pd(fjy1,ty);
1066 fjz1 = _mm_add_pd(fjz1,tz);
1068 /**************************
1069 * CALCULATE INTERACTIONS *
1070 **************************/
1072 r22 = _mm_mul_pd(rsq22,rinv22);
1074 /* EWALD ELECTROSTATICS */
1076 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1077 ewrt = _mm_mul_pd(r22,ewtabscale);
1078 ewitab = _mm_cvttpd_epi32(ewrt);
1079 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1080 ewitab = _mm_slli_epi32(ewitab,2);
1081 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1082 ewtabD = _mm_setzero_pd();
1083 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1084 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1085 ewtabFn = _mm_setzero_pd();
1086 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1087 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1088 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1089 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1090 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1092 /* Update potential sum for this i atom from the interaction with this j atom. */
1093 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1094 velecsum = _mm_add_pd(velecsum,velec);
1098 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1100 /* Calculate temporary vectorial force */
1101 tx = _mm_mul_pd(fscal,dx22);
1102 ty = _mm_mul_pd(fscal,dy22);
1103 tz = _mm_mul_pd(fscal,dz22);
1105 /* Update vectorial force */
1106 fix2 = _mm_add_pd(fix2,tx);
1107 fiy2 = _mm_add_pd(fiy2,ty);
1108 fiz2 = _mm_add_pd(fiz2,tz);
1110 fjx2 = _mm_add_pd(fjx2,tx);
1111 fjy2 = _mm_add_pd(fjy2,ty);
1112 fjz2 = _mm_add_pd(fjz2,tz);
1114 /**************************
1115 * CALCULATE INTERACTIONS *
1116 **************************/
1118 r23 = _mm_mul_pd(rsq23,rinv23);
1120 /* EWALD ELECTROSTATICS */
1122 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1123 ewrt = _mm_mul_pd(r23,ewtabscale);
1124 ewitab = _mm_cvttpd_epi32(ewrt);
1125 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1126 ewitab = _mm_slli_epi32(ewitab,2);
1127 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1128 ewtabD = _mm_setzero_pd();
1129 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1130 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1131 ewtabFn = _mm_setzero_pd();
1132 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1133 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1134 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1135 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
1136 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1138 /* Update potential sum for this i atom from the interaction with this j atom. */
1139 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1140 velecsum = _mm_add_pd(velecsum,velec);
1144 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1146 /* Calculate temporary vectorial force */
1147 tx = _mm_mul_pd(fscal,dx23);
1148 ty = _mm_mul_pd(fscal,dy23);
1149 tz = _mm_mul_pd(fscal,dz23);
1151 /* Update vectorial force */
1152 fix2 = _mm_add_pd(fix2,tx);
1153 fiy2 = _mm_add_pd(fiy2,ty);
1154 fiz2 = _mm_add_pd(fiz2,tz);
1156 fjx3 = _mm_add_pd(fjx3,tx);
1157 fjy3 = _mm_add_pd(fjy3,ty);
1158 fjz3 = _mm_add_pd(fjz3,tz);
1160 /**************************
1161 * CALCULATE INTERACTIONS *
1162 **************************/
1164 r31 = _mm_mul_pd(rsq31,rinv31);
1166 /* EWALD ELECTROSTATICS */
1168 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1169 ewrt = _mm_mul_pd(r31,ewtabscale);
1170 ewitab = _mm_cvttpd_epi32(ewrt);
1171 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1172 ewitab = _mm_slli_epi32(ewitab,2);
1173 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1174 ewtabD = _mm_setzero_pd();
1175 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1176 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1177 ewtabFn = _mm_setzero_pd();
1178 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1179 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1180 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1181 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
1182 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1184 /* Update potential sum for this i atom from the interaction with this j atom. */
1185 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1186 velecsum = _mm_add_pd(velecsum,velec);
1190 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1192 /* Calculate temporary vectorial force */
1193 tx = _mm_mul_pd(fscal,dx31);
1194 ty = _mm_mul_pd(fscal,dy31);
1195 tz = _mm_mul_pd(fscal,dz31);
1197 /* Update vectorial force */
1198 fix3 = _mm_add_pd(fix3,tx);
1199 fiy3 = _mm_add_pd(fiy3,ty);
1200 fiz3 = _mm_add_pd(fiz3,tz);
1202 fjx1 = _mm_add_pd(fjx1,tx);
1203 fjy1 = _mm_add_pd(fjy1,ty);
1204 fjz1 = _mm_add_pd(fjz1,tz);
1206 /**************************
1207 * CALCULATE INTERACTIONS *
1208 **************************/
1210 r32 = _mm_mul_pd(rsq32,rinv32);
1212 /* EWALD ELECTROSTATICS */
1214 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1215 ewrt = _mm_mul_pd(r32,ewtabscale);
1216 ewitab = _mm_cvttpd_epi32(ewrt);
1217 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1218 ewitab = _mm_slli_epi32(ewitab,2);
1219 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1220 ewtabD = _mm_setzero_pd();
1221 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1222 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1223 ewtabFn = _mm_setzero_pd();
1224 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1225 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1226 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1227 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
1228 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1230 /* Update potential sum for this i atom from the interaction with this j atom. */
1231 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1232 velecsum = _mm_add_pd(velecsum,velec);
1236 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1238 /* Calculate temporary vectorial force */
1239 tx = _mm_mul_pd(fscal,dx32);
1240 ty = _mm_mul_pd(fscal,dy32);
1241 tz = _mm_mul_pd(fscal,dz32);
1243 /* Update vectorial force */
1244 fix3 = _mm_add_pd(fix3,tx);
1245 fiy3 = _mm_add_pd(fiy3,ty);
1246 fiz3 = _mm_add_pd(fiz3,tz);
1248 fjx2 = _mm_add_pd(fjx2,tx);
1249 fjy2 = _mm_add_pd(fjy2,ty);
1250 fjz2 = _mm_add_pd(fjz2,tz);
1252 /**************************
1253 * CALCULATE INTERACTIONS *
1254 **************************/
1256 r33 = _mm_mul_pd(rsq33,rinv33);
1258 /* EWALD ELECTROSTATICS */
1260 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1261 ewrt = _mm_mul_pd(r33,ewtabscale);
1262 ewitab = _mm_cvttpd_epi32(ewrt);
1263 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1264 ewitab = _mm_slli_epi32(ewitab,2);
1265 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1266 ewtabD = _mm_setzero_pd();
1267 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1268 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1269 ewtabFn = _mm_setzero_pd();
1270 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1271 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1272 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1273 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
1274 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1276 /* Update potential sum for this i atom from the interaction with this j atom. */
1277 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1278 velecsum = _mm_add_pd(velecsum,velec);
1282 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1284 /* Calculate temporary vectorial force */
1285 tx = _mm_mul_pd(fscal,dx33);
1286 ty = _mm_mul_pd(fscal,dy33);
1287 tz = _mm_mul_pd(fscal,dz33);
1289 /* Update vectorial force */
1290 fix3 = _mm_add_pd(fix3,tx);
1291 fiy3 = _mm_add_pd(fiy3,ty);
1292 fiz3 = _mm_add_pd(fiz3,tz);
1294 fjx3 = _mm_add_pd(fjx3,tx);
1295 fjy3 = _mm_add_pd(fjy3,ty);
1296 fjz3 = _mm_add_pd(fjz3,tz);
1298 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1300 /* Inner loop uses 423 flops */
1303 /* End of innermost loop */
1305 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1306 f+i_coord_offset,fshift+i_shift_offset);
1309 /* Update potential energies */
1310 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1311 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1313 /* Increment number of inner iterations */
1314 inneriter += j_index_end - j_index_start;
1316 /* Outer loop uses 26 flops */
1319 /* Increment number of outer iterations */
1322 /* Update outer/inner flops */
1324 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*423);
1327 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse4_1_double
1328 * Electrostatics interaction: Ewald
1329 * VdW interaction: LJEwald
1330 * Geometry: Water4-Water4
1331 * Calculate force/pot: Force
1334 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse4_1_double
1335 (t_nblist * gmx_restrict nlist,
1336 rvec * gmx_restrict xx,
1337 rvec * gmx_restrict ff,
1338 struct t_forcerec * gmx_restrict fr,
1339 t_mdatoms * gmx_restrict mdatoms,
1340 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1341 t_nrnb * gmx_restrict nrnb)
1343 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1344 * just 0 for non-waters.
1345 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1346 * jnr indices corresponding to data put in the four positions in the SIMD register.
1348 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1349 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1351 int j_coord_offsetA,j_coord_offsetB;
1352 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1353 real rcutoff_scalar;
1354 real *shiftvec,*fshift,*x,*f;
1355 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1357 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1359 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1361 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1363 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1364 int vdwjidx0A,vdwjidx0B;
1365 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1366 int vdwjidx1A,vdwjidx1B;
1367 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1368 int vdwjidx2A,vdwjidx2B;
1369 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1370 int vdwjidx3A,vdwjidx3B;
1371 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1372 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1373 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1374 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1375 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1376 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1377 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1378 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1379 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1380 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1381 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1382 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1385 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1388 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1389 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1400 __m128d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1402 __m128d one_half = _mm_set1_pd(0.5);
1403 __m128d minus_one = _mm_set1_pd(-1.0);
1405 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1407 __m128d dummy_mask,cutoff_mask;
1408 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1409 __m128d one = _mm_set1_pd(1.0);
1410 __m128d two = _mm_set1_pd(2.0);
1416 jindex = nlist->jindex;
1418 shiftidx = nlist->shift;
1420 shiftvec = fr->shift_vec[0];
1421 fshift = fr->fshift[0];
1422 facel = _mm_set1_pd(fr->ic->epsfac);
1423 charge = mdatoms->chargeA;
1424 nvdwtype = fr->ntype;
1425 vdwparam = fr->nbfp;
1426 vdwtype = mdatoms->typeA;
1427 vdwgridparam = fr->ljpme_c6grid;
1428 sh_lj_ewald = _mm_set1_pd(fr->ic->sh_lj_ewald);
1429 ewclj = _mm_set1_pd(fr->ic->ewaldcoeff_lj);
1430 ewclj2 = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
1432 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1433 ewtab = fr->ic->tabq_coul_F;
1434 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1435 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1437 /* Setup water-specific parameters */
1438 inr = nlist->iinr[0];
1439 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1440 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1441 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1442 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1444 jq1 = _mm_set1_pd(charge[inr+1]);
1445 jq2 = _mm_set1_pd(charge[inr+2]);
1446 jq3 = _mm_set1_pd(charge[inr+3]);
1447 vdwjidx0A = 2*vdwtype[inr+0];
1448 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1449 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1450 c6grid_00 = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
1451 qq11 = _mm_mul_pd(iq1,jq1);
1452 qq12 = _mm_mul_pd(iq1,jq2);
1453 qq13 = _mm_mul_pd(iq1,jq3);
1454 qq21 = _mm_mul_pd(iq2,jq1);
1455 qq22 = _mm_mul_pd(iq2,jq2);
1456 qq23 = _mm_mul_pd(iq2,jq3);
1457 qq31 = _mm_mul_pd(iq3,jq1);
1458 qq32 = _mm_mul_pd(iq3,jq2);
1459 qq33 = _mm_mul_pd(iq3,jq3);
1461 /* Avoid stupid compiler warnings */
1463 j_coord_offsetA = 0;
1464 j_coord_offsetB = 0;
1469 /* Start outer loop over neighborlists */
1470 for(iidx=0; iidx<nri; iidx++)
1472 /* Load shift vector for this list */
1473 i_shift_offset = DIM*shiftidx[iidx];
1475 /* Load limits for loop over neighbors */
1476 j_index_start = jindex[iidx];
1477 j_index_end = jindex[iidx+1];
1479 /* Get outer coordinate index */
1481 i_coord_offset = DIM*inr;
1483 /* Load i particle coords and add shift vector */
1484 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1485 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1487 fix0 = _mm_setzero_pd();
1488 fiy0 = _mm_setzero_pd();
1489 fiz0 = _mm_setzero_pd();
1490 fix1 = _mm_setzero_pd();
1491 fiy1 = _mm_setzero_pd();
1492 fiz1 = _mm_setzero_pd();
1493 fix2 = _mm_setzero_pd();
1494 fiy2 = _mm_setzero_pd();
1495 fiz2 = _mm_setzero_pd();
1496 fix3 = _mm_setzero_pd();
1497 fiy3 = _mm_setzero_pd();
1498 fiz3 = _mm_setzero_pd();
1500 /* Start inner kernel loop */
1501 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1504 /* Get j neighbor index, and coordinate index */
1506 jnrB = jjnr[jidx+1];
1507 j_coord_offsetA = DIM*jnrA;
1508 j_coord_offsetB = DIM*jnrB;
1510 /* load j atom coordinates */
1511 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1512 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1513 &jy2,&jz2,&jx3,&jy3,&jz3);
1515 /* Calculate displacement vector */
1516 dx00 = _mm_sub_pd(ix0,jx0);
1517 dy00 = _mm_sub_pd(iy0,jy0);
1518 dz00 = _mm_sub_pd(iz0,jz0);
1519 dx11 = _mm_sub_pd(ix1,jx1);
1520 dy11 = _mm_sub_pd(iy1,jy1);
1521 dz11 = _mm_sub_pd(iz1,jz1);
1522 dx12 = _mm_sub_pd(ix1,jx2);
1523 dy12 = _mm_sub_pd(iy1,jy2);
1524 dz12 = _mm_sub_pd(iz1,jz2);
1525 dx13 = _mm_sub_pd(ix1,jx3);
1526 dy13 = _mm_sub_pd(iy1,jy3);
1527 dz13 = _mm_sub_pd(iz1,jz3);
1528 dx21 = _mm_sub_pd(ix2,jx1);
1529 dy21 = _mm_sub_pd(iy2,jy1);
1530 dz21 = _mm_sub_pd(iz2,jz1);
1531 dx22 = _mm_sub_pd(ix2,jx2);
1532 dy22 = _mm_sub_pd(iy2,jy2);
1533 dz22 = _mm_sub_pd(iz2,jz2);
1534 dx23 = _mm_sub_pd(ix2,jx3);
1535 dy23 = _mm_sub_pd(iy2,jy3);
1536 dz23 = _mm_sub_pd(iz2,jz3);
1537 dx31 = _mm_sub_pd(ix3,jx1);
1538 dy31 = _mm_sub_pd(iy3,jy1);
1539 dz31 = _mm_sub_pd(iz3,jz1);
1540 dx32 = _mm_sub_pd(ix3,jx2);
1541 dy32 = _mm_sub_pd(iy3,jy2);
1542 dz32 = _mm_sub_pd(iz3,jz2);
1543 dx33 = _mm_sub_pd(ix3,jx3);
1544 dy33 = _mm_sub_pd(iy3,jy3);
1545 dz33 = _mm_sub_pd(iz3,jz3);
1547 /* Calculate squared distance and things based on it */
1548 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1549 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1550 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1551 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1552 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1553 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1554 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1555 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1556 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1557 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1559 rinv00 = sse41_invsqrt_d(rsq00);
1560 rinv11 = sse41_invsqrt_d(rsq11);
1561 rinv12 = sse41_invsqrt_d(rsq12);
1562 rinv13 = sse41_invsqrt_d(rsq13);
1563 rinv21 = sse41_invsqrt_d(rsq21);
1564 rinv22 = sse41_invsqrt_d(rsq22);
1565 rinv23 = sse41_invsqrt_d(rsq23);
1566 rinv31 = sse41_invsqrt_d(rsq31);
1567 rinv32 = sse41_invsqrt_d(rsq32);
1568 rinv33 = sse41_invsqrt_d(rsq33);
1570 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1571 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1572 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1573 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1574 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1575 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1576 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1577 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1578 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1579 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1581 fjx0 = _mm_setzero_pd();
1582 fjy0 = _mm_setzero_pd();
1583 fjz0 = _mm_setzero_pd();
1584 fjx1 = _mm_setzero_pd();
1585 fjy1 = _mm_setzero_pd();
1586 fjz1 = _mm_setzero_pd();
1587 fjx2 = _mm_setzero_pd();
1588 fjy2 = _mm_setzero_pd();
1589 fjz2 = _mm_setzero_pd();
1590 fjx3 = _mm_setzero_pd();
1591 fjy3 = _mm_setzero_pd();
1592 fjz3 = _mm_setzero_pd();
1594 /**************************
1595 * CALCULATE INTERACTIONS *
1596 **************************/
1598 r00 = _mm_mul_pd(rsq00,rinv00);
1600 /* Analytical LJ-PME */
1601 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1602 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
1603 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
1604 exponent = sse41_exp_d(ewcljrsq);
1605 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1606 poly = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
1607 /* f6A = 6 * C6grid * (1 - poly) */
1608 f6A = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
1609 /* f6B = C6grid * exponent * beta^6 */
1610 f6B = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
1611 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1612 fvdw = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1616 /* Calculate temporary vectorial force */
1617 tx = _mm_mul_pd(fscal,dx00);
1618 ty = _mm_mul_pd(fscal,dy00);
1619 tz = _mm_mul_pd(fscal,dz00);
1621 /* Update vectorial force */
1622 fix0 = _mm_add_pd(fix0,tx);
1623 fiy0 = _mm_add_pd(fiy0,ty);
1624 fiz0 = _mm_add_pd(fiz0,tz);
1626 fjx0 = _mm_add_pd(fjx0,tx);
1627 fjy0 = _mm_add_pd(fjy0,ty);
1628 fjz0 = _mm_add_pd(fjz0,tz);
1630 /**************************
1631 * CALCULATE INTERACTIONS *
1632 **************************/
1634 r11 = _mm_mul_pd(rsq11,rinv11);
1636 /* EWALD ELECTROSTATICS */
1638 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1639 ewrt = _mm_mul_pd(r11,ewtabscale);
1640 ewitab = _mm_cvttpd_epi32(ewrt);
1641 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1642 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1644 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1645 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1649 /* Calculate temporary vectorial force */
1650 tx = _mm_mul_pd(fscal,dx11);
1651 ty = _mm_mul_pd(fscal,dy11);
1652 tz = _mm_mul_pd(fscal,dz11);
1654 /* Update vectorial force */
1655 fix1 = _mm_add_pd(fix1,tx);
1656 fiy1 = _mm_add_pd(fiy1,ty);
1657 fiz1 = _mm_add_pd(fiz1,tz);
1659 fjx1 = _mm_add_pd(fjx1,tx);
1660 fjy1 = _mm_add_pd(fjy1,ty);
1661 fjz1 = _mm_add_pd(fjz1,tz);
1663 /**************************
1664 * CALCULATE INTERACTIONS *
1665 **************************/
1667 r12 = _mm_mul_pd(rsq12,rinv12);
1669 /* EWALD ELECTROSTATICS */
1671 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1672 ewrt = _mm_mul_pd(r12,ewtabscale);
1673 ewitab = _mm_cvttpd_epi32(ewrt);
1674 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1675 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1677 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1678 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1682 /* Calculate temporary vectorial force */
1683 tx = _mm_mul_pd(fscal,dx12);
1684 ty = _mm_mul_pd(fscal,dy12);
1685 tz = _mm_mul_pd(fscal,dz12);
1687 /* Update vectorial force */
1688 fix1 = _mm_add_pd(fix1,tx);
1689 fiy1 = _mm_add_pd(fiy1,ty);
1690 fiz1 = _mm_add_pd(fiz1,tz);
1692 fjx2 = _mm_add_pd(fjx2,tx);
1693 fjy2 = _mm_add_pd(fjy2,ty);
1694 fjz2 = _mm_add_pd(fjz2,tz);
1696 /**************************
1697 * CALCULATE INTERACTIONS *
1698 **************************/
1700 r13 = _mm_mul_pd(rsq13,rinv13);
1702 /* EWALD ELECTROSTATICS */
1704 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1705 ewrt = _mm_mul_pd(r13,ewtabscale);
1706 ewitab = _mm_cvttpd_epi32(ewrt);
1707 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1708 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1710 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1711 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1715 /* Calculate temporary vectorial force */
1716 tx = _mm_mul_pd(fscal,dx13);
1717 ty = _mm_mul_pd(fscal,dy13);
1718 tz = _mm_mul_pd(fscal,dz13);
1720 /* Update vectorial force */
1721 fix1 = _mm_add_pd(fix1,tx);
1722 fiy1 = _mm_add_pd(fiy1,ty);
1723 fiz1 = _mm_add_pd(fiz1,tz);
1725 fjx3 = _mm_add_pd(fjx3,tx);
1726 fjy3 = _mm_add_pd(fjy3,ty);
1727 fjz3 = _mm_add_pd(fjz3,tz);
1729 /**************************
1730 * CALCULATE INTERACTIONS *
1731 **************************/
1733 r21 = _mm_mul_pd(rsq21,rinv21);
1735 /* EWALD ELECTROSTATICS */
1737 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1738 ewrt = _mm_mul_pd(r21,ewtabscale);
1739 ewitab = _mm_cvttpd_epi32(ewrt);
1740 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1741 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1743 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1744 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1748 /* Calculate temporary vectorial force */
1749 tx = _mm_mul_pd(fscal,dx21);
1750 ty = _mm_mul_pd(fscal,dy21);
1751 tz = _mm_mul_pd(fscal,dz21);
1753 /* Update vectorial force */
1754 fix2 = _mm_add_pd(fix2,tx);
1755 fiy2 = _mm_add_pd(fiy2,ty);
1756 fiz2 = _mm_add_pd(fiz2,tz);
1758 fjx1 = _mm_add_pd(fjx1,tx);
1759 fjy1 = _mm_add_pd(fjy1,ty);
1760 fjz1 = _mm_add_pd(fjz1,tz);
1762 /**************************
1763 * CALCULATE INTERACTIONS *
1764 **************************/
1766 r22 = _mm_mul_pd(rsq22,rinv22);
1768 /* EWALD ELECTROSTATICS */
1770 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1771 ewrt = _mm_mul_pd(r22,ewtabscale);
1772 ewitab = _mm_cvttpd_epi32(ewrt);
1773 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1774 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1776 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1777 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1781 /* Calculate temporary vectorial force */
1782 tx = _mm_mul_pd(fscal,dx22);
1783 ty = _mm_mul_pd(fscal,dy22);
1784 tz = _mm_mul_pd(fscal,dz22);
1786 /* Update vectorial force */
1787 fix2 = _mm_add_pd(fix2,tx);
1788 fiy2 = _mm_add_pd(fiy2,ty);
1789 fiz2 = _mm_add_pd(fiz2,tz);
1791 fjx2 = _mm_add_pd(fjx2,tx);
1792 fjy2 = _mm_add_pd(fjy2,ty);
1793 fjz2 = _mm_add_pd(fjz2,tz);
1795 /**************************
1796 * CALCULATE INTERACTIONS *
1797 **************************/
1799 r23 = _mm_mul_pd(rsq23,rinv23);
1801 /* EWALD ELECTROSTATICS */
1803 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1804 ewrt = _mm_mul_pd(r23,ewtabscale);
1805 ewitab = _mm_cvttpd_epi32(ewrt);
1806 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1807 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1809 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1810 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1814 /* Calculate temporary vectorial force */
1815 tx = _mm_mul_pd(fscal,dx23);
1816 ty = _mm_mul_pd(fscal,dy23);
1817 tz = _mm_mul_pd(fscal,dz23);
1819 /* Update vectorial force */
1820 fix2 = _mm_add_pd(fix2,tx);
1821 fiy2 = _mm_add_pd(fiy2,ty);
1822 fiz2 = _mm_add_pd(fiz2,tz);
1824 fjx3 = _mm_add_pd(fjx3,tx);
1825 fjy3 = _mm_add_pd(fjy3,ty);
1826 fjz3 = _mm_add_pd(fjz3,tz);
1828 /**************************
1829 * CALCULATE INTERACTIONS *
1830 **************************/
1832 r31 = _mm_mul_pd(rsq31,rinv31);
1834 /* EWALD ELECTROSTATICS */
1836 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1837 ewrt = _mm_mul_pd(r31,ewtabscale);
1838 ewitab = _mm_cvttpd_epi32(ewrt);
1839 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1840 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1842 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1843 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1847 /* Calculate temporary vectorial force */
1848 tx = _mm_mul_pd(fscal,dx31);
1849 ty = _mm_mul_pd(fscal,dy31);
1850 tz = _mm_mul_pd(fscal,dz31);
1852 /* Update vectorial force */
1853 fix3 = _mm_add_pd(fix3,tx);
1854 fiy3 = _mm_add_pd(fiy3,ty);
1855 fiz3 = _mm_add_pd(fiz3,tz);
1857 fjx1 = _mm_add_pd(fjx1,tx);
1858 fjy1 = _mm_add_pd(fjy1,ty);
1859 fjz1 = _mm_add_pd(fjz1,tz);
1861 /**************************
1862 * CALCULATE INTERACTIONS *
1863 **************************/
1865 r32 = _mm_mul_pd(rsq32,rinv32);
1867 /* EWALD ELECTROSTATICS */
1869 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1870 ewrt = _mm_mul_pd(r32,ewtabscale);
1871 ewitab = _mm_cvttpd_epi32(ewrt);
1872 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1873 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1875 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1876 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1880 /* Calculate temporary vectorial force */
1881 tx = _mm_mul_pd(fscal,dx32);
1882 ty = _mm_mul_pd(fscal,dy32);
1883 tz = _mm_mul_pd(fscal,dz32);
1885 /* Update vectorial force */
1886 fix3 = _mm_add_pd(fix3,tx);
1887 fiy3 = _mm_add_pd(fiy3,ty);
1888 fiz3 = _mm_add_pd(fiz3,tz);
1890 fjx2 = _mm_add_pd(fjx2,tx);
1891 fjy2 = _mm_add_pd(fjy2,ty);
1892 fjz2 = _mm_add_pd(fjz2,tz);
1894 /**************************
1895 * CALCULATE INTERACTIONS *
1896 **************************/
1898 r33 = _mm_mul_pd(rsq33,rinv33);
1900 /* EWALD ELECTROSTATICS */
1902 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1903 ewrt = _mm_mul_pd(r33,ewtabscale);
1904 ewitab = _mm_cvttpd_epi32(ewrt);
1905 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1906 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1908 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1909 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1913 /* Calculate temporary vectorial force */
1914 tx = _mm_mul_pd(fscal,dx33);
1915 ty = _mm_mul_pd(fscal,dy33);
1916 tz = _mm_mul_pd(fscal,dz33);
1918 /* Update vectorial force */
1919 fix3 = _mm_add_pd(fix3,tx);
1920 fiy3 = _mm_add_pd(fiy3,ty);
1921 fiz3 = _mm_add_pd(fiz3,tz);
1923 fjx3 = _mm_add_pd(fjx3,tx);
1924 fjy3 = _mm_add_pd(fjy3,ty);
1925 fjz3 = _mm_add_pd(fjz3,tz);
1927 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1929 /* Inner loop uses 373 flops */
1932 if(jidx<j_index_end)
1936 j_coord_offsetA = DIM*jnrA;
1938 /* load j atom coordinates */
1939 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1940 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1941 &jy2,&jz2,&jx3,&jy3,&jz3);
1943 /* Calculate displacement vector */
1944 dx00 = _mm_sub_pd(ix0,jx0);
1945 dy00 = _mm_sub_pd(iy0,jy0);
1946 dz00 = _mm_sub_pd(iz0,jz0);
1947 dx11 = _mm_sub_pd(ix1,jx1);
1948 dy11 = _mm_sub_pd(iy1,jy1);
1949 dz11 = _mm_sub_pd(iz1,jz1);
1950 dx12 = _mm_sub_pd(ix1,jx2);
1951 dy12 = _mm_sub_pd(iy1,jy2);
1952 dz12 = _mm_sub_pd(iz1,jz2);
1953 dx13 = _mm_sub_pd(ix1,jx3);
1954 dy13 = _mm_sub_pd(iy1,jy3);
1955 dz13 = _mm_sub_pd(iz1,jz3);
1956 dx21 = _mm_sub_pd(ix2,jx1);
1957 dy21 = _mm_sub_pd(iy2,jy1);
1958 dz21 = _mm_sub_pd(iz2,jz1);
1959 dx22 = _mm_sub_pd(ix2,jx2);
1960 dy22 = _mm_sub_pd(iy2,jy2);
1961 dz22 = _mm_sub_pd(iz2,jz2);
1962 dx23 = _mm_sub_pd(ix2,jx3);
1963 dy23 = _mm_sub_pd(iy2,jy3);
1964 dz23 = _mm_sub_pd(iz2,jz3);
1965 dx31 = _mm_sub_pd(ix3,jx1);
1966 dy31 = _mm_sub_pd(iy3,jy1);
1967 dz31 = _mm_sub_pd(iz3,jz1);
1968 dx32 = _mm_sub_pd(ix3,jx2);
1969 dy32 = _mm_sub_pd(iy3,jy2);
1970 dz32 = _mm_sub_pd(iz3,jz2);
1971 dx33 = _mm_sub_pd(ix3,jx3);
1972 dy33 = _mm_sub_pd(iy3,jy3);
1973 dz33 = _mm_sub_pd(iz3,jz3);
1975 /* Calculate squared distance and things based on it */
1976 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1977 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1978 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1979 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1980 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1981 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1982 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1983 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1984 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1985 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1987 rinv00 = sse41_invsqrt_d(rsq00);
1988 rinv11 = sse41_invsqrt_d(rsq11);
1989 rinv12 = sse41_invsqrt_d(rsq12);
1990 rinv13 = sse41_invsqrt_d(rsq13);
1991 rinv21 = sse41_invsqrt_d(rsq21);
1992 rinv22 = sse41_invsqrt_d(rsq22);
1993 rinv23 = sse41_invsqrt_d(rsq23);
1994 rinv31 = sse41_invsqrt_d(rsq31);
1995 rinv32 = sse41_invsqrt_d(rsq32);
1996 rinv33 = sse41_invsqrt_d(rsq33);
1998 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1999 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
2000 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
2001 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
2002 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
2003 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
2004 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
2005 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
2006 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
2007 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
2009 fjx0 = _mm_setzero_pd();
2010 fjy0 = _mm_setzero_pd();
2011 fjz0 = _mm_setzero_pd();
2012 fjx1 = _mm_setzero_pd();
2013 fjy1 = _mm_setzero_pd();
2014 fjz1 = _mm_setzero_pd();
2015 fjx2 = _mm_setzero_pd();
2016 fjy2 = _mm_setzero_pd();
2017 fjz2 = _mm_setzero_pd();
2018 fjx3 = _mm_setzero_pd();
2019 fjy3 = _mm_setzero_pd();
2020 fjz3 = _mm_setzero_pd();
2022 /**************************
2023 * CALCULATE INTERACTIONS *
2024 **************************/
2026 r00 = _mm_mul_pd(rsq00,rinv00);
2028 /* Analytical LJ-PME */
2029 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2030 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
2031 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
2032 exponent = sse41_exp_d(ewcljrsq);
2033 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2034 poly = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
2035 /* f6A = 6 * C6grid * (1 - poly) */
2036 f6A = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
2037 /* f6B = C6grid * exponent * beta^6 */
2038 f6B = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
2039 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2040 fvdw = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2044 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2046 /* Calculate temporary vectorial force */
2047 tx = _mm_mul_pd(fscal,dx00);
2048 ty = _mm_mul_pd(fscal,dy00);
2049 tz = _mm_mul_pd(fscal,dz00);
2051 /* Update vectorial force */
2052 fix0 = _mm_add_pd(fix0,tx);
2053 fiy0 = _mm_add_pd(fiy0,ty);
2054 fiz0 = _mm_add_pd(fiz0,tz);
2056 fjx0 = _mm_add_pd(fjx0,tx);
2057 fjy0 = _mm_add_pd(fjy0,ty);
2058 fjz0 = _mm_add_pd(fjz0,tz);
2060 /**************************
2061 * CALCULATE INTERACTIONS *
2062 **************************/
2064 r11 = _mm_mul_pd(rsq11,rinv11);
2066 /* EWALD ELECTROSTATICS */
2068 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2069 ewrt = _mm_mul_pd(r11,ewtabscale);
2070 ewitab = _mm_cvttpd_epi32(ewrt);
2071 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2072 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2073 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2074 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2078 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2080 /* Calculate temporary vectorial force */
2081 tx = _mm_mul_pd(fscal,dx11);
2082 ty = _mm_mul_pd(fscal,dy11);
2083 tz = _mm_mul_pd(fscal,dz11);
2085 /* Update vectorial force */
2086 fix1 = _mm_add_pd(fix1,tx);
2087 fiy1 = _mm_add_pd(fiy1,ty);
2088 fiz1 = _mm_add_pd(fiz1,tz);
2090 fjx1 = _mm_add_pd(fjx1,tx);
2091 fjy1 = _mm_add_pd(fjy1,ty);
2092 fjz1 = _mm_add_pd(fjz1,tz);
2094 /**************************
2095 * CALCULATE INTERACTIONS *
2096 **************************/
2098 r12 = _mm_mul_pd(rsq12,rinv12);
2100 /* EWALD ELECTROSTATICS */
2102 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2103 ewrt = _mm_mul_pd(r12,ewtabscale);
2104 ewitab = _mm_cvttpd_epi32(ewrt);
2105 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2106 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2107 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2108 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2112 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2114 /* Calculate temporary vectorial force */
2115 tx = _mm_mul_pd(fscal,dx12);
2116 ty = _mm_mul_pd(fscal,dy12);
2117 tz = _mm_mul_pd(fscal,dz12);
2119 /* Update vectorial force */
2120 fix1 = _mm_add_pd(fix1,tx);
2121 fiy1 = _mm_add_pd(fiy1,ty);
2122 fiz1 = _mm_add_pd(fiz1,tz);
2124 fjx2 = _mm_add_pd(fjx2,tx);
2125 fjy2 = _mm_add_pd(fjy2,ty);
2126 fjz2 = _mm_add_pd(fjz2,tz);
2128 /**************************
2129 * CALCULATE INTERACTIONS *
2130 **************************/
2132 r13 = _mm_mul_pd(rsq13,rinv13);
2134 /* EWALD ELECTROSTATICS */
2136 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2137 ewrt = _mm_mul_pd(r13,ewtabscale);
2138 ewitab = _mm_cvttpd_epi32(ewrt);
2139 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2140 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2141 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2142 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2146 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2148 /* Calculate temporary vectorial force */
2149 tx = _mm_mul_pd(fscal,dx13);
2150 ty = _mm_mul_pd(fscal,dy13);
2151 tz = _mm_mul_pd(fscal,dz13);
2153 /* Update vectorial force */
2154 fix1 = _mm_add_pd(fix1,tx);
2155 fiy1 = _mm_add_pd(fiy1,ty);
2156 fiz1 = _mm_add_pd(fiz1,tz);
2158 fjx3 = _mm_add_pd(fjx3,tx);
2159 fjy3 = _mm_add_pd(fjy3,ty);
2160 fjz3 = _mm_add_pd(fjz3,tz);
2162 /**************************
2163 * CALCULATE INTERACTIONS *
2164 **************************/
2166 r21 = _mm_mul_pd(rsq21,rinv21);
2168 /* EWALD ELECTROSTATICS */
2170 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2171 ewrt = _mm_mul_pd(r21,ewtabscale);
2172 ewitab = _mm_cvttpd_epi32(ewrt);
2173 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2174 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2175 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2176 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2180 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2182 /* Calculate temporary vectorial force */
2183 tx = _mm_mul_pd(fscal,dx21);
2184 ty = _mm_mul_pd(fscal,dy21);
2185 tz = _mm_mul_pd(fscal,dz21);
2187 /* Update vectorial force */
2188 fix2 = _mm_add_pd(fix2,tx);
2189 fiy2 = _mm_add_pd(fiy2,ty);
2190 fiz2 = _mm_add_pd(fiz2,tz);
2192 fjx1 = _mm_add_pd(fjx1,tx);
2193 fjy1 = _mm_add_pd(fjy1,ty);
2194 fjz1 = _mm_add_pd(fjz1,tz);
2196 /**************************
2197 * CALCULATE INTERACTIONS *
2198 **************************/
2200 r22 = _mm_mul_pd(rsq22,rinv22);
2202 /* EWALD ELECTROSTATICS */
2204 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2205 ewrt = _mm_mul_pd(r22,ewtabscale);
2206 ewitab = _mm_cvttpd_epi32(ewrt);
2207 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2208 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2209 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2210 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2214 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2216 /* Calculate temporary vectorial force */
2217 tx = _mm_mul_pd(fscal,dx22);
2218 ty = _mm_mul_pd(fscal,dy22);
2219 tz = _mm_mul_pd(fscal,dz22);
2221 /* Update vectorial force */
2222 fix2 = _mm_add_pd(fix2,tx);
2223 fiy2 = _mm_add_pd(fiy2,ty);
2224 fiz2 = _mm_add_pd(fiz2,tz);
2226 fjx2 = _mm_add_pd(fjx2,tx);
2227 fjy2 = _mm_add_pd(fjy2,ty);
2228 fjz2 = _mm_add_pd(fjz2,tz);
2230 /**************************
2231 * CALCULATE INTERACTIONS *
2232 **************************/
2234 r23 = _mm_mul_pd(rsq23,rinv23);
2236 /* EWALD ELECTROSTATICS */
2238 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2239 ewrt = _mm_mul_pd(r23,ewtabscale);
2240 ewitab = _mm_cvttpd_epi32(ewrt);
2241 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2242 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2243 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2244 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2248 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2250 /* Calculate temporary vectorial force */
2251 tx = _mm_mul_pd(fscal,dx23);
2252 ty = _mm_mul_pd(fscal,dy23);
2253 tz = _mm_mul_pd(fscal,dz23);
2255 /* Update vectorial force */
2256 fix2 = _mm_add_pd(fix2,tx);
2257 fiy2 = _mm_add_pd(fiy2,ty);
2258 fiz2 = _mm_add_pd(fiz2,tz);
2260 fjx3 = _mm_add_pd(fjx3,tx);
2261 fjy3 = _mm_add_pd(fjy3,ty);
2262 fjz3 = _mm_add_pd(fjz3,tz);
2264 /**************************
2265 * CALCULATE INTERACTIONS *
2266 **************************/
2268 r31 = _mm_mul_pd(rsq31,rinv31);
2270 /* EWALD ELECTROSTATICS */
2272 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2273 ewrt = _mm_mul_pd(r31,ewtabscale);
2274 ewitab = _mm_cvttpd_epi32(ewrt);
2275 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2276 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2277 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2278 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2282 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2284 /* Calculate temporary vectorial force */
2285 tx = _mm_mul_pd(fscal,dx31);
2286 ty = _mm_mul_pd(fscal,dy31);
2287 tz = _mm_mul_pd(fscal,dz31);
2289 /* Update vectorial force */
2290 fix3 = _mm_add_pd(fix3,tx);
2291 fiy3 = _mm_add_pd(fiy3,ty);
2292 fiz3 = _mm_add_pd(fiz3,tz);
2294 fjx1 = _mm_add_pd(fjx1,tx);
2295 fjy1 = _mm_add_pd(fjy1,ty);
2296 fjz1 = _mm_add_pd(fjz1,tz);
2298 /**************************
2299 * CALCULATE INTERACTIONS *
2300 **************************/
2302 r32 = _mm_mul_pd(rsq32,rinv32);
2304 /* EWALD ELECTROSTATICS */
2306 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2307 ewrt = _mm_mul_pd(r32,ewtabscale);
2308 ewitab = _mm_cvttpd_epi32(ewrt);
2309 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2310 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2311 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2312 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2316 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2318 /* Calculate temporary vectorial force */
2319 tx = _mm_mul_pd(fscal,dx32);
2320 ty = _mm_mul_pd(fscal,dy32);
2321 tz = _mm_mul_pd(fscal,dz32);
2323 /* Update vectorial force */
2324 fix3 = _mm_add_pd(fix3,tx);
2325 fiy3 = _mm_add_pd(fiy3,ty);
2326 fiz3 = _mm_add_pd(fiz3,tz);
2328 fjx2 = _mm_add_pd(fjx2,tx);
2329 fjy2 = _mm_add_pd(fjy2,ty);
2330 fjz2 = _mm_add_pd(fjz2,tz);
2332 /**************************
2333 * CALCULATE INTERACTIONS *
2334 **************************/
2336 r33 = _mm_mul_pd(rsq33,rinv33);
2338 /* EWALD ELECTROSTATICS */
2340 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2341 ewrt = _mm_mul_pd(r33,ewtabscale);
2342 ewitab = _mm_cvttpd_epi32(ewrt);
2343 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2344 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2345 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2346 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2350 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2352 /* Calculate temporary vectorial force */
2353 tx = _mm_mul_pd(fscal,dx33);
2354 ty = _mm_mul_pd(fscal,dy33);
2355 tz = _mm_mul_pd(fscal,dz33);
2357 /* Update vectorial force */
2358 fix3 = _mm_add_pd(fix3,tx);
2359 fiy3 = _mm_add_pd(fiy3,ty);
2360 fiz3 = _mm_add_pd(fiz3,tz);
2362 fjx3 = _mm_add_pd(fjx3,tx);
2363 fjy3 = _mm_add_pd(fjy3,ty);
2364 fjz3 = _mm_add_pd(fjz3,tz);
2366 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2368 /* Inner loop uses 373 flops */
2371 /* End of innermost loop */
2373 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2374 f+i_coord_offset,fshift+i_shift_offset);
2376 /* Increment number of inner iterations */
2377 inneriter += j_index_end - j_index_start;
2379 /* Outer loop uses 24 flops */
2382 /* Increment number of outer iterations */
2385 /* Update outer/inner flops */
2387 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*373);