2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_sse4_1_double.h"
48 #include "kernelutil_x86_sse4_1_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse4_1_double
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LJEwald
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse4_1_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int j_coord_offsetA,j_coord_offsetB;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
88 int vdwjidx0A,vdwjidx0B;
89 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 int vdwjidx1A,vdwjidx1B;
91 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B;
93 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 int vdwjidx3A,vdwjidx3B;
95 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
96 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
100 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
103 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
104 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
105 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
106 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
109 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
113 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
124 __m128d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
126 __m128d one_half = _mm_set1_pd(0.5);
127 __m128d minus_one = _mm_set1_pd(-1.0);
129 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
131 __m128d dummy_mask,cutoff_mask;
132 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
133 __m128d one = _mm_set1_pd(1.0);
134 __m128d two = _mm_set1_pd(2.0);
140 jindex = nlist->jindex;
142 shiftidx = nlist->shift;
144 shiftvec = fr->shift_vec[0];
145 fshift = fr->fshift[0];
146 facel = _mm_set1_pd(fr->epsfac);
147 charge = mdatoms->chargeA;
148 nvdwtype = fr->ntype;
150 vdwtype = mdatoms->typeA;
151 vdwgridparam = fr->ljpme_c6grid;
152 sh_lj_ewald = _mm_set1_pd(fr->ic->sh_lj_ewald);
153 ewclj = _mm_set1_pd(fr->ewaldcoeff_lj);
154 ewclj2 = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
156 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
157 ewtab = fr->ic->tabq_coul_FDV0;
158 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
159 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
161 /* Setup water-specific parameters */
162 inr = nlist->iinr[0];
163 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
164 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
165 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
166 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
168 jq1 = _mm_set1_pd(charge[inr+1]);
169 jq2 = _mm_set1_pd(charge[inr+2]);
170 jq3 = _mm_set1_pd(charge[inr+3]);
171 vdwjidx0A = 2*vdwtype[inr+0];
172 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
173 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
174 c6grid_00 = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
175 qq11 = _mm_mul_pd(iq1,jq1);
176 qq12 = _mm_mul_pd(iq1,jq2);
177 qq13 = _mm_mul_pd(iq1,jq3);
178 qq21 = _mm_mul_pd(iq2,jq1);
179 qq22 = _mm_mul_pd(iq2,jq2);
180 qq23 = _mm_mul_pd(iq2,jq3);
181 qq31 = _mm_mul_pd(iq3,jq1);
182 qq32 = _mm_mul_pd(iq3,jq2);
183 qq33 = _mm_mul_pd(iq3,jq3);
185 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
186 rcutoff_scalar = fr->rcoulomb;
187 rcutoff = _mm_set1_pd(rcutoff_scalar);
188 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
190 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
191 rvdw = _mm_set1_pd(fr->rvdw);
193 /* Avoid stupid compiler warnings */
201 /* Start outer loop over neighborlists */
202 for(iidx=0; iidx<nri; iidx++)
204 /* Load shift vector for this list */
205 i_shift_offset = DIM*shiftidx[iidx];
207 /* Load limits for loop over neighbors */
208 j_index_start = jindex[iidx];
209 j_index_end = jindex[iidx+1];
211 /* Get outer coordinate index */
213 i_coord_offset = DIM*inr;
215 /* Load i particle coords and add shift vector */
216 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
217 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
219 fix0 = _mm_setzero_pd();
220 fiy0 = _mm_setzero_pd();
221 fiz0 = _mm_setzero_pd();
222 fix1 = _mm_setzero_pd();
223 fiy1 = _mm_setzero_pd();
224 fiz1 = _mm_setzero_pd();
225 fix2 = _mm_setzero_pd();
226 fiy2 = _mm_setzero_pd();
227 fiz2 = _mm_setzero_pd();
228 fix3 = _mm_setzero_pd();
229 fiy3 = _mm_setzero_pd();
230 fiz3 = _mm_setzero_pd();
232 /* Reset potential sums */
233 velecsum = _mm_setzero_pd();
234 vvdwsum = _mm_setzero_pd();
236 /* Start inner kernel loop */
237 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
240 /* Get j neighbor index, and coordinate index */
243 j_coord_offsetA = DIM*jnrA;
244 j_coord_offsetB = DIM*jnrB;
246 /* load j atom coordinates */
247 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
248 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
249 &jy2,&jz2,&jx3,&jy3,&jz3);
251 /* Calculate displacement vector */
252 dx00 = _mm_sub_pd(ix0,jx0);
253 dy00 = _mm_sub_pd(iy0,jy0);
254 dz00 = _mm_sub_pd(iz0,jz0);
255 dx11 = _mm_sub_pd(ix1,jx1);
256 dy11 = _mm_sub_pd(iy1,jy1);
257 dz11 = _mm_sub_pd(iz1,jz1);
258 dx12 = _mm_sub_pd(ix1,jx2);
259 dy12 = _mm_sub_pd(iy1,jy2);
260 dz12 = _mm_sub_pd(iz1,jz2);
261 dx13 = _mm_sub_pd(ix1,jx3);
262 dy13 = _mm_sub_pd(iy1,jy3);
263 dz13 = _mm_sub_pd(iz1,jz3);
264 dx21 = _mm_sub_pd(ix2,jx1);
265 dy21 = _mm_sub_pd(iy2,jy1);
266 dz21 = _mm_sub_pd(iz2,jz1);
267 dx22 = _mm_sub_pd(ix2,jx2);
268 dy22 = _mm_sub_pd(iy2,jy2);
269 dz22 = _mm_sub_pd(iz2,jz2);
270 dx23 = _mm_sub_pd(ix2,jx3);
271 dy23 = _mm_sub_pd(iy2,jy3);
272 dz23 = _mm_sub_pd(iz2,jz3);
273 dx31 = _mm_sub_pd(ix3,jx1);
274 dy31 = _mm_sub_pd(iy3,jy1);
275 dz31 = _mm_sub_pd(iz3,jz1);
276 dx32 = _mm_sub_pd(ix3,jx2);
277 dy32 = _mm_sub_pd(iy3,jy2);
278 dz32 = _mm_sub_pd(iz3,jz2);
279 dx33 = _mm_sub_pd(ix3,jx3);
280 dy33 = _mm_sub_pd(iy3,jy3);
281 dz33 = _mm_sub_pd(iz3,jz3);
283 /* Calculate squared distance and things based on it */
284 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
285 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
286 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
287 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
288 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
289 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
290 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
291 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
292 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
293 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
295 rinv00 = gmx_mm_invsqrt_pd(rsq00);
296 rinv11 = gmx_mm_invsqrt_pd(rsq11);
297 rinv12 = gmx_mm_invsqrt_pd(rsq12);
298 rinv13 = gmx_mm_invsqrt_pd(rsq13);
299 rinv21 = gmx_mm_invsqrt_pd(rsq21);
300 rinv22 = gmx_mm_invsqrt_pd(rsq22);
301 rinv23 = gmx_mm_invsqrt_pd(rsq23);
302 rinv31 = gmx_mm_invsqrt_pd(rsq31);
303 rinv32 = gmx_mm_invsqrt_pd(rsq32);
304 rinv33 = gmx_mm_invsqrt_pd(rsq33);
306 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
307 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
308 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
309 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
310 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
311 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
312 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
313 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
314 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
315 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
317 fjx0 = _mm_setzero_pd();
318 fjy0 = _mm_setzero_pd();
319 fjz0 = _mm_setzero_pd();
320 fjx1 = _mm_setzero_pd();
321 fjy1 = _mm_setzero_pd();
322 fjz1 = _mm_setzero_pd();
323 fjx2 = _mm_setzero_pd();
324 fjy2 = _mm_setzero_pd();
325 fjz2 = _mm_setzero_pd();
326 fjx3 = _mm_setzero_pd();
327 fjy3 = _mm_setzero_pd();
328 fjz3 = _mm_setzero_pd();
330 /**************************
331 * CALCULATE INTERACTIONS *
332 **************************/
334 if (gmx_mm_any_lt(rsq00,rcutoff2))
337 r00 = _mm_mul_pd(rsq00,rinv00);
339 /* Analytical LJ-PME */
340 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
341 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
342 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
343 exponent = gmx_simd_exp_d(ewcljrsq);
344 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
345 poly = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
346 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
347 vvdw6 = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
348 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
349 vvdw = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
350 _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
351 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
352 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
354 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
356 /* Update potential sum for this i atom from the interaction with this j atom. */
357 vvdw = _mm_and_pd(vvdw,cutoff_mask);
358 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
362 fscal = _mm_and_pd(fscal,cutoff_mask);
364 /* Calculate temporary vectorial force */
365 tx = _mm_mul_pd(fscal,dx00);
366 ty = _mm_mul_pd(fscal,dy00);
367 tz = _mm_mul_pd(fscal,dz00);
369 /* Update vectorial force */
370 fix0 = _mm_add_pd(fix0,tx);
371 fiy0 = _mm_add_pd(fiy0,ty);
372 fiz0 = _mm_add_pd(fiz0,tz);
374 fjx0 = _mm_add_pd(fjx0,tx);
375 fjy0 = _mm_add_pd(fjy0,ty);
376 fjz0 = _mm_add_pd(fjz0,tz);
380 /**************************
381 * CALCULATE INTERACTIONS *
382 **************************/
384 if (gmx_mm_any_lt(rsq11,rcutoff2))
387 r11 = _mm_mul_pd(rsq11,rinv11);
389 /* EWALD ELECTROSTATICS */
391 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
392 ewrt = _mm_mul_pd(r11,ewtabscale);
393 ewitab = _mm_cvttpd_epi32(ewrt);
394 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
395 ewitab = _mm_slli_epi32(ewitab,2);
396 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
397 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
398 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
399 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
400 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
401 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
402 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
403 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
404 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
405 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
407 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
409 /* Update potential sum for this i atom from the interaction with this j atom. */
410 velec = _mm_and_pd(velec,cutoff_mask);
411 velecsum = _mm_add_pd(velecsum,velec);
415 fscal = _mm_and_pd(fscal,cutoff_mask);
417 /* Calculate temporary vectorial force */
418 tx = _mm_mul_pd(fscal,dx11);
419 ty = _mm_mul_pd(fscal,dy11);
420 tz = _mm_mul_pd(fscal,dz11);
422 /* Update vectorial force */
423 fix1 = _mm_add_pd(fix1,tx);
424 fiy1 = _mm_add_pd(fiy1,ty);
425 fiz1 = _mm_add_pd(fiz1,tz);
427 fjx1 = _mm_add_pd(fjx1,tx);
428 fjy1 = _mm_add_pd(fjy1,ty);
429 fjz1 = _mm_add_pd(fjz1,tz);
433 /**************************
434 * CALCULATE INTERACTIONS *
435 **************************/
437 if (gmx_mm_any_lt(rsq12,rcutoff2))
440 r12 = _mm_mul_pd(rsq12,rinv12);
442 /* EWALD ELECTROSTATICS */
444 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
445 ewrt = _mm_mul_pd(r12,ewtabscale);
446 ewitab = _mm_cvttpd_epi32(ewrt);
447 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
448 ewitab = _mm_slli_epi32(ewitab,2);
449 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
450 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
451 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
452 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
453 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
454 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
455 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
456 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
457 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
458 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
460 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
462 /* Update potential sum for this i atom from the interaction with this j atom. */
463 velec = _mm_and_pd(velec,cutoff_mask);
464 velecsum = _mm_add_pd(velecsum,velec);
468 fscal = _mm_and_pd(fscal,cutoff_mask);
470 /* Calculate temporary vectorial force */
471 tx = _mm_mul_pd(fscal,dx12);
472 ty = _mm_mul_pd(fscal,dy12);
473 tz = _mm_mul_pd(fscal,dz12);
475 /* Update vectorial force */
476 fix1 = _mm_add_pd(fix1,tx);
477 fiy1 = _mm_add_pd(fiy1,ty);
478 fiz1 = _mm_add_pd(fiz1,tz);
480 fjx2 = _mm_add_pd(fjx2,tx);
481 fjy2 = _mm_add_pd(fjy2,ty);
482 fjz2 = _mm_add_pd(fjz2,tz);
486 /**************************
487 * CALCULATE INTERACTIONS *
488 **************************/
490 if (gmx_mm_any_lt(rsq13,rcutoff2))
493 r13 = _mm_mul_pd(rsq13,rinv13);
495 /* EWALD ELECTROSTATICS */
497 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
498 ewrt = _mm_mul_pd(r13,ewtabscale);
499 ewitab = _mm_cvttpd_epi32(ewrt);
500 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
501 ewitab = _mm_slli_epi32(ewitab,2);
502 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
503 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
504 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
505 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
506 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
507 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
508 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
509 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
510 velec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_sub_pd(rinv13,sh_ewald),velec));
511 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
513 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
515 /* Update potential sum for this i atom from the interaction with this j atom. */
516 velec = _mm_and_pd(velec,cutoff_mask);
517 velecsum = _mm_add_pd(velecsum,velec);
521 fscal = _mm_and_pd(fscal,cutoff_mask);
523 /* Calculate temporary vectorial force */
524 tx = _mm_mul_pd(fscal,dx13);
525 ty = _mm_mul_pd(fscal,dy13);
526 tz = _mm_mul_pd(fscal,dz13);
528 /* Update vectorial force */
529 fix1 = _mm_add_pd(fix1,tx);
530 fiy1 = _mm_add_pd(fiy1,ty);
531 fiz1 = _mm_add_pd(fiz1,tz);
533 fjx3 = _mm_add_pd(fjx3,tx);
534 fjy3 = _mm_add_pd(fjy3,ty);
535 fjz3 = _mm_add_pd(fjz3,tz);
539 /**************************
540 * CALCULATE INTERACTIONS *
541 **************************/
543 if (gmx_mm_any_lt(rsq21,rcutoff2))
546 r21 = _mm_mul_pd(rsq21,rinv21);
548 /* EWALD ELECTROSTATICS */
550 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
551 ewrt = _mm_mul_pd(r21,ewtabscale);
552 ewitab = _mm_cvttpd_epi32(ewrt);
553 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
554 ewitab = _mm_slli_epi32(ewitab,2);
555 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
556 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
557 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
558 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
559 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
560 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
561 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
562 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
563 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
564 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
566 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
568 /* Update potential sum for this i atom from the interaction with this j atom. */
569 velec = _mm_and_pd(velec,cutoff_mask);
570 velecsum = _mm_add_pd(velecsum,velec);
574 fscal = _mm_and_pd(fscal,cutoff_mask);
576 /* Calculate temporary vectorial force */
577 tx = _mm_mul_pd(fscal,dx21);
578 ty = _mm_mul_pd(fscal,dy21);
579 tz = _mm_mul_pd(fscal,dz21);
581 /* Update vectorial force */
582 fix2 = _mm_add_pd(fix2,tx);
583 fiy2 = _mm_add_pd(fiy2,ty);
584 fiz2 = _mm_add_pd(fiz2,tz);
586 fjx1 = _mm_add_pd(fjx1,tx);
587 fjy1 = _mm_add_pd(fjy1,ty);
588 fjz1 = _mm_add_pd(fjz1,tz);
592 /**************************
593 * CALCULATE INTERACTIONS *
594 **************************/
596 if (gmx_mm_any_lt(rsq22,rcutoff2))
599 r22 = _mm_mul_pd(rsq22,rinv22);
601 /* EWALD ELECTROSTATICS */
603 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
604 ewrt = _mm_mul_pd(r22,ewtabscale);
605 ewitab = _mm_cvttpd_epi32(ewrt);
606 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
607 ewitab = _mm_slli_epi32(ewitab,2);
608 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
609 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
610 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
611 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
612 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
613 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
614 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
615 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
616 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
617 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
619 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
621 /* Update potential sum for this i atom from the interaction with this j atom. */
622 velec = _mm_and_pd(velec,cutoff_mask);
623 velecsum = _mm_add_pd(velecsum,velec);
627 fscal = _mm_and_pd(fscal,cutoff_mask);
629 /* Calculate temporary vectorial force */
630 tx = _mm_mul_pd(fscal,dx22);
631 ty = _mm_mul_pd(fscal,dy22);
632 tz = _mm_mul_pd(fscal,dz22);
634 /* Update vectorial force */
635 fix2 = _mm_add_pd(fix2,tx);
636 fiy2 = _mm_add_pd(fiy2,ty);
637 fiz2 = _mm_add_pd(fiz2,tz);
639 fjx2 = _mm_add_pd(fjx2,tx);
640 fjy2 = _mm_add_pd(fjy2,ty);
641 fjz2 = _mm_add_pd(fjz2,tz);
645 /**************************
646 * CALCULATE INTERACTIONS *
647 **************************/
649 if (gmx_mm_any_lt(rsq23,rcutoff2))
652 r23 = _mm_mul_pd(rsq23,rinv23);
654 /* EWALD ELECTROSTATICS */
656 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
657 ewrt = _mm_mul_pd(r23,ewtabscale);
658 ewitab = _mm_cvttpd_epi32(ewrt);
659 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
660 ewitab = _mm_slli_epi32(ewitab,2);
661 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
662 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
663 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
664 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
665 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
666 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
667 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
668 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
669 velec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_sub_pd(rinv23,sh_ewald),velec));
670 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
672 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
674 /* Update potential sum for this i atom from the interaction with this j atom. */
675 velec = _mm_and_pd(velec,cutoff_mask);
676 velecsum = _mm_add_pd(velecsum,velec);
680 fscal = _mm_and_pd(fscal,cutoff_mask);
682 /* Calculate temporary vectorial force */
683 tx = _mm_mul_pd(fscal,dx23);
684 ty = _mm_mul_pd(fscal,dy23);
685 tz = _mm_mul_pd(fscal,dz23);
687 /* Update vectorial force */
688 fix2 = _mm_add_pd(fix2,tx);
689 fiy2 = _mm_add_pd(fiy2,ty);
690 fiz2 = _mm_add_pd(fiz2,tz);
692 fjx3 = _mm_add_pd(fjx3,tx);
693 fjy3 = _mm_add_pd(fjy3,ty);
694 fjz3 = _mm_add_pd(fjz3,tz);
698 /**************************
699 * CALCULATE INTERACTIONS *
700 **************************/
702 if (gmx_mm_any_lt(rsq31,rcutoff2))
705 r31 = _mm_mul_pd(rsq31,rinv31);
707 /* EWALD ELECTROSTATICS */
709 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
710 ewrt = _mm_mul_pd(r31,ewtabscale);
711 ewitab = _mm_cvttpd_epi32(ewrt);
712 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
713 ewitab = _mm_slli_epi32(ewitab,2);
714 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
715 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
716 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
717 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
718 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
719 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
720 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
721 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
722 velec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_sub_pd(rinv31,sh_ewald),velec));
723 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
725 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
727 /* Update potential sum for this i atom from the interaction with this j atom. */
728 velec = _mm_and_pd(velec,cutoff_mask);
729 velecsum = _mm_add_pd(velecsum,velec);
733 fscal = _mm_and_pd(fscal,cutoff_mask);
735 /* Calculate temporary vectorial force */
736 tx = _mm_mul_pd(fscal,dx31);
737 ty = _mm_mul_pd(fscal,dy31);
738 tz = _mm_mul_pd(fscal,dz31);
740 /* Update vectorial force */
741 fix3 = _mm_add_pd(fix3,tx);
742 fiy3 = _mm_add_pd(fiy3,ty);
743 fiz3 = _mm_add_pd(fiz3,tz);
745 fjx1 = _mm_add_pd(fjx1,tx);
746 fjy1 = _mm_add_pd(fjy1,ty);
747 fjz1 = _mm_add_pd(fjz1,tz);
751 /**************************
752 * CALCULATE INTERACTIONS *
753 **************************/
755 if (gmx_mm_any_lt(rsq32,rcutoff2))
758 r32 = _mm_mul_pd(rsq32,rinv32);
760 /* EWALD ELECTROSTATICS */
762 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
763 ewrt = _mm_mul_pd(r32,ewtabscale);
764 ewitab = _mm_cvttpd_epi32(ewrt);
765 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
766 ewitab = _mm_slli_epi32(ewitab,2);
767 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
768 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
769 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
770 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
771 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
772 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
773 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
774 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
775 velec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_sub_pd(rinv32,sh_ewald),velec));
776 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
778 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
780 /* Update potential sum for this i atom from the interaction with this j atom. */
781 velec = _mm_and_pd(velec,cutoff_mask);
782 velecsum = _mm_add_pd(velecsum,velec);
786 fscal = _mm_and_pd(fscal,cutoff_mask);
788 /* Calculate temporary vectorial force */
789 tx = _mm_mul_pd(fscal,dx32);
790 ty = _mm_mul_pd(fscal,dy32);
791 tz = _mm_mul_pd(fscal,dz32);
793 /* Update vectorial force */
794 fix3 = _mm_add_pd(fix3,tx);
795 fiy3 = _mm_add_pd(fiy3,ty);
796 fiz3 = _mm_add_pd(fiz3,tz);
798 fjx2 = _mm_add_pd(fjx2,tx);
799 fjy2 = _mm_add_pd(fjy2,ty);
800 fjz2 = _mm_add_pd(fjz2,tz);
804 /**************************
805 * CALCULATE INTERACTIONS *
806 **************************/
808 if (gmx_mm_any_lt(rsq33,rcutoff2))
811 r33 = _mm_mul_pd(rsq33,rinv33);
813 /* EWALD ELECTROSTATICS */
815 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
816 ewrt = _mm_mul_pd(r33,ewtabscale);
817 ewitab = _mm_cvttpd_epi32(ewrt);
818 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
819 ewitab = _mm_slli_epi32(ewitab,2);
820 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
821 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
822 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
823 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
824 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
825 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
826 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
827 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
828 velec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_sub_pd(rinv33,sh_ewald),velec));
829 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
831 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
833 /* Update potential sum for this i atom from the interaction with this j atom. */
834 velec = _mm_and_pd(velec,cutoff_mask);
835 velecsum = _mm_add_pd(velecsum,velec);
839 fscal = _mm_and_pd(fscal,cutoff_mask);
841 /* Calculate temporary vectorial force */
842 tx = _mm_mul_pd(fscal,dx33);
843 ty = _mm_mul_pd(fscal,dy33);
844 tz = _mm_mul_pd(fscal,dz33);
846 /* Update vectorial force */
847 fix3 = _mm_add_pd(fix3,tx);
848 fiy3 = _mm_add_pd(fiy3,ty);
849 fiz3 = _mm_add_pd(fiz3,tz);
851 fjx3 = _mm_add_pd(fjx3,tx);
852 fjy3 = _mm_add_pd(fjy3,ty);
853 fjz3 = _mm_add_pd(fjz3,tz);
857 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
859 /* Inner loop uses 478 flops */
866 j_coord_offsetA = DIM*jnrA;
868 /* load j atom coordinates */
869 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
870 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
871 &jy2,&jz2,&jx3,&jy3,&jz3);
873 /* Calculate displacement vector */
874 dx00 = _mm_sub_pd(ix0,jx0);
875 dy00 = _mm_sub_pd(iy0,jy0);
876 dz00 = _mm_sub_pd(iz0,jz0);
877 dx11 = _mm_sub_pd(ix1,jx1);
878 dy11 = _mm_sub_pd(iy1,jy1);
879 dz11 = _mm_sub_pd(iz1,jz1);
880 dx12 = _mm_sub_pd(ix1,jx2);
881 dy12 = _mm_sub_pd(iy1,jy2);
882 dz12 = _mm_sub_pd(iz1,jz2);
883 dx13 = _mm_sub_pd(ix1,jx3);
884 dy13 = _mm_sub_pd(iy1,jy3);
885 dz13 = _mm_sub_pd(iz1,jz3);
886 dx21 = _mm_sub_pd(ix2,jx1);
887 dy21 = _mm_sub_pd(iy2,jy1);
888 dz21 = _mm_sub_pd(iz2,jz1);
889 dx22 = _mm_sub_pd(ix2,jx2);
890 dy22 = _mm_sub_pd(iy2,jy2);
891 dz22 = _mm_sub_pd(iz2,jz2);
892 dx23 = _mm_sub_pd(ix2,jx3);
893 dy23 = _mm_sub_pd(iy2,jy3);
894 dz23 = _mm_sub_pd(iz2,jz3);
895 dx31 = _mm_sub_pd(ix3,jx1);
896 dy31 = _mm_sub_pd(iy3,jy1);
897 dz31 = _mm_sub_pd(iz3,jz1);
898 dx32 = _mm_sub_pd(ix3,jx2);
899 dy32 = _mm_sub_pd(iy3,jy2);
900 dz32 = _mm_sub_pd(iz3,jz2);
901 dx33 = _mm_sub_pd(ix3,jx3);
902 dy33 = _mm_sub_pd(iy3,jy3);
903 dz33 = _mm_sub_pd(iz3,jz3);
905 /* Calculate squared distance and things based on it */
906 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
907 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
908 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
909 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
910 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
911 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
912 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
913 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
914 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
915 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
917 rinv00 = gmx_mm_invsqrt_pd(rsq00);
918 rinv11 = gmx_mm_invsqrt_pd(rsq11);
919 rinv12 = gmx_mm_invsqrt_pd(rsq12);
920 rinv13 = gmx_mm_invsqrt_pd(rsq13);
921 rinv21 = gmx_mm_invsqrt_pd(rsq21);
922 rinv22 = gmx_mm_invsqrt_pd(rsq22);
923 rinv23 = gmx_mm_invsqrt_pd(rsq23);
924 rinv31 = gmx_mm_invsqrt_pd(rsq31);
925 rinv32 = gmx_mm_invsqrt_pd(rsq32);
926 rinv33 = gmx_mm_invsqrt_pd(rsq33);
928 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
929 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
930 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
931 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
932 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
933 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
934 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
935 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
936 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
937 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
939 fjx0 = _mm_setzero_pd();
940 fjy0 = _mm_setzero_pd();
941 fjz0 = _mm_setzero_pd();
942 fjx1 = _mm_setzero_pd();
943 fjy1 = _mm_setzero_pd();
944 fjz1 = _mm_setzero_pd();
945 fjx2 = _mm_setzero_pd();
946 fjy2 = _mm_setzero_pd();
947 fjz2 = _mm_setzero_pd();
948 fjx3 = _mm_setzero_pd();
949 fjy3 = _mm_setzero_pd();
950 fjz3 = _mm_setzero_pd();
952 /**************************
953 * CALCULATE INTERACTIONS *
954 **************************/
956 if (gmx_mm_any_lt(rsq00,rcutoff2))
959 r00 = _mm_mul_pd(rsq00,rinv00);
961 /* Analytical LJ-PME */
962 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
963 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
964 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
965 exponent = gmx_simd_exp_d(ewcljrsq);
966 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
967 poly = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
968 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
969 vvdw6 = _mm_mul_pd(_mm_sub_pd(c6_00,_mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly))),rinvsix);
970 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
971 vvdw = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
972 _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_add_pd(_mm_mul_pd(c6_00,sh_vdw_invrcut6),_mm_mul_pd(c6grid_00,sh_lj_ewald))),one_sixth));
973 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
974 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,_mm_sub_pd(vvdw6,_mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6)))),rinvsq00);
976 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
978 /* Update potential sum for this i atom from the interaction with this j atom. */
979 vvdw = _mm_and_pd(vvdw,cutoff_mask);
980 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
981 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
985 fscal = _mm_and_pd(fscal,cutoff_mask);
987 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
989 /* Calculate temporary vectorial force */
990 tx = _mm_mul_pd(fscal,dx00);
991 ty = _mm_mul_pd(fscal,dy00);
992 tz = _mm_mul_pd(fscal,dz00);
994 /* Update vectorial force */
995 fix0 = _mm_add_pd(fix0,tx);
996 fiy0 = _mm_add_pd(fiy0,ty);
997 fiz0 = _mm_add_pd(fiz0,tz);
999 fjx0 = _mm_add_pd(fjx0,tx);
1000 fjy0 = _mm_add_pd(fjy0,ty);
1001 fjz0 = _mm_add_pd(fjz0,tz);
1005 /**************************
1006 * CALCULATE INTERACTIONS *
1007 **************************/
1009 if (gmx_mm_any_lt(rsq11,rcutoff2))
1012 r11 = _mm_mul_pd(rsq11,rinv11);
1014 /* EWALD ELECTROSTATICS */
1016 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1017 ewrt = _mm_mul_pd(r11,ewtabscale);
1018 ewitab = _mm_cvttpd_epi32(ewrt);
1019 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1020 ewitab = _mm_slli_epi32(ewitab,2);
1021 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1022 ewtabD = _mm_setzero_pd();
1023 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1024 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1025 ewtabFn = _mm_setzero_pd();
1026 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1027 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1028 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1029 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
1030 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1032 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1034 /* Update potential sum for this i atom from the interaction with this j atom. */
1035 velec = _mm_and_pd(velec,cutoff_mask);
1036 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1037 velecsum = _mm_add_pd(velecsum,velec);
1041 fscal = _mm_and_pd(fscal,cutoff_mask);
1043 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1045 /* Calculate temporary vectorial force */
1046 tx = _mm_mul_pd(fscal,dx11);
1047 ty = _mm_mul_pd(fscal,dy11);
1048 tz = _mm_mul_pd(fscal,dz11);
1050 /* Update vectorial force */
1051 fix1 = _mm_add_pd(fix1,tx);
1052 fiy1 = _mm_add_pd(fiy1,ty);
1053 fiz1 = _mm_add_pd(fiz1,tz);
1055 fjx1 = _mm_add_pd(fjx1,tx);
1056 fjy1 = _mm_add_pd(fjy1,ty);
1057 fjz1 = _mm_add_pd(fjz1,tz);
1061 /**************************
1062 * CALCULATE INTERACTIONS *
1063 **************************/
1065 if (gmx_mm_any_lt(rsq12,rcutoff2))
1068 r12 = _mm_mul_pd(rsq12,rinv12);
1070 /* EWALD ELECTROSTATICS */
1072 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1073 ewrt = _mm_mul_pd(r12,ewtabscale);
1074 ewitab = _mm_cvttpd_epi32(ewrt);
1075 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1076 ewitab = _mm_slli_epi32(ewitab,2);
1077 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1078 ewtabD = _mm_setzero_pd();
1079 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1080 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1081 ewtabFn = _mm_setzero_pd();
1082 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1083 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1084 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1085 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
1086 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1088 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1090 /* Update potential sum for this i atom from the interaction with this j atom. */
1091 velec = _mm_and_pd(velec,cutoff_mask);
1092 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1093 velecsum = _mm_add_pd(velecsum,velec);
1097 fscal = _mm_and_pd(fscal,cutoff_mask);
1099 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1101 /* Calculate temporary vectorial force */
1102 tx = _mm_mul_pd(fscal,dx12);
1103 ty = _mm_mul_pd(fscal,dy12);
1104 tz = _mm_mul_pd(fscal,dz12);
1106 /* Update vectorial force */
1107 fix1 = _mm_add_pd(fix1,tx);
1108 fiy1 = _mm_add_pd(fiy1,ty);
1109 fiz1 = _mm_add_pd(fiz1,tz);
1111 fjx2 = _mm_add_pd(fjx2,tx);
1112 fjy2 = _mm_add_pd(fjy2,ty);
1113 fjz2 = _mm_add_pd(fjz2,tz);
1117 /**************************
1118 * CALCULATE INTERACTIONS *
1119 **************************/
1121 if (gmx_mm_any_lt(rsq13,rcutoff2))
1124 r13 = _mm_mul_pd(rsq13,rinv13);
1126 /* EWALD ELECTROSTATICS */
1128 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1129 ewrt = _mm_mul_pd(r13,ewtabscale);
1130 ewitab = _mm_cvttpd_epi32(ewrt);
1131 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1132 ewitab = _mm_slli_epi32(ewitab,2);
1133 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1134 ewtabD = _mm_setzero_pd();
1135 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1136 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1137 ewtabFn = _mm_setzero_pd();
1138 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1139 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1140 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1141 velec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_sub_pd(rinv13,sh_ewald),velec));
1142 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1144 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1146 /* Update potential sum for this i atom from the interaction with this j atom. */
1147 velec = _mm_and_pd(velec,cutoff_mask);
1148 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1149 velecsum = _mm_add_pd(velecsum,velec);
1153 fscal = _mm_and_pd(fscal,cutoff_mask);
1155 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1157 /* Calculate temporary vectorial force */
1158 tx = _mm_mul_pd(fscal,dx13);
1159 ty = _mm_mul_pd(fscal,dy13);
1160 tz = _mm_mul_pd(fscal,dz13);
1162 /* Update vectorial force */
1163 fix1 = _mm_add_pd(fix1,tx);
1164 fiy1 = _mm_add_pd(fiy1,ty);
1165 fiz1 = _mm_add_pd(fiz1,tz);
1167 fjx3 = _mm_add_pd(fjx3,tx);
1168 fjy3 = _mm_add_pd(fjy3,ty);
1169 fjz3 = _mm_add_pd(fjz3,tz);
1173 /**************************
1174 * CALCULATE INTERACTIONS *
1175 **************************/
1177 if (gmx_mm_any_lt(rsq21,rcutoff2))
1180 r21 = _mm_mul_pd(rsq21,rinv21);
1182 /* EWALD ELECTROSTATICS */
1184 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1185 ewrt = _mm_mul_pd(r21,ewtabscale);
1186 ewitab = _mm_cvttpd_epi32(ewrt);
1187 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1188 ewitab = _mm_slli_epi32(ewitab,2);
1189 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1190 ewtabD = _mm_setzero_pd();
1191 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1192 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1193 ewtabFn = _mm_setzero_pd();
1194 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1195 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1196 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1197 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
1198 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1200 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1202 /* Update potential sum for this i atom from the interaction with this j atom. */
1203 velec = _mm_and_pd(velec,cutoff_mask);
1204 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1205 velecsum = _mm_add_pd(velecsum,velec);
1209 fscal = _mm_and_pd(fscal,cutoff_mask);
1211 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1213 /* Calculate temporary vectorial force */
1214 tx = _mm_mul_pd(fscal,dx21);
1215 ty = _mm_mul_pd(fscal,dy21);
1216 tz = _mm_mul_pd(fscal,dz21);
1218 /* Update vectorial force */
1219 fix2 = _mm_add_pd(fix2,tx);
1220 fiy2 = _mm_add_pd(fiy2,ty);
1221 fiz2 = _mm_add_pd(fiz2,tz);
1223 fjx1 = _mm_add_pd(fjx1,tx);
1224 fjy1 = _mm_add_pd(fjy1,ty);
1225 fjz1 = _mm_add_pd(fjz1,tz);
1229 /**************************
1230 * CALCULATE INTERACTIONS *
1231 **************************/
1233 if (gmx_mm_any_lt(rsq22,rcutoff2))
1236 r22 = _mm_mul_pd(rsq22,rinv22);
1238 /* EWALD ELECTROSTATICS */
1240 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1241 ewrt = _mm_mul_pd(r22,ewtabscale);
1242 ewitab = _mm_cvttpd_epi32(ewrt);
1243 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1244 ewitab = _mm_slli_epi32(ewitab,2);
1245 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1246 ewtabD = _mm_setzero_pd();
1247 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1248 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1249 ewtabFn = _mm_setzero_pd();
1250 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1251 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1252 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1253 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
1254 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1256 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1258 /* Update potential sum for this i atom from the interaction with this j atom. */
1259 velec = _mm_and_pd(velec,cutoff_mask);
1260 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1261 velecsum = _mm_add_pd(velecsum,velec);
1265 fscal = _mm_and_pd(fscal,cutoff_mask);
1267 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1269 /* Calculate temporary vectorial force */
1270 tx = _mm_mul_pd(fscal,dx22);
1271 ty = _mm_mul_pd(fscal,dy22);
1272 tz = _mm_mul_pd(fscal,dz22);
1274 /* Update vectorial force */
1275 fix2 = _mm_add_pd(fix2,tx);
1276 fiy2 = _mm_add_pd(fiy2,ty);
1277 fiz2 = _mm_add_pd(fiz2,tz);
1279 fjx2 = _mm_add_pd(fjx2,tx);
1280 fjy2 = _mm_add_pd(fjy2,ty);
1281 fjz2 = _mm_add_pd(fjz2,tz);
1285 /**************************
1286 * CALCULATE INTERACTIONS *
1287 **************************/
1289 if (gmx_mm_any_lt(rsq23,rcutoff2))
1292 r23 = _mm_mul_pd(rsq23,rinv23);
1294 /* EWALD ELECTROSTATICS */
1296 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1297 ewrt = _mm_mul_pd(r23,ewtabscale);
1298 ewitab = _mm_cvttpd_epi32(ewrt);
1299 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1300 ewitab = _mm_slli_epi32(ewitab,2);
1301 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1302 ewtabD = _mm_setzero_pd();
1303 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1304 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1305 ewtabFn = _mm_setzero_pd();
1306 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1307 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1308 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1309 velec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_sub_pd(rinv23,sh_ewald),velec));
1310 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1312 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1314 /* Update potential sum for this i atom from the interaction with this j atom. */
1315 velec = _mm_and_pd(velec,cutoff_mask);
1316 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1317 velecsum = _mm_add_pd(velecsum,velec);
1321 fscal = _mm_and_pd(fscal,cutoff_mask);
1323 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1325 /* Calculate temporary vectorial force */
1326 tx = _mm_mul_pd(fscal,dx23);
1327 ty = _mm_mul_pd(fscal,dy23);
1328 tz = _mm_mul_pd(fscal,dz23);
1330 /* Update vectorial force */
1331 fix2 = _mm_add_pd(fix2,tx);
1332 fiy2 = _mm_add_pd(fiy2,ty);
1333 fiz2 = _mm_add_pd(fiz2,tz);
1335 fjx3 = _mm_add_pd(fjx3,tx);
1336 fjy3 = _mm_add_pd(fjy3,ty);
1337 fjz3 = _mm_add_pd(fjz3,tz);
1341 /**************************
1342 * CALCULATE INTERACTIONS *
1343 **************************/
1345 if (gmx_mm_any_lt(rsq31,rcutoff2))
1348 r31 = _mm_mul_pd(rsq31,rinv31);
1350 /* EWALD ELECTROSTATICS */
1352 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1353 ewrt = _mm_mul_pd(r31,ewtabscale);
1354 ewitab = _mm_cvttpd_epi32(ewrt);
1355 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1356 ewitab = _mm_slli_epi32(ewitab,2);
1357 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1358 ewtabD = _mm_setzero_pd();
1359 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1360 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1361 ewtabFn = _mm_setzero_pd();
1362 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1363 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1364 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1365 velec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_sub_pd(rinv31,sh_ewald),velec));
1366 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1368 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1370 /* Update potential sum for this i atom from the interaction with this j atom. */
1371 velec = _mm_and_pd(velec,cutoff_mask);
1372 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1373 velecsum = _mm_add_pd(velecsum,velec);
1377 fscal = _mm_and_pd(fscal,cutoff_mask);
1379 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1381 /* Calculate temporary vectorial force */
1382 tx = _mm_mul_pd(fscal,dx31);
1383 ty = _mm_mul_pd(fscal,dy31);
1384 tz = _mm_mul_pd(fscal,dz31);
1386 /* Update vectorial force */
1387 fix3 = _mm_add_pd(fix3,tx);
1388 fiy3 = _mm_add_pd(fiy3,ty);
1389 fiz3 = _mm_add_pd(fiz3,tz);
1391 fjx1 = _mm_add_pd(fjx1,tx);
1392 fjy1 = _mm_add_pd(fjy1,ty);
1393 fjz1 = _mm_add_pd(fjz1,tz);
1397 /**************************
1398 * CALCULATE INTERACTIONS *
1399 **************************/
1401 if (gmx_mm_any_lt(rsq32,rcutoff2))
1404 r32 = _mm_mul_pd(rsq32,rinv32);
1406 /* EWALD ELECTROSTATICS */
1408 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1409 ewrt = _mm_mul_pd(r32,ewtabscale);
1410 ewitab = _mm_cvttpd_epi32(ewrt);
1411 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1412 ewitab = _mm_slli_epi32(ewitab,2);
1413 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1414 ewtabD = _mm_setzero_pd();
1415 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1416 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1417 ewtabFn = _mm_setzero_pd();
1418 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1419 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1420 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1421 velec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_sub_pd(rinv32,sh_ewald),velec));
1422 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1424 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1426 /* Update potential sum for this i atom from the interaction with this j atom. */
1427 velec = _mm_and_pd(velec,cutoff_mask);
1428 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1429 velecsum = _mm_add_pd(velecsum,velec);
1433 fscal = _mm_and_pd(fscal,cutoff_mask);
1435 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1437 /* Calculate temporary vectorial force */
1438 tx = _mm_mul_pd(fscal,dx32);
1439 ty = _mm_mul_pd(fscal,dy32);
1440 tz = _mm_mul_pd(fscal,dz32);
1442 /* Update vectorial force */
1443 fix3 = _mm_add_pd(fix3,tx);
1444 fiy3 = _mm_add_pd(fiy3,ty);
1445 fiz3 = _mm_add_pd(fiz3,tz);
1447 fjx2 = _mm_add_pd(fjx2,tx);
1448 fjy2 = _mm_add_pd(fjy2,ty);
1449 fjz2 = _mm_add_pd(fjz2,tz);
1453 /**************************
1454 * CALCULATE INTERACTIONS *
1455 **************************/
1457 if (gmx_mm_any_lt(rsq33,rcutoff2))
1460 r33 = _mm_mul_pd(rsq33,rinv33);
1462 /* EWALD ELECTROSTATICS */
1464 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1465 ewrt = _mm_mul_pd(r33,ewtabscale);
1466 ewitab = _mm_cvttpd_epi32(ewrt);
1467 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1468 ewitab = _mm_slli_epi32(ewitab,2);
1469 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1470 ewtabD = _mm_setzero_pd();
1471 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1472 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1473 ewtabFn = _mm_setzero_pd();
1474 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1475 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1476 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1477 velec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_sub_pd(rinv33,sh_ewald),velec));
1478 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1480 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1482 /* Update potential sum for this i atom from the interaction with this j atom. */
1483 velec = _mm_and_pd(velec,cutoff_mask);
1484 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1485 velecsum = _mm_add_pd(velecsum,velec);
1489 fscal = _mm_and_pd(fscal,cutoff_mask);
1491 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1493 /* Calculate temporary vectorial force */
1494 tx = _mm_mul_pd(fscal,dx33);
1495 ty = _mm_mul_pd(fscal,dy33);
1496 tz = _mm_mul_pd(fscal,dz33);
1498 /* Update vectorial force */
1499 fix3 = _mm_add_pd(fix3,tx);
1500 fiy3 = _mm_add_pd(fiy3,ty);
1501 fiz3 = _mm_add_pd(fiz3,tz);
1503 fjx3 = _mm_add_pd(fjx3,tx);
1504 fjy3 = _mm_add_pd(fjy3,ty);
1505 fjz3 = _mm_add_pd(fjz3,tz);
1509 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1511 /* Inner loop uses 478 flops */
1514 /* End of innermost loop */
1516 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1517 f+i_coord_offset,fshift+i_shift_offset);
1520 /* Update potential energies */
1521 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1522 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1524 /* Increment number of inner iterations */
1525 inneriter += j_index_end - j_index_start;
1527 /* Outer loop uses 26 flops */
1530 /* Increment number of outer iterations */
1533 /* Update outer/inner flops */
1535 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*478);
1538 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse4_1_double
1539 * Electrostatics interaction: Ewald
1540 * VdW interaction: LJEwald
1541 * Geometry: Water4-Water4
1542 * Calculate force/pot: Force
1545 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse4_1_double
1546 (t_nblist * gmx_restrict nlist,
1547 rvec * gmx_restrict xx,
1548 rvec * gmx_restrict ff,
1549 t_forcerec * gmx_restrict fr,
1550 t_mdatoms * gmx_restrict mdatoms,
1551 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1552 t_nrnb * gmx_restrict nrnb)
1554 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1555 * just 0 for non-waters.
1556 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1557 * jnr indices corresponding to data put in the four positions in the SIMD register.
1559 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1560 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1562 int j_coord_offsetA,j_coord_offsetB;
1563 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1564 real rcutoff_scalar;
1565 real *shiftvec,*fshift,*x,*f;
1566 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1568 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1570 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1572 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1574 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1575 int vdwjidx0A,vdwjidx0B;
1576 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1577 int vdwjidx1A,vdwjidx1B;
1578 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1579 int vdwjidx2A,vdwjidx2B;
1580 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1581 int vdwjidx3A,vdwjidx3B;
1582 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1583 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1584 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1585 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1586 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1587 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1588 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1589 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1590 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1591 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1592 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1593 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1596 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1599 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1600 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1611 __m128d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1613 __m128d one_half = _mm_set1_pd(0.5);
1614 __m128d minus_one = _mm_set1_pd(-1.0);
1616 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1618 __m128d dummy_mask,cutoff_mask;
1619 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1620 __m128d one = _mm_set1_pd(1.0);
1621 __m128d two = _mm_set1_pd(2.0);
1627 jindex = nlist->jindex;
1629 shiftidx = nlist->shift;
1631 shiftvec = fr->shift_vec[0];
1632 fshift = fr->fshift[0];
1633 facel = _mm_set1_pd(fr->epsfac);
1634 charge = mdatoms->chargeA;
1635 nvdwtype = fr->ntype;
1636 vdwparam = fr->nbfp;
1637 vdwtype = mdatoms->typeA;
1638 vdwgridparam = fr->ljpme_c6grid;
1639 sh_lj_ewald = _mm_set1_pd(fr->ic->sh_lj_ewald);
1640 ewclj = _mm_set1_pd(fr->ewaldcoeff_lj);
1641 ewclj2 = _mm_mul_pd(minus_one,_mm_mul_pd(ewclj,ewclj));
1643 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1644 ewtab = fr->ic->tabq_coul_F;
1645 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1646 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1648 /* Setup water-specific parameters */
1649 inr = nlist->iinr[0];
1650 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1651 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1652 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1653 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1655 jq1 = _mm_set1_pd(charge[inr+1]);
1656 jq2 = _mm_set1_pd(charge[inr+2]);
1657 jq3 = _mm_set1_pd(charge[inr+3]);
1658 vdwjidx0A = 2*vdwtype[inr+0];
1659 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1660 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1661 c6grid_00 = _mm_set1_pd(vdwgridparam[vdwioffset0+vdwjidx0A]);
1662 qq11 = _mm_mul_pd(iq1,jq1);
1663 qq12 = _mm_mul_pd(iq1,jq2);
1664 qq13 = _mm_mul_pd(iq1,jq3);
1665 qq21 = _mm_mul_pd(iq2,jq1);
1666 qq22 = _mm_mul_pd(iq2,jq2);
1667 qq23 = _mm_mul_pd(iq2,jq3);
1668 qq31 = _mm_mul_pd(iq3,jq1);
1669 qq32 = _mm_mul_pd(iq3,jq2);
1670 qq33 = _mm_mul_pd(iq3,jq3);
1672 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1673 rcutoff_scalar = fr->rcoulomb;
1674 rcutoff = _mm_set1_pd(rcutoff_scalar);
1675 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1677 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
1678 rvdw = _mm_set1_pd(fr->rvdw);
1680 /* Avoid stupid compiler warnings */
1682 j_coord_offsetA = 0;
1683 j_coord_offsetB = 0;
1688 /* Start outer loop over neighborlists */
1689 for(iidx=0; iidx<nri; iidx++)
1691 /* Load shift vector for this list */
1692 i_shift_offset = DIM*shiftidx[iidx];
1694 /* Load limits for loop over neighbors */
1695 j_index_start = jindex[iidx];
1696 j_index_end = jindex[iidx+1];
1698 /* Get outer coordinate index */
1700 i_coord_offset = DIM*inr;
1702 /* Load i particle coords and add shift vector */
1703 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1704 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1706 fix0 = _mm_setzero_pd();
1707 fiy0 = _mm_setzero_pd();
1708 fiz0 = _mm_setzero_pd();
1709 fix1 = _mm_setzero_pd();
1710 fiy1 = _mm_setzero_pd();
1711 fiz1 = _mm_setzero_pd();
1712 fix2 = _mm_setzero_pd();
1713 fiy2 = _mm_setzero_pd();
1714 fiz2 = _mm_setzero_pd();
1715 fix3 = _mm_setzero_pd();
1716 fiy3 = _mm_setzero_pd();
1717 fiz3 = _mm_setzero_pd();
1719 /* Start inner kernel loop */
1720 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1723 /* Get j neighbor index, and coordinate index */
1725 jnrB = jjnr[jidx+1];
1726 j_coord_offsetA = DIM*jnrA;
1727 j_coord_offsetB = DIM*jnrB;
1729 /* load j atom coordinates */
1730 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1731 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1732 &jy2,&jz2,&jx3,&jy3,&jz3);
1734 /* Calculate displacement vector */
1735 dx00 = _mm_sub_pd(ix0,jx0);
1736 dy00 = _mm_sub_pd(iy0,jy0);
1737 dz00 = _mm_sub_pd(iz0,jz0);
1738 dx11 = _mm_sub_pd(ix1,jx1);
1739 dy11 = _mm_sub_pd(iy1,jy1);
1740 dz11 = _mm_sub_pd(iz1,jz1);
1741 dx12 = _mm_sub_pd(ix1,jx2);
1742 dy12 = _mm_sub_pd(iy1,jy2);
1743 dz12 = _mm_sub_pd(iz1,jz2);
1744 dx13 = _mm_sub_pd(ix1,jx3);
1745 dy13 = _mm_sub_pd(iy1,jy3);
1746 dz13 = _mm_sub_pd(iz1,jz3);
1747 dx21 = _mm_sub_pd(ix2,jx1);
1748 dy21 = _mm_sub_pd(iy2,jy1);
1749 dz21 = _mm_sub_pd(iz2,jz1);
1750 dx22 = _mm_sub_pd(ix2,jx2);
1751 dy22 = _mm_sub_pd(iy2,jy2);
1752 dz22 = _mm_sub_pd(iz2,jz2);
1753 dx23 = _mm_sub_pd(ix2,jx3);
1754 dy23 = _mm_sub_pd(iy2,jy3);
1755 dz23 = _mm_sub_pd(iz2,jz3);
1756 dx31 = _mm_sub_pd(ix3,jx1);
1757 dy31 = _mm_sub_pd(iy3,jy1);
1758 dz31 = _mm_sub_pd(iz3,jz1);
1759 dx32 = _mm_sub_pd(ix3,jx2);
1760 dy32 = _mm_sub_pd(iy3,jy2);
1761 dz32 = _mm_sub_pd(iz3,jz2);
1762 dx33 = _mm_sub_pd(ix3,jx3);
1763 dy33 = _mm_sub_pd(iy3,jy3);
1764 dz33 = _mm_sub_pd(iz3,jz3);
1766 /* Calculate squared distance and things based on it */
1767 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1768 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1769 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1770 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1771 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1772 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1773 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1774 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1775 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1776 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1778 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1779 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1780 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1781 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1782 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1783 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1784 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1785 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1786 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1787 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1789 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1790 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1791 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1792 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1793 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1794 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1795 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1796 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1797 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1798 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1800 fjx0 = _mm_setzero_pd();
1801 fjy0 = _mm_setzero_pd();
1802 fjz0 = _mm_setzero_pd();
1803 fjx1 = _mm_setzero_pd();
1804 fjy1 = _mm_setzero_pd();
1805 fjz1 = _mm_setzero_pd();
1806 fjx2 = _mm_setzero_pd();
1807 fjy2 = _mm_setzero_pd();
1808 fjz2 = _mm_setzero_pd();
1809 fjx3 = _mm_setzero_pd();
1810 fjy3 = _mm_setzero_pd();
1811 fjz3 = _mm_setzero_pd();
1813 /**************************
1814 * CALCULATE INTERACTIONS *
1815 **************************/
1817 if (gmx_mm_any_lt(rsq00,rcutoff2))
1820 r00 = _mm_mul_pd(rsq00,rinv00);
1822 /* Analytical LJ-PME */
1823 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1824 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
1825 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
1826 exponent = gmx_simd_exp_d(ewcljrsq);
1827 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1828 poly = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
1829 /* f6A = 6 * C6grid * (1 - poly) */
1830 f6A = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
1831 /* f6B = C6grid * exponent * beta^6 */
1832 f6B = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
1833 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1834 fvdw = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1836 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1840 fscal = _mm_and_pd(fscal,cutoff_mask);
1842 /* Calculate temporary vectorial force */
1843 tx = _mm_mul_pd(fscal,dx00);
1844 ty = _mm_mul_pd(fscal,dy00);
1845 tz = _mm_mul_pd(fscal,dz00);
1847 /* Update vectorial force */
1848 fix0 = _mm_add_pd(fix0,tx);
1849 fiy0 = _mm_add_pd(fiy0,ty);
1850 fiz0 = _mm_add_pd(fiz0,tz);
1852 fjx0 = _mm_add_pd(fjx0,tx);
1853 fjy0 = _mm_add_pd(fjy0,ty);
1854 fjz0 = _mm_add_pd(fjz0,tz);
1858 /**************************
1859 * CALCULATE INTERACTIONS *
1860 **************************/
1862 if (gmx_mm_any_lt(rsq11,rcutoff2))
1865 r11 = _mm_mul_pd(rsq11,rinv11);
1867 /* EWALD ELECTROSTATICS */
1869 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1870 ewrt = _mm_mul_pd(r11,ewtabscale);
1871 ewitab = _mm_cvttpd_epi32(ewrt);
1872 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1873 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1875 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1876 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1878 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1882 fscal = _mm_and_pd(fscal,cutoff_mask);
1884 /* Calculate temporary vectorial force */
1885 tx = _mm_mul_pd(fscal,dx11);
1886 ty = _mm_mul_pd(fscal,dy11);
1887 tz = _mm_mul_pd(fscal,dz11);
1889 /* Update vectorial force */
1890 fix1 = _mm_add_pd(fix1,tx);
1891 fiy1 = _mm_add_pd(fiy1,ty);
1892 fiz1 = _mm_add_pd(fiz1,tz);
1894 fjx1 = _mm_add_pd(fjx1,tx);
1895 fjy1 = _mm_add_pd(fjy1,ty);
1896 fjz1 = _mm_add_pd(fjz1,tz);
1900 /**************************
1901 * CALCULATE INTERACTIONS *
1902 **************************/
1904 if (gmx_mm_any_lt(rsq12,rcutoff2))
1907 r12 = _mm_mul_pd(rsq12,rinv12);
1909 /* EWALD ELECTROSTATICS */
1911 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1912 ewrt = _mm_mul_pd(r12,ewtabscale);
1913 ewitab = _mm_cvttpd_epi32(ewrt);
1914 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1915 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1917 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1918 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1920 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1924 fscal = _mm_and_pd(fscal,cutoff_mask);
1926 /* Calculate temporary vectorial force */
1927 tx = _mm_mul_pd(fscal,dx12);
1928 ty = _mm_mul_pd(fscal,dy12);
1929 tz = _mm_mul_pd(fscal,dz12);
1931 /* Update vectorial force */
1932 fix1 = _mm_add_pd(fix1,tx);
1933 fiy1 = _mm_add_pd(fiy1,ty);
1934 fiz1 = _mm_add_pd(fiz1,tz);
1936 fjx2 = _mm_add_pd(fjx2,tx);
1937 fjy2 = _mm_add_pd(fjy2,ty);
1938 fjz2 = _mm_add_pd(fjz2,tz);
1942 /**************************
1943 * CALCULATE INTERACTIONS *
1944 **************************/
1946 if (gmx_mm_any_lt(rsq13,rcutoff2))
1949 r13 = _mm_mul_pd(rsq13,rinv13);
1951 /* EWALD ELECTROSTATICS */
1953 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1954 ewrt = _mm_mul_pd(r13,ewtabscale);
1955 ewitab = _mm_cvttpd_epi32(ewrt);
1956 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1957 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1959 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1960 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1962 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1966 fscal = _mm_and_pd(fscal,cutoff_mask);
1968 /* Calculate temporary vectorial force */
1969 tx = _mm_mul_pd(fscal,dx13);
1970 ty = _mm_mul_pd(fscal,dy13);
1971 tz = _mm_mul_pd(fscal,dz13);
1973 /* Update vectorial force */
1974 fix1 = _mm_add_pd(fix1,tx);
1975 fiy1 = _mm_add_pd(fiy1,ty);
1976 fiz1 = _mm_add_pd(fiz1,tz);
1978 fjx3 = _mm_add_pd(fjx3,tx);
1979 fjy3 = _mm_add_pd(fjy3,ty);
1980 fjz3 = _mm_add_pd(fjz3,tz);
1984 /**************************
1985 * CALCULATE INTERACTIONS *
1986 **************************/
1988 if (gmx_mm_any_lt(rsq21,rcutoff2))
1991 r21 = _mm_mul_pd(rsq21,rinv21);
1993 /* EWALD ELECTROSTATICS */
1995 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1996 ewrt = _mm_mul_pd(r21,ewtabscale);
1997 ewitab = _mm_cvttpd_epi32(ewrt);
1998 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1999 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2001 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2002 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2004 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2008 fscal = _mm_and_pd(fscal,cutoff_mask);
2010 /* Calculate temporary vectorial force */
2011 tx = _mm_mul_pd(fscal,dx21);
2012 ty = _mm_mul_pd(fscal,dy21);
2013 tz = _mm_mul_pd(fscal,dz21);
2015 /* Update vectorial force */
2016 fix2 = _mm_add_pd(fix2,tx);
2017 fiy2 = _mm_add_pd(fiy2,ty);
2018 fiz2 = _mm_add_pd(fiz2,tz);
2020 fjx1 = _mm_add_pd(fjx1,tx);
2021 fjy1 = _mm_add_pd(fjy1,ty);
2022 fjz1 = _mm_add_pd(fjz1,tz);
2026 /**************************
2027 * CALCULATE INTERACTIONS *
2028 **************************/
2030 if (gmx_mm_any_lt(rsq22,rcutoff2))
2033 r22 = _mm_mul_pd(rsq22,rinv22);
2035 /* EWALD ELECTROSTATICS */
2037 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2038 ewrt = _mm_mul_pd(r22,ewtabscale);
2039 ewitab = _mm_cvttpd_epi32(ewrt);
2040 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2041 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2043 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2044 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2046 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2050 fscal = _mm_and_pd(fscal,cutoff_mask);
2052 /* Calculate temporary vectorial force */
2053 tx = _mm_mul_pd(fscal,dx22);
2054 ty = _mm_mul_pd(fscal,dy22);
2055 tz = _mm_mul_pd(fscal,dz22);
2057 /* Update vectorial force */
2058 fix2 = _mm_add_pd(fix2,tx);
2059 fiy2 = _mm_add_pd(fiy2,ty);
2060 fiz2 = _mm_add_pd(fiz2,tz);
2062 fjx2 = _mm_add_pd(fjx2,tx);
2063 fjy2 = _mm_add_pd(fjy2,ty);
2064 fjz2 = _mm_add_pd(fjz2,tz);
2068 /**************************
2069 * CALCULATE INTERACTIONS *
2070 **************************/
2072 if (gmx_mm_any_lt(rsq23,rcutoff2))
2075 r23 = _mm_mul_pd(rsq23,rinv23);
2077 /* EWALD ELECTROSTATICS */
2079 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2080 ewrt = _mm_mul_pd(r23,ewtabscale);
2081 ewitab = _mm_cvttpd_epi32(ewrt);
2082 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2083 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2085 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2086 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2088 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
2092 fscal = _mm_and_pd(fscal,cutoff_mask);
2094 /* Calculate temporary vectorial force */
2095 tx = _mm_mul_pd(fscal,dx23);
2096 ty = _mm_mul_pd(fscal,dy23);
2097 tz = _mm_mul_pd(fscal,dz23);
2099 /* Update vectorial force */
2100 fix2 = _mm_add_pd(fix2,tx);
2101 fiy2 = _mm_add_pd(fiy2,ty);
2102 fiz2 = _mm_add_pd(fiz2,tz);
2104 fjx3 = _mm_add_pd(fjx3,tx);
2105 fjy3 = _mm_add_pd(fjy3,ty);
2106 fjz3 = _mm_add_pd(fjz3,tz);
2110 /**************************
2111 * CALCULATE INTERACTIONS *
2112 **************************/
2114 if (gmx_mm_any_lt(rsq31,rcutoff2))
2117 r31 = _mm_mul_pd(rsq31,rinv31);
2119 /* EWALD ELECTROSTATICS */
2121 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2122 ewrt = _mm_mul_pd(r31,ewtabscale);
2123 ewitab = _mm_cvttpd_epi32(ewrt);
2124 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2125 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2127 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2128 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2130 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
2134 fscal = _mm_and_pd(fscal,cutoff_mask);
2136 /* Calculate temporary vectorial force */
2137 tx = _mm_mul_pd(fscal,dx31);
2138 ty = _mm_mul_pd(fscal,dy31);
2139 tz = _mm_mul_pd(fscal,dz31);
2141 /* Update vectorial force */
2142 fix3 = _mm_add_pd(fix3,tx);
2143 fiy3 = _mm_add_pd(fiy3,ty);
2144 fiz3 = _mm_add_pd(fiz3,tz);
2146 fjx1 = _mm_add_pd(fjx1,tx);
2147 fjy1 = _mm_add_pd(fjy1,ty);
2148 fjz1 = _mm_add_pd(fjz1,tz);
2152 /**************************
2153 * CALCULATE INTERACTIONS *
2154 **************************/
2156 if (gmx_mm_any_lt(rsq32,rcutoff2))
2159 r32 = _mm_mul_pd(rsq32,rinv32);
2161 /* EWALD ELECTROSTATICS */
2163 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2164 ewrt = _mm_mul_pd(r32,ewtabscale);
2165 ewitab = _mm_cvttpd_epi32(ewrt);
2166 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2167 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2169 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2170 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2172 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
2176 fscal = _mm_and_pd(fscal,cutoff_mask);
2178 /* Calculate temporary vectorial force */
2179 tx = _mm_mul_pd(fscal,dx32);
2180 ty = _mm_mul_pd(fscal,dy32);
2181 tz = _mm_mul_pd(fscal,dz32);
2183 /* Update vectorial force */
2184 fix3 = _mm_add_pd(fix3,tx);
2185 fiy3 = _mm_add_pd(fiy3,ty);
2186 fiz3 = _mm_add_pd(fiz3,tz);
2188 fjx2 = _mm_add_pd(fjx2,tx);
2189 fjy2 = _mm_add_pd(fjy2,ty);
2190 fjz2 = _mm_add_pd(fjz2,tz);
2194 /**************************
2195 * CALCULATE INTERACTIONS *
2196 **************************/
2198 if (gmx_mm_any_lt(rsq33,rcutoff2))
2201 r33 = _mm_mul_pd(rsq33,rinv33);
2203 /* EWALD ELECTROSTATICS */
2205 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2206 ewrt = _mm_mul_pd(r33,ewtabscale);
2207 ewitab = _mm_cvttpd_epi32(ewrt);
2208 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2209 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2211 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2212 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2214 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
2218 fscal = _mm_and_pd(fscal,cutoff_mask);
2220 /* Calculate temporary vectorial force */
2221 tx = _mm_mul_pd(fscal,dx33);
2222 ty = _mm_mul_pd(fscal,dy33);
2223 tz = _mm_mul_pd(fscal,dz33);
2225 /* Update vectorial force */
2226 fix3 = _mm_add_pd(fix3,tx);
2227 fiy3 = _mm_add_pd(fiy3,ty);
2228 fiz3 = _mm_add_pd(fiz3,tz);
2230 fjx3 = _mm_add_pd(fjx3,tx);
2231 fjy3 = _mm_add_pd(fjy3,ty);
2232 fjz3 = _mm_add_pd(fjz3,tz);
2236 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2238 /* Inner loop uses 403 flops */
2241 if(jidx<j_index_end)
2245 j_coord_offsetA = DIM*jnrA;
2247 /* load j atom coordinates */
2248 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2249 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2250 &jy2,&jz2,&jx3,&jy3,&jz3);
2252 /* Calculate displacement vector */
2253 dx00 = _mm_sub_pd(ix0,jx0);
2254 dy00 = _mm_sub_pd(iy0,jy0);
2255 dz00 = _mm_sub_pd(iz0,jz0);
2256 dx11 = _mm_sub_pd(ix1,jx1);
2257 dy11 = _mm_sub_pd(iy1,jy1);
2258 dz11 = _mm_sub_pd(iz1,jz1);
2259 dx12 = _mm_sub_pd(ix1,jx2);
2260 dy12 = _mm_sub_pd(iy1,jy2);
2261 dz12 = _mm_sub_pd(iz1,jz2);
2262 dx13 = _mm_sub_pd(ix1,jx3);
2263 dy13 = _mm_sub_pd(iy1,jy3);
2264 dz13 = _mm_sub_pd(iz1,jz3);
2265 dx21 = _mm_sub_pd(ix2,jx1);
2266 dy21 = _mm_sub_pd(iy2,jy1);
2267 dz21 = _mm_sub_pd(iz2,jz1);
2268 dx22 = _mm_sub_pd(ix2,jx2);
2269 dy22 = _mm_sub_pd(iy2,jy2);
2270 dz22 = _mm_sub_pd(iz2,jz2);
2271 dx23 = _mm_sub_pd(ix2,jx3);
2272 dy23 = _mm_sub_pd(iy2,jy3);
2273 dz23 = _mm_sub_pd(iz2,jz3);
2274 dx31 = _mm_sub_pd(ix3,jx1);
2275 dy31 = _mm_sub_pd(iy3,jy1);
2276 dz31 = _mm_sub_pd(iz3,jz1);
2277 dx32 = _mm_sub_pd(ix3,jx2);
2278 dy32 = _mm_sub_pd(iy3,jy2);
2279 dz32 = _mm_sub_pd(iz3,jz2);
2280 dx33 = _mm_sub_pd(ix3,jx3);
2281 dy33 = _mm_sub_pd(iy3,jy3);
2282 dz33 = _mm_sub_pd(iz3,jz3);
2284 /* Calculate squared distance and things based on it */
2285 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2286 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2287 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2288 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2289 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2290 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2291 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2292 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2293 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2294 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2296 rinv00 = gmx_mm_invsqrt_pd(rsq00);
2297 rinv11 = gmx_mm_invsqrt_pd(rsq11);
2298 rinv12 = gmx_mm_invsqrt_pd(rsq12);
2299 rinv13 = gmx_mm_invsqrt_pd(rsq13);
2300 rinv21 = gmx_mm_invsqrt_pd(rsq21);
2301 rinv22 = gmx_mm_invsqrt_pd(rsq22);
2302 rinv23 = gmx_mm_invsqrt_pd(rsq23);
2303 rinv31 = gmx_mm_invsqrt_pd(rsq31);
2304 rinv32 = gmx_mm_invsqrt_pd(rsq32);
2305 rinv33 = gmx_mm_invsqrt_pd(rsq33);
2307 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
2308 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
2309 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
2310 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
2311 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
2312 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
2313 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
2314 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
2315 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
2316 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
2318 fjx0 = _mm_setzero_pd();
2319 fjy0 = _mm_setzero_pd();
2320 fjz0 = _mm_setzero_pd();
2321 fjx1 = _mm_setzero_pd();
2322 fjy1 = _mm_setzero_pd();
2323 fjz1 = _mm_setzero_pd();
2324 fjx2 = _mm_setzero_pd();
2325 fjy2 = _mm_setzero_pd();
2326 fjz2 = _mm_setzero_pd();
2327 fjx3 = _mm_setzero_pd();
2328 fjy3 = _mm_setzero_pd();
2329 fjz3 = _mm_setzero_pd();
2331 /**************************
2332 * CALCULATE INTERACTIONS *
2333 **************************/
2335 if (gmx_mm_any_lt(rsq00,rcutoff2))
2338 r00 = _mm_mul_pd(rsq00,rinv00);
2340 /* Analytical LJ-PME */
2341 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2342 ewcljrsq = _mm_mul_pd(ewclj2,rsq00);
2343 ewclj6 = _mm_mul_pd(ewclj2,_mm_mul_pd(ewclj2,ewclj2));
2344 exponent = gmx_simd_exp_d(ewcljrsq);
2345 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2346 poly = _mm_mul_pd(exponent,_mm_add_pd(_mm_sub_pd(one,ewcljrsq),_mm_mul_pd(_mm_mul_pd(ewcljrsq,ewcljrsq),one_half)));
2347 /* f6A = 6 * C6grid * (1 - poly) */
2348 f6A = _mm_mul_pd(c6grid_00,_mm_sub_pd(one,poly));
2349 /* f6B = C6grid * exponent * beta^6 */
2350 f6B = _mm_mul_pd(_mm_mul_pd(c6grid_00,one_sixth),_mm_mul_pd(exponent,ewclj6));
2351 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2352 fvdw = _mm_mul_pd(_mm_add_pd(_mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),_mm_sub_pd(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2354 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
2358 fscal = _mm_and_pd(fscal,cutoff_mask);
2360 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2362 /* Calculate temporary vectorial force */
2363 tx = _mm_mul_pd(fscal,dx00);
2364 ty = _mm_mul_pd(fscal,dy00);
2365 tz = _mm_mul_pd(fscal,dz00);
2367 /* Update vectorial force */
2368 fix0 = _mm_add_pd(fix0,tx);
2369 fiy0 = _mm_add_pd(fiy0,ty);
2370 fiz0 = _mm_add_pd(fiz0,tz);
2372 fjx0 = _mm_add_pd(fjx0,tx);
2373 fjy0 = _mm_add_pd(fjy0,ty);
2374 fjz0 = _mm_add_pd(fjz0,tz);
2378 /**************************
2379 * CALCULATE INTERACTIONS *
2380 **************************/
2382 if (gmx_mm_any_lt(rsq11,rcutoff2))
2385 r11 = _mm_mul_pd(rsq11,rinv11);
2387 /* EWALD ELECTROSTATICS */
2389 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2390 ewrt = _mm_mul_pd(r11,ewtabscale);
2391 ewitab = _mm_cvttpd_epi32(ewrt);
2392 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2393 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2394 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2395 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2397 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2401 fscal = _mm_and_pd(fscal,cutoff_mask);
2403 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2405 /* Calculate temporary vectorial force */
2406 tx = _mm_mul_pd(fscal,dx11);
2407 ty = _mm_mul_pd(fscal,dy11);
2408 tz = _mm_mul_pd(fscal,dz11);
2410 /* Update vectorial force */
2411 fix1 = _mm_add_pd(fix1,tx);
2412 fiy1 = _mm_add_pd(fiy1,ty);
2413 fiz1 = _mm_add_pd(fiz1,tz);
2415 fjx1 = _mm_add_pd(fjx1,tx);
2416 fjy1 = _mm_add_pd(fjy1,ty);
2417 fjz1 = _mm_add_pd(fjz1,tz);
2421 /**************************
2422 * CALCULATE INTERACTIONS *
2423 **************************/
2425 if (gmx_mm_any_lt(rsq12,rcutoff2))
2428 r12 = _mm_mul_pd(rsq12,rinv12);
2430 /* EWALD ELECTROSTATICS */
2432 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2433 ewrt = _mm_mul_pd(r12,ewtabscale);
2434 ewitab = _mm_cvttpd_epi32(ewrt);
2435 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2436 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2437 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2438 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2440 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2444 fscal = _mm_and_pd(fscal,cutoff_mask);
2446 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2448 /* Calculate temporary vectorial force */
2449 tx = _mm_mul_pd(fscal,dx12);
2450 ty = _mm_mul_pd(fscal,dy12);
2451 tz = _mm_mul_pd(fscal,dz12);
2453 /* Update vectorial force */
2454 fix1 = _mm_add_pd(fix1,tx);
2455 fiy1 = _mm_add_pd(fiy1,ty);
2456 fiz1 = _mm_add_pd(fiz1,tz);
2458 fjx2 = _mm_add_pd(fjx2,tx);
2459 fjy2 = _mm_add_pd(fjy2,ty);
2460 fjz2 = _mm_add_pd(fjz2,tz);
2464 /**************************
2465 * CALCULATE INTERACTIONS *
2466 **************************/
2468 if (gmx_mm_any_lt(rsq13,rcutoff2))
2471 r13 = _mm_mul_pd(rsq13,rinv13);
2473 /* EWALD ELECTROSTATICS */
2475 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2476 ewrt = _mm_mul_pd(r13,ewtabscale);
2477 ewitab = _mm_cvttpd_epi32(ewrt);
2478 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2479 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2480 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2481 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2483 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
2487 fscal = _mm_and_pd(fscal,cutoff_mask);
2489 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2491 /* Calculate temporary vectorial force */
2492 tx = _mm_mul_pd(fscal,dx13);
2493 ty = _mm_mul_pd(fscal,dy13);
2494 tz = _mm_mul_pd(fscal,dz13);
2496 /* Update vectorial force */
2497 fix1 = _mm_add_pd(fix1,tx);
2498 fiy1 = _mm_add_pd(fiy1,ty);
2499 fiz1 = _mm_add_pd(fiz1,tz);
2501 fjx3 = _mm_add_pd(fjx3,tx);
2502 fjy3 = _mm_add_pd(fjy3,ty);
2503 fjz3 = _mm_add_pd(fjz3,tz);
2507 /**************************
2508 * CALCULATE INTERACTIONS *
2509 **************************/
2511 if (gmx_mm_any_lt(rsq21,rcutoff2))
2514 r21 = _mm_mul_pd(rsq21,rinv21);
2516 /* EWALD ELECTROSTATICS */
2518 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2519 ewrt = _mm_mul_pd(r21,ewtabscale);
2520 ewitab = _mm_cvttpd_epi32(ewrt);
2521 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2522 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2523 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2524 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2526 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2530 fscal = _mm_and_pd(fscal,cutoff_mask);
2532 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2534 /* Calculate temporary vectorial force */
2535 tx = _mm_mul_pd(fscal,dx21);
2536 ty = _mm_mul_pd(fscal,dy21);
2537 tz = _mm_mul_pd(fscal,dz21);
2539 /* Update vectorial force */
2540 fix2 = _mm_add_pd(fix2,tx);
2541 fiy2 = _mm_add_pd(fiy2,ty);
2542 fiz2 = _mm_add_pd(fiz2,tz);
2544 fjx1 = _mm_add_pd(fjx1,tx);
2545 fjy1 = _mm_add_pd(fjy1,ty);
2546 fjz1 = _mm_add_pd(fjz1,tz);
2550 /**************************
2551 * CALCULATE INTERACTIONS *
2552 **************************/
2554 if (gmx_mm_any_lt(rsq22,rcutoff2))
2557 r22 = _mm_mul_pd(rsq22,rinv22);
2559 /* EWALD ELECTROSTATICS */
2561 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2562 ewrt = _mm_mul_pd(r22,ewtabscale);
2563 ewitab = _mm_cvttpd_epi32(ewrt);
2564 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2565 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2566 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2567 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2569 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2573 fscal = _mm_and_pd(fscal,cutoff_mask);
2575 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2577 /* Calculate temporary vectorial force */
2578 tx = _mm_mul_pd(fscal,dx22);
2579 ty = _mm_mul_pd(fscal,dy22);
2580 tz = _mm_mul_pd(fscal,dz22);
2582 /* Update vectorial force */
2583 fix2 = _mm_add_pd(fix2,tx);
2584 fiy2 = _mm_add_pd(fiy2,ty);
2585 fiz2 = _mm_add_pd(fiz2,tz);
2587 fjx2 = _mm_add_pd(fjx2,tx);
2588 fjy2 = _mm_add_pd(fjy2,ty);
2589 fjz2 = _mm_add_pd(fjz2,tz);
2593 /**************************
2594 * CALCULATE INTERACTIONS *
2595 **************************/
2597 if (gmx_mm_any_lt(rsq23,rcutoff2))
2600 r23 = _mm_mul_pd(rsq23,rinv23);
2602 /* EWALD ELECTROSTATICS */
2604 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2605 ewrt = _mm_mul_pd(r23,ewtabscale);
2606 ewitab = _mm_cvttpd_epi32(ewrt);
2607 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2608 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2609 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2610 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2612 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
2616 fscal = _mm_and_pd(fscal,cutoff_mask);
2618 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2620 /* Calculate temporary vectorial force */
2621 tx = _mm_mul_pd(fscal,dx23);
2622 ty = _mm_mul_pd(fscal,dy23);
2623 tz = _mm_mul_pd(fscal,dz23);
2625 /* Update vectorial force */
2626 fix2 = _mm_add_pd(fix2,tx);
2627 fiy2 = _mm_add_pd(fiy2,ty);
2628 fiz2 = _mm_add_pd(fiz2,tz);
2630 fjx3 = _mm_add_pd(fjx3,tx);
2631 fjy3 = _mm_add_pd(fjy3,ty);
2632 fjz3 = _mm_add_pd(fjz3,tz);
2636 /**************************
2637 * CALCULATE INTERACTIONS *
2638 **************************/
2640 if (gmx_mm_any_lt(rsq31,rcutoff2))
2643 r31 = _mm_mul_pd(rsq31,rinv31);
2645 /* EWALD ELECTROSTATICS */
2647 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2648 ewrt = _mm_mul_pd(r31,ewtabscale);
2649 ewitab = _mm_cvttpd_epi32(ewrt);
2650 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2651 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2652 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2653 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2655 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
2659 fscal = _mm_and_pd(fscal,cutoff_mask);
2661 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2663 /* Calculate temporary vectorial force */
2664 tx = _mm_mul_pd(fscal,dx31);
2665 ty = _mm_mul_pd(fscal,dy31);
2666 tz = _mm_mul_pd(fscal,dz31);
2668 /* Update vectorial force */
2669 fix3 = _mm_add_pd(fix3,tx);
2670 fiy3 = _mm_add_pd(fiy3,ty);
2671 fiz3 = _mm_add_pd(fiz3,tz);
2673 fjx1 = _mm_add_pd(fjx1,tx);
2674 fjy1 = _mm_add_pd(fjy1,ty);
2675 fjz1 = _mm_add_pd(fjz1,tz);
2679 /**************************
2680 * CALCULATE INTERACTIONS *
2681 **************************/
2683 if (gmx_mm_any_lt(rsq32,rcutoff2))
2686 r32 = _mm_mul_pd(rsq32,rinv32);
2688 /* EWALD ELECTROSTATICS */
2690 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2691 ewrt = _mm_mul_pd(r32,ewtabscale);
2692 ewitab = _mm_cvttpd_epi32(ewrt);
2693 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2694 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2695 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2696 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2698 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
2702 fscal = _mm_and_pd(fscal,cutoff_mask);
2704 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2706 /* Calculate temporary vectorial force */
2707 tx = _mm_mul_pd(fscal,dx32);
2708 ty = _mm_mul_pd(fscal,dy32);
2709 tz = _mm_mul_pd(fscal,dz32);
2711 /* Update vectorial force */
2712 fix3 = _mm_add_pd(fix3,tx);
2713 fiy3 = _mm_add_pd(fiy3,ty);
2714 fiz3 = _mm_add_pd(fiz3,tz);
2716 fjx2 = _mm_add_pd(fjx2,tx);
2717 fjy2 = _mm_add_pd(fjy2,ty);
2718 fjz2 = _mm_add_pd(fjz2,tz);
2722 /**************************
2723 * CALCULATE INTERACTIONS *
2724 **************************/
2726 if (gmx_mm_any_lt(rsq33,rcutoff2))
2729 r33 = _mm_mul_pd(rsq33,rinv33);
2731 /* EWALD ELECTROSTATICS */
2733 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2734 ewrt = _mm_mul_pd(r33,ewtabscale);
2735 ewitab = _mm_cvttpd_epi32(ewrt);
2736 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2737 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2738 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2739 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2741 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
2745 fscal = _mm_and_pd(fscal,cutoff_mask);
2747 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2749 /* Calculate temporary vectorial force */
2750 tx = _mm_mul_pd(fscal,dx33);
2751 ty = _mm_mul_pd(fscal,dy33);
2752 tz = _mm_mul_pd(fscal,dz33);
2754 /* Update vectorial force */
2755 fix3 = _mm_add_pd(fix3,tx);
2756 fiy3 = _mm_add_pd(fiy3,ty);
2757 fiz3 = _mm_add_pd(fiz3,tz);
2759 fjx3 = _mm_add_pd(fjx3,tx);
2760 fjy3 = _mm_add_pd(fjy3,ty);
2761 fjz3 = _mm_add_pd(fjz3,tz);
2765 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2767 /* Inner loop uses 403 flops */
2770 /* End of innermost loop */
2772 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2773 f+i_coord_offset,fshift+i_shift_offset);
2775 /* Increment number of inner iterations */
2776 inneriter += j_index_end - j_index_start;
2778 /* Outer loop uses 24 flops */
2781 /* Increment number of outer iterations */
2784 /* Update outer/inner flops */
2786 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*403);