2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_128_fma_double.h"
48 #include "kernelutil_x86_avx_128_fma_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_128_fma_double
52 * Electrostatics interaction: Ewald
53 * VdW interaction: CubicSplineTable
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_128_fma_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int j_coord_offsetA,j_coord_offsetB;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
88 int vdwjidx0A,vdwjidx0B;
89 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 int vdwjidx1A,vdwjidx1B;
91 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B;
93 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 int vdwjidx3A,vdwjidx3B;
95 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
96 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
100 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
103 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
104 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
105 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
106 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
109 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
113 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
115 __m128i ifour = _mm_set1_epi32(4);
116 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
119 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
121 __m128d dummy_mask,cutoff_mask;
122 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
123 __m128d one = _mm_set1_pd(1.0);
124 __m128d two = _mm_set1_pd(2.0);
130 jindex = nlist->jindex;
132 shiftidx = nlist->shift;
134 shiftvec = fr->shift_vec[0];
135 fshift = fr->fshift[0];
136 facel = _mm_set1_pd(fr->epsfac);
137 charge = mdatoms->chargeA;
138 nvdwtype = fr->ntype;
140 vdwtype = mdatoms->typeA;
142 vftab = kernel_data->table_vdw->data;
143 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
145 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
146 ewtab = fr->ic->tabq_coul_FDV0;
147 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
148 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
150 /* Setup water-specific parameters */
151 inr = nlist->iinr[0];
152 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
153 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
154 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
155 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
157 jq1 = _mm_set1_pd(charge[inr+1]);
158 jq2 = _mm_set1_pd(charge[inr+2]);
159 jq3 = _mm_set1_pd(charge[inr+3]);
160 vdwjidx0A = 2*vdwtype[inr+0];
161 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
162 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
163 qq11 = _mm_mul_pd(iq1,jq1);
164 qq12 = _mm_mul_pd(iq1,jq2);
165 qq13 = _mm_mul_pd(iq1,jq3);
166 qq21 = _mm_mul_pd(iq2,jq1);
167 qq22 = _mm_mul_pd(iq2,jq2);
168 qq23 = _mm_mul_pd(iq2,jq3);
169 qq31 = _mm_mul_pd(iq3,jq1);
170 qq32 = _mm_mul_pd(iq3,jq2);
171 qq33 = _mm_mul_pd(iq3,jq3);
173 /* Avoid stupid compiler warnings */
181 /* Start outer loop over neighborlists */
182 for(iidx=0; iidx<nri; iidx++)
184 /* Load shift vector for this list */
185 i_shift_offset = DIM*shiftidx[iidx];
187 /* Load limits for loop over neighbors */
188 j_index_start = jindex[iidx];
189 j_index_end = jindex[iidx+1];
191 /* Get outer coordinate index */
193 i_coord_offset = DIM*inr;
195 /* Load i particle coords and add shift vector */
196 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
197 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
199 fix0 = _mm_setzero_pd();
200 fiy0 = _mm_setzero_pd();
201 fiz0 = _mm_setzero_pd();
202 fix1 = _mm_setzero_pd();
203 fiy1 = _mm_setzero_pd();
204 fiz1 = _mm_setzero_pd();
205 fix2 = _mm_setzero_pd();
206 fiy2 = _mm_setzero_pd();
207 fiz2 = _mm_setzero_pd();
208 fix3 = _mm_setzero_pd();
209 fiy3 = _mm_setzero_pd();
210 fiz3 = _mm_setzero_pd();
212 /* Reset potential sums */
213 velecsum = _mm_setzero_pd();
214 vvdwsum = _mm_setzero_pd();
216 /* Start inner kernel loop */
217 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
220 /* Get j neighbor index, and coordinate index */
223 j_coord_offsetA = DIM*jnrA;
224 j_coord_offsetB = DIM*jnrB;
226 /* load j atom coordinates */
227 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
228 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
229 &jy2,&jz2,&jx3,&jy3,&jz3);
231 /* Calculate displacement vector */
232 dx00 = _mm_sub_pd(ix0,jx0);
233 dy00 = _mm_sub_pd(iy0,jy0);
234 dz00 = _mm_sub_pd(iz0,jz0);
235 dx11 = _mm_sub_pd(ix1,jx1);
236 dy11 = _mm_sub_pd(iy1,jy1);
237 dz11 = _mm_sub_pd(iz1,jz1);
238 dx12 = _mm_sub_pd(ix1,jx2);
239 dy12 = _mm_sub_pd(iy1,jy2);
240 dz12 = _mm_sub_pd(iz1,jz2);
241 dx13 = _mm_sub_pd(ix1,jx3);
242 dy13 = _mm_sub_pd(iy1,jy3);
243 dz13 = _mm_sub_pd(iz1,jz3);
244 dx21 = _mm_sub_pd(ix2,jx1);
245 dy21 = _mm_sub_pd(iy2,jy1);
246 dz21 = _mm_sub_pd(iz2,jz1);
247 dx22 = _mm_sub_pd(ix2,jx2);
248 dy22 = _mm_sub_pd(iy2,jy2);
249 dz22 = _mm_sub_pd(iz2,jz2);
250 dx23 = _mm_sub_pd(ix2,jx3);
251 dy23 = _mm_sub_pd(iy2,jy3);
252 dz23 = _mm_sub_pd(iz2,jz3);
253 dx31 = _mm_sub_pd(ix3,jx1);
254 dy31 = _mm_sub_pd(iy3,jy1);
255 dz31 = _mm_sub_pd(iz3,jz1);
256 dx32 = _mm_sub_pd(ix3,jx2);
257 dy32 = _mm_sub_pd(iy3,jy2);
258 dz32 = _mm_sub_pd(iz3,jz2);
259 dx33 = _mm_sub_pd(ix3,jx3);
260 dy33 = _mm_sub_pd(iy3,jy3);
261 dz33 = _mm_sub_pd(iz3,jz3);
263 /* Calculate squared distance and things based on it */
264 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
265 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
266 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
267 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
268 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
269 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
270 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
271 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
272 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
273 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
275 rinv00 = gmx_mm_invsqrt_pd(rsq00);
276 rinv11 = gmx_mm_invsqrt_pd(rsq11);
277 rinv12 = gmx_mm_invsqrt_pd(rsq12);
278 rinv13 = gmx_mm_invsqrt_pd(rsq13);
279 rinv21 = gmx_mm_invsqrt_pd(rsq21);
280 rinv22 = gmx_mm_invsqrt_pd(rsq22);
281 rinv23 = gmx_mm_invsqrt_pd(rsq23);
282 rinv31 = gmx_mm_invsqrt_pd(rsq31);
283 rinv32 = gmx_mm_invsqrt_pd(rsq32);
284 rinv33 = gmx_mm_invsqrt_pd(rsq33);
286 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
287 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
288 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
289 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
290 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
291 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
292 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
293 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
294 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
296 fjx0 = _mm_setzero_pd();
297 fjy0 = _mm_setzero_pd();
298 fjz0 = _mm_setzero_pd();
299 fjx1 = _mm_setzero_pd();
300 fjy1 = _mm_setzero_pd();
301 fjz1 = _mm_setzero_pd();
302 fjx2 = _mm_setzero_pd();
303 fjy2 = _mm_setzero_pd();
304 fjz2 = _mm_setzero_pd();
305 fjx3 = _mm_setzero_pd();
306 fjy3 = _mm_setzero_pd();
307 fjz3 = _mm_setzero_pd();
309 /**************************
310 * CALCULATE INTERACTIONS *
311 **************************/
313 r00 = _mm_mul_pd(rsq00,rinv00);
315 /* Calculate table index by multiplying r with table scale and truncate to integer */
316 rt = _mm_mul_pd(r00,vftabscale);
317 vfitab = _mm_cvttpd_epi32(rt);
319 vfeps = _mm_frcz_pd(rt);
321 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
323 twovfeps = _mm_add_pd(vfeps,vfeps);
324 vfitab = _mm_slli_epi32(vfitab,3);
326 /* CUBIC SPLINE TABLE DISPERSION */
327 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
328 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
329 GMX_MM_TRANSPOSE2_PD(Y,F);
330 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
331 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
332 GMX_MM_TRANSPOSE2_PD(G,H);
333 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
334 VV = _mm_macc_pd(vfeps,Fp,Y);
335 vvdw6 = _mm_mul_pd(c6_00,VV);
336 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
337 fvdw6 = _mm_mul_pd(c6_00,FF);
339 /* CUBIC SPLINE TABLE REPULSION */
340 vfitab = _mm_add_epi32(vfitab,ifour);
341 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
342 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
343 GMX_MM_TRANSPOSE2_PD(Y,F);
344 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
345 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
346 GMX_MM_TRANSPOSE2_PD(G,H);
347 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
348 VV = _mm_macc_pd(vfeps,Fp,Y);
349 vvdw12 = _mm_mul_pd(c12_00,VV);
350 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
351 fvdw12 = _mm_mul_pd(c12_00,FF);
352 vvdw = _mm_add_pd(vvdw12,vvdw6);
353 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
355 /* Update potential sum for this i atom from the interaction with this j atom. */
356 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
360 /* Update vectorial force */
361 fix0 = _mm_macc_pd(dx00,fscal,fix0);
362 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
363 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
365 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
366 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
367 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
369 /**************************
370 * CALCULATE INTERACTIONS *
371 **************************/
373 r11 = _mm_mul_pd(rsq11,rinv11);
375 /* EWALD ELECTROSTATICS */
377 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
378 ewrt = _mm_mul_pd(r11,ewtabscale);
379 ewitab = _mm_cvttpd_epi32(ewrt);
381 eweps = _mm_frcz_pd(ewrt);
383 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
385 twoeweps = _mm_add_pd(eweps,eweps);
386 ewitab = _mm_slli_epi32(ewitab,2);
387 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
388 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
389 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
390 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
391 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
392 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
393 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
394 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
395 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
396 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
398 /* Update potential sum for this i atom from the interaction with this j atom. */
399 velecsum = _mm_add_pd(velecsum,velec);
403 /* Update vectorial force */
404 fix1 = _mm_macc_pd(dx11,fscal,fix1);
405 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
406 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
408 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
409 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
410 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
412 /**************************
413 * CALCULATE INTERACTIONS *
414 **************************/
416 r12 = _mm_mul_pd(rsq12,rinv12);
418 /* EWALD ELECTROSTATICS */
420 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
421 ewrt = _mm_mul_pd(r12,ewtabscale);
422 ewitab = _mm_cvttpd_epi32(ewrt);
424 eweps = _mm_frcz_pd(ewrt);
426 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
428 twoeweps = _mm_add_pd(eweps,eweps);
429 ewitab = _mm_slli_epi32(ewitab,2);
430 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
431 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
432 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
433 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
434 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
435 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
436 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
437 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
438 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
439 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
441 /* Update potential sum for this i atom from the interaction with this j atom. */
442 velecsum = _mm_add_pd(velecsum,velec);
446 /* Update vectorial force */
447 fix1 = _mm_macc_pd(dx12,fscal,fix1);
448 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
449 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
451 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
452 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
453 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
455 /**************************
456 * CALCULATE INTERACTIONS *
457 **************************/
459 r13 = _mm_mul_pd(rsq13,rinv13);
461 /* EWALD ELECTROSTATICS */
463 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
464 ewrt = _mm_mul_pd(r13,ewtabscale);
465 ewitab = _mm_cvttpd_epi32(ewrt);
467 eweps = _mm_frcz_pd(ewrt);
469 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
471 twoeweps = _mm_add_pd(eweps,eweps);
472 ewitab = _mm_slli_epi32(ewitab,2);
473 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
474 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
475 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
476 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
477 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
478 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
479 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
480 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
481 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
482 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
484 /* Update potential sum for this i atom from the interaction with this j atom. */
485 velecsum = _mm_add_pd(velecsum,velec);
489 /* Update vectorial force */
490 fix1 = _mm_macc_pd(dx13,fscal,fix1);
491 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
492 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
494 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
495 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
496 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
498 /**************************
499 * CALCULATE INTERACTIONS *
500 **************************/
502 r21 = _mm_mul_pd(rsq21,rinv21);
504 /* EWALD ELECTROSTATICS */
506 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
507 ewrt = _mm_mul_pd(r21,ewtabscale);
508 ewitab = _mm_cvttpd_epi32(ewrt);
510 eweps = _mm_frcz_pd(ewrt);
512 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
514 twoeweps = _mm_add_pd(eweps,eweps);
515 ewitab = _mm_slli_epi32(ewitab,2);
516 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
517 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
518 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
519 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
520 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
521 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
522 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
523 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
524 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
525 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
527 /* Update potential sum for this i atom from the interaction with this j atom. */
528 velecsum = _mm_add_pd(velecsum,velec);
532 /* Update vectorial force */
533 fix2 = _mm_macc_pd(dx21,fscal,fix2);
534 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
535 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
537 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
538 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
539 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
541 /**************************
542 * CALCULATE INTERACTIONS *
543 **************************/
545 r22 = _mm_mul_pd(rsq22,rinv22);
547 /* EWALD ELECTROSTATICS */
549 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
550 ewrt = _mm_mul_pd(r22,ewtabscale);
551 ewitab = _mm_cvttpd_epi32(ewrt);
553 eweps = _mm_frcz_pd(ewrt);
555 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
557 twoeweps = _mm_add_pd(eweps,eweps);
558 ewitab = _mm_slli_epi32(ewitab,2);
559 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
560 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
561 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
562 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
563 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
564 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
565 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
566 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
567 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
568 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
570 /* Update potential sum for this i atom from the interaction with this j atom. */
571 velecsum = _mm_add_pd(velecsum,velec);
575 /* Update vectorial force */
576 fix2 = _mm_macc_pd(dx22,fscal,fix2);
577 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
578 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
580 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
581 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
582 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
584 /**************************
585 * CALCULATE INTERACTIONS *
586 **************************/
588 r23 = _mm_mul_pd(rsq23,rinv23);
590 /* EWALD ELECTROSTATICS */
592 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
593 ewrt = _mm_mul_pd(r23,ewtabscale);
594 ewitab = _mm_cvttpd_epi32(ewrt);
596 eweps = _mm_frcz_pd(ewrt);
598 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
600 twoeweps = _mm_add_pd(eweps,eweps);
601 ewitab = _mm_slli_epi32(ewitab,2);
602 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
603 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
604 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
605 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
606 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
607 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
608 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
609 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
610 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
611 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
613 /* Update potential sum for this i atom from the interaction with this j atom. */
614 velecsum = _mm_add_pd(velecsum,velec);
618 /* Update vectorial force */
619 fix2 = _mm_macc_pd(dx23,fscal,fix2);
620 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
621 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
623 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
624 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
625 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
627 /**************************
628 * CALCULATE INTERACTIONS *
629 **************************/
631 r31 = _mm_mul_pd(rsq31,rinv31);
633 /* EWALD ELECTROSTATICS */
635 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
636 ewrt = _mm_mul_pd(r31,ewtabscale);
637 ewitab = _mm_cvttpd_epi32(ewrt);
639 eweps = _mm_frcz_pd(ewrt);
641 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
643 twoeweps = _mm_add_pd(eweps,eweps);
644 ewitab = _mm_slli_epi32(ewitab,2);
645 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
646 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
647 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
648 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
649 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
650 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
651 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
652 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
653 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
654 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
656 /* Update potential sum for this i atom from the interaction with this j atom. */
657 velecsum = _mm_add_pd(velecsum,velec);
661 /* Update vectorial force */
662 fix3 = _mm_macc_pd(dx31,fscal,fix3);
663 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
664 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
666 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
667 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
668 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
670 /**************************
671 * CALCULATE INTERACTIONS *
672 **************************/
674 r32 = _mm_mul_pd(rsq32,rinv32);
676 /* EWALD ELECTROSTATICS */
678 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
679 ewrt = _mm_mul_pd(r32,ewtabscale);
680 ewitab = _mm_cvttpd_epi32(ewrt);
682 eweps = _mm_frcz_pd(ewrt);
684 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
686 twoeweps = _mm_add_pd(eweps,eweps);
687 ewitab = _mm_slli_epi32(ewitab,2);
688 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
689 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
690 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
691 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
692 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
693 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
694 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
695 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
696 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
697 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
699 /* Update potential sum for this i atom from the interaction with this j atom. */
700 velecsum = _mm_add_pd(velecsum,velec);
704 /* Update vectorial force */
705 fix3 = _mm_macc_pd(dx32,fscal,fix3);
706 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
707 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
709 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
710 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
711 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
713 /**************************
714 * CALCULATE INTERACTIONS *
715 **************************/
717 r33 = _mm_mul_pd(rsq33,rinv33);
719 /* EWALD ELECTROSTATICS */
721 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
722 ewrt = _mm_mul_pd(r33,ewtabscale);
723 ewitab = _mm_cvttpd_epi32(ewrt);
725 eweps = _mm_frcz_pd(ewrt);
727 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
729 twoeweps = _mm_add_pd(eweps,eweps);
730 ewitab = _mm_slli_epi32(ewitab,2);
731 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
732 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
733 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
734 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
735 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
736 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
737 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
738 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
739 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
740 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
742 /* Update potential sum for this i atom from the interaction with this j atom. */
743 velecsum = _mm_add_pd(velecsum,velec);
747 /* Update vectorial force */
748 fix3 = _mm_macc_pd(dx33,fscal,fix3);
749 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
750 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
752 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
753 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
754 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
756 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
758 /* Inner loop uses 458 flops */
765 j_coord_offsetA = DIM*jnrA;
767 /* load j atom coordinates */
768 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
769 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
770 &jy2,&jz2,&jx3,&jy3,&jz3);
772 /* Calculate displacement vector */
773 dx00 = _mm_sub_pd(ix0,jx0);
774 dy00 = _mm_sub_pd(iy0,jy0);
775 dz00 = _mm_sub_pd(iz0,jz0);
776 dx11 = _mm_sub_pd(ix1,jx1);
777 dy11 = _mm_sub_pd(iy1,jy1);
778 dz11 = _mm_sub_pd(iz1,jz1);
779 dx12 = _mm_sub_pd(ix1,jx2);
780 dy12 = _mm_sub_pd(iy1,jy2);
781 dz12 = _mm_sub_pd(iz1,jz2);
782 dx13 = _mm_sub_pd(ix1,jx3);
783 dy13 = _mm_sub_pd(iy1,jy3);
784 dz13 = _mm_sub_pd(iz1,jz3);
785 dx21 = _mm_sub_pd(ix2,jx1);
786 dy21 = _mm_sub_pd(iy2,jy1);
787 dz21 = _mm_sub_pd(iz2,jz1);
788 dx22 = _mm_sub_pd(ix2,jx2);
789 dy22 = _mm_sub_pd(iy2,jy2);
790 dz22 = _mm_sub_pd(iz2,jz2);
791 dx23 = _mm_sub_pd(ix2,jx3);
792 dy23 = _mm_sub_pd(iy2,jy3);
793 dz23 = _mm_sub_pd(iz2,jz3);
794 dx31 = _mm_sub_pd(ix3,jx1);
795 dy31 = _mm_sub_pd(iy3,jy1);
796 dz31 = _mm_sub_pd(iz3,jz1);
797 dx32 = _mm_sub_pd(ix3,jx2);
798 dy32 = _mm_sub_pd(iy3,jy2);
799 dz32 = _mm_sub_pd(iz3,jz2);
800 dx33 = _mm_sub_pd(ix3,jx3);
801 dy33 = _mm_sub_pd(iy3,jy3);
802 dz33 = _mm_sub_pd(iz3,jz3);
804 /* Calculate squared distance and things based on it */
805 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
806 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
807 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
808 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
809 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
810 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
811 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
812 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
813 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
814 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
816 rinv00 = gmx_mm_invsqrt_pd(rsq00);
817 rinv11 = gmx_mm_invsqrt_pd(rsq11);
818 rinv12 = gmx_mm_invsqrt_pd(rsq12);
819 rinv13 = gmx_mm_invsqrt_pd(rsq13);
820 rinv21 = gmx_mm_invsqrt_pd(rsq21);
821 rinv22 = gmx_mm_invsqrt_pd(rsq22);
822 rinv23 = gmx_mm_invsqrt_pd(rsq23);
823 rinv31 = gmx_mm_invsqrt_pd(rsq31);
824 rinv32 = gmx_mm_invsqrt_pd(rsq32);
825 rinv33 = gmx_mm_invsqrt_pd(rsq33);
827 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
828 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
829 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
830 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
831 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
832 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
833 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
834 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
835 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
837 fjx0 = _mm_setzero_pd();
838 fjy0 = _mm_setzero_pd();
839 fjz0 = _mm_setzero_pd();
840 fjx1 = _mm_setzero_pd();
841 fjy1 = _mm_setzero_pd();
842 fjz1 = _mm_setzero_pd();
843 fjx2 = _mm_setzero_pd();
844 fjy2 = _mm_setzero_pd();
845 fjz2 = _mm_setzero_pd();
846 fjx3 = _mm_setzero_pd();
847 fjy3 = _mm_setzero_pd();
848 fjz3 = _mm_setzero_pd();
850 /**************************
851 * CALCULATE INTERACTIONS *
852 **************************/
854 r00 = _mm_mul_pd(rsq00,rinv00);
856 /* Calculate table index by multiplying r with table scale and truncate to integer */
857 rt = _mm_mul_pd(r00,vftabscale);
858 vfitab = _mm_cvttpd_epi32(rt);
860 vfeps = _mm_frcz_pd(rt);
862 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
864 twovfeps = _mm_add_pd(vfeps,vfeps);
865 vfitab = _mm_slli_epi32(vfitab,3);
867 /* CUBIC SPLINE TABLE DISPERSION */
868 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
869 F = _mm_setzero_pd();
870 GMX_MM_TRANSPOSE2_PD(Y,F);
871 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
872 H = _mm_setzero_pd();
873 GMX_MM_TRANSPOSE2_PD(G,H);
874 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
875 VV = _mm_macc_pd(vfeps,Fp,Y);
876 vvdw6 = _mm_mul_pd(c6_00,VV);
877 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
878 fvdw6 = _mm_mul_pd(c6_00,FF);
880 /* CUBIC SPLINE TABLE REPULSION */
881 vfitab = _mm_add_epi32(vfitab,ifour);
882 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
883 F = _mm_setzero_pd();
884 GMX_MM_TRANSPOSE2_PD(Y,F);
885 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
886 H = _mm_setzero_pd();
887 GMX_MM_TRANSPOSE2_PD(G,H);
888 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
889 VV = _mm_macc_pd(vfeps,Fp,Y);
890 vvdw12 = _mm_mul_pd(c12_00,VV);
891 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
892 fvdw12 = _mm_mul_pd(c12_00,FF);
893 vvdw = _mm_add_pd(vvdw12,vvdw6);
894 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
896 /* Update potential sum for this i atom from the interaction with this j atom. */
897 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
898 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
902 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
904 /* Update vectorial force */
905 fix0 = _mm_macc_pd(dx00,fscal,fix0);
906 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
907 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
909 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
910 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
911 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
913 /**************************
914 * CALCULATE INTERACTIONS *
915 **************************/
917 r11 = _mm_mul_pd(rsq11,rinv11);
919 /* EWALD ELECTROSTATICS */
921 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
922 ewrt = _mm_mul_pd(r11,ewtabscale);
923 ewitab = _mm_cvttpd_epi32(ewrt);
925 eweps = _mm_frcz_pd(ewrt);
927 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
929 twoeweps = _mm_add_pd(eweps,eweps);
930 ewitab = _mm_slli_epi32(ewitab,2);
931 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
932 ewtabD = _mm_setzero_pd();
933 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
934 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
935 ewtabFn = _mm_setzero_pd();
936 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
937 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
938 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
939 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
940 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
942 /* Update potential sum for this i atom from the interaction with this j atom. */
943 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
944 velecsum = _mm_add_pd(velecsum,velec);
948 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
950 /* Update vectorial force */
951 fix1 = _mm_macc_pd(dx11,fscal,fix1);
952 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
953 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
955 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
956 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
957 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
959 /**************************
960 * CALCULATE INTERACTIONS *
961 **************************/
963 r12 = _mm_mul_pd(rsq12,rinv12);
965 /* EWALD ELECTROSTATICS */
967 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
968 ewrt = _mm_mul_pd(r12,ewtabscale);
969 ewitab = _mm_cvttpd_epi32(ewrt);
971 eweps = _mm_frcz_pd(ewrt);
973 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
975 twoeweps = _mm_add_pd(eweps,eweps);
976 ewitab = _mm_slli_epi32(ewitab,2);
977 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
978 ewtabD = _mm_setzero_pd();
979 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
980 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
981 ewtabFn = _mm_setzero_pd();
982 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
983 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
984 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
985 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
986 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
988 /* Update potential sum for this i atom from the interaction with this j atom. */
989 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
990 velecsum = _mm_add_pd(velecsum,velec);
994 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
996 /* Update vectorial force */
997 fix1 = _mm_macc_pd(dx12,fscal,fix1);
998 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
999 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1001 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1002 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1003 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1005 /**************************
1006 * CALCULATE INTERACTIONS *
1007 **************************/
1009 r13 = _mm_mul_pd(rsq13,rinv13);
1011 /* EWALD ELECTROSTATICS */
1013 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1014 ewrt = _mm_mul_pd(r13,ewtabscale);
1015 ewitab = _mm_cvttpd_epi32(ewrt);
1017 eweps = _mm_frcz_pd(ewrt);
1019 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1021 twoeweps = _mm_add_pd(eweps,eweps);
1022 ewitab = _mm_slli_epi32(ewitab,2);
1023 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1024 ewtabD = _mm_setzero_pd();
1025 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1026 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1027 ewtabFn = _mm_setzero_pd();
1028 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1029 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1030 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1031 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
1032 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1034 /* Update potential sum for this i atom from the interaction with this j atom. */
1035 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1036 velecsum = _mm_add_pd(velecsum,velec);
1040 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1042 /* Update vectorial force */
1043 fix1 = _mm_macc_pd(dx13,fscal,fix1);
1044 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
1045 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
1047 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
1048 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
1049 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
1051 /**************************
1052 * CALCULATE INTERACTIONS *
1053 **************************/
1055 r21 = _mm_mul_pd(rsq21,rinv21);
1057 /* EWALD ELECTROSTATICS */
1059 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1060 ewrt = _mm_mul_pd(r21,ewtabscale);
1061 ewitab = _mm_cvttpd_epi32(ewrt);
1063 eweps = _mm_frcz_pd(ewrt);
1065 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1067 twoeweps = _mm_add_pd(eweps,eweps);
1068 ewitab = _mm_slli_epi32(ewitab,2);
1069 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1070 ewtabD = _mm_setzero_pd();
1071 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1072 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1073 ewtabFn = _mm_setzero_pd();
1074 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1075 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1076 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1077 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1078 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1080 /* Update potential sum for this i atom from the interaction with this j atom. */
1081 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1082 velecsum = _mm_add_pd(velecsum,velec);
1086 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1088 /* Update vectorial force */
1089 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1090 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1091 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1093 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1094 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1095 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1097 /**************************
1098 * CALCULATE INTERACTIONS *
1099 **************************/
1101 r22 = _mm_mul_pd(rsq22,rinv22);
1103 /* EWALD ELECTROSTATICS */
1105 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1106 ewrt = _mm_mul_pd(r22,ewtabscale);
1107 ewitab = _mm_cvttpd_epi32(ewrt);
1109 eweps = _mm_frcz_pd(ewrt);
1111 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1113 twoeweps = _mm_add_pd(eweps,eweps);
1114 ewitab = _mm_slli_epi32(ewitab,2);
1115 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1116 ewtabD = _mm_setzero_pd();
1117 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1118 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1119 ewtabFn = _mm_setzero_pd();
1120 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1121 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1122 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1123 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1124 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1126 /* Update potential sum for this i atom from the interaction with this j atom. */
1127 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1128 velecsum = _mm_add_pd(velecsum,velec);
1132 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1134 /* Update vectorial force */
1135 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1136 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1137 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1139 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1140 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1141 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1143 /**************************
1144 * CALCULATE INTERACTIONS *
1145 **************************/
1147 r23 = _mm_mul_pd(rsq23,rinv23);
1149 /* EWALD ELECTROSTATICS */
1151 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1152 ewrt = _mm_mul_pd(r23,ewtabscale);
1153 ewitab = _mm_cvttpd_epi32(ewrt);
1155 eweps = _mm_frcz_pd(ewrt);
1157 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1159 twoeweps = _mm_add_pd(eweps,eweps);
1160 ewitab = _mm_slli_epi32(ewitab,2);
1161 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1162 ewtabD = _mm_setzero_pd();
1163 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1164 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1165 ewtabFn = _mm_setzero_pd();
1166 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1167 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1168 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1169 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
1170 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1172 /* Update potential sum for this i atom from the interaction with this j atom. */
1173 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1174 velecsum = _mm_add_pd(velecsum,velec);
1178 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1180 /* Update vectorial force */
1181 fix2 = _mm_macc_pd(dx23,fscal,fix2);
1182 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
1183 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
1185 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
1186 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
1187 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
1189 /**************************
1190 * CALCULATE INTERACTIONS *
1191 **************************/
1193 r31 = _mm_mul_pd(rsq31,rinv31);
1195 /* EWALD ELECTROSTATICS */
1197 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1198 ewrt = _mm_mul_pd(r31,ewtabscale);
1199 ewitab = _mm_cvttpd_epi32(ewrt);
1201 eweps = _mm_frcz_pd(ewrt);
1203 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1205 twoeweps = _mm_add_pd(eweps,eweps);
1206 ewitab = _mm_slli_epi32(ewitab,2);
1207 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1208 ewtabD = _mm_setzero_pd();
1209 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1210 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1211 ewtabFn = _mm_setzero_pd();
1212 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1213 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1214 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1215 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
1216 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1218 /* Update potential sum for this i atom from the interaction with this j atom. */
1219 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1220 velecsum = _mm_add_pd(velecsum,velec);
1224 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1226 /* Update vectorial force */
1227 fix3 = _mm_macc_pd(dx31,fscal,fix3);
1228 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
1229 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
1231 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
1232 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
1233 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
1235 /**************************
1236 * CALCULATE INTERACTIONS *
1237 **************************/
1239 r32 = _mm_mul_pd(rsq32,rinv32);
1241 /* EWALD ELECTROSTATICS */
1243 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1244 ewrt = _mm_mul_pd(r32,ewtabscale);
1245 ewitab = _mm_cvttpd_epi32(ewrt);
1247 eweps = _mm_frcz_pd(ewrt);
1249 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1251 twoeweps = _mm_add_pd(eweps,eweps);
1252 ewitab = _mm_slli_epi32(ewitab,2);
1253 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1254 ewtabD = _mm_setzero_pd();
1255 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1256 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1257 ewtabFn = _mm_setzero_pd();
1258 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1259 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1260 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1261 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
1262 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1264 /* Update potential sum for this i atom from the interaction with this j atom. */
1265 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1266 velecsum = _mm_add_pd(velecsum,velec);
1270 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1272 /* Update vectorial force */
1273 fix3 = _mm_macc_pd(dx32,fscal,fix3);
1274 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
1275 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
1277 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
1278 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
1279 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
1281 /**************************
1282 * CALCULATE INTERACTIONS *
1283 **************************/
1285 r33 = _mm_mul_pd(rsq33,rinv33);
1287 /* EWALD ELECTROSTATICS */
1289 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1290 ewrt = _mm_mul_pd(r33,ewtabscale);
1291 ewitab = _mm_cvttpd_epi32(ewrt);
1293 eweps = _mm_frcz_pd(ewrt);
1295 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1297 twoeweps = _mm_add_pd(eweps,eweps);
1298 ewitab = _mm_slli_epi32(ewitab,2);
1299 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1300 ewtabD = _mm_setzero_pd();
1301 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1302 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1303 ewtabFn = _mm_setzero_pd();
1304 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1305 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1306 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1307 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
1308 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1310 /* Update potential sum for this i atom from the interaction with this j atom. */
1311 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1312 velecsum = _mm_add_pd(velecsum,velec);
1316 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1318 /* Update vectorial force */
1319 fix3 = _mm_macc_pd(dx33,fscal,fix3);
1320 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
1321 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
1323 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
1324 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
1325 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
1327 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1329 /* Inner loop uses 458 flops */
1332 /* End of innermost loop */
1334 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1335 f+i_coord_offset,fshift+i_shift_offset);
1338 /* Update potential energies */
1339 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1340 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1342 /* Increment number of inner iterations */
1343 inneriter += j_index_end - j_index_start;
1345 /* Outer loop uses 26 flops */
1348 /* Increment number of outer iterations */
1351 /* Update outer/inner flops */
1353 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*458);
1356 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_128_fma_double
1357 * Electrostatics interaction: Ewald
1358 * VdW interaction: CubicSplineTable
1359 * Geometry: Water4-Water4
1360 * Calculate force/pot: Force
1363 nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_128_fma_double
1364 (t_nblist * gmx_restrict nlist,
1365 rvec * gmx_restrict xx,
1366 rvec * gmx_restrict ff,
1367 t_forcerec * gmx_restrict fr,
1368 t_mdatoms * gmx_restrict mdatoms,
1369 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1370 t_nrnb * gmx_restrict nrnb)
1372 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1373 * just 0 for non-waters.
1374 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1375 * jnr indices corresponding to data put in the four positions in the SIMD register.
1377 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1378 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1380 int j_coord_offsetA,j_coord_offsetB;
1381 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1382 real rcutoff_scalar;
1383 real *shiftvec,*fshift,*x,*f;
1384 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1386 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1388 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1390 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1392 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1393 int vdwjidx0A,vdwjidx0B;
1394 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1395 int vdwjidx1A,vdwjidx1B;
1396 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1397 int vdwjidx2A,vdwjidx2B;
1398 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1399 int vdwjidx3A,vdwjidx3B;
1400 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1401 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1402 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1403 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1404 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1405 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1406 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1407 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1408 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1409 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1410 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1411 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1414 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1417 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1418 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1420 __m128i ifour = _mm_set1_epi32(4);
1421 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
1424 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1426 __m128d dummy_mask,cutoff_mask;
1427 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1428 __m128d one = _mm_set1_pd(1.0);
1429 __m128d two = _mm_set1_pd(2.0);
1435 jindex = nlist->jindex;
1437 shiftidx = nlist->shift;
1439 shiftvec = fr->shift_vec[0];
1440 fshift = fr->fshift[0];
1441 facel = _mm_set1_pd(fr->epsfac);
1442 charge = mdatoms->chargeA;
1443 nvdwtype = fr->ntype;
1444 vdwparam = fr->nbfp;
1445 vdwtype = mdatoms->typeA;
1447 vftab = kernel_data->table_vdw->data;
1448 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
1450 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1451 ewtab = fr->ic->tabq_coul_F;
1452 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1453 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1455 /* Setup water-specific parameters */
1456 inr = nlist->iinr[0];
1457 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1458 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1459 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1460 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1462 jq1 = _mm_set1_pd(charge[inr+1]);
1463 jq2 = _mm_set1_pd(charge[inr+2]);
1464 jq3 = _mm_set1_pd(charge[inr+3]);
1465 vdwjidx0A = 2*vdwtype[inr+0];
1466 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1467 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1468 qq11 = _mm_mul_pd(iq1,jq1);
1469 qq12 = _mm_mul_pd(iq1,jq2);
1470 qq13 = _mm_mul_pd(iq1,jq3);
1471 qq21 = _mm_mul_pd(iq2,jq1);
1472 qq22 = _mm_mul_pd(iq2,jq2);
1473 qq23 = _mm_mul_pd(iq2,jq3);
1474 qq31 = _mm_mul_pd(iq3,jq1);
1475 qq32 = _mm_mul_pd(iq3,jq2);
1476 qq33 = _mm_mul_pd(iq3,jq3);
1478 /* Avoid stupid compiler warnings */
1480 j_coord_offsetA = 0;
1481 j_coord_offsetB = 0;
1486 /* Start outer loop over neighborlists */
1487 for(iidx=0; iidx<nri; iidx++)
1489 /* Load shift vector for this list */
1490 i_shift_offset = DIM*shiftidx[iidx];
1492 /* Load limits for loop over neighbors */
1493 j_index_start = jindex[iidx];
1494 j_index_end = jindex[iidx+1];
1496 /* Get outer coordinate index */
1498 i_coord_offset = DIM*inr;
1500 /* Load i particle coords and add shift vector */
1501 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1502 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1504 fix0 = _mm_setzero_pd();
1505 fiy0 = _mm_setzero_pd();
1506 fiz0 = _mm_setzero_pd();
1507 fix1 = _mm_setzero_pd();
1508 fiy1 = _mm_setzero_pd();
1509 fiz1 = _mm_setzero_pd();
1510 fix2 = _mm_setzero_pd();
1511 fiy2 = _mm_setzero_pd();
1512 fiz2 = _mm_setzero_pd();
1513 fix3 = _mm_setzero_pd();
1514 fiy3 = _mm_setzero_pd();
1515 fiz3 = _mm_setzero_pd();
1517 /* Start inner kernel loop */
1518 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1521 /* Get j neighbor index, and coordinate index */
1523 jnrB = jjnr[jidx+1];
1524 j_coord_offsetA = DIM*jnrA;
1525 j_coord_offsetB = DIM*jnrB;
1527 /* load j atom coordinates */
1528 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1529 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1530 &jy2,&jz2,&jx3,&jy3,&jz3);
1532 /* Calculate displacement vector */
1533 dx00 = _mm_sub_pd(ix0,jx0);
1534 dy00 = _mm_sub_pd(iy0,jy0);
1535 dz00 = _mm_sub_pd(iz0,jz0);
1536 dx11 = _mm_sub_pd(ix1,jx1);
1537 dy11 = _mm_sub_pd(iy1,jy1);
1538 dz11 = _mm_sub_pd(iz1,jz1);
1539 dx12 = _mm_sub_pd(ix1,jx2);
1540 dy12 = _mm_sub_pd(iy1,jy2);
1541 dz12 = _mm_sub_pd(iz1,jz2);
1542 dx13 = _mm_sub_pd(ix1,jx3);
1543 dy13 = _mm_sub_pd(iy1,jy3);
1544 dz13 = _mm_sub_pd(iz1,jz3);
1545 dx21 = _mm_sub_pd(ix2,jx1);
1546 dy21 = _mm_sub_pd(iy2,jy1);
1547 dz21 = _mm_sub_pd(iz2,jz1);
1548 dx22 = _mm_sub_pd(ix2,jx2);
1549 dy22 = _mm_sub_pd(iy2,jy2);
1550 dz22 = _mm_sub_pd(iz2,jz2);
1551 dx23 = _mm_sub_pd(ix2,jx3);
1552 dy23 = _mm_sub_pd(iy2,jy3);
1553 dz23 = _mm_sub_pd(iz2,jz3);
1554 dx31 = _mm_sub_pd(ix3,jx1);
1555 dy31 = _mm_sub_pd(iy3,jy1);
1556 dz31 = _mm_sub_pd(iz3,jz1);
1557 dx32 = _mm_sub_pd(ix3,jx2);
1558 dy32 = _mm_sub_pd(iy3,jy2);
1559 dz32 = _mm_sub_pd(iz3,jz2);
1560 dx33 = _mm_sub_pd(ix3,jx3);
1561 dy33 = _mm_sub_pd(iy3,jy3);
1562 dz33 = _mm_sub_pd(iz3,jz3);
1564 /* Calculate squared distance and things based on it */
1565 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1566 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1567 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1568 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1569 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1570 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1571 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1572 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1573 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1574 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1576 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1577 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1578 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1579 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1580 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1581 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1582 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1583 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1584 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1585 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1587 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1588 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1589 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1590 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1591 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1592 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1593 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1594 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1595 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1597 fjx0 = _mm_setzero_pd();
1598 fjy0 = _mm_setzero_pd();
1599 fjz0 = _mm_setzero_pd();
1600 fjx1 = _mm_setzero_pd();
1601 fjy1 = _mm_setzero_pd();
1602 fjz1 = _mm_setzero_pd();
1603 fjx2 = _mm_setzero_pd();
1604 fjy2 = _mm_setzero_pd();
1605 fjz2 = _mm_setzero_pd();
1606 fjx3 = _mm_setzero_pd();
1607 fjy3 = _mm_setzero_pd();
1608 fjz3 = _mm_setzero_pd();
1610 /**************************
1611 * CALCULATE INTERACTIONS *
1612 **************************/
1614 r00 = _mm_mul_pd(rsq00,rinv00);
1616 /* Calculate table index by multiplying r with table scale and truncate to integer */
1617 rt = _mm_mul_pd(r00,vftabscale);
1618 vfitab = _mm_cvttpd_epi32(rt);
1620 vfeps = _mm_frcz_pd(rt);
1622 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1624 twovfeps = _mm_add_pd(vfeps,vfeps);
1625 vfitab = _mm_slli_epi32(vfitab,3);
1627 /* CUBIC SPLINE TABLE DISPERSION */
1628 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1629 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1630 GMX_MM_TRANSPOSE2_PD(Y,F);
1631 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1632 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1633 GMX_MM_TRANSPOSE2_PD(G,H);
1634 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1635 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1636 fvdw6 = _mm_mul_pd(c6_00,FF);
1638 /* CUBIC SPLINE TABLE REPULSION */
1639 vfitab = _mm_add_epi32(vfitab,ifour);
1640 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1641 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1642 GMX_MM_TRANSPOSE2_PD(Y,F);
1643 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1644 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1645 GMX_MM_TRANSPOSE2_PD(G,H);
1646 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1647 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1648 fvdw12 = _mm_mul_pd(c12_00,FF);
1649 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1653 /* Update vectorial force */
1654 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1655 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1656 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1658 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1659 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1660 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1662 /**************************
1663 * CALCULATE INTERACTIONS *
1664 **************************/
1666 r11 = _mm_mul_pd(rsq11,rinv11);
1668 /* EWALD ELECTROSTATICS */
1670 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1671 ewrt = _mm_mul_pd(r11,ewtabscale);
1672 ewitab = _mm_cvttpd_epi32(ewrt);
1674 eweps = _mm_frcz_pd(ewrt);
1676 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1678 twoeweps = _mm_add_pd(eweps,eweps);
1679 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1681 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1682 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1686 /* Update vectorial force */
1687 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1688 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1689 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1691 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1692 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1693 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1695 /**************************
1696 * CALCULATE INTERACTIONS *
1697 **************************/
1699 r12 = _mm_mul_pd(rsq12,rinv12);
1701 /* EWALD ELECTROSTATICS */
1703 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1704 ewrt = _mm_mul_pd(r12,ewtabscale);
1705 ewitab = _mm_cvttpd_epi32(ewrt);
1707 eweps = _mm_frcz_pd(ewrt);
1709 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1711 twoeweps = _mm_add_pd(eweps,eweps);
1712 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1714 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1715 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1719 /* Update vectorial force */
1720 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1721 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1722 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1724 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1725 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1726 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1728 /**************************
1729 * CALCULATE INTERACTIONS *
1730 **************************/
1732 r13 = _mm_mul_pd(rsq13,rinv13);
1734 /* EWALD ELECTROSTATICS */
1736 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1737 ewrt = _mm_mul_pd(r13,ewtabscale);
1738 ewitab = _mm_cvttpd_epi32(ewrt);
1740 eweps = _mm_frcz_pd(ewrt);
1742 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1744 twoeweps = _mm_add_pd(eweps,eweps);
1745 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1747 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1748 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1752 /* Update vectorial force */
1753 fix1 = _mm_macc_pd(dx13,fscal,fix1);
1754 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
1755 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
1757 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
1758 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
1759 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
1761 /**************************
1762 * CALCULATE INTERACTIONS *
1763 **************************/
1765 r21 = _mm_mul_pd(rsq21,rinv21);
1767 /* EWALD ELECTROSTATICS */
1769 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1770 ewrt = _mm_mul_pd(r21,ewtabscale);
1771 ewitab = _mm_cvttpd_epi32(ewrt);
1773 eweps = _mm_frcz_pd(ewrt);
1775 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1777 twoeweps = _mm_add_pd(eweps,eweps);
1778 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1780 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1781 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1785 /* Update vectorial force */
1786 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1787 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1788 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1790 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1791 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1792 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1794 /**************************
1795 * CALCULATE INTERACTIONS *
1796 **************************/
1798 r22 = _mm_mul_pd(rsq22,rinv22);
1800 /* EWALD ELECTROSTATICS */
1802 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1803 ewrt = _mm_mul_pd(r22,ewtabscale);
1804 ewitab = _mm_cvttpd_epi32(ewrt);
1806 eweps = _mm_frcz_pd(ewrt);
1808 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1810 twoeweps = _mm_add_pd(eweps,eweps);
1811 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1813 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1814 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1818 /* Update vectorial force */
1819 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1820 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1821 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1823 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1824 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1825 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1827 /**************************
1828 * CALCULATE INTERACTIONS *
1829 **************************/
1831 r23 = _mm_mul_pd(rsq23,rinv23);
1833 /* EWALD ELECTROSTATICS */
1835 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1836 ewrt = _mm_mul_pd(r23,ewtabscale);
1837 ewitab = _mm_cvttpd_epi32(ewrt);
1839 eweps = _mm_frcz_pd(ewrt);
1841 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1843 twoeweps = _mm_add_pd(eweps,eweps);
1844 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1846 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1847 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1851 /* Update vectorial force */
1852 fix2 = _mm_macc_pd(dx23,fscal,fix2);
1853 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
1854 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
1856 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
1857 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
1858 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
1860 /**************************
1861 * CALCULATE INTERACTIONS *
1862 **************************/
1864 r31 = _mm_mul_pd(rsq31,rinv31);
1866 /* EWALD ELECTROSTATICS */
1868 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1869 ewrt = _mm_mul_pd(r31,ewtabscale);
1870 ewitab = _mm_cvttpd_epi32(ewrt);
1872 eweps = _mm_frcz_pd(ewrt);
1874 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1876 twoeweps = _mm_add_pd(eweps,eweps);
1877 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1879 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1880 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1884 /* Update vectorial force */
1885 fix3 = _mm_macc_pd(dx31,fscal,fix3);
1886 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
1887 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
1889 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
1890 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
1891 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
1893 /**************************
1894 * CALCULATE INTERACTIONS *
1895 **************************/
1897 r32 = _mm_mul_pd(rsq32,rinv32);
1899 /* EWALD ELECTROSTATICS */
1901 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1902 ewrt = _mm_mul_pd(r32,ewtabscale);
1903 ewitab = _mm_cvttpd_epi32(ewrt);
1905 eweps = _mm_frcz_pd(ewrt);
1907 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1909 twoeweps = _mm_add_pd(eweps,eweps);
1910 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1912 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1913 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1917 /* Update vectorial force */
1918 fix3 = _mm_macc_pd(dx32,fscal,fix3);
1919 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
1920 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
1922 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
1923 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
1924 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
1926 /**************************
1927 * CALCULATE INTERACTIONS *
1928 **************************/
1930 r33 = _mm_mul_pd(rsq33,rinv33);
1932 /* EWALD ELECTROSTATICS */
1934 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1935 ewrt = _mm_mul_pd(r33,ewtabscale);
1936 ewitab = _mm_cvttpd_epi32(ewrt);
1938 eweps = _mm_frcz_pd(ewrt);
1940 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1942 twoeweps = _mm_add_pd(eweps,eweps);
1943 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1945 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1946 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1950 /* Update vectorial force */
1951 fix3 = _mm_macc_pd(dx33,fscal,fix3);
1952 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
1953 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
1955 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
1956 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
1957 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
1959 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1961 /* Inner loop uses 405 flops */
1964 if(jidx<j_index_end)
1968 j_coord_offsetA = DIM*jnrA;
1970 /* load j atom coordinates */
1971 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1972 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1973 &jy2,&jz2,&jx3,&jy3,&jz3);
1975 /* Calculate displacement vector */
1976 dx00 = _mm_sub_pd(ix0,jx0);
1977 dy00 = _mm_sub_pd(iy0,jy0);
1978 dz00 = _mm_sub_pd(iz0,jz0);
1979 dx11 = _mm_sub_pd(ix1,jx1);
1980 dy11 = _mm_sub_pd(iy1,jy1);
1981 dz11 = _mm_sub_pd(iz1,jz1);
1982 dx12 = _mm_sub_pd(ix1,jx2);
1983 dy12 = _mm_sub_pd(iy1,jy2);
1984 dz12 = _mm_sub_pd(iz1,jz2);
1985 dx13 = _mm_sub_pd(ix1,jx3);
1986 dy13 = _mm_sub_pd(iy1,jy3);
1987 dz13 = _mm_sub_pd(iz1,jz3);
1988 dx21 = _mm_sub_pd(ix2,jx1);
1989 dy21 = _mm_sub_pd(iy2,jy1);
1990 dz21 = _mm_sub_pd(iz2,jz1);
1991 dx22 = _mm_sub_pd(ix2,jx2);
1992 dy22 = _mm_sub_pd(iy2,jy2);
1993 dz22 = _mm_sub_pd(iz2,jz2);
1994 dx23 = _mm_sub_pd(ix2,jx3);
1995 dy23 = _mm_sub_pd(iy2,jy3);
1996 dz23 = _mm_sub_pd(iz2,jz3);
1997 dx31 = _mm_sub_pd(ix3,jx1);
1998 dy31 = _mm_sub_pd(iy3,jy1);
1999 dz31 = _mm_sub_pd(iz3,jz1);
2000 dx32 = _mm_sub_pd(ix3,jx2);
2001 dy32 = _mm_sub_pd(iy3,jy2);
2002 dz32 = _mm_sub_pd(iz3,jz2);
2003 dx33 = _mm_sub_pd(ix3,jx3);
2004 dy33 = _mm_sub_pd(iy3,jy3);
2005 dz33 = _mm_sub_pd(iz3,jz3);
2007 /* Calculate squared distance and things based on it */
2008 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2009 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2010 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2011 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2012 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2013 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2014 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2015 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2016 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2017 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2019 rinv00 = gmx_mm_invsqrt_pd(rsq00);
2020 rinv11 = gmx_mm_invsqrt_pd(rsq11);
2021 rinv12 = gmx_mm_invsqrt_pd(rsq12);
2022 rinv13 = gmx_mm_invsqrt_pd(rsq13);
2023 rinv21 = gmx_mm_invsqrt_pd(rsq21);
2024 rinv22 = gmx_mm_invsqrt_pd(rsq22);
2025 rinv23 = gmx_mm_invsqrt_pd(rsq23);
2026 rinv31 = gmx_mm_invsqrt_pd(rsq31);
2027 rinv32 = gmx_mm_invsqrt_pd(rsq32);
2028 rinv33 = gmx_mm_invsqrt_pd(rsq33);
2030 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
2031 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
2032 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
2033 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
2034 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
2035 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
2036 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
2037 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
2038 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
2040 fjx0 = _mm_setzero_pd();
2041 fjy0 = _mm_setzero_pd();
2042 fjz0 = _mm_setzero_pd();
2043 fjx1 = _mm_setzero_pd();
2044 fjy1 = _mm_setzero_pd();
2045 fjz1 = _mm_setzero_pd();
2046 fjx2 = _mm_setzero_pd();
2047 fjy2 = _mm_setzero_pd();
2048 fjz2 = _mm_setzero_pd();
2049 fjx3 = _mm_setzero_pd();
2050 fjy3 = _mm_setzero_pd();
2051 fjz3 = _mm_setzero_pd();
2053 /**************************
2054 * CALCULATE INTERACTIONS *
2055 **************************/
2057 r00 = _mm_mul_pd(rsq00,rinv00);
2059 /* Calculate table index by multiplying r with table scale and truncate to integer */
2060 rt = _mm_mul_pd(r00,vftabscale);
2061 vfitab = _mm_cvttpd_epi32(rt);
2063 vfeps = _mm_frcz_pd(rt);
2065 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2067 twovfeps = _mm_add_pd(vfeps,vfeps);
2068 vfitab = _mm_slli_epi32(vfitab,3);
2070 /* CUBIC SPLINE TABLE DISPERSION */
2071 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2072 F = _mm_setzero_pd();
2073 GMX_MM_TRANSPOSE2_PD(Y,F);
2074 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2075 H = _mm_setzero_pd();
2076 GMX_MM_TRANSPOSE2_PD(G,H);
2077 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
2078 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
2079 fvdw6 = _mm_mul_pd(c6_00,FF);
2081 /* CUBIC SPLINE TABLE REPULSION */
2082 vfitab = _mm_add_epi32(vfitab,ifour);
2083 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2084 F = _mm_setzero_pd();
2085 GMX_MM_TRANSPOSE2_PD(Y,F);
2086 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2087 H = _mm_setzero_pd();
2088 GMX_MM_TRANSPOSE2_PD(G,H);
2089 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
2090 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
2091 fvdw12 = _mm_mul_pd(c12_00,FF);
2092 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
2096 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2098 /* Update vectorial force */
2099 fix0 = _mm_macc_pd(dx00,fscal,fix0);
2100 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
2101 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
2103 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
2104 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
2105 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
2107 /**************************
2108 * CALCULATE INTERACTIONS *
2109 **************************/
2111 r11 = _mm_mul_pd(rsq11,rinv11);
2113 /* EWALD ELECTROSTATICS */
2115 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2116 ewrt = _mm_mul_pd(r11,ewtabscale);
2117 ewitab = _mm_cvttpd_epi32(ewrt);
2119 eweps = _mm_frcz_pd(ewrt);
2121 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2123 twoeweps = _mm_add_pd(eweps,eweps);
2124 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2125 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2126 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2130 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2132 /* Update vectorial force */
2133 fix1 = _mm_macc_pd(dx11,fscal,fix1);
2134 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
2135 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
2137 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
2138 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
2139 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
2141 /**************************
2142 * CALCULATE INTERACTIONS *
2143 **************************/
2145 r12 = _mm_mul_pd(rsq12,rinv12);
2147 /* EWALD ELECTROSTATICS */
2149 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2150 ewrt = _mm_mul_pd(r12,ewtabscale);
2151 ewitab = _mm_cvttpd_epi32(ewrt);
2153 eweps = _mm_frcz_pd(ewrt);
2155 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2157 twoeweps = _mm_add_pd(eweps,eweps);
2158 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2159 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2160 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2164 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2166 /* Update vectorial force */
2167 fix1 = _mm_macc_pd(dx12,fscal,fix1);
2168 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
2169 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
2171 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
2172 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
2173 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
2175 /**************************
2176 * CALCULATE INTERACTIONS *
2177 **************************/
2179 r13 = _mm_mul_pd(rsq13,rinv13);
2181 /* EWALD ELECTROSTATICS */
2183 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2184 ewrt = _mm_mul_pd(r13,ewtabscale);
2185 ewitab = _mm_cvttpd_epi32(ewrt);
2187 eweps = _mm_frcz_pd(ewrt);
2189 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2191 twoeweps = _mm_add_pd(eweps,eweps);
2192 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2193 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2194 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2198 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2200 /* Update vectorial force */
2201 fix1 = _mm_macc_pd(dx13,fscal,fix1);
2202 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
2203 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
2205 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
2206 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
2207 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
2209 /**************************
2210 * CALCULATE INTERACTIONS *
2211 **************************/
2213 r21 = _mm_mul_pd(rsq21,rinv21);
2215 /* EWALD ELECTROSTATICS */
2217 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2218 ewrt = _mm_mul_pd(r21,ewtabscale);
2219 ewitab = _mm_cvttpd_epi32(ewrt);
2221 eweps = _mm_frcz_pd(ewrt);
2223 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2225 twoeweps = _mm_add_pd(eweps,eweps);
2226 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2227 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2228 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2232 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2234 /* Update vectorial force */
2235 fix2 = _mm_macc_pd(dx21,fscal,fix2);
2236 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
2237 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
2239 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
2240 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
2241 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
2243 /**************************
2244 * CALCULATE INTERACTIONS *
2245 **************************/
2247 r22 = _mm_mul_pd(rsq22,rinv22);
2249 /* EWALD ELECTROSTATICS */
2251 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2252 ewrt = _mm_mul_pd(r22,ewtabscale);
2253 ewitab = _mm_cvttpd_epi32(ewrt);
2255 eweps = _mm_frcz_pd(ewrt);
2257 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2259 twoeweps = _mm_add_pd(eweps,eweps);
2260 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2261 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2262 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2266 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2268 /* Update vectorial force */
2269 fix2 = _mm_macc_pd(dx22,fscal,fix2);
2270 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
2271 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
2273 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
2274 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
2275 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
2277 /**************************
2278 * CALCULATE INTERACTIONS *
2279 **************************/
2281 r23 = _mm_mul_pd(rsq23,rinv23);
2283 /* EWALD ELECTROSTATICS */
2285 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2286 ewrt = _mm_mul_pd(r23,ewtabscale);
2287 ewitab = _mm_cvttpd_epi32(ewrt);
2289 eweps = _mm_frcz_pd(ewrt);
2291 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2293 twoeweps = _mm_add_pd(eweps,eweps);
2294 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2295 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2296 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2300 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2302 /* Update vectorial force */
2303 fix2 = _mm_macc_pd(dx23,fscal,fix2);
2304 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
2305 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
2307 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
2308 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
2309 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
2311 /**************************
2312 * CALCULATE INTERACTIONS *
2313 **************************/
2315 r31 = _mm_mul_pd(rsq31,rinv31);
2317 /* EWALD ELECTROSTATICS */
2319 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2320 ewrt = _mm_mul_pd(r31,ewtabscale);
2321 ewitab = _mm_cvttpd_epi32(ewrt);
2323 eweps = _mm_frcz_pd(ewrt);
2325 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2327 twoeweps = _mm_add_pd(eweps,eweps);
2328 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2329 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2330 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2334 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2336 /* Update vectorial force */
2337 fix3 = _mm_macc_pd(dx31,fscal,fix3);
2338 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
2339 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
2341 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
2342 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
2343 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
2345 /**************************
2346 * CALCULATE INTERACTIONS *
2347 **************************/
2349 r32 = _mm_mul_pd(rsq32,rinv32);
2351 /* EWALD ELECTROSTATICS */
2353 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2354 ewrt = _mm_mul_pd(r32,ewtabscale);
2355 ewitab = _mm_cvttpd_epi32(ewrt);
2357 eweps = _mm_frcz_pd(ewrt);
2359 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2361 twoeweps = _mm_add_pd(eweps,eweps);
2362 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2363 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2364 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2368 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2370 /* Update vectorial force */
2371 fix3 = _mm_macc_pd(dx32,fscal,fix3);
2372 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
2373 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
2375 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
2376 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
2377 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
2379 /**************************
2380 * CALCULATE INTERACTIONS *
2381 **************************/
2383 r33 = _mm_mul_pd(rsq33,rinv33);
2385 /* EWALD ELECTROSTATICS */
2387 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2388 ewrt = _mm_mul_pd(r33,ewtabscale);
2389 ewitab = _mm_cvttpd_epi32(ewrt);
2391 eweps = _mm_frcz_pd(ewrt);
2393 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2395 twoeweps = _mm_add_pd(eweps,eweps);
2396 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2397 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2398 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2402 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2404 /* Update vectorial force */
2405 fix3 = _mm_macc_pd(dx33,fscal,fix3);
2406 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
2407 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
2409 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
2410 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
2411 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
2413 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2415 /* Inner loop uses 405 flops */
2418 /* End of innermost loop */
2420 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2421 f+i_coord_offset,fshift+i_shift_offset);
2423 /* Increment number of inner iterations */
2424 inneriter += j_index_end - j_index_start;
2426 /* Outer loop uses 24 flops */
2429 /* Increment number of outer iterations */
2432 /* Update outer/inner flops */
2434 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*405);