2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
49 #include "gromacs/simd/math_x86_avx_128_fma_double.h"
50 #include "kernelutil_x86_avx_128_fma_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
54 * Electrostatics interaction: Ewald
55 * VdW interaction: CubicSplineTable
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
77 int j_coord_offsetA,j_coord_offsetB;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 int vdwjidx0A,vdwjidx0B;
89 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 int vdwjidx1A,vdwjidx1B;
91 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B;
93 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
106 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
109 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
110 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
112 __m128i ifour = _mm_set1_epi32(4);
113 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
116 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
118 __m128d dummy_mask,cutoff_mask;
119 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
120 __m128d one = _mm_set1_pd(1.0);
121 __m128d two = _mm_set1_pd(2.0);
127 jindex = nlist->jindex;
129 shiftidx = nlist->shift;
131 shiftvec = fr->shift_vec[0];
132 fshift = fr->fshift[0];
133 facel = _mm_set1_pd(fr->epsfac);
134 charge = mdatoms->chargeA;
135 nvdwtype = fr->ntype;
137 vdwtype = mdatoms->typeA;
139 vftab = kernel_data->table_vdw->data;
140 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
142 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
143 ewtab = fr->ic->tabq_coul_FDV0;
144 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
145 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
147 /* Setup water-specific parameters */
148 inr = nlist->iinr[0];
149 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
150 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
151 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
152 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
154 jq0 = _mm_set1_pd(charge[inr+0]);
155 jq1 = _mm_set1_pd(charge[inr+1]);
156 jq2 = _mm_set1_pd(charge[inr+2]);
157 vdwjidx0A = 2*vdwtype[inr+0];
158 qq00 = _mm_mul_pd(iq0,jq0);
159 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
160 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
161 qq01 = _mm_mul_pd(iq0,jq1);
162 qq02 = _mm_mul_pd(iq0,jq2);
163 qq10 = _mm_mul_pd(iq1,jq0);
164 qq11 = _mm_mul_pd(iq1,jq1);
165 qq12 = _mm_mul_pd(iq1,jq2);
166 qq20 = _mm_mul_pd(iq2,jq0);
167 qq21 = _mm_mul_pd(iq2,jq1);
168 qq22 = _mm_mul_pd(iq2,jq2);
170 /* Avoid stupid compiler warnings */
178 /* Start outer loop over neighborlists */
179 for(iidx=0; iidx<nri; iidx++)
181 /* Load shift vector for this list */
182 i_shift_offset = DIM*shiftidx[iidx];
184 /* Load limits for loop over neighbors */
185 j_index_start = jindex[iidx];
186 j_index_end = jindex[iidx+1];
188 /* Get outer coordinate index */
190 i_coord_offset = DIM*inr;
192 /* Load i particle coords and add shift vector */
193 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
194 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
196 fix0 = _mm_setzero_pd();
197 fiy0 = _mm_setzero_pd();
198 fiz0 = _mm_setzero_pd();
199 fix1 = _mm_setzero_pd();
200 fiy1 = _mm_setzero_pd();
201 fiz1 = _mm_setzero_pd();
202 fix2 = _mm_setzero_pd();
203 fiy2 = _mm_setzero_pd();
204 fiz2 = _mm_setzero_pd();
206 /* Reset potential sums */
207 velecsum = _mm_setzero_pd();
208 vvdwsum = _mm_setzero_pd();
210 /* Start inner kernel loop */
211 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
214 /* Get j neighbor index, and coordinate index */
217 j_coord_offsetA = DIM*jnrA;
218 j_coord_offsetB = DIM*jnrB;
220 /* load j atom coordinates */
221 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
222 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
224 /* Calculate displacement vector */
225 dx00 = _mm_sub_pd(ix0,jx0);
226 dy00 = _mm_sub_pd(iy0,jy0);
227 dz00 = _mm_sub_pd(iz0,jz0);
228 dx01 = _mm_sub_pd(ix0,jx1);
229 dy01 = _mm_sub_pd(iy0,jy1);
230 dz01 = _mm_sub_pd(iz0,jz1);
231 dx02 = _mm_sub_pd(ix0,jx2);
232 dy02 = _mm_sub_pd(iy0,jy2);
233 dz02 = _mm_sub_pd(iz0,jz2);
234 dx10 = _mm_sub_pd(ix1,jx0);
235 dy10 = _mm_sub_pd(iy1,jy0);
236 dz10 = _mm_sub_pd(iz1,jz0);
237 dx11 = _mm_sub_pd(ix1,jx1);
238 dy11 = _mm_sub_pd(iy1,jy1);
239 dz11 = _mm_sub_pd(iz1,jz1);
240 dx12 = _mm_sub_pd(ix1,jx2);
241 dy12 = _mm_sub_pd(iy1,jy2);
242 dz12 = _mm_sub_pd(iz1,jz2);
243 dx20 = _mm_sub_pd(ix2,jx0);
244 dy20 = _mm_sub_pd(iy2,jy0);
245 dz20 = _mm_sub_pd(iz2,jz0);
246 dx21 = _mm_sub_pd(ix2,jx1);
247 dy21 = _mm_sub_pd(iy2,jy1);
248 dz21 = _mm_sub_pd(iz2,jz1);
249 dx22 = _mm_sub_pd(ix2,jx2);
250 dy22 = _mm_sub_pd(iy2,jy2);
251 dz22 = _mm_sub_pd(iz2,jz2);
253 /* Calculate squared distance and things based on it */
254 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
255 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
256 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
257 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
258 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
259 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
260 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
261 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
262 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
264 rinv00 = gmx_mm_invsqrt_pd(rsq00);
265 rinv01 = gmx_mm_invsqrt_pd(rsq01);
266 rinv02 = gmx_mm_invsqrt_pd(rsq02);
267 rinv10 = gmx_mm_invsqrt_pd(rsq10);
268 rinv11 = gmx_mm_invsqrt_pd(rsq11);
269 rinv12 = gmx_mm_invsqrt_pd(rsq12);
270 rinv20 = gmx_mm_invsqrt_pd(rsq20);
271 rinv21 = gmx_mm_invsqrt_pd(rsq21);
272 rinv22 = gmx_mm_invsqrt_pd(rsq22);
274 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
275 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
276 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
277 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
278 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
279 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
280 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
281 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
282 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
284 fjx0 = _mm_setzero_pd();
285 fjy0 = _mm_setzero_pd();
286 fjz0 = _mm_setzero_pd();
287 fjx1 = _mm_setzero_pd();
288 fjy1 = _mm_setzero_pd();
289 fjz1 = _mm_setzero_pd();
290 fjx2 = _mm_setzero_pd();
291 fjy2 = _mm_setzero_pd();
292 fjz2 = _mm_setzero_pd();
294 /**************************
295 * CALCULATE INTERACTIONS *
296 **************************/
298 r00 = _mm_mul_pd(rsq00,rinv00);
300 /* Calculate table index by multiplying r with table scale and truncate to integer */
301 rt = _mm_mul_pd(r00,vftabscale);
302 vfitab = _mm_cvttpd_epi32(rt);
304 vfeps = _mm_frcz_pd(rt);
306 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
308 twovfeps = _mm_add_pd(vfeps,vfeps);
309 vfitab = _mm_slli_epi32(vfitab,3);
311 /* EWALD ELECTROSTATICS */
313 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
314 ewrt = _mm_mul_pd(r00,ewtabscale);
315 ewitab = _mm_cvttpd_epi32(ewrt);
317 eweps = _mm_frcz_pd(ewrt);
319 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
321 twoeweps = _mm_add_pd(eweps,eweps);
322 ewitab = _mm_slli_epi32(ewitab,2);
323 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
324 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
325 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
326 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
327 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
328 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
329 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
330 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
331 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
332 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
334 /* CUBIC SPLINE TABLE DISPERSION */
335 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
336 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
337 GMX_MM_TRANSPOSE2_PD(Y,F);
338 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
339 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
340 GMX_MM_TRANSPOSE2_PD(G,H);
341 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
342 VV = _mm_macc_pd(vfeps,Fp,Y);
343 vvdw6 = _mm_mul_pd(c6_00,VV);
344 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
345 fvdw6 = _mm_mul_pd(c6_00,FF);
347 /* CUBIC SPLINE TABLE REPULSION */
348 vfitab = _mm_add_epi32(vfitab,ifour);
349 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
350 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
351 GMX_MM_TRANSPOSE2_PD(Y,F);
352 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
353 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
354 GMX_MM_TRANSPOSE2_PD(G,H);
355 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
356 VV = _mm_macc_pd(vfeps,Fp,Y);
357 vvdw12 = _mm_mul_pd(c12_00,VV);
358 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
359 fvdw12 = _mm_mul_pd(c12_00,FF);
360 vvdw = _mm_add_pd(vvdw12,vvdw6);
361 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
363 /* Update potential sum for this i atom from the interaction with this j atom. */
364 velecsum = _mm_add_pd(velecsum,velec);
365 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
367 fscal = _mm_add_pd(felec,fvdw);
369 /* Update vectorial force */
370 fix0 = _mm_macc_pd(dx00,fscal,fix0);
371 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
372 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
374 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
375 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
376 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
378 /**************************
379 * CALCULATE INTERACTIONS *
380 **************************/
382 r01 = _mm_mul_pd(rsq01,rinv01);
384 /* EWALD ELECTROSTATICS */
386 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
387 ewrt = _mm_mul_pd(r01,ewtabscale);
388 ewitab = _mm_cvttpd_epi32(ewrt);
390 eweps = _mm_frcz_pd(ewrt);
392 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
394 twoeweps = _mm_add_pd(eweps,eweps);
395 ewitab = _mm_slli_epi32(ewitab,2);
396 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
397 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
398 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
399 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
400 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
401 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
402 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
403 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
404 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
405 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
407 /* Update potential sum for this i atom from the interaction with this j atom. */
408 velecsum = _mm_add_pd(velecsum,velec);
412 /* Update vectorial force */
413 fix0 = _mm_macc_pd(dx01,fscal,fix0);
414 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
415 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
417 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
418 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
419 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
421 /**************************
422 * CALCULATE INTERACTIONS *
423 **************************/
425 r02 = _mm_mul_pd(rsq02,rinv02);
427 /* EWALD ELECTROSTATICS */
429 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
430 ewrt = _mm_mul_pd(r02,ewtabscale);
431 ewitab = _mm_cvttpd_epi32(ewrt);
433 eweps = _mm_frcz_pd(ewrt);
435 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
437 twoeweps = _mm_add_pd(eweps,eweps);
438 ewitab = _mm_slli_epi32(ewitab,2);
439 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
440 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
441 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
442 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
443 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
444 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
445 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
446 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
447 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
448 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
450 /* Update potential sum for this i atom from the interaction with this j atom. */
451 velecsum = _mm_add_pd(velecsum,velec);
455 /* Update vectorial force */
456 fix0 = _mm_macc_pd(dx02,fscal,fix0);
457 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
458 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
460 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
461 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
462 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
464 /**************************
465 * CALCULATE INTERACTIONS *
466 **************************/
468 r10 = _mm_mul_pd(rsq10,rinv10);
470 /* EWALD ELECTROSTATICS */
472 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
473 ewrt = _mm_mul_pd(r10,ewtabscale);
474 ewitab = _mm_cvttpd_epi32(ewrt);
476 eweps = _mm_frcz_pd(ewrt);
478 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
480 twoeweps = _mm_add_pd(eweps,eweps);
481 ewitab = _mm_slli_epi32(ewitab,2);
482 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
483 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
484 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
485 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
486 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
487 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
488 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
489 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
490 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
491 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
493 /* Update potential sum for this i atom from the interaction with this j atom. */
494 velecsum = _mm_add_pd(velecsum,velec);
498 /* Update vectorial force */
499 fix1 = _mm_macc_pd(dx10,fscal,fix1);
500 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
501 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
503 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
504 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
505 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
507 /**************************
508 * CALCULATE INTERACTIONS *
509 **************************/
511 r11 = _mm_mul_pd(rsq11,rinv11);
513 /* EWALD ELECTROSTATICS */
515 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
516 ewrt = _mm_mul_pd(r11,ewtabscale);
517 ewitab = _mm_cvttpd_epi32(ewrt);
519 eweps = _mm_frcz_pd(ewrt);
521 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
523 twoeweps = _mm_add_pd(eweps,eweps);
524 ewitab = _mm_slli_epi32(ewitab,2);
525 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
526 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
527 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
528 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
529 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
530 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
531 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
532 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
533 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
534 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
536 /* Update potential sum for this i atom from the interaction with this j atom. */
537 velecsum = _mm_add_pd(velecsum,velec);
541 /* Update vectorial force */
542 fix1 = _mm_macc_pd(dx11,fscal,fix1);
543 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
544 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
546 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
547 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
548 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
550 /**************************
551 * CALCULATE INTERACTIONS *
552 **************************/
554 r12 = _mm_mul_pd(rsq12,rinv12);
556 /* EWALD ELECTROSTATICS */
558 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
559 ewrt = _mm_mul_pd(r12,ewtabscale);
560 ewitab = _mm_cvttpd_epi32(ewrt);
562 eweps = _mm_frcz_pd(ewrt);
564 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
566 twoeweps = _mm_add_pd(eweps,eweps);
567 ewitab = _mm_slli_epi32(ewitab,2);
568 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
569 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
570 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
571 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
572 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
573 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
574 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
575 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
576 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
577 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
579 /* Update potential sum for this i atom from the interaction with this j atom. */
580 velecsum = _mm_add_pd(velecsum,velec);
584 /* Update vectorial force */
585 fix1 = _mm_macc_pd(dx12,fscal,fix1);
586 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
587 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
589 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
590 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
591 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
593 /**************************
594 * CALCULATE INTERACTIONS *
595 **************************/
597 r20 = _mm_mul_pd(rsq20,rinv20);
599 /* EWALD ELECTROSTATICS */
601 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
602 ewrt = _mm_mul_pd(r20,ewtabscale);
603 ewitab = _mm_cvttpd_epi32(ewrt);
605 eweps = _mm_frcz_pd(ewrt);
607 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
609 twoeweps = _mm_add_pd(eweps,eweps);
610 ewitab = _mm_slli_epi32(ewitab,2);
611 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
612 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
613 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
614 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
615 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
616 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
617 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
618 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
619 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
620 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
622 /* Update potential sum for this i atom from the interaction with this j atom. */
623 velecsum = _mm_add_pd(velecsum,velec);
627 /* Update vectorial force */
628 fix2 = _mm_macc_pd(dx20,fscal,fix2);
629 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
630 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
632 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
633 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
634 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
636 /**************************
637 * CALCULATE INTERACTIONS *
638 **************************/
640 r21 = _mm_mul_pd(rsq21,rinv21);
642 /* EWALD ELECTROSTATICS */
644 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
645 ewrt = _mm_mul_pd(r21,ewtabscale);
646 ewitab = _mm_cvttpd_epi32(ewrt);
648 eweps = _mm_frcz_pd(ewrt);
650 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
652 twoeweps = _mm_add_pd(eweps,eweps);
653 ewitab = _mm_slli_epi32(ewitab,2);
654 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
655 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
656 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
657 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
658 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
659 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
660 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
661 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
662 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
663 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
665 /* Update potential sum for this i atom from the interaction with this j atom. */
666 velecsum = _mm_add_pd(velecsum,velec);
670 /* Update vectorial force */
671 fix2 = _mm_macc_pd(dx21,fscal,fix2);
672 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
673 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
675 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
676 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
677 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
679 /**************************
680 * CALCULATE INTERACTIONS *
681 **************************/
683 r22 = _mm_mul_pd(rsq22,rinv22);
685 /* EWALD ELECTROSTATICS */
687 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
688 ewrt = _mm_mul_pd(r22,ewtabscale);
689 ewitab = _mm_cvttpd_epi32(ewrt);
691 eweps = _mm_frcz_pd(ewrt);
693 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
695 twoeweps = _mm_add_pd(eweps,eweps);
696 ewitab = _mm_slli_epi32(ewitab,2);
697 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
698 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
699 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
700 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
701 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
702 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
703 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
704 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
705 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
706 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
708 /* Update potential sum for this i atom from the interaction with this j atom. */
709 velecsum = _mm_add_pd(velecsum,velec);
713 /* Update vectorial force */
714 fix2 = _mm_macc_pd(dx22,fscal,fix2);
715 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
716 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
718 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
719 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
720 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
722 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
724 /* Inner loop uses 430 flops */
731 j_coord_offsetA = DIM*jnrA;
733 /* load j atom coordinates */
734 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
735 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
737 /* Calculate displacement vector */
738 dx00 = _mm_sub_pd(ix0,jx0);
739 dy00 = _mm_sub_pd(iy0,jy0);
740 dz00 = _mm_sub_pd(iz0,jz0);
741 dx01 = _mm_sub_pd(ix0,jx1);
742 dy01 = _mm_sub_pd(iy0,jy1);
743 dz01 = _mm_sub_pd(iz0,jz1);
744 dx02 = _mm_sub_pd(ix0,jx2);
745 dy02 = _mm_sub_pd(iy0,jy2);
746 dz02 = _mm_sub_pd(iz0,jz2);
747 dx10 = _mm_sub_pd(ix1,jx0);
748 dy10 = _mm_sub_pd(iy1,jy0);
749 dz10 = _mm_sub_pd(iz1,jz0);
750 dx11 = _mm_sub_pd(ix1,jx1);
751 dy11 = _mm_sub_pd(iy1,jy1);
752 dz11 = _mm_sub_pd(iz1,jz1);
753 dx12 = _mm_sub_pd(ix1,jx2);
754 dy12 = _mm_sub_pd(iy1,jy2);
755 dz12 = _mm_sub_pd(iz1,jz2);
756 dx20 = _mm_sub_pd(ix2,jx0);
757 dy20 = _mm_sub_pd(iy2,jy0);
758 dz20 = _mm_sub_pd(iz2,jz0);
759 dx21 = _mm_sub_pd(ix2,jx1);
760 dy21 = _mm_sub_pd(iy2,jy1);
761 dz21 = _mm_sub_pd(iz2,jz1);
762 dx22 = _mm_sub_pd(ix2,jx2);
763 dy22 = _mm_sub_pd(iy2,jy2);
764 dz22 = _mm_sub_pd(iz2,jz2);
766 /* Calculate squared distance and things based on it */
767 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
768 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
769 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
770 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
771 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
772 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
773 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
774 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
775 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
777 rinv00 = gmx_mm_invsqrt_pd(rsq00);
778 rinv01 = gmx_mm_invsqrt_pd(rsq01);
779 rinv02 = gmx_mm_invsqrt_pd(rsq02);
780 rinv10 = gmx_mm_invsqrt_pd(rsq10);
781 rinv11 = gmx_mm_invsqrt_pd(rsq11);
782 rinv12 = gmx_mm_invsqrt_pd(rsq12);
783 rinv20 = gmx_mm_invsqrt_pd(rsq20);
784 rinv21 = gmx_mm_invsqrt_pd(rsq21);
785 rinv22 = gmx_mm_invsqrt_pd(rsq22);
787 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
788 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
789 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
790 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
791 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
792 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
793 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
794 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
795 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
797 fjx0 = _mm_setzero_pd();
798 fjy0 = _mm_setzero_pd();
799 fjz0 = _mm_setzero_pd();
800 fjx1 = _mm_setzero_pd();
801 fjy1 = _mm_setzero_pd();
802 fjz1 = _mm_setzero_pd();
803 fjx2 = _mm_setzero_pd();
804 fjy2 = _mm_setzero_pd();
805 fjz2 = _mm_setzero_pd();
807 /**************************
808 * CALCULATE INTERACTIONS *
809 **************************/
811 r00 = _mm_mul_pd(rsq00,rinv00);
813 /* Calculate table index by multiplying r with table scale and truncate to integer */
814 rt = _mm_mul_pd(r00,vftabscale);
815 vfitab = _mm_cvttpd_epi32(rt);
817 vfeps = _mm_frcz_pd(rt);
819 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
821 twovfeps = _mm_add_pd(vfeps,vfeps);
822 vfitab = _mm_slli_epi32(vfitab,3);
824 /* EWALD ELECTROSTATICS */
826 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
827 ewrt = _mm_mul_pd(r00,ewtabscale);
828 ewitab = _mm_cvttpd_epi32(ewrt);
830 eweps = _mm_frcz_pd(ewrt);
832 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
834 twoeweps = _mm_add_pd(eweps,eweps);
835 ewitab = _mm_slli_epi32(ewitab,2);
836 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
837 ewtabD = _mm_setzero_pd();
838 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
839 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
840 ewtabFn = _mm_setzero_pd();
841 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
842 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
843 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
844 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
845 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
847 /* CUBIC SPLINE TABLE DISPERSION */
848 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
849 F = _mm_setzero_pd();
850 GMX_MM_TRANSPOSE2_PD(Y,F);
851 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
852 H = _mm_setzero_pd();
853 GMX_MM_TRANSPOSE2_PD(G,H);
854 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
855 VV = _mm_macc_pd(vfeps,Fp,Y);
856 vvdw6 = _mm_mul_pd(c6_00,VV);
857 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
858 fvdw6 = _mm_mul_pd(c6_00,FF);
860 /* CUBIC SPLINE TABLE REPULSION */
861 vfitab = _mm_add_epi32(vfitab,ifour);
862 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
863 F = _mm_setzero_pd();
864 GMX_MM_TRANSPOSE2_PD(Y,F);
865 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
866 H = _mm_setzero_pd();
867 GMX_MM_TRANSPOSE2_PD(G,H);
868 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
869 VV = _mm_macc_pd(vfeps,Fp,Y);
870 vvdw12 = _mm_mul_pd(c12_00,VV);
871 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
872 fvdw12 = _mm_mul_pd(c12_00,FF);
873 vvdw = _mm_add_pd(vvdw12,vvdw6);
874 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
876 /* Update potential sum for this i atom from the interaction with this j atom. */
877 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
878 velecsum = _mm_add_pd(velecsum,velec);
879 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
880 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
882 fscal = _mm_add_pd(felec,fvdw);
884 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
886 /* Update vectorial force */
887 fix0 = _mm_macc_pd(dx00,fscal,fix0);
888 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
889 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
891 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
892 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
893 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
895 /**************************
896 * CALCULATE INTERACTIONS *
897 **************************/
899 r01 = _mm_mul_pd(rsq01,rinv01);
901 /* EWALD ELECTROSTATICS */
903 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
904 ewrt = _mm_mul_pd(r01,ewtabscale);
905 ewitab = _mm_cvttpd_epi32(ewrt);
907 eweps = _mm_frcz_pd(ewrt);
909 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
911 twoeweps = _mm_add_pd(eweps,eweps);
912 ewitab = _mm_slli_epi32(ewitab,2);
913 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
914 ewtabD = _mm_setzero_pd();
915 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
916 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
917 ewtabFn = _mm_setzero_pd();
918 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
919 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
920 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
921 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
922 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
924 /* Update potential sum for this i atom from the interaction with this j atom. */
925 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
926 velecsum = _mm_add_pd(velecsum,velec);
930 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
932 /* Update vectorial force */
933 fix0 = _mm_macc_pd(dx01,fscal,fix0);
934 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
935 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
937 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
938 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
939 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
941 /**************************
942 * CALCULATE INTERACTIONS *
943 **************************/
945 r02 = _mm_mul_pd(rsq02,rinv02);
947 /* EWALD ELECTROSTATICS */
949 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
950 ewrt = _mm_mul_pd(r02,ewtabscale);
951 ewitab = _mm_cvttpd_epi32(ewrt);
953 eweps = _mm_frcz_pd(ewrt);
955 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
957 twoeweps = _mm_add_pd(eweps,eweps);
958 ewitab = _mm_slli_epi32(ewitab,2);
959 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
960 ewtabD = _mm_setzero_pd();
961 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
962 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
963 ewtabFn = _mm_setzero_pd();
964 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
965 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
966 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
967 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
968 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
970 /* Update potential sum for this i atom from the interaction with this j atom. */
971 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
972 velecsum = _mm_add_pd(velecsum,velec);
976 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
978 /* Update vectorial force */
979 fix0 = _mm_macc_pd(dx02,fscal,fix0);
980 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
981 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
983 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
984 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
985 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
987 /**************************
988 * CALCULATE INTERACTIONS *
989 **************************/
991 r10 = _mm_mul_pd(rsq10,rinv10);
993 /* EWALD ELECTROSTATICS */
995 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
996 ewrt = _mm_mul_pd(r10,ewtabscale);
997 ewitab = _mm_cvttpd_epi32(ewrt);
999 eweps = _mm_frcz_pd(ewrt);
1001 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1003 twoeweps = _mm_add_pd(eweps,eweps);
1004 ewitab = _mm_slli_epi32(ewitab,2);
1005 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1006 ewtabD = _mm_setzero_pd();
1007 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1008 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1009 ewtabFn = _mm_setzero_pd();
1010 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1011 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1012 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1013 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
1014 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1016 /* Update potential sum for this i atom from the interaction with this j atom. */
1017 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1018 velecsum = _mm_add_pd(velecsum,velec);
1022 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1024 /* Update vectorial force */
1025 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1026 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1027 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1029 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1030 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1031 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1033 /**************************
1034 * CALCULATE INTERACTIONS *
1035 **************************/
1037 r11 = _mm_mul_pd(rsq11,rinv11);
1039 /* EWALD ELECTROSTATICS */
1041 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1042 ewrt = _mm_mul_pd(r11,ewtabscale);
1043 ewitab = _mm_cvttpd_epi32(ewrt);
1045 eweps = _mm_frcz_pd(ewrt);
1047 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1049 twoeweps = _mm_add_pd(eweps,eweps);
1050 ewitab = _mm_slli_epi32(ewitab,2);
1051 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1052 ewtabD = _mm_setzero_pd();
1053 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1054 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1055 ewtabFn = _mm_setzero_pd();
1056 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1057 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1058 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1059 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1060 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1062 /* Update potential sum for this i atom from the interaction with this j atom. */
1063 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1064 velecsum = _mm_add_pd(velecsum,velec);
1068 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1070 /* Update vectorial force */
1071 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1072 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1073 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1075 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1076 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1077 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1079 /**************************
1080 * CALCULATE INTERACTIONS *
1081 **************************/
1083 r12 = _mm_mul_pd(rsq12,rinv12);
1085 /* EWALD ELECTROSTATICS */
1087 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1088 ewrt = _mm_mul_pd(r12,ewtabscale);
1089 ewitab = _mm_cvttpd_epi32(ewrt);
1091 eweps = _mm_frcz_pd(ewrt);
1093 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1095 twoeweps = _mm_add_pd(eweps,eweps);
1096 ewitab = _mm_slli_epi32(ewitab,2);
1097 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1098 ewtabD = _mm_setzero_pd();
1099 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1100 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1101 ewtabFn = _mm_setzero_pd();
1102 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1103 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1104 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1105 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1106 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1108 /* Update potential sum for this i atom from the interaction with this j atom. */
1109 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1110 velecsum = _mm_add_pd(velecsum,velec);
1114 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1116 /* Update vectorial force */
1117 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1118 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1119 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1121 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1122 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1123 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1125 /**************************
1126 * CALCULATE INTERACTIONS *
1127 **************************/
1129 r20 = _mm_mul_pd(rsq20,rinv20);
1131 /* EWALD ELECTROSTATICS */
1133 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1134 ewrt = _mm_mul_pd(r20,ewtabscale);
1135 ewitab = _mm_cvttpd_epi32(ewrt);
1137 eweps = _mm_frcz_pd(ewrt);
1139 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1141 twoeweps = _mm_add_pd(eweps,eweps);
1142 ewitab = _mm_slli_epi32(ewitab,2);
1143 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1144 ewtabD = _mm_setzero_pd();
1145 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1146 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1147 ewtabFn = _mm_setzero_pd();
1148 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1149 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1150 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1151 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
1152 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1154 /* Update potential sum for this i atom from the interaction with this j atom. */
1155 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1156 velecsum = _mm_add_pd(velecsum,velec);
1160 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1162 /* Update vectorial force */
1163 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1164 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1165 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1167 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1168 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1169 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1171 /**************************
1172 * CALCULATE INTERACTIONS *
1173 **************************/
1175 r21 = _mm_mul_pd(rsq21,rinv21);
1177 /* EWALD ELECTROSTATICS */
1179 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1180 ewrt = _mm_mul_pd(r21,ewtabscale);
1181 ewitab = _mm_cvttpd_epi32(ewrt);
1183 eweps = _mm_frcz_pd(ewrt);
1185 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1187 twoeweps = _mm_add_pd(eweps,eweps);
1188 ewitab = _mm_slli_epi32(ewitab,2);
1189 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1190 ewtabD = _mm_setzero_pd();
1191 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1192 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1193 ewtabFn = _mm_setzero_pd();
1194 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1195 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1196 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1197 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1198 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1200 /* Update potential sum for this i atom from the interaction with this j atom. */
1201 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1202 velecsum = _mm_add_pd(velecsum,velec);
1206 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1208 /* Update vectorial force */
1209 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1210 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1211 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1213 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1214 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1215 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1217 /**************************
1218 * CALCULATE INTERACTIONS *
1219 **************************/
1221 r22 = _mm_mul_pd(rsq22,rinv22);
1223 /* EWALD ELECTROSTATICS */
1225 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1226 ewrt = _mm_mul_pd(r22,ewtabscale);
1227 ewitab = _mm_cvttpd_epi32(ewrt);
1229 eweps = _mm_frcz_pd(ewrt);
1231 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1233 twoeweps = _mm_add_pd(eweps,eweps);
1234 ewitab = _mm_slli_epi32(ewitab,2);
1235 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1236 ewtabD = _mm_setzero_pd();
1237 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1238 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1239 ewtabFn = _mm_setzero_pd();
1240 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1241 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1242 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1243 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1244 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1246 /* Update potential sum for this i atom from the interaction with this j atom. */
1247 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1248 velecsum = _mm_add_pd(velecsum,velec);
1252 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1254 /* Update vectorial force */
1255 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1256 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1257 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1259 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1260 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1261 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1263 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1265 /* Inner loop uses 430 flops */
1268 /* End of innermost loop */
1270 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1271 f+i_coord_offset,fshift+i_shift_offset);
1274 /* Update potential energies */
1275 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1276 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1278 /* Increment number of inner iterations */
1279 inneriter += j_index_end - j_index_start;
1281 /* Outer loop uses 20 flops */
1284 /* Increment number of outer iterations */
1287 /* Update outer/inner flops */
1289 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*430);
1292 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_128_fma_double
1293 * Electrostatics interaction: Ewald
1294 * VdW interaction: CubicSplineTable
1295 * Geometry: Water3-Water3
1296 * Calculate force/pot: Force
1299 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_128_fma_double
1300 (t_nblist * gmx_restrict nlist,
1301 rvec * gmx_restrict xx,
1302 rvec * gmx_restrict ff,
1303 t_forcerec * gmx_restrict fr,
1304 t_mdatoms * gmx_restrict mdatoms,
1305 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1306 t_nrnb * gmx_restrict nrnb)
1308 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1309 * just 0 for non-waters.
1310 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1311 * jnr indices corresponding to data put in the four positions in the SIMD register.
1313 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1314 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1316 int j_coord_offsetA,j_coord_offsetB;
1317 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1318 real rcutoff_scalar;
1319 real *shiftvec,*fshift,*x,*f;
1320 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1322 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1324 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1326 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1327 int vdwjidx0A,vdwjidx0B;
1328 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1329 int vdwjidx1A,vdwjidx1B;
1330 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1331 int vdwjidx2A,vdwjidx2B;
1332 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1333 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1334 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1335 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1336 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1337 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1338 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1339 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1340 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1341 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1342 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1345 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1348 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1349 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1351 __m128i ifour = _mm_set1_epi32(4);
1352 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
1355 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1357 __m128d dummy_mask,cutoff_mask;
1358 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1359 __m128d one = _mm_set1_pd(1.0);
1360 __m128d two = _mm_set1_pd(2.0);
1366 jindex = nlist->jindex;
1368 shiftidx = nlist->shift;
1370 shiftvec = fr->shift_vec[0];
1371 fshift = fr->fshift[0];
1372 facel = _mm_set1_pd(fr->epsfac);
1373 charge = mdatoms->chargeA;
1374 nvdwtype = fr->ntype;
1375 vdwparam = fr->nbfp;
1376 vdwtype = mdatoms->typeA;
1378 vftab = kernel_data->table_vdw->data;
1379 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
1381 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1382 ewtab = fr->ic->tabq_coul_F;
1383 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1384 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1386 /* Setup water-specific parameters */
1387 inr = nlist->iinr[0];
1388 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1389 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1390 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1391 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1393 jq0 = _mm_set1_pd(charge[inr+0]);
1394 jq1 = _mm_set1_pd(charge[inr+1]);
1395 jq2 = _mm_set1_pd(charge[inr+2]);
1396 vdwjidx0A = 2*vdwtype[inr+0];
1397 qq00 = _mm_mul_pd(iq0,jq0);
1398 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1399 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1400 qq01 = _mm_mul_pd(iq0,jq1);
1401 qq02 = _mm_mul_pd(iq0,jq2);
1402 qq10 = _mm_mul_pd(iq1,jq0);
1403 qq11 = _mm_mul_pd(iq1,jq1);
1404 qq12 = _mm_mul_pd(iq1,jq2);
1405 qq20 = _mm_mul_pd(iq2,jq0);
1406 qq21 = _mm_mul_pd(iq2,jq1);
1407 qq22 = _mm_mul_pd(iq2,jq2);
1409 /* Avoid stupid compiler warnings */
1411 j_coord_offsetA = 0;
1412 j_coord_offsetB = 0;
1417 /* Start outer loop over neighborlists */
1418 for(iidx=0; iidx<nri; iidx++)
1420 /* Load shift vector for this list */
1421 i_shift_offset = DIM*shiftidx[iidx];
1423 /* Load limits for loop over neighbors */
1424 j_index_start = jindex[iidx];
1425 j_index_end = jindex[iidx+1];
1427 /* Get outer coordinate index */
1429 i_coord_offset = DIM*inr;
1431 /* Load i particle coords and add shift vector */
1432 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1433 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1435 fix0 = _mm_setzero_pd();
1436 fiy0 = _mm_setzero_pd();
1437 fiz0 = _mm_setzero_pd();
1438 fix1 = _mm_setzero_pd();
1439 fiy1 = _mm_setzero_pd();
1440 fiz1 = _mm_setzero_pd();
1441 fix2 = _mm_setzero_pd();
1442 fiy2 = _mm_setzero_pd();
1443 fiz2 = _mm_setzero_pd();
1445 /* Start inner kernel loop */
1446 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1449 /* Get j neighbor index, and coordinate index */
1451 jnrB = jjnr[jidx+1];
1452 j_coord_offsetA = DIM*jnrA;
1453 j_coord_offsetB = DIM*jnrB;
1455 /* load j atom coordinates */
1456 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1457 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1459 /* Calculate displacement vector */
1460 dx00 = _mm_sub_pd(ix0,jx0);
1461 dy00 = _mm_sub_pd(iy0,jy0);
1462 dz00 = _mm_sub_pd(iz0,jz0);
1463 dx01 = _mm_sub_pd(ix0,jx1);
1464 dy01 = _mm_sub_pd(iy0,jy1);
1465 dz01 = _mm_sub_pd(iz0,jz1);
1466 dx02 = _mm_sub_pd(ix0,jx2);
1467 dy02 = _mm_sub_pd(iy0,jy2);
1468 dz02 = _mm_sub_pd(iz0,jz2);
1469 dx10 = _mm_sub_pd(ix1,jx0);
1470 dy10 = _mm_sub_pd(iy1,jy0);
1471 dz10 = _mm_sub_pd(iz1,jz0);
1472 dx11 = _mm_sub_pd(ix1,jx1);
1473 dy11 = _mm_sub_pd(iy1,jy1);
1474 dz11 = _mm_sub_pd(iz1,jz1);
1475 dx12 = _mm_sub_pd(ix1,jx2);
1476 dy12 = _mm_sub_pd(iy1,jy2);
1477 dz12 = _mm_sub_pd(iz1,jz2);
1478 dx20 = _mm_sub_pd(ix2,jx0);
1479 dy20 = _mm_sub_pd(iy2,jy0);
1480 dz20 = _mm_sub_pd(iz2,jz0);
1481 dx21 = _mm_sub_pd(ix2,jx1);
1482 dy21 = _mm_sub_pd(iy2,jy1);
1483 dz21 = _mm_sub_pd(iz2,jz1);
1484 dx22 = _mm_sub_pd(ix2,jx2);
1485 dy22 = _mm_sub_pd(iy2,jy2);
1486 dz22 = _mm_sub_pd(iz2,jz2);
1488 /* Calculate squared distance and things based on it */
1489 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1490 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1491 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1492 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1493 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1494 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1495 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1496 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1497 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1499 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1500 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1501 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1502 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1503 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1504 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1505 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1506 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1507 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1509 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1510 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1511 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1512 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1513 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1514 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1515 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1516 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1517 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1519 fjx0 = _mm_setzero_pd();
1520 fjy0 = _mm_setzero_pd();
1521 fjz0 = _mm_setzero_pd();
1522 fjx1 = _mm_setzero_pd();
1523 fjy1 = _mm_setzero_pd();
1524 fjz1 = _mm_setzero_pd();
1525 fjx2 = _mm_setzero_pd();
1526 fjy2 = _mm_setzero_pd();
1527 fjz2 = _mm_setzero_pd();
1529 /**************************
1530 * CALCULATE INTERACTIONS *
1531 **************************/
1533 r00 = _mm_mul_pd(rsq00,rinv00);
1535 /* Calculate table index by multiplying r with table scale and truncate to integer */
1536 rt = _mm_mul_pd(r00,vftabscale);
1537 vfitab = _mm_cvttpd_epi32(rt);
1539 vfeps = _mm_frcz_pd(rt);
1541 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1543 twovfeps = _mm_add_pd(vfeps,vfeps);
1544 vfitab = _mm_slli_epi32(vfitab,3);
1546 /* EWALD ELECTROSTATICS */
1548 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1549 ewrt = _mm_mul_pd(r00,ewtabscale);
1550 ewitab = _mm_cvttpd_epi32(ewrt);
1552 eweps = _mm_frcz_pd(ewrt);
1554 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1556 twoeweps = _mm_add_pd(eweps,eweps);
1557 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1559 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1560 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1562 /* CUBIC SPLINE TABLE DISPERSION */
1563 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1564 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1565 GMX_MM_TRANSPOSE2_PD(Y,F);
1566 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1567 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1568 GMX_MM_TRANSPOSE2_PD(G,H);
1569 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1570 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1571 fvdw6 = _mm_mul_pd(c6_00,FF);
1573 /* CUBIC SPLINE TABLE REPULSION */
1574 vfitab = _mm_add_epi32(vfitab,ifour);
1575 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1576 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1577 GMX_MM_TRANSPOSE2_PD(Y,F);
1578 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1579 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1580 GMX_MM_TRANSPOSE2_PD(G,H);
1581 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1582 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1583 fvdw12 = _mm_mul_pd(c12_00,FF);
1584 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1586 fscal = _mm_add_pd(felec,fvdw);
1588 /* Update vectorial force */
1589 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1590 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1591 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1593 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1594 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1595 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1597 /**************************
1598 * CALCULATE INTERACTIONS *
1599 **************************/
1601 r01 = _mm_mul_pd(rsq01,rinv01);
1603 /* EWALD ELECTROSTATICS */
1605 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1606 ewrt = _mm_mul_pd(r01,ewtabscale);
1607 ewitab = _mm_cvttpd_epi32(ewrt);
1609 eweps = _mm_frcz_pd(ewrt);
1611 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1613 twoeweps = _mm_add_pd(eweps,eweps);
1614 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1616 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1617 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1621 /* Update vectorial force */
1622 fix0 = _mm_macc_pd(dx01,fscal,fix0);
1623 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
1624 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
1626 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
1627 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
1628 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
1630 /**************************
1631 * CALCULATE INTERACTIONS *
1632 **************************/
1634 r02 = _mm_mul_pd(rsq02,rinv02);
1636 /* EWALD ELECTROSTATICS */
1638 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1639 ewrt = _mm_mul_pd(r02,ewtabscale);
1640 ewitab = _mm_cvttpd_epi32(ewrt);
1642 eweps = _mm_frcz_pd(ewrt);
1644 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1646 twoeweps = _mm_add_pd(eweps,eweps);
1647 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1649 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1650 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1654 /* Update vectorial force */
1655 fix0 = _mm_macc_pd(dx02,fscal,fix0);
1656 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
1657 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
1659 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
1660 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
1661 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
1663 /**************************
1664 * CALCULATE INTERACTIONS *
1665 **************************/
1667 r10 = _mm_mul_pd(rsq10,rinv10);
1669 /* EWALD ELECTROSTATICS */
1671 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1672 ewrt = _mm_mul_pd(r10,ewtabscale);
1673 ewitab = _mm_cvttpd_epi32(ewrt);
1675 eweps = _mm_frcz_pd(ewrt);
1677 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1679 twoeweps = _mm_add_pd(eweps,eweps);
1680 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1682 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1683 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1687 /* Update vectorial force */
1688 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1689 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1690 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1692 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1693 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1694 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1696 /**************************
1697 * CALCULATE INTERACTIONS *
1698 **************************/
1700 r11 = _mm_mul_pd(rsq11,rinv11);
1702 /* EWALD ELECTROSTATICS */
1704 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1705 ewrt = _mm_mul_pd(r11,ewtabscale);
1706 ewitab = _mm_cvttpd_epi32(ewrt);
1708 eweps = _mm_frcz_pd(ewrt);
1710 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1712 twoeweps = _mm_add_pd(eweps,eweps);
1713 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1715 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1716 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1720 /* Update vectorial force */
1721 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1722 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1723 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1725 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1726 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1727 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1729 /**************************
1730 * CALCULATE INTERACTIONS *
1731 **************************/
1733 r12 = _mm_mul_pd(rsq12,rinv12);
1735 /* EWALD ELECTROSTATICS */
1737 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1738 ewrt = _mm_mul_pd(r12,ewtabscale);
1739 ewitab = _mm_cvttpd_epi32(ewrt);
1741 eweps = _mm_frcz_pd(ewrt);
1743 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1745 twoeweps = _mm_add_pd(eweps,eweps);
1746 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1748 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1749 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1753 /* Update vectorial force */
1754 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1755 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1756 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1758 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1759 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1760 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1762 /**************************
1763 * CALCULATE INTERACTIONS *
1764 **************************/
1766 r20 = _mm_mul_pd(rsq20,rinv20);
1768 /* EWALD ELECTROSTATICS */
1770 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1771 ewrt = _mm_mul_pd(r20,ewtabscale);
1772 ewitab = _mm_cvttpd_epi32(ewrt);
1774 eweps = _mm_frcz_pd(ewrt);
1776 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1778 twoeweps = _mm_add_pd(eweps,eweps);
1779 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1781 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1782 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1786 /* Update vectorial force */
1787 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1788 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1789 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1791 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1792 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1793 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1795 /**************************
1796 * CALCULATE INTERACTIONS *
1797 **************************/
1799 r21 = _mm_mul_pd(rsq21,rinv21);
1801 /* EWALD ELECTROSTATICS */
1803 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1804 ewrt = _mm_mul_pd(r21,ewtabscale);
1805 ewitab = _mm_cvttpd_epi32(ewrt);
1807 eweps = _mm_frcz_pd(ewrt);
1809 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1811 twoeweps = _mm_add_pd(eweps,eweps);
1812 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1814 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1815 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1819 /* Update vectorial force */
1820 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1821 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1822 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1824 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1825 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1826 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1828 /**************************
1829 * CALCULATE INTERACTIONS *
1830 **************************/
1832 r22 = _mm_mul_pd(rsq22,rinv22);
1834 /* EWALD ELECTROSTATICS */
1836 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1837 ewrt = _mm_mul_pd(r22,ewtabscale);
1838 ewitab = _mm_cvttpd_epi32(ewrt);
1840 eweps = _mm_frcz_pd(ewrt);
1842 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1844 twoeweps = _mm_add_pd(eweps,eweps);
1845 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1847 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1848 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1852 /* Update vectorial force */
1853 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1854 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1855 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1857 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1858 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1859 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1861 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1863 /* Inner loop uses 377 flops */
1866 if(jidx<j_index_end)
1870 j_coord_offsetA = DIM*jnrA;
1872 /* load j atom coordinates */
1873 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1874 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1876 /* Calculate displacement vector */
1877 dx00 = _mm_sub_pd(ix0,jx0);
1878 dy00 = _mm_sub_pd(iy0,jy0);
1879 dz00 = _mm_sub_pd(iz0,jz0);
1880 dx01 = _mm_sub_pd(ix0,jx1);
1881 dy01 = _mm_sub_pd(iy0,jy1);
1882 dz01 = _mm_sub_pd(iz0,jz1);
1883 dx02 = _mm_sub_pd(ix0,jx2);
1884 dy02 = _mm_sub_pd(iy0,jy2);
1885 dz02 = _mm_sub_pd(iz0,jz2);
1886 dx10 = _mm_sub_pd(ix1,jx0);
1887 dy10 = _mm_sub_pd(iy1,jy0);
1888 dz10 = _mm_sub_pd(iz1,jz0);
1889 dx11 = _mm_sub_pd(ix1,jx1);
1890 dy11 = _mm_sub_pd(iy1,jy1);
1891 dz11 = _mm_sub_pd(iz1,jz1);
1892 dx12 = _mm_sub_pd(ix1,jx2);
1893 dy12 = _mm_sub_pd(iy1,jy2);
1894 dz12 = _mm_sub_pd(iz1,jz2);
1895 dx20 = _mm_sub_pd(ix2,jx0);
1896 dy20 = _mm_sub_pd(iy2,jy0);
1897 dz20 = _mm_sub_pd(iz2,jz0);
1898 dx21 = _mm_sub_pd(ix2,jx1);
1899 dy21 = _mm_sub_pd(iy2,jy1);
1900 dz21 = _mm_sub_pd(iz2,jz1);
1901 dx22 = _mm_sub_pd(ix2,jx2);
1902 dy22 = _mm_sub_pd(iy2,jy2);
1903 dz22 = _mm_sub_pd(iz2,jz2);
1905 /* Calculate squared distance and things based on it */
1906 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1907 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1908 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1909 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1910 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1911 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1912 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1913 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1914 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1916 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1917 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1918 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1919 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1920 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1921 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1922 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1923 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1924 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1926 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1927 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1928 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1929 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1930 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1931 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1932 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1933 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1934 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1936 fjx0 = _mm_setzero_pd();
1937 fjy0 = _mm_setzero_pd();
1938 fjz0 = _mm_setzero_pd();
1939 fjx1 = _mm_setzero_pd();
1940 fjy1 = _mm_setzero_pd();
1941 fjz1 = _mm_setzero_pd();
1942 fjx2 = _mm_setzero_pd();
1943 fjy2 = _mm_setzero_pd();
1944 fjz2 = _mm_setzero_pd();
1946 /**************************
1947 * CALCULATE INTERACTIONS *
1948 **************************/
1950 r00 = _mm_mul_pd(rsq00,rinv00);
1952 /* Calculate table index by multiplying r with table scale and truncate to integer */
1953 rt = _mm_mul_pd(r00,vftabscale);
1954 vfitab = _mm_cvttpd_epi32(rt);
1956 vfeps = _mm_frcz_pd(rt);
1958 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1960 twovfeps = _mm_add_pd(vfeps,vfeps);
1961 vfitab = _mm_slli_epi32(vfitab,3);
1963 /* EWALD ELECTROSTATICS */
1965 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1966 ewrt = _mm_mul_pd(r00,ewtabscale);
1967 ewitab = _mm_cvttpd_epi32(ewrt);
1969 eweps = _mm_frcz_pd(ewrt);
1971 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1973 twoeweps = _mm_add_pd(eweps,eweps);
1974 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1975 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1976 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1978 /* CUBIC SPLINE TABLE DISPERSION */
1979 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1980 F = _mm_setzero_pd();
1981 GMX_MM_TRANSPOSE2_PD(Y,F);
1982 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1983 H = _mm_setzero_pd();
1984 GMX_MM_TRANSPOSE2_PD(G,H);
1985 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1986 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1987 fvdw6 = _mm_mul_pd(c6_00,FF);
1989 /* CUBIC SPLINE TABLE REPULSION */
1990 vfitab = _mm_add_epi32(vfitab,ifour);
1991 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1992 F = _mm_setzero_pd();
1993 GMX_MM_TRANSPOSE2_PD(Y,F);
1994 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1995 H = _mm_setzero_pd();
1996 GMX_MM_TRANSPOSE2_PD(G,H);
1997 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1998 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1999 fvdw12 = _mm_mul_pd(c12_00,FF);
2000 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
2002 fscal = _mm_add_pd(felec,fvdw);
2004 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2006 /* Update vectorial force */
2007 fix0 = _mm_macc_pd(dx00,fscal,fix0);
2008 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
2009 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
2011 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
2012 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
2013 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
2015 /**************************
2016 * CALCULATE INTERACTIONS *
2017 **************************/
2019 r01 = _mm_mul_pd(rsq01,rinv01);
2021 /* EWALD ELECTROSTATICS */
2023 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2024 ewrt = _mm_mul_pd(r01,ewtabscale);
2025 ewitab = _mm_cvttpd_epi32(ewrt);
2027 eweps = _mm_frcz_pd(ewrt);
2029 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2031 twoeweps = _mm_add_pd(eweps,eweps);
2032 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2033 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2034 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
2038 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2040 /* Update vectorial force */
2041 fix0 = _mm_macc_pd(dx01,fscal,fix0);
2042 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
2043 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
2045 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
2046 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
2047 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
2049 /**************************
2050 * CALCULATE INTERACTIONS *
2051 **************************/
2053 r02 = _mm_mul_pd(rsq02,rinv02);
2055 /* EWALD ELECTROSTATICS */
2057 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2058 ewrt = _mm_mul_pd(r02,ewtabscale);
2059 ewitab = _mm_cvttpd_epi32(ewrt);
2061 eweps = _mm_frcz_pd(ewrt);
2063 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2065 twoeweps = _mm_add_pd(eweps,eweps);
2066 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2067 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2068 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
2072 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2074 /* Update vectorial force */
2075 fix0 = _mm_macc_pd(dx02,fscal,fix0);
2076 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
2077 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
2079 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
2080 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
2081 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
2083 /**************************
2084 * CALCULATE INTERACTIONS *
2085 **************************/
2087 r10 = _mm_mul_pd(rsq10,rinv10);
2089 /* EWALD ELECTROSTATICS */
2091 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2092 ewrt = _mm_mul_pd(r10,ewtabscale);
2093 ewitab = _mm_cvttpd_epi32(ewrt);
2095 eweps = _mm_frcz_pd(ewrt);
2097 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2099 twoeweps = _mm_add_pd(eweps,eweps);
2100 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2101 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2102 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2106 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2108 /* Update vectorial force */
2109 fix1 = _mm_macc_pd(dx10,fscal,fix1);
2110 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
2111 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
2113 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
2114 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
2115 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
2117 /**************************
2118 * CALCULATE INTERACTIONS *
2119 **************************/
2121 r11 = _mm_mul_pd(rsq11,rinv11);
2123 /* EWALD ELECTROSTATICS */
2125 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2126 ewrt = _mm_mul_pd(r11,ewtabscale);
2127 ewitab = _mm_cvttpd_epi32(ewrt);
2129 eweps = _mm_frcz_pd(ewrt);
2131 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2133 twoeweps = _mm_add_pd(eweps,eweps);
2134 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2135 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2136 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2140 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2142 /* Update vectorial force */
2143 fix1 = _mm_macc_pd(dx11,fscal,fix1);
2144 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
2145 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
2147 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
2148 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
2149 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
2151 /**************************
2152 * CALCULATE INTERACTIONS *
2153 **************************/
2155 r12 = _mm_mul_pd(rsq12,rinv12);
2157 /* EWALD ELECTROSTATICS */
2159 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2160 ewrt = _mm_mul_pd(r12,ewtabscale);
2161 ewitab = _mm_cvttpd_epi32(ewrt);
2163 eweps = _mm_frcz_pd(ewrt);
2165 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2167 twoeweps = _mm_add_pd(eweps,eweps);
2168 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2169 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2170 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2174 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2176 /* Update vectorial force */
2177 fix1 = _mm_macc_pd(dx12,fscal,fix1);
2178 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
2179 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
2181 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
2182 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
2183 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
2185 /**************************
2186 * CALCULATE INTERACTIONS *
2187 **************************/
2189 r20 = _mm_mul_pd(rsq20,rinv20);
2191 /* EWALD ELECTROSTATICS */
2193 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2194 ewrt = _mm_mul_pd(r20,ewtabscale);
2195 ewitab = _mm_cvttpd_epi32(ewrt);
2197 eweps = _mm_frcz_pd(ewrt);
2199 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2201 twoeweps = _mm_add_pd(eweps,eweps);
2202 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2203 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2204 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2208 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2210 /* Update vectorial force */
2211 fix2 = _mm_macc_pd(dx20,fscal,fix2);
2212 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
2213 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
2215 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
2216 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
2217 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
2219 /**************************
2220 * CALCULATE INTERACTIONS *
2221 **************************/
2223 r21 = _mm_mul_pd(rsq21,rinv21);
2225 /* EWALD ELECTROSTATICS */
2227 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2228 ewrt = _mm_mul_pd(r21,ewtabscale);
2229 ewitab = _mm_cvttpd_epi32(ewrt);
2231 eweps = _mm_frcz_pd(ewrt);
2233 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2235 twoeweps = _mm_add_pd(eweps,eweps);
2236 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2237 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2238 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2242 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2244 /* Update vectorial force */
2245 fix2 = _mm_macc_pd(dx21,fscal,fix2);
2246 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
2247 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
2249 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
2250 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
2251 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
2253 /**************************
2254 * CALCULATE INTERACTIONS *
2255 **************************/
2257 r22 = _mm_mul_pd(rsq22,rinv22);
2259 /* EWALD ELECTROSTATICS */
2261 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2262 ewrt = _mm_mul_pd(r22,ewtabscale);
2263 ewitab = _mm_cvttpd_epi32(ewrt);
2265 eweps = _mm_frcz_pd(ewrt);
2267 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2269 twoeweps = _mm_add_pd(eweps,eweps);
2270 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2271 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2272 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2276 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2278 /* Update vectorial force */
2279 fix2 = _mm_macc_pd(dx22,fscal,fix2);
2280 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
2281 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
2283 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
2284 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
2285 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
2287 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2289 /* Inner loop uses 377 flops */
2292 /* End of innermost loop */
2294 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2295 f+i_coord_offset,fshift+i_shift_offset);
2297 /* Increment number of inner iterations */
2298 inneriter += j_index_end - j_index_start;
2300 /* Outer loop uses 18 flops */
2303 /* Increment number of outer iterations */
2306 /* Update outer/inner flops */
2308 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*377);