2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_avx_128_fma_double.h"
48 #include "kernelutil_x86_avx_128_fma_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_128_fma_double
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LennardJones
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_128_fma_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int j_coord_offsetA,j_coord_offsetB;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86 int vdwjidx0A,vdwjidx0B;
87 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
88 int vdwjidx1A,vdwjidx1B;
89 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
90 int vdwjidx2A,vdwjidx2B;
91 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
92 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
93 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
94 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
95 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
96 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
97 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
98 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
99 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
100 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
101 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
104 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
107 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
108 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
110 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
112 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
113 real rswitch_scalar,d_scalar;
114 __m128d dummy_mask,cutoff_mask;
115 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
116 __m128d one = _mm_set1_pd(1.0);
117 __m128d two = _mm_set1_pd(2.0);
123 jindex = nlist->jindex;
125 shiftidx = nlist->shift;
127 shiftvec = fr->shift_vec[0];
128 fshift = fr->fshift[0];
129 facel = _mm_set1_pd(fr->epsfac);
130 charge = mdatoms->chargeA;
131 nvdwtype = fr->ntype;
133 vdwtype = mdatoms->typeA;
135 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
136 ewtab = fr->ic->tabq_coul_FDV0;
137 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
138 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
140 /* Setup water-specific parameters */
141 inr = nlist->iinr[0];
142 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
143 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
144 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
145 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
147 jq0 = _mm_set1_pd(charge[inr+0]);
148 jq1 = _mm_set1_pd(charge[inr+1]);
149 jq2 = _mm_set1_pd(charge[inr+2]);
150 vdwjidx0A = 2*vdwtype[inr+0];
151 qq00 = _mm_mul_pd(iq0,jq0);
152 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
153 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
154 qq01 = _mm_mul_pd(iq0,jq1);
155 qq02 = _mm_mul_pd(iq0,jq2);
156 qq10 = _mm_mul_pd(iq1,jq0);
157 qq11 = _mm_mul_pd(iq1,jq1);
158 qq12 = _mm_mul_pd(iq1,jq2);
159 qq20 = _mm_mul_pd(iq2,jq0);
160 qq21 = _mm_mul_pd(iq2,jq1);
161 qq22 = _mm_mul_pd(iq2,jq2);
163 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
164 rcutoff_scalar = fr->rcoulomb;
165 rcutoff = _mm_set1_pd(rcutoff_scalar);
166 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
168 rswitch_scalar = fr->rcoulomb_switch;
169 rswitch = _mm_set1_pd(rswitch_scalar);
170 /* Setup switch parameters */
171 d_scalar = rcutoff_scalar-rswitch_scalar;
172 d = _mm_set1_pd(d_scalar);
173 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
174 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
175 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
176 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
177 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
178 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
180 /* Avoid stupid compiler warnings */
188 /* Start outer loop over neighborlists */
189 for(iidx=0; iidx<nri; iidx++)
191 /* Load shift vector for this list */
192 i_shift_offset = DIM*shiftidx[iidx];
194 /* Load limits for loop over neighbors */
195 j_index_start = jindex[iidx];
196 j_index_end = jindex[iidx+1];
198 /* Get outer coordinate index */
200 i_coord_offset = DIM*inr;
202 /* Load i particle coords and add shift vector */
203 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
204 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
206 fix0 = _mm_setzero_pd();
207 fiy0 = _mm_setzero_pd();
208 fiz0 = _mm_setzero_pd();
209 fix1 = _mm_setzero_pd();
210 fiy1 = _mm_setzero_pd();
211 fiz1 = _mm_setzero_pd();
212 fix2 = _mm_setzero_pd();
213 fiy2 = _mm_setzero_pd();
214 fiz2 = _mm_setzero_pd();
216 /* Reset potential sums */
217 velecsum = _mm_setzero_pd();
218 vvdwsum = _mm_setzero_pd();
220 /* Start inner kernel loop */
221 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
224 /* Get j neighbor index, and coordinate index */
227 j_coord_offsetA = DIM*jnrA;
228 j_coord_offsetB = DIM*jnrB;
230 /* load j atom coordinates */
231 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
232 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
234 /* Calculate displacement vector */
235 dx00 = _mm_sub_pd(ix0,jx0);
236 dy00 = _mm_sub_pd(iy0,jy0);
237 dz00 = _mm_sub_pd(iz0,jz0);
238 dx01 = _mm_sub_pd(ix0,jx1);
239 dy01 = _mm_sub_pd(iy0,jy1);
240 dz01 = _mm_sub_pd(iz0,jz1);
241 dx02 = _mm_sub_pd(ix0,jx2);
242 dy02 = _mm_sub_pd(iy0,jy2);
243 dz02 = _mm_sub_pd(iz0,jz2);
244 dx10 = _mm_sub_pd(ix1,jx0);
245 dy10 = _mm_sub_pd(iy1,jy0);
246 dz10 = _mm_sub_pd(iz1,jz0);
247 dx11 = _mm_sub_pd(ix1,jx1);
248 dy11 = _mm_sub_pd(iy1,jy1);
249 dz11 = _mm_sub_pd(iz1,jz1);
250 dx12 = _mm_sub_pd(ix1,jx2);
251 dy12 = _mm_sub_pd(iy1,jy2);
252 dz12 = _mm_sub_pd(iz1,jz2);
253 dx20 = _mm_sub_pd(ix2,jx0);
254 dy20 = _mm_sub_pd(iy2,jy0);
255 dz20 = _mm_sub_pd(iz2,jz0);
256 dx21 = _mm_sub_pd(ix2,jx1);
257 dy21 = _mm_sub_pd(iy2,jy1);
258 dz21 = _mm_sub_pd(iz2,jz1);
259 dx22 = _mm_sub_pd(ix2,jx2);
260 dy22 = _mm_sub_pd(iy2,jy2);
261 dz22 = _mm_sub_pd(iz2,jz2);
263 /* Calculate squared distance and things based on it */
264 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
265 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
266 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
267 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
268 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
269 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
270 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
271 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
272 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
274 rinv00 = gmx_mm_invsqrt_pd(rsq00);
275 rinv01 = gmx_mm_invsqrt_pd(rsq01);
276 rinv02 = gmx_mm_invsqrt_pd(rsq02);
277 rinv10 = gmx_mm_invsqrt_pd(rsq10);
278 rinv11 = gmx_mm_invsqrt_pd(rsq11);
279 rinv12 = gmx_mm_invsqrt_pd(rsq12);
280 rinv20 = gmx_mm_invsqrt_pd(rsq20);
281 rinv21 = gmx_mm_invsqrt_pd(rsq21);
282 rinv22 = gmx_mm_invsqrt_pd(rsq22);
284 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
285 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
286 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
287 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
288 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
289 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
290 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
291 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
292 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
294 fjx0 = _mm_setzero_pd();
295 fjy0 = _mm_setzero_pd();
296 fjz0 = _mm_setzero_pd();
297 fjx1 = _mm_setzero_pd();
298 fjy1 = _mm_setzero_pd();
299 fjz1 = _mm_setzero_pd();
300 fjx2 = _mm_setzero_pd();
301 fjy2 = _mm_setzero_pd();
302 fjz2 = _mm_setzero_pd();
304 /**************************
305 * CALCULATE INTERACTIONS *
306 **************************/
308 if (gmx_mm_any_lt(rsq00,rcutoff2))
311 r00 = _mm_mul_pd(rsq00,rinv00);
313 /* EWALD ELECTROSTATICS */
315 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
316 ewrt = _mm_mul_pd(r00,ewtabscale);
317 ewitab = _mm_cvttpd_epi32(ewrt);
319 eweps = _mm_frcz_pd(ewrt);
321 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
323 twoeweps = _mm_add_pd(eweps,eweps);
324 ewitab = _mm_slli_epi32(ewitab,2);
325 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
326 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
327 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
328 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
329 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
330 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
331 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
332 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
333 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
334 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
336 /* LENNARD-JONES DISPERSION/REPULSION */
338 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
339 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
340 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
341 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
342 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
344 d = _mm_sub_pd(r00,rswitch);
345 d = _mm_max_pd(d,_mm_setzero_pd());
346 d2 = _mm_mul_pd(d,d);
347 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
349 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
351 /* Evaluate switch function */
352 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
353 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
354 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
355 velec = _mm_mul_pd(velec,sw);
356 vvdw = _mm_mul_pd(vvdw,sw);
357 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
359 /* Update potential sum for this i atom from the interaction with this j atom. */
360 velec = _mm_and_pd(velec,cutoff_mask);
361 velecsum = _mm_add_pd(velecsum,velec);
362 vvdw = _mm_and_pd(vvdw,cutoff_mask);
363 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
365 fscal = _mm_add_pd(felec,fvdw);
367 fscal = _mm_and_pd(fscal,cutoff_mask);
369 /* Update vectorial force */
370 fix0 = _mm_macc_pd(dx00,fscal,fix0);
371 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
372 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
374 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
375 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
376 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
380 /**************************
381 * CALCULATE INTERACTIONS *
382 **************************/
384 if (gmx_mm_any_lt(rsq01,rcutoff2))
387 r01 = _mm_mul_pd(rsq01,rinv01);
389 /* EWALD ELECTROSTATICS */
391 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
392 ewrt = _mm_mul_pd(r01,ewtabscale);
393 ewitab = _mm_cvttpd_epi32(ewrt);
395 eweps = _mm_frcz_pd(ewrt);
397 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
399 twoeweps = _mm_add_pd(eweps,eweps);
400 ewitab = _mm_slli_epi32(ewitab,2);
401 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
402 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
403 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
404 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
405 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
406 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
407 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
408 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
409 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
410 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
412 d = _mm_sub_pd(r01,rswitch);
413 d = _mm_max_pd(d,_mm_setzero_pd());
414 d2 = _mm_mul_pd(d,d);
415 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
417 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
419 /* Evaluate switch function */
420 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
421 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
422 velec = _mm_mul_pd(velec,sw);
423 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
425 /* Update potential sum for this i atom from the interaction with this j atom. */
426 velec = _mm_and_pd(velec,cutoff_mask);
427 velecsum = _mm_add_pd(velecsum,velec);
431 fscal = _mm_and_pd(fscal,cutoff_mask);
433 /* Update vectorial force */
434 fix0 = _mm_macc_pd(dx01,fscal,fix0);
435 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
436 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
438 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
439 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
440 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
444 /**************************
445 * CALCULATE INTERACTIONS *
446 **************************/
448 if (gmx_mm_any_lt(rsq02,rcutoff2))
451 r02 = _mm_mul_pd(rsq02,rinv02);
453 /* EWALD ELECTROSTATICS */
455 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
456 ewrt = _mm_mul_pd(r02,ewtabscale);
457 ewitab = _mm_cvttpd_epi32(ewrt);
459 eweps = _mm_frcz_pd(ewrt);
461 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
463 twoeweps = _mm_add_pd(eweps,eweps);
464 ewitab = _mm_slli_epi32(ewitab,2);
465 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
466 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
467 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
468 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
469 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
470 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
471 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
472 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
473 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
474 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
476 d = _mm_sub_pd(r02,rswitch);
477 d = _mm_max_pd(d,_mm_setzero_pd());
478 d2 = _mm_mul_pd(d,d);
479 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
481 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
483 /* Evaluate switch function */
484 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
485 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
486 velec = _mm_mul_pd(velec,sw);
487 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
489 /* Update potential sum for this i atom from the interaction with this j atom. */
490 velec = _mm_and_pd(velec,cutoff_mask);
491 velecsum = _mm_add_pd(velecsum,velec);
495 fscal = _mm_and_pd(fscal,cutoff_mask);
497 /* Update vectorial force */
498 fix0 = _mm_macc_pd(dx02,fscal,fix0);
499 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
500 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
502 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
503 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
504 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
508 /**************************
509 * CALCULATE INTERACTIONS *
510 **************************/
512 if (gmx_mm_any_lt(rsq10,rcutoff2))
515 r10 = _mm_mul_pd(rsq10,rinv10);
517 /* EWALD ELECTROSTATICS */
519 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
520 ewrt = _mm_mul_pd(r10,ewtabscale);
521 ewitab = _mm_cvttpd_epi32(ewrt);
523 eweps = _mm_frcz_pd(ewrt);
525 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
527 twoeweps = _mm_add_pd(eweps,eweps);
528 ewitab = _mm_slli_epi32(ewitab,2);
529 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
530 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
531 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
532 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
533 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
534 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
535 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
536 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
537 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
538 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
540 d = _mm_sub_pd(r10,rswitch);
541 d = _mm_max_pd(d,_mm_setzero_pd());
542 d2 = _mm_mul_pd(d,d);
543 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
545 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
547 /* Evaluate switch function */
548 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
549 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
550 velec = _mm_mul_pd(velec,sw);
551 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
553 /* Update potential sum for this i atom from the interaction with this j atom. */
554 velec = _mm_and_pd(velec,cutoff_mask);
555 velecsum = _mm_add_pd(velecsum,velec);
559 fscal = _mm_and_pd(fscal,cutoff_mask);
561 /* Update vectorial force */
562 fix1 = _mm_macc_pd(dx10,fscal,fix1);
563 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
564 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
566 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
567 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
568 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
572 /**************************
573 * CALCULATE INTERACTIONS *
574 **************************/
576 if (gmx_mm_any_lt(rsq11,rcutoff2))
579 r11 = _mm_mul_pd(rsq11,rinv11);
581 /* EWALD ELECTROSTATICS */
583 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
584 ewrt = _mm_mul_pd(r11,ewtabscale);
585 ewitab = _mm_cvttpd_epi32(ewrt);
587 eweps = _mm_frcz_pd(ewrt);
589 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
591 twoeweps = _mm_add_pd(eweps,eweps);
592 ewitab = _mm_slli_epi32(ewitab,2);
593 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
594 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
595 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
596 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
597 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
598 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
599 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
600 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
601 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
602 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
604 d = _mm_sub_pd(r11,rswitch);
605 d = _mm_max_pd(d,_mm_setzero_pd());
606 d2 = _mm_mul_pd(d,d);
607 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
609 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
611 /* Evaluate switch function */
612 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
613 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
614 velec = _mm_mul_pd(velec,sw);
615 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
617 /* Update potential sum for this i atom from the interaction with this j atom. */
618 velec = _mm_and_pd(velec,cutoff_mask);
619 velecsum = _mm_add_pd(velecsum,velec);
623 fscal = _mm_and_pd(fscal,cutoff_mask);
625 /* Update vectorial force */
626 fix1 = _mm_macc_pd(dx11,fscal,fix1);
627 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
628 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
630 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
631 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
632 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
636 /**************************
637 * CALCULATE INTERACTIONS *
638 **************************/
640 if (gmx_mm_any_lt(rsq12,rcutoff2))
643 r12 = _mm_mul_pd(rsq12,rinv12);
645 /* EWALD ELECTROSTATICS */
647 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
648 ewrt = _mm_mul_pd(r12,ewtabscale);
649 ewitab = _mm_cvttpd_epi32(ewrt);
651 eweps = _mm_frcz_pd(ewrt);
653 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
655 twoeweps = _mm_add_pd(eweps,eweps);
656 ewitab = _mm_slli_epi32(ewitab,2);
657 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
658 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
659 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
660 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
661 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
662 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
663 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
664 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
665 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
666 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
668 d = _mm_sub_pd(r12,rswitch);
669 d = _mm_max_pd(d,_mm_setzero_pd());
670 d2 = _mm_mul_pd(d,d);
671 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
673 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
675 /* Evaluate switch function */
676 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
677 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
678 velec = _mm_mul_pd(velec,sw);
679 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
681 /* Update potential sum for this i atom from the interaction with this j atom. */
682 velec = _mm_and_pd(velec,cutoff_mask);
683 velecsum = _mm_add_pd(velecsum,velec);
687 fscal = _mm_and_pd(fscal,cutoff_mask);
689 /* Update vectorial force */
690 fix1 = _mm_macc_pd(dx12,fscal,fix1);
691 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
692 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
694 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
695 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
696 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
700 /**************************
701 * CALCULATE INTERACTIONS *
702 **************************/
704 if (gmx_mm_any_lt(rsq20,rcutoff2))
707 r20 = _mm_mul_pd(rsq20,rinv20);
709 /* EWALD ELECTROSTATICS */
711 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
712 ewrt = _mm_mul_pd(r20,ewtabscale);
713 ewitab = _mm_cvttpd_epi32(ewrt);
715 eweps = _mm_frcz_pd(ewrt);
717 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
719 twoeweps = _mm_add_pd(eweps,eweps);
720 ewitab = _mm_slli_epi32(ewitab,2);
721 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
722 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
723 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
724 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
725 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
726 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
727 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
728 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
729 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
730 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
732 d = _mm_sub_pd(r20,rswitch);
733 d = _mm_max_pd(d,_mm_setzero_pd());
734 d2 = _mm_mul_pd(d,d);
735 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
737 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
739 /* Evaluate switch function */
740 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
741 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
742 velec = _mm_mul_pd(velec,sw);
743 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
745 /* Update potential sum for this i atom from the interaction with this j atom. */
746 velec = _mm_and_pd(velec,cutoff_mask);
747 velecsum = _mm_add_pd(velecsum,velec);
751 fscal = _mm_and_pd(fscal,cutoff_mask);
753 /* Update vectorial force */
754 fix2 = _mm_macc_pd(dx20,fscal,fix2);
755 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
756 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
758 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
759 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
760 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
764 /**************************
765 * CALCULATE INTERACTIONS *
766 **************************/
768 if (gmx_mm_any_lt(rsq21,rcutoff2))
771 r21 = _mm_mul_pd(rsq21,rinv21);
773 /* EWALD ELECTROSTATICS */
775 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
776 ewrt = _mm_mul_pd(r21,ewtabscale);
777 ewitab = _mm_cvttpd_epi32(ewrt);
779 eweps = _mm_frcz_pd(ewrt);
781 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
783 twoeweps = _mm_add_pd(eweps,eweps);
784 ewitab = _mm_slli_epi32(ewitab,2);
785 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
786 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
787 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
788 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
789 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
790 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
791 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
792 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
793 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
794 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
796 d = _mm_sub_pd(r21,rswitch);
797 d = _mm_max_pd(d,_mm_setzero_pd());
798 d2 = _mm_mul_pd(d,d);
799 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
801 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
803 /* Evaluate switch function */
804 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
805 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
806 velec = _mm_mul_pd(velec,sw);
807 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
809 /* Update potential sum for this i atom from the interaction with this j atom. */
810 velec = _mm_and_pd(velec,cutoff_mask);
811 velecsum = _mm_add_pd(velecsum,velec);
815 fscal = _mm_and_pd(fscal,cutoff_mask);
817 /* Update vectorial force */
818 fix2 = _mm_macc_pd(dx21,fscal,fix2);
819 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
820 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
822 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
823 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
824 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
828 /**************************
829 * CALCULATE INTERACTIONS *
830 **************************/
832 if (gmx_mm_any_lt(rsq22,rcutoff2))
835 r22 = _mm_mul_pd(rsq22,rinv22);
837 /* EWALD ELECTROSTATICS */
839 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
840 ewrt = _mm_mul_pd(r22,ewtabscale);
841 ewitab = _mm_cvttpd_epi32(ewrt);
843 eweps = _mm_frcz_pd(ewrt);
845 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
847 twoeweps = _mm_add_pd(eweps,eweps);
848 ewitab = _mm_slli_epi32(ewitab,2);
849 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
850 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
851 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
852 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
853 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
854 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
855 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
856 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
857 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
858 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
860 d = _mm_sub_pd(r22,rswitch);
861 d = _mm_max_pd(d,_mm_setzero_pd());
862 d2 = _mm_mul_pd(d,d);
863 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
865 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
867 /* Evaluate switch function */
868 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
869 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
870 velec = _mm_mul_pd(velec,sw);
871 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
873 /* Update potential sum for this i atom from the interaction with this j atom. */
874 velec = _mm_and_pd(velec,cutoff_mask);
875 velecsum = _mm_add_pd(velecsum,velec);
879 fscal = _mm_and_pd(fscal,cutoff_mask);
881 /* Update vectorial force */
882 fix2 = _mm_macc_pd(dx22,fscal,fix2);
883 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
884 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
886 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
887 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
888 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
892 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
894 /* Inner loop uses 630 flops */
901 j_coord_offsetA = DIM*jnrA;
903 /* load j atom coordinates */
904 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
905 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
907 /* Calculate displacement vector */
908 dx00 = _mm_sub_pd(ix0,jx0);
909 dy00 = _mm_sub_pd(iy0,jy0);
910 dz00 = _mm_sub_pd(iz0,jz0);
911 dx01 = _mm_sub_pd(ix0,jx1);
912 dy01 = _mm_sub_pd(iy0,jy1);
913 dz01 = _mm_sub_pd(iz0,jz1);
914 dx02 = _mm_sub_pd(ix0,jx2);
915 dy02 = _mm_sub_pd(iy0,jy2);
916 dz02 = _mm_sub_pd(iz0,jz2);
917 dx10 = _mm_sub_pd(ix1,jx0);
918 dy10 = _mm_sub_pd(iy1,jy0);
919 dz10 = _mm_sub_pd(iz1,jz0);
920 dx11 = _mm_sub_pd(ix1,jx1);
921 dy11 = _mm_sub_pd(iy1,jy1);
922 dz11 = _mm_sub_pd(iz1,jz1);
923 dx12 = _mm_sub_pd(ix1,jx2);
924 dy12 = _mm_sub_pd(iy1,jy2);
925 dz12 = _mm_sub_pd(iz1,jz2);
926 dx20 = _mm_sub_pd(ix2,jx0);
927 dy20 = _mm_sub_pd(iy2,jy0);
928 dz20 = _mm_sub_pd(iz2,jz0);
929 dx21 = _mm_sub_pd(ix2,jx1);
930 dy21 = _mm_sub_pd(iy2,jy1);
931 dz21 = _mm_sub_pd(iz2,jz1);
932 dx22 = _mm_sub_pd(ix2,jx2);
933 dy22 = _mm_sub_pd(iy2,jy2);
934 dz22 = _mm_sub_pd(iz2,jz2);
936 /* Calculate squared distance and things based on it */
937 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
938 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
939 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
940 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
941 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
942 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
943 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
944 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
945 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
947 rinv00 = gmx_mm_invsqrt_pd(rsq00);
948 rinv01 = gmx_mm_invsqrt_pd(rsq01);
949 rinv02 = gmx_mm_invsqrt_pd(rsq02);
950 rinv10 = gmx_mm_invsqrt_pd(rsq10);
951 rinv11 = gmx_mm_invsqrt_pd(rsq11);
952 rinv12 = gmx_mm_invsqrt_pd(rsq12);
953 rinv20 = gmx_mm_invsqrt_pd(rsq20);
954 rinv21 = gmx_mm_invsqrt_pd(rsq21);
955 rinv22 = gmx_mm_invsqrt_pd(rsq22);
957 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
958 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
959 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
960 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
961 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
962 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
963 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
964 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
965 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
967 fjx0 = _mm_setzero_pd();
968 fjy0 = _mm_setzero_pd();
969 fjz0 = _mm_setzero_pd();
970 fjx1 = _mm_setzero_pd();
971 fjy1 = _mm_setzero_pd();
972 fjz1 = _mm_setzero_pd();
973 fjx2 = _mm_setzero_pd();
974 fjy2 = _mm_setzero_pd();
975 fjz2 = _mm_setzero_pd();
977 /**************************
978 * CALCULATE INTERACTIONS *
979 **************************/
981 if (gmx_mm_any_lt(rsq00,rcutoff2))
984 r00 = _mm_mul_pd(rsq00,rinv00);
986 /* EWALD ELECTROSTATICS */
988 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
989 ewrt = _mm_mul_pd(r00,ewtabscale);
990 ewitab = _mm_cvttpd_epi32(ewrt);
992 eweps = _mm_frcz_pd(ewrt);
994 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
996 twoeweps = _mm_add_pd(eweps,eweps);
997 ewitab = _mm_slli_epi32(ewitab,2);
998 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
999 ewtabD = _mm_setzero_pd();
1000 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1001 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1002 ewtabFn = _mm_setzero_pd();
1003 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1004 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1005 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1006 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
1007 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1009 /* LENNARD-JONES DISPERSION/REPULSION */
1011 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1012 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1013 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1014 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
1015 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1017 d = _mm_sub_pd(r00,rswitch);
1018 d = _mm_max_pd(d,_mm_setzero_pd());
1019 d2 = _mm_mul_pd(d,d);
1020 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1022 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1024 /* Evaluate switch function */
1025 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1026 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
1027 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1028 velec = _mm_mul_pd(velec,sw);
1029 vvdw = _mm_mul_pd(vvdw,sw);
1030 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1032 /* Update potential sum for this i atom from the interaction with this j atom. */
1033 velec = _mm_and_pd(velec,cutoff_mask);
1034 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1035 velecsum = _mm_add_pd(velecsum,velec);
1036 vvdw = _mm_and_pd(vvdw,cutoff_mask);
1037 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
1038 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
1040 fscal = _mm_add_pd(felec,fvdw);
1042 fscal = _mm_and_pd(fscal,cutoff_mask);
1044 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1046 /* Update vectorial force */
1047 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1048 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1049 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1051 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1052 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1053 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1057 /**************************
1058 * CALCULATE INTERACTIONS *
1059 **************************/
1061 if (gmx_mm_any_lt(rsq01,rcutoff2))
1064 r01 = _mm_mul_pd(rsq01,rinv01);
1066 /* EWALD ELECTROSTATICS */
1068 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1069 ewrt = _mm_mul_pd(r01,ewtabscale);
1070 ewitab = _mm_cvttpd_epi32(ewrt);
1072 eweps = _mm_frcz_pd(ewrt);
1074 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1076 twoeweps = _mm_add_pd(eweps,eweps);
1077 ewitab = _mm_slli_epi32(ewitab,2);
1078 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1079 ewtabD = _mm_setzero_pd();
1080 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1081 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1082 ewtabFn = _mm_setzero_pd();
1083 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1084 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1085 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1086 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
1087 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1089 d = _mm_sub_pd(r01,rswitch);
1090 d = _mm_max_pd(d,_mm_setzero_pd());
1091 d2 = _mm_mul_pd(d,d);
1092 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1094 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1096 /* Evaluate switch function */
1097 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1098 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
1099 velec = _mm_mul_pd(velec,sw);
1100 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
1102 /* Update potential sum for this i atom from the interaction with this j atom. */
1103 velec = _mm_and_pd(velec,cutoff_mask);
1104 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1105 velecsum = _mm_add_pd(velecsum,velec);
1109 fscal = _mm_and_pd(fscal,cutoff_mask);
1111 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1113 /* Update vectorial force */
1114 fix0 = _mm_macc_pd(dx01,fscal,fix0);
1115 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
1116 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
1118 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
1119 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
1120 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
1124 /**************************
1125 * CALCULATE INTERACTIONS *
1126 **************************/
1128 if (gmx_mm_any_lt(rsq02,rcutoff2))
1131 r02 = _mm_mul_pd(rsq02,rinv02);
1133 /* EWALD ELECTROSTATICS */
1135 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1136 ewrt = _mm_mul_pd(r02,ewtabscale);
1137 ewitab = _mm_cvttpd_epi32(ewrt);
1139 eweps = _mm_frcz_pd(ewrt);
1141 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1143 twoeweps = _mm_add_pd(eweps,eweps);
1144 ewitab = _mm_slli_epi32(ewitab,2);
1145 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1146 ewtabD = _mm_setzero_pd();
1147 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1148 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1149 ewtabFn = _mm_setzero_pd();
1150 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1151 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1152 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1153 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
1154 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1156 d = _mm_sub_pd(r02,rswitch);
1157 d = _mm_max_pd(d,_mm_setzero_pd());
1158 d2 = _mm_mul_pd(d,d);
1159 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1161 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1163 /* Evaluate switch function */
1164 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1165 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
1166 velec = _mm_mul_pd(velec,sw);
1167 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
1169 /* Update potential sum for this i atom from the interaction with this j atom. */
1170 velec = _mm_and_pd(velec,cutoff_mask);
1171 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1172 velecsum = _mm_add_pd(velecsum,velec);
1176 fscal = _mm_and_pd(fscal,cutoff_mask);
1178 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1180 /* Update vectorial force */
1181 fix0 = _mm_macc_pd(dx02,fscal,fix0);
1182 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
1183 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
1185 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
1186 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
1187 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
1191 /**************************
1192 * CALCULATE INTERACTIONS *
1193 **************************/
1195 if (gmx_mm_any_lt(rsq10,rcutoff2))
1198 r10 = _mm_mul_pd(rsq10,rinv10);
1200 /* EWALD ELECTROSTATICS */
1202 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1203 ewrt = _mm_mul_pd(r10,ewtabscale);
1204 ewitab = _mm_cvttpd_epi32(ewrt);
1206 eweps = _mm_frcz_pd(ewrt);
1208 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1210 twoeweps = _mm_add_pd(eweps,eweps);
1211 ewitab = _mm_slli_epi32(ewitab,2);
1212 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1213 ewtabD = _mm_setzero_pd();
1214 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1215 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1216 ewtabFn = _mm_setzero_pd();
1217 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1218 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1219 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1220 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
1221 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1223 d = _mm_sub_pd(r10,rswitch);
1224 d = _mm_max_pd(d,_mm_setzero_pd());
1225 d2 = _mm_mul_pd(d,d);
1226 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1228 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1230 /* Evaluate switch function */
1231 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1232 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
1233 velec = _mm_mul_pd(velec,sw);
1234 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1236 /* Update potential sum for this i atom from the interaction with this j atom. */
1237 velec = _mm_and_pd(velec,cutoff_mask);
1238 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1239 velecsum = _mm_add_pd(velecsum,velec);
1243 fscal = _mm_and_pd(fscal,cutoff_mask);
1245 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1247 /* Update vectorial force */
1248 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1249 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1250 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1252 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1253 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1254 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1258 /**************************
1259 * CALCULATE INTERACTIONS *
1260 **************************/
1262 if (gmx_mm_any_lt(rsq11,rcutoff2))
1265 r11 = _mm_mul_pd(rsq11,rinv11);
1267 /* EWALD ELECTROSTATICS */
1269 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1270 ewrt = _mm_mul_pd(r11,ewtabscale);
1271 ewitab = _mm_cvttpd_epi32(ewrt);
1273 eweps = _mm_frcz_pd(ewrt);
1275 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1277 twoeweps = _mm_add_pd(eweps,eweps);
1278 ewitab = _mm_slli_epi32(ewitab,2);
1279 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1280 ewtabD = _mm_setzero_pd();
1281 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1282 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1283 ewtabFn = _mm_setzero_pd();
1284 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1285 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1286 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1287 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1288 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1290 d = _mm_sub_pd(r11,rswitch);
1291 d = _mm_max_pd(d,_mm_setzero_pd());
1292 d2 = _mm_mul_pd(d,d);
1293 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1295 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1297 /* Evaluate switch function */
1298 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1299 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
1300 velec = _mm_mul_pd(velec,sw);
1301 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1303 /* Update potential sum for this i atom from the interaction with this j atom. */
1304 velec = _mm_and_pd(velec,cutoff_mask);
1305 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1306 velecsum = _mm_add_pd(velecsum,velec);
1310 fscal = _mm_and_pd(fscal,cutoff_mask);
1312 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1314 /* Update vectorial force */
1315 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1316 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1317 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1319 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1320 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1321 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1325 /**************************
1326 * CALCULATE INTERACTIONS *
1327 **************************/
1329 if (gmx_mm_any_lt(rsq12,rcutoff2))
1332 r12 = _mm_mul_pd(rsq12,rinv12);
1334 /* EWALD ELECTROSTATICS */
1336 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1337 ewrt = _mm_mul_pd(r12,ewtabscale);
1338 ewitab = _mm_cvttpd_epi32(ewrt);
1340 eweps = _mm_frcz_pd(ewrt);
1342 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1344 twoeweps = _mm_add_pd(eweps,eweps);
1345 ewitab = _mm_slli_epi32(ewitab,2);
1346 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1347 ewtabD = _mm_setzero_pd();
1348 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1349 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1350 ewtabFn = _mm_setzero_pd();
1351 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1352 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1353 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1354 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1355 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1357 d = _mm_sub_pd(r12,rswitch);
1358 d = _mm_max_pd(d,_mm_setzero_pd());
1359 d2 = _mm_mul_pd(d,d);
1360 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1362 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1364 /* Evaluate switch function */
1365 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1366 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1367 velec = _mm_mul_pd(velec,sw);
1368 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1370 /* Update potential sum for this i atom from the interaction with this j atom. */
1371 velec = _mm_and_pd(velec,cutoff_mask);
1372 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1373 velecsum = _mm_add_pd(velecsum,velec);
1377 fscal = _mm_and_pd(fscal,cutoff_mask);
1379 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1381 /* Update vectorial force */
1382 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1383 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1384 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1386 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1387 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1388 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1392 /**************************
1393 * CALCULATE INTERACTIONS *
1394 **************************/
1396 if (gmx_mm_any_lt(rsq20,rcutoff2))
1399 r20 = _mm_mul_pd(rsq20,rinv20);
1401 /* EWALD ELECTROSTATICS */
1403 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1404 ewrt = _mm_mul_pd(r20,ewtabscale);
1405 ewitab = _mm_cvttpd_epi32(ewrt);
1407 eweps = _mm_frcz_pd(ewrt);
1409 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1411 twoeweps = _mm_add_pd(eweps,eweps);
1412 ewitab = _mm_slli_epi32(ewitab,2);
1413 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1414 ewtabD = _mm_setzero_pd();
1415 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1416 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1417 ewtabFn = _mm_setzero_pd();
1418 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1419 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1420 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1421 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
1422 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1424 d = _mm_sub_pd(r20,rswitch);
1425 d = _mm_max_pd(d,_mm_setzero_pd());
1426 d2 = _mm_mul_pd(d,d);
1427 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1429 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1431 /* Evaluate switch function */
1432 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1433 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
1434 velec = _mm_mul_pd(velec,sw);
1435 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1437 /* Update potential sum for this i atom from the interaction with this j atom. */
1438 velec = _mm_and_pd(velec,cutoff_mask);
1439 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1440 velecsum = _mm_add_pd(velecsum,velec);
1444 fscal = _mm_and_pd(fscal,cutoff_mask);
1446 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1448 /* Update vectorial force */
1449 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1450 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1451 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1453 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1454 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1455 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1459 /**************************
1460 * CALCULATE INTERACTIONS *
1461 **************************/
1463 if (gmx_mm_any_lt(rsq21,rcutoff2))
1466 r21 = _mm_mul_pd(rsq21,rinv21);
1468 /* EWALD ELECTROSTATICS */
1470 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1471 ewrt = _mm_mul_pd(r21,ewtabscale);
1472 ewitab = _mm_cvttpd_epi32(ewrt);
1474 eweps = _mm_frcz_pd(ewrt);
1476 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1478 twoeweps = _mm_add_pd(eweps,eweps);
1479 ewitab = _mm_slli_epi32(ewitab,2);
1480 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1481 ewtabD = _mm_setzero_pd();
1482 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1483 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1484 ewtabFn = _mm_setzero_pd();
1485 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1486 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1487 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1488 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1489 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1491 d = _mm_sub_pd(r21,rswitch);
1492 d = _mm_max_pd(d,_mm_setzero_pd());
1493 d2 = _mm_mul_pd(d,d);
1494 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1496 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1498 /* Evaluate switch function */
1499 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1500 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
1501 velec = _mm_mul_pd(velec,sw);
1502 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1504 /* Update potential sum for this i atom from the interaction with this j atom. */
1505 velec = _mm_and_pd(velec,cutoff_mask);
1506 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1507 velecsum = _mm_add_pd(velecsum,velec);
1511 fscal = _mm_and_pd(fscal,cutoff_mask);
1513 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1515 /* Update vectorial force */
1516 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1517 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1518 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1520 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1521 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1522 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1526 /**************************
1527 * CALCULATE INTERACTIONS *
1528 **************************/
1530 if (gmx_mm_any_lt(rsq22,rcutoff2))
1533 r22 = _mm_mul_pd(rsq22,rinv22);
1535 /* EWALD ELECTROSTATICS */
1537 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1538 ewrt = _mm_mul_pd(r22,ewtabscale);
1539 ewitab = _mm_cvttpd_epi32(ewrt);
1541 eweps = _mm_frcz_pd(ewrt);
1543 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1545 twoeweps = _mm_add_pd(eweps,eweps);
1546 ewitab = _mm_slli_epi32(ewitab,2);
1547 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1548 ewtabD = _mm_setzero_pd();
1549 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1550 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1551 ewtabFn = _mm_setzero_pd();
1552 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1553 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1554 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1555 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1556 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1558 d = _mm_sub_pd(r22,rswitch);
1559 d = _mm_max_pd(d,_mm_setzero_pd());
1560 d2 = _mm_mul_pd(d,d);
1561 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1563 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1565 /* Evaluate switch function */
1566 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1567 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
1568 velec = _mm_mul_pd(velec,sw);
1569 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1571 /* Update potential sum for this i atom from the interaction with this j atom. */
1572 velec = _mm_and_pd(velec,cutoff_mask);
1573 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1574 velecsum = _mm_add_pd(velecsum,velec);
1578 fscal = _mm_and_pd(fscal,cutoff_mask);
1580 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1582 /* Update vectorial force */
1583 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1584 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1585 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1587 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1588 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1589 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1593 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1595 /* Inner loop uses 630 flops */
1598 /* End of innermost loop */
1600 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1601 f+i_coord_offset,fshift+i_shift_offset);
1604 /* Update potential energies */
1605 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1606 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1608 /* Increment number of inner iterations */
1609 inneriter += j_index_end - j_index_start;
1611 /* Outer loop uses 20 flops */
1614 /* Increment number of outer iterations */
1617 /* Update outer/inner flops */
1619 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*630);
1622 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_128_fma_double
1623 * Electrostatics interaction: Ewald
1624 * VdW interaction: LennardJones
1625 * Geometry: Water3-Water3
1626 * Calculate force/pot: Force
1629 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_128_fma_double
1630 (t_nblist * gmx_restrict nlist,
1631 rvec * gmx_restrict xx,
1632 rvec * gmx_restrict ff,
1633 t_forcerec * gmx_restrict fr,
1634 t_mdatoms * gmx_restrict mdatoms,
1635 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1636 t_nrnb * gmx_restrict nrnb)
1638 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1639 * just 0 for non-waters.
1640 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1641 * jnr indices corresponding to data put in the four positions in the SIMD register.
1643 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1644 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1646 int j_coord_offsetA,j_coord_offsetB;
1647 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1648 real rcutoff_scalar;
1649 real *shiftvec,*fshift,*x,*f;
1650 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1652 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1654 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1656 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1657 int vdwjidx0A,vdwjidx0B;
1658 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1659 int vdwjidx1A,vdwjidx1B;
1660 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1661 int vdwjidx2A,vdwjidx2B;
1662 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1663 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1664 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1665 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1666 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1667 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1668 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1669 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1670 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1671 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1672 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1675 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1678 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1679 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1681 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1683 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1684 real rswitch_scalar,d_scalar;
1685 __m128d dummy_mask,cutoff_mask;
1686 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1687 __m128d one = _mm_set1_pd(1.0);
1688 __m128d two = _mm_set1_pd(2.0);
1694 jindex = nlist->jindex;
1696 shiftidx = nlist->shift;
1698 shiftvec = fr->shift_vec[0];
1699 fshift = fr->fshift[0];
1700 facel = _mm_set1_pd(fr->epsfac);
1701 charge = mdatoms->chargeA;
1702 nvdwtype = fr->ntype;
1703 vdwparam = fr->nbfp;
1704 vdwtype = mdatoms->typeA;
1706 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1707 ewtab = fr->ic->tabq_coul_FDV0;
1708 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1709 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1711 /* Setup water-specific parameters */
1712 inr = nlist->iinr[0];
1713 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1714 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1715 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1716 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1718 jq0 = _mm_set1_pd(charge[inr+0]);
1719 jq1 = _mm_set1_pd(charge[inr+1]);
1720 jq2 = _mm_set1_pd(charge[inr+2]);
1721 vdwjidx0A = 2*vdwtype[inr+0];
1722 qq00 = _mm_mul_pd(iq0,jq0);
1723 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1724 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1725 qq01 = _mm_mul_pd(iq0,jq1);
1726 qq02 = _mm_mul_pd(iq0,jq2);
1727 qq10 = _mm_mul_pd(iq1,jq0);
1728 qq11 = _mm_mul_pd(iq1,jq1);
1729 qq12 = _mm_mul_pd(iq1,jq2);
1730 qq20 = _mm_mul_pd(iq2,jq0);
1731 qq21 = _mm_mul_pd(iq2,jq1);
1732 qq22 = _mm_mul_pd(iq2,jq2);
1734 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1735 rcutoff_scalar = fr->rcoulomb;
1736 rcutoff = _mm_set1_pd(rcutoff_scalar);
1737 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1739 rswitch_scalar = fr->rcoulomb_switch;
1740 rswitch = _mm_set1_pd(rswitch_scalar);
1741 /* Setup switch parameters */
1742 d_scalar = rcutoff_scalar-rswitch_scalar;
1743 d = _mm_set1_pd(d_scalar);
1744 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1745 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1746 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1747 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1748 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1749 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1751 /* Avoid stupid compiler warnings */
1753 j_coord_offsetA = 0;
1754 j_coord_offsetB = 0;
1759 /* Start outer loop over neighborlists */
1760 for(iidx=0; iidx<nri; iidx++)
1762 /* Load shift vector for this list */
1763 i_shift_offset = DIM*shiftidx[iidx];
1765 /* Load limits for loop over neighbors */
1766 j_index_start = jindex[iidx];
1767 j_index_end = jindex[iidx+1];
1769 /* Get outer coordinate index */
1771 i_coord_offset = DIM*inr;
1773 /* Load i particle coords and add shift vector */
1774 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1775 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1777 fix0 = _mm_setzero_pd();
1778 fiy0 = _mm_setzero_pd();
1779 fiz0 = _mm_setzero_pd();
1780 fix1 = _mm_setzero_pd();
1781 fiy1 = _mm_setzero_pd();
1782 fiz1 = _mm_setzero_pd();
1783 fix2 = _mm_setzero_pd();
1784 fiy2 = _mm_setzero_pd();
1785 fiz2 = _mm_setzero_pd();
1787 /* Start inner kernel loop */
1788 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1791 /* Get j neighbor index, and coordinate index */
1793 jnrB = jjnr[jidx+1];
1794 j_coord_offsetA = DIM*jnrA;
1795 j_coord_offsetB = DIM*jnrB;
1797 /* load j atom coordinates */
1798 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1799 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1801 /* Calculate displacement vector */
1802 dx00 = _mm_sub_pd(ix0,jx0);
1803 dy00 = _mm_sub_pd(iy0,jy0);
1804 dz00 = _mm_sub_pd(iz0,jz0);
1805 dx01 = _mm_sub_pd(ix0,jx1);
1806 dy01 = _mm_sub_pd(iy0,jy1);
1807 dz01 = _mm_sub_pd(iz0,jz1);
1808 dx02 = _mm_sub_pd(ix0,jx2);
1809 dy02 = _mm_sub_pd(iy0,jy2);
1810 dz02 = _mm_sub_pd(iz0,jz2);
1811 dx10 = _mm_sub_pd(ix1,jx0);
1812 dy10 = _mm_sub_pd(iy1,jy0);
1813 dz10 = _mm_sub_pd(iz1,jz0);
1814 dx11 = _mm_sub_pd(ix1,jx1);
1815 dy11 = _mm_sub_pd(iy1,jy1);
1816 dz11 = _mm_sub_pd(iz1,jz1);
1817 dx12 = _mm_sub_pd(ix1,jx2);
1818 dy12 = _mm_sub_pd(iy1,jy2);
1819 dz12 = _mm_sub_pd(iz1,jz2);
1820 dx20 = _mm_sub_pd(ix2,jx0);
1821 dy20 = _mm_sub_pd(iy2,jy0);
1822 dz20 = _mm_sub_pd(iz2,jz0);
1823 dx21 = _mm_sub_pd(ix2,jx1);
1824 dy21 = _mm_sub_pd(iy2,jy1);
1825 dz21 = _mm_sub_pd(iz2,jz1);
1826 dx22 = _mm_sub_pd(ix2,jx2);
1827 dy22 = _mm_sub_pd(iy2,jy2);
1828 dz22 = _mm_sub_pd(iz2,jz2);
1830 /* Calculate squared distance and things based on it */
1831 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1832 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1833 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1834 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1835 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1836 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1837 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1838 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1839 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1841 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1842 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1843 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1844 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1845 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1846 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1847 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1848 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1849 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1851 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1852 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1853 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1854 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1855 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1856 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1857 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1858 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1859 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1861 fjx0 = _mm_setzero_pd();
1862 fjy0 = _mm_setzero_pd();
1863 fjz0 = _mm_setzero_pd();
1864 fjx1 = _mm_setzero_pd();
1865 fjy1 = _mm_setzero_pd();
1866 fjz1 = _mm_setzero_pd();
1867 fjx2 = _mm_setzero_pd();
1868 fjy2 = _mm_setzero_pd();
1869 fjz2 = _mm_setzero_pd();
1871 /**************************
1872 * CALCULATE INTERACTIONS *
1873 **************************/
1875 if (gmx_mm_any_lt(rsq00,rcutoff2))
1878 r00 = _mm_mul_pd(rsq00,rinv00);
1880 /* EWALD ELECTROSTATICS */
1882 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1883 ewrt = _mm_mul_pd(r00,ewtabscale);
1884 ewitab = _mm_cvttpd_epi32(ewrt);
1886 eweps = _mm_frcz_pd(ewrt);
1888 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1890 twoeweps = _mm_add_pd(eweps,eweps);
1891 ewitab = _mm_slli_epi32(ewitab,2);
1892 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1893 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1894 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1895 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1896 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
1897 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1898 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1899 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1900 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
1901 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1903 /* LENNARD-JONES DISPERSION/REPULSION */
1905 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1906 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1907 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1908 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
1909 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1911 d = _mm_sub_pd(r00,rswitch);
1912 d = _mm_max_pd(d,_mm_setzero_pd());
1913 d2 = _mm_mul_pd(d,d);
1914 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1916 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1918 /* Evaluate switch function */
1919 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1920 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
1921 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1922 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1924 fscal = _mm_add_pd(felec,fvdw);
1926 fscal = _mm_and_pd(fscal,cutoff_mask);
1928 /* Update vectorial force */
1929 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1930 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1931 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1933 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1934 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1935 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1939 /**************************
1940 * CALCULATE INTERACTIONS *
1941 **************************/
1943 if (gmx_mm_any_lt(rsq01,rcutoff2))
1946 r01 = _mm_mul_pd(rsq01,rinv01);
1948 /* EWALD ELECTROSTATICS */
1950 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1951 ewrt = _mm_mul_pd(r01,ewtabscale);
1952 ewitab = _mm_cvttpd_epi32(ewrt);
1954 eweps = _mm_frcz_pd(ewrt);
1956 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1958 twoeweps = _mm_add_pd(eweps,eweps);
1959 ewitab = _mm_slli_epi32(ewitab,2);
1960 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1961 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1962 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1963 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1964 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
1965 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1966 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1967 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1968 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
1969 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1971 d = _mm_sub_pd(r01,rswitch);
1972 d = _mm_max_pd(d,_mm_setzero_pd());
1973 d2 = _mm_mul_pd(d,d);
1974 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1976 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1978 /* Evaluate switch function */
1979 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1980 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
1981 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
1985 fscal = _mm_and_pd(fscal,cutoff_mask);
1987 /* Update vectorial force */
1988 fix0 = _mm_macc_pd(dx01,fscal,fix0);
1989 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
1990 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
1992 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
1993 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
1994 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
1998 /**************************
1999 * CALCULATE INTERACTIONS *
2000 **************************/
2002 if (gmx_mm_any_lt(rsq02,rcutoff2))
2005 r02 = _mm_mul_pd(rsq02,rinv02);
2007 /* EWALD ELECTROSTATICS */
2009 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2010 ewrt = _mm_mul_pd(r02,ewtabscale);
2011 ewitab = _mm_cvttpd_epi32(ewrt);
2013 eweps = _mm_frcz_pd(ewrt);
2015 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2017 twoeweps = _mm_add_pd(eweps,eweps);
2018 ewitab = _mm_slli_epi32(ewitab,2);
2019 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2020 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2021 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2022 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2023 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2024 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2025 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2026 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2027 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
2028 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
2030 d = _mm_sub_pd(r02,rswitch);
2031 d = _mm_max_pd(d,_mm_setzero_pd());
2032 d2 = _mm_mul_pd(d,d);
2033 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2035 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2037 /* Evaluate switch function */
2038 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2039 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
2040 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
2044 fscal = _mm_and_pd(fscal,cutoff_mask);
2046 /* Update vectorial force */
2047 fix0 = _mm_macc_pd(dx02,fscal,fix0);
2048 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
2049 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
2051 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
2052 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
2053 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
2057 /**************************
2058 * CALCULATE INTERACTIONS *
2059 **************************/
2061 if (gmx_mm_any_lt(rsq10,rcutoff2))
2064 r10 = _mm_mul_pd(rsq10,rinv10);
2066 /* EWALD ELECTROSTATICS */
2068 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2069 ewrt = _mm_mul_pd(r10,ewtabscale);
2070 ewitab = _mm_cvttpd_epi32(ewrt);
2072 eweps = _mm_frcz_pd(ewrt);
2074 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2076 twoeweps = _mm_add_pd(eweps,eweps);
2077 ewitab = _mm_slli_epi32(ewitab,2);
2078 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2079 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2080 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2081 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2082 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2083 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2084 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2085 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2086 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
2087 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2089 d = _mm_sub_pd(r10,rswitch);
2090 d = _mm_max_pd(d,_mm_setzero_pd());
2091 d2 = _mm_mul_pd(d,d);
2092 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2094 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2096 /* Evaluate switch function */
2097 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2098 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
2099 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
2103 fscal = _mm_and_pd(fscal,cutoff_mask);
2105 /* Update vectorial force */
2106 fix1 = _mm_macc_pd(dx10,fscal,fix1);
2107 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
2108 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
2110 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
2111 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
2112 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
2116 /**************************
2117 * CALCULATE INTERACTIONS *
2118 **************************/
2120 if (gmx_mm_any_lt(rsq11,rcutoff2))
2123 r11 = _mm_mul_pd(rsq11,rinv11);
2125 /* EWALD ELECTROSTATICS */
2127 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2128 ewrt = _mm_mul_pd(r11,ewtabscale);
2129 ewitab = _mm_cvttpd_epi32(ewrt);
2131 eweps = _mm_frcz_pd(ewrt);
2133 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2135 twoeweps = _mm_add_pd(eweps,eweps);
2136 ewitab = _mm_slli_epi32(ewitab,2);
2137 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2138 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2139 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2140 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2141 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2142 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2143 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2144 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2145 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2146 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2148 d = _mm_sub_pd(r11,rswitch);
2149 d = _mm_max_pd(d,_mm_setzero_pd());
2150 d2 = _mm_mul_pd(d,d);
2151 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2153 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2155 /* Evaluate switch function */
2156 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2157 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2158 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2162 fscal = _mm_and_pd(fscal,cutoff_mask);
2164 /* Update vectorial force */
2165 fix1 = _mm_macc_pd(dx11,fscal,fix1);
2166 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
2167 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
2169 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
2170 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
2171 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
2175 /**************************
2176 * CALCULATE INTERACTIONS *
2177 **************************/
2179 if (gmx_mm_any_lt(rsq12,rcutoff2))
2182 r12 = _mm_mul_pd(rsq12,rinv12);
2184 /* EWALD ELECTROSTATICS */
2186 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2187 ewrt = _mm_mul_pd(r12,ewtabscale);
2188 ewitab = _mm_cvttpd_epi32(ewrt);
2190 eweps = _mm_frcz_pd(ewrt);
2192 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2194 twoeweps = _mm_add_pd(eweps,eweps);
2195 ewitab = _mm_slli_epi32(ewitab,2);
2196 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2197 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2198 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2199 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2200 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2201 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2202 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2203 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2204 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2205 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2207 d = _mm_sub_pd(r12,rswitch);
2208 d = _mm_max_pd(d,_mm_setzero_pd());
2209 d2 = _mm_mul_pd(d,d);
2210 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2212 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2214 /* Evaluate switch function */
2215 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2216 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2217 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2221 fscal = _mm_and_pd(fscal,cutoff_mask);
2223 /* Update vectorial force */
2224 fix1 = _mm_macc_pd(dx12,fscal,fix1);
2225 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
2226 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
2228 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
2229 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
2230 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
2234 /**************************
2235 * CALCULATE INTERACTIONS *
2236 **************************/
2238 if (gmx_mm_any_lt(rsq20,rcutoff2))
2241 r20 = _mm_mul_pd(rsq20,rinv20);
2243 /* EWALD ELECTROSTATICS */
2245 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2246 ewrt = _mm_mul_pd(r20,ewtabscale);
2247 ewitab = _mm_cvttpd_epi32(ewrt);
2249 eweps = _mm_frcz_pd(ewrt);
2251 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2253 twoeweps = _mm_add_pd(eweps,eweps);
2254 ewitab = _mm_slli_epi32(ewitab,2);
2255 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2256 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2257 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2258 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2259 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2260 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2261 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2262 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2263 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
2264 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2266 d = _mm_sub_pd(r20,rswitch);
2267 d = _mm_max_pd(d,_mm_setzero_pd());
2268 d2 = _mm_mul_pd(d,d);
2269 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2271 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2273 /* Evaluate switch function */
2274 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2275 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
2276 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
2280 fscal = _mm_and_pd(fscal,cutoff_mask);
2282 /* Update vectorial force */
2283 fix2 = _mm_macc_pd(dx20,fscal,fix2);
2284 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
2285 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
2287 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
2288 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
2289 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
2293 /**************************
2294 * CALCULATE INTERACTIONS *
2295 **************************/
2297 if (gmx_mm_any_lt(rsq21,rcutoff2))
2300 r21 = _mm_mul_pd(rsq21,rinv21);
2302 /* EWALD ELECTROSTATICS */
2304 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2305 ewrt = _mm_mul_pd(r21,ewtabscale);
2306 ewitab = _mm_cvttpd_epi32(ewrt);
2308 eweps = _mm_frcz_pd(ewrt);
2310 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2312 twoeweps = _mm_add_pd(eweps,eweps);
2313 ewitab = _mm_slli_epi32(ewitab,2);
2314 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2315 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2316 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2317 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2318 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2319 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2320 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2321 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2322 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2323 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2325 d = _mm_sub_pd(r21,rswitch);
2326 d = _mm_max_pd(d,_mm_setzero_pd());
2327 d2 = _mm_mul_pd(d,d);
2328 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2330 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2332 /* Evaluate switch function */
2333 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2334 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2335 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2339 fscal = _mm_and_pd(fscal,cutoff_mask);
2341 /* Update vectorial force */
2342 fix2 = _mm_macc_pd(dx21,fscal,fix2);
2343 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
2344 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
2346 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
2347 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
2348 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
2352 /**************************
2353 * CALCULATE INTERACTIONS *
2354 **************************/
2356 if (gmx_mm_any_lt(rsq22,rcutoff2))
2359 r22 = _mm_mul_pd(rsq22,rinv22);
2361 /* EWALD ELECTROSTATICS */
2363 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2364 ewrt = _mm_mul_pd(r22,ewtabscale);
2365 ewitab = _mm_cvttpd_epi32(ewrt);
2367 eweps = _mm_frcz_pd(ewrt);
2369 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2371 twoeweps = _mm_add_pd(eweps,eweps);
2372 ewitab = _mm_slli_epi32(ewitab,2);
2373 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2374 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
2375 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2376 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2377 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
2378 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2379 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2380 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2381 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2382 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2384 d = _mm_sub_pd(r22,rswitch);
2385 d = _mm_max_pd(d,_mm_setzero_pd());
2386 d2 = _mm_mul_pd(d,d);
2387 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2389 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2391 /* Evaluate switch function */
2392 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2393 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2394 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2398 fscal = _mm_and_pd(fscal,cutoff_mask);
2400 /* Update vectorial force */
2401 fix2 = _mm_macc_pd(dx22,fscal,fix2);
2402 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
2403 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
2405 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
2406 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
2407 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
2411 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2413 /* Inner loop uses 600 flops */
2416 if(jidx<j_index_end)
2420 j_coord_offsetA = DIM*jnrA;
2422 /* load j atom coordinates */
2423 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2424 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2426 /* Calculate displacement vector */
2427 dx00 = _mm_sub_pd(ix0,jx0);
2428 dy00 = _mm_sub_pd(iy0,jy0);
2429 dz00 = _mm_sub_pd(iz0,jz0);
2430 dx01 = _mm_sub_pd(ix0,jx1);
2431 dy01 = _mm_sub_pd(iy0,jy1);
2432 dz01 = _mm_sub_pd(iz0,jz1);
2433 dx02 = _mm_sub_pd(ix0,jx2);
2434 dy02 = _mm_sub_pd(iy0,jy2);
2435 dz02 = _mm_sub_pd(iz0,jz2);
2436 dx10 = _mm_sub_pd(ix1,jx0);
2437 dy10 = _mm_sub_pd(iy1,jy0);
2438 dz10 = _mm_sub_pd(iz1,jz0);
2439 dx11 = _mm_sub_pd(ix1,jx1);
2440 dy11 = _mm_sub_pd(iy1,jy1);
2441 dz11 = _mm_sub_pd(iz1,jz1);
2442 dx12 = _mm_sub_pd(ix1,jx2);
2443 dy12 = _mm_sub_pd(iy1,jy2);
2444 dz12 = _mm_sub_pd(iz1,jz2);
2445 dx20 = _mm_sub_pd(ix2,jx0);
2446 dy20 = _mm_sub_pd(iy2,jy0);
2447 dz20 = _mm_sub_pd(iz2,jz0);
2448 dx21 = _mm_sub_pd(ix2,jx1);
2449 dy21 = _mm_sub_pd(iy2,jy1);
2450 dz21 = _mm_sub_pd(iz2,jz1);
2451 dx22 = _mm_sub_pd(ix2,jx2);
2452 dy22 = _mm_sub_pd(iy2,jy2);
2453 dz22 = _mm_sub_pd(iz2,jz2);
2455 /* Calculate squared distance and things based on it */
2456 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2457 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
2458 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
2459 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
2460 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2461 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2462 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
2463 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2464 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2466 rinv00 = gmx_mm_invsqrt_pd(rsq00);
2467 rinv01 = gmx_mm_invsqrt_pd(rsq01);
2468 rinv02 = gmx_mm_invsqrt_pd(rsq02);
2469 rinv10 = gmx_mm_invsqrt_pd(rsq10);
2470 rinv11 = gmx_mm_invsqrt_pd(rsq11);
2471 rinv12 = gmx_mm_invsqrt_pd(rsq12);
2472 rinv20 = gmx_mm_invsqrt_pd(rsq20);
2473 rinv21 = gmx_mm_invsqrt_pd(rsq21);
2474 rinv22 = gmx_mm_invsqrt_pd(rsq22);
2476 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
2477 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
2478 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
2479 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
2480 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
2481 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
2482 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
2483 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
2484 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
2486 fjx0 = _mm_setzero_pd();
2487 fjy0 = _mm_setzero_pd();
2488 fjz0 = _mm_setzero_pd();
2489 fjx1 = _mm_setzero_pd();
2490 fjy1 = _mm_setzero_pd();
2491 fjz1 = _mm_setzero_pd();
2492 fjx2 = _mm_setzero_pd();
2493 fjy2 = _mm_setzero_pd();
2494 fjz2 = _mm_setzero_pd();
2496 /**************************
2497 * CALCULATE INTERACTIONS *
2498 **************************/
2500 if (gmx_mm_any_lt(rsq00,rcutoff2))
2503 r00 = _mm_mul_pd(rsq00,rinv00);
2505 /* EWALD ELECTROSTATICS */
2507 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2508 ewrt = _mm_mul_pd(r00,ewtabscale);
2509 ewitab = _mm_cvttpd_epi32(ewrt);
2511 eweps = _mm_frcz_pd(ewrt);
2513 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2515 twoeweps = _mm_add_pd(eweps,eweps);
2516 ewitab = _mm_slli_epi32(ewitab,2);
2517 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2518 ewtabD = _mm_setzero_pd();
2519 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2520 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2521 ewtabFn = _mm_setzero_pd();
2522 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2523 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2524 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2525 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
2526 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
2528 /* LENNARD-JONES DISPERSION/REPULSION */
2530 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2531 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
2532 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
2533 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
2534 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
2536 d = _mm_sub_pd(r00,rswitch);
2537 d = _mm_max_pd(d,_mm_setzero_pd());
2538 d2 = _mm_mul_pd(d,d);
2539 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2541 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2543 /* Evaluate switch function */
2544 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2545 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
2546 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2547 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
2549 fscal = _mm_add_pd(felec,fvdw);
2551 fscal = _mm_and_pd(fscal,cutoff_mask);
2553 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2555 /* Update vectorial force */
2556 fix0 = _mm_macc_pd(dx00,fscal,fix0);
2557 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
2558 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
2560 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
2561 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
2562 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
2566 /**************************
2567 * CALCULATE INTERACTIONS *
2568 **************************/
2570 if (gmx_mm_any_lt(rsq01,rcutoff2))
2573 r01 = _mm_mul_pd(rsq01,rinv01);
2575 /* EWALD ELECTROSTATICS */
2577 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2578 ewrt = _mm_mul_pd(r01,ewtabscale);
2579 ewitab = _mm_cvttpd_epi32(ewrt);
2581 eweps = _mm_frcz_pd(ewrt);
2583 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2585 twoeweps = _mm_add_pd(eweps,eweps);
2586 ewitab = _mm_slli_epi32(ewitab,2);
2587 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2588 ewtabD = _mm_setzero_pd();
2589 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2590 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2591 ewtabFn = _mm_setzero_pd();
2592 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2593 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2594 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2595 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
2596 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
2598 d = _mm_sub_pd(r01,rswitch);
2599 d = _mm_max_pd(d,_mm_setzero_pd());
2600 d2 = _mm_mul_pd(d,d);
2601 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2603 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2605 /* Evaluate switch function */
2606 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2607 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
2608 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
2612 fscal = _mm_and_pd(fscal,cutoff_mask);
2614 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2616 /* Update vectorial force */
2617 fix0 = _mm_macc_pd(dx01,fscal,fix0);
2618 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
2619 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
2621 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
2622 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
2623 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
2627 /**************************
2628 * CALCULATE INTERACTIONS *
2629 **************************/
2631 if (gmx_mm_any_lt(rsq02,rcutoff2))
2634 r02 = _mm_mul_pd(rsq02,rinv02);
2636 /* EWALD ELECTROSTATICS */
2638 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2639 ewrt = _mm_mul_pd(r02,ewtabscale);
2640 ewitab = _mm_cvttpd_epi32(ewrt);
2642 eweps = _mm_frcz_pd(ewrt);
2644 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2646 twoeweps = _mm_add_pd(eweps,eweps);
2647 ewitab = _mm_slli_epi32(ewitab,2);
2648 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2649 ewtabD = _mm_setzero_pd();
2650 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2651 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2652 ewtabFn = _mm_setzero_pd();
2653 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2654 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2655 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2656 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
2657 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
2659 d = _mm_sub_pd(r02,rswitch);
2660 d = _mm_max_pd(d,_mm_setzero_pd());
2661 d2 = _mm_mul_pd(d,d);
2662 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2664 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2666 /* Evaluate switch function */
2667 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2668 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
2669 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
2673 fscal = _mm_and_pd(fscal,cutoff_mask);
2675 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2677 /* Update vectorial force */
2678 fix0 = _mm_macc_pd(dx02,fscal,fix0);
2679 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
2680 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
2682 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
2683 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
2684 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
2688 /**************************
2689 * CALCULATE INTERACTIONS *
2690 **************************/
2692 if (gmx_mm_any_lt(rsq10,rcutoff2))
2695 r10 = _mm_mul_pd(rsq10,rinv10);
2697 /* EWALD ELECTROSTATICS */
2699 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2700 ewrt = _mm_mul_pd(r10,ewtabscale);
2701 ewitab = _mm_cvttpd_epi32(ewrt);
2703 eweps = _mm_frcz_pd(ewrt);
2705 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2707 twoeweps = _mm_add_pd(eweps,eweps);
2708 ewitab = _mm_slli_epi32(ewitab,2);
2709 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2710 ewtabD = _mm_setzero_pd();
2711 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2712 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2713 ewtabFn = _mm_setzero_pd();
2714 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2715 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2716 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2717 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
2718 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2720 d = _mm_sub_pd(r10,rswitch);
2721 d = _mm_max_pd(d,_mm_setzero_pd());
2722 d2 = _mm_mul_pd(d,d);
2723 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2725 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2727 /* Evaluate switch function */
2728 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2729 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
2730 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
2734 fscal = _mm_and_pd(fscal,cutoff_mask);
2736 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2738 /* Update vectorial force */
2739 fix1 = _mm_macc_pd(dx10,fscal,fix1);
2740 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
2741 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
2743 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
2744 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
2745 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
2749 /**************************
2750 * CALCULATE INTERACTIONS *
2751 **************************/
2753 if (gmx_mm_any_lt(rsq11,rcutoff2))
2756 r11 = _mm_mul_pd(rsq11,rinv11);
2758 /* EWALD ELECTROSTATICS */
2760 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2761 ewrt = _mm_mul_pd(r11,ewtabscale);
2762 ewitab = _mm_cvttpd_epi32(ewrt);
2764 eweps = _mm_frcz_pd(ewrt);
2766 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2768 twoeweps = _mm_add_pd(eweps,eweps);
2769 ewitab = _mm_slli_epi32(ewitab,2);
2770 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2771 ewtabD = _mm_setzero_pd();
2772 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2773 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2774 ewtabFn = _mm_setzero_pd();
2775 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2776 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2777 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2778 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2779 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2781 d = _mm_sub_pd(r11,rswitch);
2782 d = _mm_max_pd(d,_mm_setzero_pd());
2783 d2 = _mm_mul_pd(d,d);
2784 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2786 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2788 /* Evaluate switch function */
2789 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2790 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2791 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2795 fscal = _mm_and_pd(fscal,cutoff_mask);
2797 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2799 /* Update vectorial force */
2800 fix1 = _mm_macc_pd(dx11,fscal,fix1);
2801 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
2802 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
2804 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
2805 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
2806 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
2810 /**************************
2811 * CALCULATE INTERACTIONS *
2812 **************************/
2814 if (gmx_mm_any_lt(rsq12,rcutoff2))
2817 r12 = _mm_mul_pd(rsq12,rinv12);
2819 /* EWALD ELECTROSTATICS */
2821 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2822 ewrt = _mm_mul_pd(r12,ewtabscale);
2823 ewitab = _mm_cvttpd_epi32(ewrt);
2825 eweps = _mm_frcz_pd(ewrt);
2827 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2829 twoeweps = _mm_add_pd(eweps,eweps);
2830 ewitab = _mm_slli_epi32(ewitab,2);
2831 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2832 ewtabD = _mm_setzero_pd();
2833 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2834 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2835 ewtabFn = _mm_setzero_pd();
2836 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2837 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2838 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2839 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2840 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2842 d = _mm_sub_pd(r12,rswitch);
2843 d = _mm_max_pd(d,_mm_setzero_pd());
2844 d2 = _mm_mul_pd(d,d);
2845 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2847 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2849 /* Evaluate switch function */
2850 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2851 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2852 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2856 fscal = _mm_and_pd(fscal,cutoff_mask);
2858 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2860 /* Update vectorial force */
2861 fix1 = _mm_macc_pd(dx12,fscal,fix1);
2862 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
2863 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
2865 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
2866 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
2867 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
2871 /**************************
2872 * CALCULATE INTERACTIONS *
2873 **************************/
2875 if (gmx_mm_any_lt(rsq20,rcutoff2))
2878 r20 = _mm_mul_pd(rsq20,rinv20);
2880 /* EWALD ELECTROSTATICS */
2882 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2883 ewrt = _mm_mul_pd(r20,ewtabscale);
2884 ewitab = _mm_cvttpd_epi32(ewrt);
2886 eweps = _mm_frcz_pd(ewrt);
2888 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2890 twoeweps = _mm_add_pd(eweps,eweps);
2891 ewitab = _mm_slli_epi32(ewitab,2);
2892 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2893 ewtabD = _mm_setzero_pd();
2894 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2895 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2896 ewtabFn = _mm_setzero_pd();
2897 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2898 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2899 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2900 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
2901 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2903 d = _mm_sub_pd(r20,rswitch);
2904 d = _mm_max_pd(d,_mm_setzero_pd());
2905 d2 = _mm_mul_pd(d,d);
2906 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2908 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2910 /* Evaluate switch function */
2911 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2912 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
2913 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
2917 fscal = _mm_and_pd(fscal,cutoff_mask);
2919 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2921 /* Update vectorial force */
2922 fix2 = _mm_macc_pd(dx20,fscal,fix2);
2923 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
2924 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
2926 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
2927 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
2928 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
2932 /**************************
2933 * CALCULATE INTERACTIONS *
2934 **************************/
2936 if (gmx_mm_any_lt(rsq21,rcutoff2))
2939 r21 = _mm_mul_pd(rsq21,rinv21);
2941 /* EWALD ELECTROSTATICS */
2943 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2944 ewrt = _mm_mul_pd(r21,ewtabscale);
2945 ewitab = _mm_cvttpd_epi32(ewrt);
2947 eweps = _mm_frcz_pd(ewrt);
2949 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2951 twoeweps = _mm_add_pd(eweps,eweps);
2952 ewitab = _mm_slli_epi32(ewitab,2);
2953 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
2954 ewtabD = _mm_setzero_pd();
2955 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2956 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
2957 ewtabFn = _mm_setzero_pd();
2958 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2959 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
2960 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
2961 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2962 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2964 d = _mm_sub_pd(r21,rswitch);
2965 d = _mm_max_pd(d,_mm_setzero_pd());
2966 d2 = _mm_mul_pd(d,d);
2967 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
2969 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
2971 /* Evaluate switch function */
2972 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2973 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2974 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2978 fscal = _mm_and_pd(fscal,cutoff_mask);
2980 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2982 /* Update vectorial force */
2983 fix2 = _mm_macc_pd(dx21,fscal,fix2);
2984 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
2985 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
2987 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
2988 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
2989 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
2993 /**************************
2994 * CALCULATE INTERACTIONS *
2995 **************************/
2997 if (gmx_mm_any_lt(rsq22,rcutoff2))
3000 r22 = _mm_mul_pd(rsq22,rinv22);
3002 /* EWALD ELECTROSTATICS */
3004 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3005 ewrt = _mm_mul_pd(r22,ewtabscale);
3006 ewitab = _mm_cvttpd_epi32(ewrt);
3008 eweps = _mm_frcz_pd(ewrt);
3010 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3012 twoeweps = _mm_add_pd(eweps,eweps);
3013 ewitab = _mm_slli_epi32(ewitab,2);
3014 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
3015 ewtabD = _mm_setzero_pd();
3016 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3017 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
3018 ewtabFn = _mm_setzero_pd();
3019 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3020 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
3021 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
3022 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
3023 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
3025 d = _mm_sub_pd(r22,rswitch);
3026 d = _mm_max_pd(d,_mm_setzero_pd());
3027 d2 = _mm_mul_pd(d,d);
3028 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
3030 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
3032 /* Evaluate switch function */
3033 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3034 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
3035 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
3039 fscal = _mm_and_pd(fscal,cutoff_mask);
3041 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3043 /* Update vectorial force */
3044 fix2 = _mm_macc_pd(dx22,fscal,fix2);
3045 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
3046 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
3048 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
3049 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
3050 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
3054 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3056 /* Inner loop uses 600 flops */
3059 /* End of innermost loop */
3061 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3062 f+i_coord_offset,fshift+i_shift_offset);
3064 /* Increment number of inner iterations */
3065 inneriter += j_index_end - j_index_start;
3067 /* Outer loop uses 18 flops */
3070 /* Increment number of outer iterations */
3073 /* Update outer/inner flops */
3075 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*600);