2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
49 #include "gromacs/simd/math_x86_avx_128_fma_double.h"
50 #include "kernelutil_x86_avx_128_fma_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_128_fma_double
54 * Electrostatics interaction: Ewald
55 * VdW interaction: LennardJones
56 * Geometry: Water4-Particle
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_avx_128_fma_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
77 int j_coord_offsetA,j_coord_offsetB;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
90 int vdwjidx0A,vdwjidx0B;
91 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
93 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
94 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
95 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
96 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
99 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
102 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
103 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
105 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
107 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
108 real rswitch_scalar,d_scalar;
109 __m128d dummy_mask,cutoff_mask;
110 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
111 __m128d one = _mm_set1_pd(1.0);
112 __m128d two = _mm_set1_pd(2.0);
118 jindex = nlist->jindex;
120 shiftidx = nlist->shift;
122 shiftvec = fr->shift_vec[0];
123 fshift = fr->fshift[0];
124 facel = _mm_set1_pd(fr->epsfac);
125 charge = mdatoms->chargeA;
126 nvdwtype = fr->ntype;
128 vdwtype = mdatoms->typeA;
130 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
131 ewtab = fr->ic->tabq_coul_FDV0;
132 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
133 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
135 /* Setup water-specific parameters */
136 inr = nlist->iinr[0];
137 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
138 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
139 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
140 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
142 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
143 rcutoff_scalar = fr->rcoulomb;
144 rcutoff = _mm_set1_pd(rcutoff_scalar);
145 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
147 rswitch_scalar = fr->rcoulomb_switch;
148 rswitch = _mm_set1_pd(rswitch_scalar);
149 /* Setup switch parameters */
150 d_scalar = rcutoff_scalar-rswitch_scalar;
151 d = _mm_set1_pd(d_scalar);
152 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
153 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
154 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
155 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
156 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
157 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
159 /* Avoid stupid compiler warnings */
167 /* Start outer loop over neighborlists */
168 for(iidx=0; iidx<nri; iidx++)
170 /* Load shift vector for this list */
171 i_shift_offset = DIM*shiftidx[iidx];
173 /* Load limits for loop over neighbors */
174 j_index_start = jindex[iidx];
175 j_index_end = jindex[iidx+1];
177 /* Get outer coordinate index */
179 i_coord_offset = DIM*inr;
181 /* Load i particle coords and add shift vector */
182 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
183 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
185 fix0 = _mm_setzero_pd();
186 fiy0 = _mm_setzero_pd();
187 fiz0 = _mm_setzero_pd();
188 fix1 = _mm_setzero_pd();
189 fiy1 = _mm_setzero_pd();
190 fiz1 = _mm_setzero_pd();
191 fix2 = _mm_setzero_pd();
192 fiy2 = _mm_setzero_pd();
193 fiz2 = _mm_setzero_pd();
194 fix3 = _mm_setzero_pd();
195 fiy3 = _mm_setzero_pd();
196 fiz3 = _mm_setzero_pd();
198 /* Reset potential sums */
199 velecsum = _mm_setzero_pd();
200 vvdwsum = _mm_setzero_pd();
202 /* Start inner kernel loop */
203 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
206 /* Get j neighbor index, and coordinate index */
209 j_coord_offsetA = DIM*jnrA;
210 j_coord_offsetB = DIM*jnrB;
212 /* load j atom coordinates */
213 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
216 /* Calculate displacement vector */
217 dx00 = _mm_sub_pd(ix0,jx0);
218 dy00 = _mm_sub_pd(iy0,jy0);
219 dz00 = _mm_sub_pd(iz0,jz0);
220 dx10 = _mm_sub_pd(ix1,jx0);
221 dy10 = _mm_sub_pd(iy1,jy0);
222 dz10 = _mm_sub_pd(iz1,jz0);
223 dx20 = _mm_sub_pd(ix2,jx0);
224 dy20 = _mm_sub_pd(iy2,jy0);
225 dz20 = _mm_sub_pd(iz2,jz0);
226 dx30 = _mm_sub_pd(ix3,jx0);
227 dy30 = _mm_sub_pd(iy3,jy0);
228 dz30 = _mm_sub_pd(iz3,jz0);
230 /* Calculate squared distance and things based on it */
231 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
232 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
233 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
234 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
236 rinv00 = gmx_mm_invsqrt_pd(rsq00);
237 rinv10 = gmx_mm_invsqrt_pd(rsq10);
238 rinv20 = gmx_mm_invsqrt_pd(rsq20);
239 rinv30 = gmx_mm_invsqrt_pd(rsq30);
241 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
242 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
243 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
244 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
246 /* Load parameters for j particles */
247 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
248 vdwjidx0A = 2*vdwtype[jnrA+0];
249 vdwjidx0B = 2*vdwtype[jnrB+0];
251 fjx0 = _mm_setzero_pd();
252 fjy0 = _mm_setzero_pd();
253 fjz0 = _mm_setzero_pd();
255 /**************************
256 * CALCULATE INTERACTIONS *
257 **************************/
259 if (gmx_mm_any_lt(rsq00,rcutoff2))
262 r00 = _mm_mul_pd(rsq00,rinv00);
264 /* Compute parameters for interactions between i and j atoms */
265 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
266 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
268 /* LENNARD-JONES DISPERSION/REPULSION */
270 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
271 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
272 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
273 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
274 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
276 d = _mm_sub_pd(r00,rswitch);
277 d = _mm_max_pd(d,_mm_setzero_pd());
278 d2 = _mm_mul_pd(d,d);
279 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
281 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
283 /* Evaluate switch function */
284 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
285 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
286 vvdw = _mm_mul_pd(vvdw,sw);
287 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
289 /* Update potential sum for this i atom from the interaction with this j atom. */
290 vvdw = _mm_and_pd(vvdw,cutoff_mask);
291 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
295 fscal = _mm_and_pd(fscal,cutoff_mask);
297 /* Update vectorial force */
298 fix0 = _mm_macc_pd(dx00,fscal,fix0);
299 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
300 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
302 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
303 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
304 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
308 /**************************
309 * CALCULATE INTERACTIONS *
310 **************************/
312 if (gmx_mm_any_lt(rsq10,rcutoff2))
315 r10 = _mm_mul_pd(rsq10,rinv10);
317 /* Compute parameters for interactions between i and j atoms */
318 qq10 = _mm_mul_pd(iq1,jq0);
320 /* EWALD ELECTROSTATICS */
322 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
323 ewrt = _mm_mul_pd(r10,ewtabscale);
324 ewitab = _mm_cvttpd_epi32(ewrt);
326 eweps = _mm_frcz_pd(ewrt);
328 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
330 twoeweps = _mm_add_pd(eweps,eweps);
331 ewitab = _mm_slli_epi32(ewitab,2);
332 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
333 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
334 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
335 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
336 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
337 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
338 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
339 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
340 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
341 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
343 d = _mm_sub_pd(r10,rswitch);
344 d = _mm_max_pd(d,_mm_setzero_pd());
345 d2 = _mm_mul_pd(d,d);
346 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
348 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
350 /* Evaluate switch function */
351 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
352 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
353 velec = _mm_mul_pd(velec,sw);
354 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
356 /* Update potential sum for this i atom from the interaction with this j atom. */
357 velec = _mm_and_pd(velec,cutoff_mask);
358 velecsum = _mm_add_pd(velecsum,velec);
362 fscal = _mm_and_pd(fscal,cutoff_mask);
364 /* Update vectorial force */
365 fix1 = _mm_macc_pd(dx10,fscal,fix1);
366 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
367 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
369 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
370 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
371 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
375 /**************************
376 * CALCULATE INTERACTIONS *
377 **************************/
379 if (gmx_mm_any_lt(rsq20,rcutoff2))
382 r20 = _mm_mul_pd(rsq20,rinv20);
384 /* Compute parameters for interactions between i and j atoms */
385 qq20 = _mm_mul_pd(iq2,jq0);
387 /* EWALD ELECTROSTATICS */
389 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
390 ewrt = _mm_mul_pd(r20,ewtabscale);
391 ewitab = _mm_cvttpd_epi32(ewrt);
393 eweps = _mm_frcz_pd(ewrt);
395 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
397 twoeweps = _mm_add_pd(eweps,eweps);
398 ewitab = _mm_slli_epi32(ewitab,2);
399 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
400 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
401 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
402 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
403 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
404 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
405 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
406 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
407 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
408 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
410 d = _mm_sub_pd(r20,rswitch);
411 d = _mm_max_pd(d,_mm_setzero_pd());
412 d2 = _mm_mul_pd(d,d);
413 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
415 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
417 /* Evaluate switch function */
418 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
419 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
420 velec = _mm_mul_pd(velec,sw);
421 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
423 /* Update potential sum for this i atom from the interaction with this j atom. */
424 velec = _mm_and_pd(velec,cutoff_mask);
425 velecsum = _mm_add_pd(velecsum,velec);
429 fscal = _mm_and_pd(fscal,cutoff_mask);
431 /* Update vectorial force */
432 fix2 = _mm_macc_pd(dx20,fscal,fix2);
433 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
434 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
436 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
437 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
438 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
442 /**************************
443 * CALCULATE INTERACTIONS *
444 **************************/
446 if (gmx_mm_any_lt(rsq30,rcutoff2))
449 r30 = _mm_mul_pd(rsq30,rinv30);
451 /* Compute parameters for interactions between i and j atoms */
452 qq30 = _mm_mul_pd(iq3,jq0);
454 /* EWALD ELECTROSTATICS */
456 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
457 ewrt = _mm_mul_pd(r30,ewtabscale);
458 ewitab = _mm_cvttpd_epi32(ewrt);
460 eweps = _mm_frcz_pd(ewrt);
462 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
464 twoeweps = _mm_add_pd(eweps,eweps);
465 ewitab = _mm_slli_epi32(ewitab,2);
466 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
467 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
468 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
469 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
470 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
471 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
472 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
473 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
474 velec = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
475 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
477 d = _mm_sub_pd(r30,rswitch);
478 d = _mm_max_pd(d,_mm_setzero_pd());
479 d2 = _mm_mul_pd(d,d);
480 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
482 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
484 /* Evaluate switch function */
485 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
486 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv30,_mm_mul_pd(velec,dsw)) );
487 velec = _mm_mul_pd(velec,sw);
488 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
490 /* Update potential sum for this i atom from the interaction with this j atom. */
491 velec = _mm_and_pd(velec,cutoff_mask);
492 velecsum = _mm_add_pd(velecsum,velec);
496 fscal = _mm_and_pd(fscal,cutoff_mask);
498 /* Update vectorial force */
499 fix3 = _mm_macc_pd(dx30,fscal,fix3);
500 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
501 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
503 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
504 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
505 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
509 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
511 /* Inner loop uses 269 flops */
518 j_coord_offsetA = DIM*jnrA;
520 /* load j atom coordinates */
521 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
524 /* Calculate displacement vector */
525 dx00 = _mm_sub_pd(ix0,jx0);
526 dy00 = _mm_sub_pd(iy0,jy0);
527 dz00 = _mm_sub_pd(iz0,jz0);
528 dx10 = _mm_sub_pd(ix1,jx0);
529 dy10 = _mm_sub_pd(iy1,jy0);
530 dz10 = _mm_sub_pd(iz1,jz0);
531 dx20 = _mm_sub_pd(ix2,jx0);
532 dy20 = _mm_sub_pd(iy2,jy0);
533 dz20 = _mm_sub_pd(iz2,jz0);
534 dx30 = _mm_sub_pd(ix3,jx0);
535 dy30 = _mm_sub_pd(iy3,jy0);
536 dz30 = _mm_sub_pd(iz3,jz0);
538 /* Calculate squared distance and things based on it */
539 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
540 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
541 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
542 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
544 rinv00 = gmx_mm_invsqrt_pd(rsq00);
545 rinv10 = gmx_mm_invsqrt_pd(rsq10);
546 rinv20 = gmx_mm_invsqrt_pd(rsq20);
547 rinv30 = gmx_mm_invsqrt_pd(rsq30);
549 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
550 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
551 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
552 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
554 /* Load parameters for j particles */
555 jq0 = _mm_load_sd(charge+jnrA+0);
556 vdwjidx0A = 2*vdwtype[jnrA+0];
558 fjx0 = _mm_setzero_pd();
559 fjy0 = _mm_setzero_pd();
560 fjz0 = _mm_setzero_pd();
562 /**************************
563 * CALCULATE INTERACTIONS *
564 **************************/
566 if (gmx_mm_any_lt(rsq00,rcutoff2))
569 r00 = _mm_mul_pd(rsq00,rinv00);
571 /* Compute parameters for interactions between i and j atoms */
572 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
574 /* LENNARD-JONES DISPERSION/REPULSION */
576 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
577 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
578 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
579 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
580 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
582 d = _mm_sub_pd(r00,rswitch);
583 d = _mm_max_pd(d,_mm_setzero_pd());
584 d2 = _mm_mul_pd(d,d);
585 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
587 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
589 /* Evaluate switch function */
590 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
591 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
592 vvdw = _mm_mul_pd(vvdw,sw);
593 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
595 /* Update potential sum for this i atom from the interaction with this j atom. */
596 vvdw = _mm_and_pd(vvdw,cutoff_mask);
597 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
598 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
602 fscal = _mm_and_pd(fscal,cutoff_mask);
604 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
606 /* Update vectorial force */
607 fix0 = _mm_macc_pd(dx00,fscal,fix0);
608 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
609 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
611 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
612 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
613 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
617 /**************************
618 * CALCULATE INTERACTIONS *
619 **************************/
621 if (gmx_mm_any_lt(rsq10,rcutoff2))
624 r10 = _mm_mul_pd(rsq10,rinv10);
626 /* Compute parameters for interactions between i and j atoms */
627 qq10 = _mm_mul_pd(iq1,jq0);
629 /* EWALD ELECTROSTATICS */
631 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
632 ewrt = _mm_mul_pd(r10,ewtabscale);
633 ewitab = _mm_cvttpd_epi32(ewrt);
635 eweps = _mm_frcz_pd(ewrt);
637 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
639 twoeweps = _mm_add_pd(eweps,eweps);
640 ewitab = _mm_slli_epi32(ewitab,2);
641 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
642 ewtabD = _mm_setzero_pd();
643 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
644 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
645 ewtabFn = _mm_setzero_pd();
646 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
647 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
648 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
649 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
650 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
652 d = _mm_sub_pd(r10,rswitch);
653 d = _mm_max_pd(d,_mm_setzero_pd());
654 d2 = _mm_mul_pd(d,d);
655 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
657 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
659 /* Evaluate switch function */
660 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
661 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
662 velec = _mm_mul_pd(velec,sw);
663 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
665 /* Update potential sum for this i atom from the interaction with this j atom. */
666 velec = _mm_and_pd(velec,cutoff_mask);
667 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
668 velecsum = _mm_add_pd(velecsum,velec);
672 fscal = _mm_and_pd(fscal,cutoff_mask);
674 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
676 /* Update vectorial force */
677 fix1 = _mm_macc_pd(dx10,fscal,fix1);
678 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
679 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
681 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
682 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
683 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
687 /**************************
688 * CALCULATE INTERACTIONS *
689 **************************/
691 if (gmx_mm_any_lt(rsq20,rcutoff2))
694 r20 = _mm_mul_pd(rsq20,rinv20);
696 /* Compute parameters for interactions between i and j atoms */
697 qq20 = _mm_mul_pd(iq2,jq0);
699 /* EWALD ELECTROSTATICS */
701 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
702 ewrt = _mm_mul_pd(r20,ewtabscale);
703 ewitab = _mm_cvttpd_epi32(ewrt);
705 eweps = _mm_frcz_pd(ewrt);
707 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
709 twoeweps = _mm_add_pd(eweps,eweps);
710 ewitab = _mm_slli_epi32(ewitab,2);
711 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
712 ewtabD = _mm_setzero_pd();
713 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
714 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
715 ewtabFn = _mm_setzero_pd();
716 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
717 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
718 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
719 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
720 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
722 d = _mm_sub_pd(r20,rswitch);
723 d = _mm_max_pd(d,_mm_setzero_pd());
724 d2 = _mm_mul_pd(d,d);
725 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
727 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
729 /* Evaluate switch function */
730 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
731 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
732 velec = _mm_mul_pd(velec,sw);
733 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
735 /* Update potential sum for this i atom from the interaction with this j atom. */
736 velec = _mm_and_pd(velec,cutoff_mask);
737 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
738 velecsum = _mm_add_pd(velecsum,velec);
742 fscal = _mm_and_pd(fscal,cutoff_mask);
744 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
746 /* Update vectorial force */
747 fix2 = _mm_macc_pd(dx20,fscal,fix2);
748 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
749 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
751 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
752 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
753 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
757 /**************************
758 * CALCULATE INTERACTIONS *
759 **************************/
761 if (gmx_mm_any_lt(rsq30,rcutoff2))
764 r30 = _mm_mul_pd(rsq30,rinv30);
766 /* Compute parameters for interactions between i and j atoms */
767 qq30 = _mm_mul_pd(iq3,jq0);
769 /* EWALD ELECTROSTATICS */
771 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
772 ewrt = _mm_mul_pd(r30,ewtabscale);
773 ewitab = _mm_cvttpd_epi32(ewrt);
775 eweps = _mm_frcz_pd(ewrt);
777 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
779 twoeweps = _mm_add_pd(eweps,eweps);
780 ewitab = _mm_slli_epi32(ewitab,2);
781 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
782 ewtabD = _mm_setzero_pd();
783 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
784 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
785 ewtabFn = _mm_setzero_pd();
786 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
787 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
788 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
789 velec = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
790 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
792 d = _mm_sub_pd(r30,rswitch);
793 d = _mm_max_pd(d,_mm_setzero_pd());
794 d2 = _mm_mul_pd(d,d);
795 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
797 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
799 /* Evaluate switch function */
800 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
801 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv30,_mm_mul_pd(velec,dsw)) );
802 velec = _mm_mul_pd(velec,sw);
803 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
805 /* Update potential sum for this i atom from the interaction with this j atom. */
806 velec = _mm_and_pd(velec,cutoff_mask);
807 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
808 velecsum = _mm_add_pd(velecsum,velec);
812 fscal = _mm_and_pd(fscal,cutoff_mask);
814 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
816 /* Update vectorial force */
817 fix3 = _mm_macc_pd(dx30,fscal,fix3);
818 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
819 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
821 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
822 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
823 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
827 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
829 /* Inner loop uses 269 flops */
832 /* End of innermost loop */
834 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
835 f+i_coord_offset,fshift+i_shift_offset);
838 /* Update potential energies */
839 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
840 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
842 /* Increment number of inner iterations */
843 inneriter += j_index_end - j_index_start;
845 /* Outer loop uses 26 flops */
848 /* Increment number of outer iterations */
851 /* Update outer/inner flops */
853 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*269);
856 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_128_fma_double
857 * Electrostatics interaction: Ewald
858 * VdW interaction: LennardJones
859 * Geometry: Water4-Particle
860 * Calculate force/pot: Force
863 nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_avx_128_fma_double
864 (t_nblist * gmx_restrict nlist,
865 rvec * gmx_restrict xx,
866 rvec * gmx_restrict ff,
867 t_forcerec * gmx_restrict fr,
868 t_mdatoms * gmx_restrict mdatoms,
869 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
870 t_nrnb * gmx_restrict nrnb)
872 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
873 * just 0 for non-waters.
874 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
875 * jnr indices corresponding to data put in the four positions in the SIMD register.
877 int i_shift_offset,i_coord_offset,outeriter,inneriter;
878 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
880 int j_coord_offsetA,j_coord_offsetB;
881 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
883 real *shiftvec,*fshift,*x,*f;
884 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
886 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
888 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
890 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
892 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
893 int vdwjidx0A,vdwjidx0B;
894 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
895 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
896 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
897 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
898 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
899 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
902 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
905 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
906 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
908 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
910 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
911 real rswitch_scalar,d_scalar;
912 __m128d dummy_mask,cutoff_mask;
913 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
914 __m128d one = _mm_set1_pd(1.0);
915 __m128d two = _mm_set1_pd(2.0);
921 jindex = nlist->jindex;
923 shiftidx = nlist->shift;
925 shiftvec = fr->shift_vec[0];
926 fshift = fr->fshift[0];
927 facel = _mm_set1_pd(fr->epsfac);
928 charge = mdatoms->chargeA;
929 nvdwtype = fr->ntype;
931 vdwtype = mdatoms->typeA;
933 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
934 ewtab = fr->ic->tabq_coul_FDV0;
935 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
936 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
938 /* Setup water-specific parameters */
939 inr = nlist->iinr[0];
940 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
941 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
942 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
943 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
945 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
946 rcutoff_scalar = fr->rcoulomb;
947 rcutoff = _mm_set1_pd(rcutoff_scalar);
948 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
950 rswitch_scalar = fr->rcoulomb_switch;
951 rswitch = _mm_set1_pd(rswitch_scalar);
952 /* Setup switch parameters */
953 d_scalar = rcutoff_scalar-rswitch_scalar;
954 d = _mm_set1_pd(d_scalar);
955 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
956 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
957 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
958 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
959 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
960 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
962 /* Avoid stupid compiler warnings */
970 /* Start outer loop over neighborlists */
971 for(iidx=0; iidx<nri; iidx++)
973 /* Load shift vector for this list */
974 i_shift_offset = DIM*shiftidx[iidx];
976 /* Load limits for loop over neighbors */
977 j_index_start = jindex[iidx];
978 j_index_end = jindex[iidx+1];
980 /* Get outer coordinate index */
982 i_coord_offset = DIM*inr;
984 /* Load i particle coords and add shift vector */
985 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
986 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
988 fix0 = _mm_setzero_pd();
989 fiy0 = _mm_setzero_pd();
990 fiz0 = _mm_setzero_pd();
991 fix1 = _mm_setzero_pd();
992 fiy1 = _mm_setzero_pd();
993 fiz1 = _mm_setzero_pd();
994 fix2 = _mm_setzero_pd();
995 fiy2 = _mm_setzero_pd();
996 fiz2 = _mm_setzero_pd();
997 fix3 = _mm_setzero_pd();
998 fiy3 = _mm_setzero_pd();
999 fiz3 = _mm_setzero_pd();
1001 /* Start inner kernel loop */
1002 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1005 /* Get j neighbor index, and coordinate index */
1007 jnrB = jjnr[jidx+1];
1008 j_coord_offsetA = DIM*jnrA;
1009 j_coord_offsetB = DIM*jnrB;
1011 /* load j atom coordinates */
1012 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1015 /* Calculate displacement vector */
1016 dx00 = _mm_sub_pd(ix0,jx0);
1017 dy00 = _mm_sub_pd(iy0,jy0);
1018 dz00 = _mm_sub_pd(iz0,jz0);
1019 dx10 = _mm_sub_pd(ix1,jx0);
1020 dy10 = _mm_sub_pd(iy1,jy0);
1021 dz10 = _mm_sub_pd(iz1,jz0);
1022 dx20 = _mm_sub_pd(ix2,jx0);
1023 dy20 = _mm_sub_pd(iy2,jy0);
1024 dz20 = _mm_sub_pd(iz2,jz0);
1025 dx30 = _mm_sub_pd(ix3,jx0);
1026 dy30 = _mm_sub_pd(iy3,jy0);
1027 dz30 = _mm_sub_pd(iz3,jz0);
1029 /* Calculate squared distance and things based on it */
1030 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1031 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1032 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1033 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
1035 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1036 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1037 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1038 rinv30 = gmx_mm_invsqrt_pd(rsq30);
1040 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1041 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1042 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1043 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
1045 /* Load parameters for j particles */
1046 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
1047 vdwjidx0A = 2*vdwtype[jnrA+0];
1048 vdwjidx0B = 2*vdwtype[jnrB+0];
1050 fjx0 = _mm_setzero_pd();
1051 fjy0 = _mm_setzero_pd();
1052 fjz0 = _mm_setzero_pd();
1054 /**************************
1055 * CALCULATE INTERACTIONS *
1056 **************************/
1058 if (gmx_mm_any_lt(rsq00,rcutoff2))
1061 r00 = _mm_mul_pd(rsq00,rinv00);
1063 /* Compute parameters for interactions between i and j atoms */
1064 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
1065 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
1067 /* LENNARD-JONES DISPERSION/REPULSION */
1069 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1070 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1071 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1072 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
1073 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1075 d = _mm_sub_pd(r00,rswitch);
1076 d = _mm_max_pd(d,_mm_setzero_pd());
1077 d2 = _mm_mul_pd(d,d);
1078 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1080 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1082 /* Evaluate switch function */
1083 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1084 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1085 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1089 fscal = _mm_and_pd(fscal,cutoff_mask);
1091 /* Update vectorial force */
1092 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1093 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1094 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1096 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1097 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1098 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1102 /**************************
1103 * CALCULATE INTERACTIONS *
1104 **************************/
1106 if (gmx_mm_any_lt(rsq10,rcutoff2))
1109 r10 = _mm_mul_pd(rsq10,rinv10);
1111 /* Compute parameters for interactions between i and j atoms */
1112 qq10 = _mm_mul_pd(iq1,jq0);
1114 /* EWALD ELECTROSTATICS */
1116 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1117 ewrt = _mm_mul_pd(r10,ewtabscale);
1118 ewitab = _mm_cvttpd_epi32(ewrt);
1120 eweps = _mm_frcz_pd(ewrt);
1122 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1124 twoeweps = _mm_add_pd(eweps,eweps);
1125 ewitab = _mm_slli_epi32(ewitab,2);
1126 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1127 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1128 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1129 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1130 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
1131 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1132 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1133 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1134 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
1135 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1137 d = _mm_sub_pd(r10,rswitch);
1138 d = _mm_max_pd(d,_mm_setzero_pd());
1139 d2 = _mm_mul_pd(d,d);
1140 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1142 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1144 /* Evaluate switch function */
1145 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1146 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
1147 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1151 fscal = _mm_and_pd(fscal,cutoff_mask);
1153 /* Update vectorial force */
1154 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1155 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1156 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1158 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1159 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1160 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1164 /**************************
1165 * CALCULATE INTERACTIONS *
1166 **************************/
1168 if (gmx_mm_any_lt(rsq20,rcutoff2))
1171 r20 = _mm_mul_pd(rsq20,rinv20);
1173 /* Compute parameters for interactions between i and j atoms */
1174 qq20 = _mm_mul_pd(iq2,jq0);
1176 /* EWALD ELECTROSTATICS */
1178 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1179 ewrt = _mm_mul_pd(r20,ewtabscale);
1180 ewitab = _mm_cvttpd_epi32(ewrt);
1182 eweps = _mm_frcz_pd(ewrt);
1184 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1186 twoeweps = _mm_add_pd(eweps,eweps);
1187 ewitab = _mm_slli_epi32(ewitab,2);
1188 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1189 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1190 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1191 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1192 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
1193 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1194 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1195 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1196 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
1197 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1199 d = _mm_sub_pd(r20,rswitch);
1200 d = _mm_max_pd(d,_mm_setzero_pd());
1201 d2 = _mm_mul_pd(d,d);
1202 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1204 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1206 /* Evaluate switch function */
1207 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1208 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
1209 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1213 fscal = _mm_and_pd(fscal,cutoff_mask);
1215 /* Update vectorial force */
1216 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1217 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1218 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1220 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1221 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1222 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1226 /**************************
1227 * CALCULATE INTERACTIONS *
1228 **************************/
1230 if (gmx_mm_any_lt(rsq30,rcutoff2))
1233 r30 = _mm_mul_pd(rsq30,rinv30);
1235 /* Compute parameters for interactions between i and j atoms */
1236 qq30 = _mm_mul_pd(iq3,jq0);
1238 /* EWALD ELECTROSTATICS */
1240 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1241 ewrt = _mm_mul_pd(r30,ewtabscale);
1242 ewitab = _mm_cvttpd_epi32(ewrt);
1244 eweps = _mm_frcz_pd(ewrt);
1246 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1248 twoeweps = _mm_add_pd(eweps,eweps);
1249 ewitab = _mm_slli_epi32(ewitab,2);
1250 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1251 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1252 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1253 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1254 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
1255 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1256 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1257 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1258 velec = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
1259 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
1261 d = _mm_sub_pd(r30,rswitch);
1262 d = _mm_max_pd(d,_mm_setzero_pd());
1263 d2 = _mm_mul_pd(d,d);
1264 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1266 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1268 /* Evaluate switch function */
1269 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1270 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv30,_mm_mul_pd(velec,dsw)) );
1271 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
1275 fscal = _mm_and_pd(fscal,cutoff_mask);
1277 /* Update vectorial force */
1278 fix3 = _mm_macc_pd(dx30,fscal,fix3);
1279 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
1280 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
1282 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
1283 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
1284 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
1288 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
1290 /* Inner loop uses 257 flops */
1293 if(jidx<j_index_end)
1297 j_coord_offsetA = DIM*jnrA;
1299 /* load j atom coordinates */
1300 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1303 /* Calculate displacement vector */
1304 dx00 = _mm_sub_pd(ix0,jx0);
1305 dy00 = _mm_sub_pd(iy0,jy0);
1306 dz00 = _mm_sub_pd(iz0,jz0);
1307 dx10 = _mm_sub_pd(ix1,jx0);
1308 dy10 = _mm_sub_pd(iy1,jy0);
1309 dz10 = _mm_sub_pd(iz1,jz0);
1310 dx20 = _mm_sub_pd(ix2,jx0);
1311 dy20 = _mm_sub_pd(iy2,jy0);
1312 dz20 = _mm_sub_pd(iz2,jz0);
1313 dx30 = _mm_sub_pd(ix3,jx0);
1314 dy30 = _mm_sub_pd(iy3,jy0);
1315 dz30 = _mm_sub_pd(iz3,jz0);
1317 /* Calculate squared distance and things based on it */
1318 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1319 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1320 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1321 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
1323 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1324 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1325 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1326 rinv30 = gmx_mm_invsqrt_pd(rsq30);
1328 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1329 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1330 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1331 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
1333 /* Load parameters for j particles */
1334 jq0 = _mm_load_sd(charge+jnrA+0);
1335 vdwjidx0A = 2*vdwtype[jnrA+0];
1337 fjx0 = _mm_setzero_pd();
1338 fjy0 = _mm_setzero_pd();
1339 fjz0 = _mm_setzero_pd();
1341 /**************************
1342 * CALCULATE INTERACTIONS *
1343 **************************/
1345 if (gmx_mm_any_lt(rsq00,rcutoff2))
1348 r00 = _mm_mul_pd(rsq00,rinv00);
1350 /* Compute parameters for interactions between i and j atoms */
1351 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
1353 /* LENNARD-JONES DISPERSION/REPULSION */
1355 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1356 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1357 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1358 vvdw = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
1359 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1361 d = _mm_sub_pd(r00,rswitch);
1362 d = _mm_max_pd(d,_mm_setzero_pd());
1363 d2 = _mm_mul_pd(d,d);
1364 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1366 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1368 /* Evaluate switch function */
1369 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1370 fvdw = _mm_msub_pd( fvdw,sw , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1371 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1375 fscal = _mm_and_pd(fscal,cutoff_mask);
1377 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1379 /* Update vectorial force */
1380 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1381 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1382 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1384 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1385 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1386 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1390 /**************************
1391 * CALCULATE INTERACTIONS *
1392 **************************/
1394 if (gmx_mm_any_lt(rsq10,rcutoff2))
1397 r10 = _mm_mul_pd(rsq10,rinv10);
1399 /* Compute parameters for interactions between i and j atoms */
1400 qq10 = _mm_mul_pd(iq1,jq0);
1402 /* EWALD ELECTROSTATICS */
1404 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1405 ewrt = _mm_mul_pd(r10,ewtabscale);
1406 ewitab = _mm_cvttpd_epi32(ewrt);
1408 eweps = _mm_frcz_pd(ewrt);
1410 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1412 twoeweps = _mm_add_pd(eweps,eweps);
1413 ewitab = _mm_slli_epi32(ewitab,2);
1414 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1415 ewtabD = _mm_setzero_pd();
1416 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1417 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1418 ewtabFn = _mm_setzero_pd();
1419 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1420 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1421 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1422 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
1423 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1425 d = _mm_sub_pd(r10,rswitch);
1426 d = _mm_max_pd(d,_mm_setzero_pd());
1427 d2 = _mm_mul_pd(d,d);
1428 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1430 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1432 /* Evaluate switch function */
1433 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1434 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
1435 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1439 fscal = _mm_and_pd(fscal,cutoff_mask);
1441 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1443 /* Update vectorial force */
1444 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1445 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1446 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1448 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1449 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1450 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1454 /**************************
1455 * CALCULATE INTERACTIONS *
1456 **************************/
1458 if (gmx_mm_any_lt(rsq20,rcutoff2))
1461 r20 = _mm_mul_pd(rsq20,rinv20);
1463 /* Compute parameters for interactions between i and j atoms */
1464 qq20 = _mm_mul_pd(iq2,jq0);
1466 /* EWALD ELECTROSTATICS */
1468 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1469 ewrt = _mm_mul_pd(r20,ewtabscale);
1470 ewitab = _mm_cvttpd_epi32(ewrt);
1472 eweps = _mm_frcz_pd(ewrt);
1474 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1476 twoeweps = _mm_add_pd(eweps,eweps);
1477 ewitab = _mm_slli_epi32(ewitab,2);
1478 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1479 ewtabD = _mm_setzero_pd();
1480 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1481 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1482 ewtabFn = _mm_setzero_pd();
1483 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1484 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1485 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1486 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
1487 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1489 d = _mm_sub_pd(r20,rswitch);
1490 d = _mm_max_pd(d,_mm_setzero_pd());
1491 d2 = _mm_mul_pd(d,d);
1492 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1494 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1496 /* Evaluate switch function */
1497 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1498 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
1499 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1503 fscal = _mm_and_pd(fscal,cutoff_mask);
1505 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1507 /* Update vectorial force */
1508 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1509 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1510 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1512 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1513 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1514 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1518 /**************************
1519 * CALCULATE INTERACTIONS *
1520 **************************/
1522 if (gmx_mm_any_lt(rsq30,rcutoff2))
1525 r30 = _mm_mul_pd(rsq30,rinv30);
1527 /* Compute parameters for interactions between i and j atoms */
1528 qq30 = _mm_mul_pd(iq3,jq0);
1530 /* EWALD ELECTROSTATICS */
1532 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1533 ewrt = _mm_mul_pd(r30,ewtabscale);
1534 ewitab = _mm_cvttpd_epi32(ewrt);
1536 eweps = _mm_frcz_pd(ewrt);
1538 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1540 twoeweps = _mm_add_pd(eweps,eweps);
1541 ewitab = _mm_slli_epi32(ewitab,2);
1542 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1543 ewtabD = _mm_setzero_pd();
1544 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1545 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1546 ewtabFn = _mm_setzero_pd();
1547 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1548 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1549 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1550 velec = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
1551 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
1553 d = _mm_sub_pd(r30,rswitch);
1554 d = _mm_max_pd(d,_mm_setzero_pd());
1555 d2 = _mm_mul_pd(d,d);
1556 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_macc_pd(d,_mm_macc_pd(d,swV5,swV4),swV3))));
1558 dsw = _mm_mul_pd(d2,_mm_macc_pd(d,_mm_macc_pd(d,swF4,swF3),swF2));
1560 /* Evaluate switch function */
1561 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1562 felec = _mm_msub_pd( felec,sw , _mm_mul_pd(rinv30,_mm_mul_pd(velec,dsw)) );
1563 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
1567 fscal = _mm_and_pd(fscal,cutoff_mask);
1569 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1571 /* Update vectorial force */
1572 fix3 = _mm_macc_pd(dx30,fscal,fix3);
1573 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
1574 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
1576 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
1577 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
1578 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
1582 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
1584 /* Inner loop uses 257 flops */
1587 /* End of innermost loop */
1589 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1590 f+i_coord_offset,fshift+i_shift_offset);
1592 /* Increment number of inner iterations */
1593 inneriter += j_index_end - j_index_start;
1595 /* Outer loop uses 24 flops */
1598 /* Increment number of outer iterations */
1601 /* Update outer/inner flops */
1603 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*257);