2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
49 #include "gromacs/simd/math_x86_sse2_double.h"
50 #include "kernelutil_x86_sse2_double.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_double
54 * Electrostatics interaction: Ewald
55 * VdW interaction: LennardJones
56 * Geometry: Water3-Particle
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_sse2_double
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
77 int j_coord_offsetA,j_coord_offsetB;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 int vdwjidx0A,vdwjidx0B;
89 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
91 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
92 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
93 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
96 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
99 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
100 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
102 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
104 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
105 real rswitch_scalar,d_scalar;
106 __m128d dummy_mask,cutoff_mask;
107 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
108 __m128d one = _mm_set1_pd(1.0);
109 __m128d two = _mm_set1_pd(2.0);
115 jindex = nlist->jindex;
117 shiftidx = nlist->shift;
119 shiftvec = fr->shift_vec[0];
120 fshift = fr->fshift[0];
121 facel = _mm_set1_pd(fr->epsfac);
122 charge = mdatoms->chargeA;
123 nvdwtype = fr->ntype;
125 vdwtype = mdatoms->typeA;
127 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
128 ewtab = fr->ic->tabq_coul_FDV0;
129 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
130 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
132 /* Setup water-specific parameters */
133 inr = nlist->iinr[0];
134 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
135 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
136 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
137 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
139 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
140 rcutoff_scalar = fr->rcoulomb;
141 rcutoff = _mm_set1_pd(rcutoff_scalar);
142 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
144 rswitch_scalar = fr->rcoulomb_switch;
145 rswitch = _mm_set1_pd(rswitch_scalar);
146 /* Setup switch parameters */
147 d_scalar = rcutoff_scalar-rswitch_scalar;
148 d = _mm_set1_pd(d_scalar);
149 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
150 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
151 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
152 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
153 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
154 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
156 /* Avoid stupid compiler warnings */
164 /* Start outer loop over neighborlists */
165 for(iidx=0; iidx<nri; iidx++)
167 /* Load shift vector for this list */
168 i_shift_offset = DIM*shiftidx[iidx];
170 /* Load limits for loop over neighbors */
171 j_index_start = jindex[iidx];
172 j_index_end = jindex[iidx+1];
174 /* Get outer coordinate index */
176 i_coord_offset = DIM*inr;
178 /* Load i particle coords and add shift vector */
179 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
180 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
182 fix0 = _mm_setzero_pd();
183 fiy0 = _mm_setzero_pd();
184 fiz0 = _mm_setzero_pd();
185 fix1 = _mm_setzero_pd();
186 fiy1 = _mm_setzero_pd();
187 fiz1 = _mm_setzero_pd();
188 fix2 = _mm_setzero_pd();
189 fiy2 = _mm_setzero_pd();
190 fiz2 = _mm_setzero_pd();
192 /* Reset potential sums */
193 velecsum = _mm_setzero_pd();
194 vvdwsum = _mm_setzero_pd();
196 /* Start inner kernel loop */
197 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
200 /* Get j neighbor index, and coordinate index */
203 j_coord_offsetA = DIM*jnrA;
204 j_coord_offsetB = DIM*jnrB;
206 /* load j atom coordinates */
207 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
210 /* Calculate displacement vector */
211 dx00 = _mm_sub_pd(ix0,jx0);
212 dy00 = _mm_sub_pd(iy0,jy0);
213 dz00 = _mm_sub_pd(iz0,jz0);
214 dx10 = _mm_sub_pd(ix1,jx0);
215 dy10 = _mm_sub_pd(iy1,jy0);
216 dz10 = _mm_sub_pd(iz1,jz0);
217 dx20 = _mm_sub_pd(ix2,jx0);
218 dy20 = _mm_sub_pd(iy2,jy0);
219 dz20 = _mm_sub_pd(iz2,jz0);
221 /* Calculate squared distance and things based on it */
222 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
223 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
224 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
226 rinv00 = gmx_mm_invsqrt_pd(rsq00);
227 rinv10 = gmx_mm_invsqrt_pd(rsq10);
228 rinv20 = gmx_mm_invsqrt_pd(rsq20);
230 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
231 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
232 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
234 /* Load parameters for j particles */
235 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
236 vdwjidx0A = 2*vdwtype[jnrA+0];
237 vdwjidx0B = 2*vdwtype[jnrB+0];
239 fjx0 = _mm_setzero_pd();
240 fjy0 = _mm_setzero_pd();
241 fjz0 = _mm_setzero_pd();
243 /**************************
244 * CALCULATE INTERACTIONS *
245 **************************/
247 if (gmx_mm_any_lt(rsq00,rcutoff2))
250 r00 = _mm_mul_pd(rsq00,rinv00);
252 /* Compute parameters for interactions between i and j atoms */
253 qq00 = _mm_mul_pd(iq0,jq0);
254 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
255 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
257 /* EWALD ELECTROSTATICS */
259 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
260 ewrt = _mm_mul_pd(r00,ewtabscale);
261 ewitab = _mm_cvttpd_epi32(ewrt);
262 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
263 ewitab = _mm_slli_epi32(ewitab,2);
264 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
265 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
266 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
267 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
268 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
269 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
270 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
271 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
272 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
273 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
275 /* LENNARD-JONES DISPERSION/REPULSION */
277 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
278 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
279 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
280 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
281 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
283 d = _mm_sub_pd(r00,rswitch);
284 d = _mm_max_pd(d,_mm_setzero_pd());
285 d2 = _mm_mul_pd(d,d);
286 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
288 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
290 /* Evaluate switch function */
291 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
292 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
293 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
294 velec = _mm_mul_pd(velec,sw);
295 vvdw = _mm_mul_pd(vvdw,sw);
296 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
298 /* Update potential sum for this i atom from the interaction with this j atom. */
299 velec = _mm_and_pd(velec,cutoff_mask);
300 velecsum = _mm_add_pd(velecsum,velec);
301 vvdw = _mm_and_pd(vvdw,cutoff_mask);
302 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
304 fscal = _mm_add_pd(felec,fvdw);
306 fscal = _mm_and_pd(fscal,cutoff_mask);
308 /* Calculate temporary vectorial force */
309 tx = _mm_mul_pd(fscal,dx00);
310 ty = _mm_mul_pd(fscal,dy00);
311 tz = _mm_mul_pd(fscal,dz00);
313 /* Update vectorial force */
314 fix0 = _mm_add_pd(fix0,tx);
315 fiy0 = _mm_add_pd(fiy0,ty);
316 fiz0 = _mm_add_pd(fiz0,tz);
318 fjx0 = _mm_add_pd(fjx0,tx);
319 fjy0 = _mm_add_pd(fjy0,ty);
320 fjz0 = _mm_add_pd(fjz0,tz);
324 /**************************
325 * CALCULATE INTERACTIONS *
326 **************************/
328 if (gmx_mm_any_lt(rsq10,rcutoff2))
331 r10 = _mm_mul_pd(rsq10,rinv10);
333 /* Compute parameters for interactions between i and j atoms */
334 qq10 = _mm_mul_pd(iq1,jq0);
336 /* EWALD ELECTROSTATICS */
338 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
339 ewrt = _mm_mul_pd(r10,ewtabscale);
340 ewitab = _mm_cvttpd_epi32(ewrt);
341 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
342 ewitab = _mm_slli_epi32(ewitab,2);
343 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
344 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
345 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
346 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
347 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
348 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
349 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
350 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
351 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
352 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
354 d = _mm_sub_pd(r10,rswitch);
355 d = _mm_max_pd(d,_mm_setzero_pd());
356 d2 = _mm_mul_pd(d,d);
357 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
359 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
361 /* Evaluate switch function */
362 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
363 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
364 velec = _mm_mul_pd(velec,sw);
365 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
367 /* Update potential sum for this i atom from the interaction with this j atom. */
368 velec = _mm_and_pd(velec,cutoff_mask);
369 velecsum = _mm_add_pd(velecsum,velec);
373 fscal = _mm_and_pd(fscal,cutoff_mask);
375 /* Calculate temporary vectorial force */
376 tx = _mm_mul_pd(fscal,dx10);
377 ty = _mm_mul_pd(fscal,dy10);
378 tz = _mm_mul_pd(fscal,dz10);
380 /* Update vectorial force */
381 fix1 = _mm_add_pd(fix1,tx);
382 fiy1 = _mm_add_pd(fiy1,ty);
383 fiz1 = _mm_add_pd(fiz1,tz);
385 fjx0 = _mm_add_pd(fjx0,tx);
386 fjy0 = _mm_add_pd(fjy0,ty);
387 fjz0 = _mm_add_pd(fjz0,tz);
391 /**************************
392 * CALCULATE INTERACTIONS *
393 **************************/
395 if (gmx_mm_any_lt(rsq20,rcutoff2))
398 r20 = _mm_mul_pd(rsq20,rinv20);
400 /* Compute parameters for interactions between i and j atoms */
401 qq20 = _mm_mul_pd(iq2,jq0);
403 /* EWALD ELECTROSTATICS */
405 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
406 ewrt = _mm_mul_pd(r20,ewtabscale);
407 ewitab = _mm_cvttpd_epi32(ewrt);
408 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
409 ewitab = _mm_slli_epi32(ewitab,2);
410 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
411 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
412 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
413 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
414 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
415 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
416 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
417 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
418 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
419 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
421 d = _mm_sub_pd(r20,rswitch);
422 d = _mm_max_pd(d,_mm_setzero_pd());
423 d2 = _mm_mul_pd(d,d);
424 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
426 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
428 /* Evaluate switch function */
429 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
430 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
431 velec = _mm_mul_pd(velec,sw);
432 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
434 /* Update potential sum for this i atom from the interaction with this j atom. */
435 velec = _mm_and_pd(velec,cutoff_mask);
436 velecsum = _mm_add_pd(velecsum,velec);
440 fscal = _mm_and_pd(fscal,cutoff_mask);
442 /* Calculate temporary vectorial force */
443 tx = _mm_mul_pd(fscal,dx20);
444 ty = _mm_mul_pd(fscal,dy20);
445 tz = _mm_mul_pd(fscal,dz20);
447 /* Update vectorial force */
448 fix2 = _mm_add_pd(fix2,tx);
449 fiy2 = _mm_add_pd(fiy2,ty);
450 fiz2 = _mm_add_pd(fiz2,tz);
452 fjx0 = _mm_add_pd(fjx0,tx);
453 fjy0 = _mm_add_pd(fjy0,ty);
454 fjz0 = _mm_add_pd(fjz0,tz);
458 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
460 /* Inner loop uses 216 flops */
467 j_coord_offsetA = DIM*jnrA;
469 /* load j atom coordinates */
470 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
473 /* Calculate displacement vector */
474 dx00 = _mm_sub_pd(ix0,jx0);
475 dy00 = _mm_sub_pd(iy0,jy0);
476 dz00 = _mm_sub_pd(iz0,jz0);
477 dx10 = _mm_sub_pd(ix1,jx0);
478 dy10 = _mm_sub_pd(iy1,jy0);
479 dz10 = _mm_sub_pd(iz1,jz0);
480 dx20 = _mm_sub_pd(ix2,jx0);
481 dy20 = _mm_sub_pd(iy2,jy0);
482 dz20 = _mm_sub_pd(iz2,jz0);
484 /* Calculate squared distance and things based on it */
485 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
486 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
487 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
489 rinv00 = gmx_mm_invsqrt_pd(rsq00);
490 rinv10 = gmx_mm_invsqrt_pd(rsq10);
491 rinv20 = gmx_mm_invsqrt_pd(rsq20);
493 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
494 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
495 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
497 /* Load parameters for j particles */
498 jq0 = _mm_load_sd(charge+jnrA+0);
499 vdwjidx0A = 2*vdwtype[jnrA+0];
501 fjx0 = _mm_setzero_pd();
502 fjy0 = _mm_setzero_pd();
503 fjz0 = _mm_setzero_pd();
505 /**************************
506 * CALCULATE INTERACTIONS *
507 **************************/
509 if (gmx_mm_any_lt(rsq00,rcutoff2))
512 r00 = _mm_mul_pd(rsq00,rinv00);
514 /* Compute parameters for interactions between i and j atoms */
515 qq00 = _mm_mul_pd(iq0,jq0);
516 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
518 /* EWALD ELECTROSTATICS */
520 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
521 ewrt = _mm_mul_pd(r00,ewtabscale);
522 ewitab = _mm_cvttpd_epi32(ewrt);
523 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
524 ewitab = _mm_slli_epi32(ewitab,2);
525 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
526 ewtabD = _mm_setzero_pd();
527 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
528 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
529 ewtabFn = _mm_setzero_pd();
530 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
531 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
532 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
533 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
534 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
536 /* LENNARD-JONES DISPERSION/REPULSION */
538 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
539 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
540 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
541 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
542 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
544 d = _mm_sub_pd(r00,rswitch);
545 d = _mm_max_pd(d,_mm_setzero_pd());
546 d2 = _mm_mul_pd(d,d);
547 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
549 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
551 /* Evaluate switch function */
552 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
553 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
554 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
555 velec = _mm_mul_pd(velec,sw);
556 vvdw = _mm_mul_pd(vvdw,sw);
557 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
559 /* Update potential sum for this i atom from the interaction with this j atom. */
560 velec = _mm_and_pd(velec,cutoff_mask);
561 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
562 velecsum = _mm_add_pd(velecsum,velec);
563 vvdw = _mm_and_pd(vvdw,cutoff_mask);
564 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
565 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
567 fscal = _mm_add_pd(felec,fvdw);
569 fscal = _mm_and_pd(fscal,cutoff_mask);
571 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
573 /* Calculate temporary vectorial force */
574 tx = _mm_mul_pd(fscal,dx00);
575 ty = _mm_mul_pd(fscal,dy00);
576 tz = _mm_mul_pd(fscal,dz00);
578 /* Update vectorial force */
579 fix0 = _mm_add_pd(fix0,tx);
580 fiy0 = _mm_add_pd(fiy0,ty);
581 fiz0 = _mm_add_pd(fiz0,tz);
583 fjx0 = _mm_add_pd(fjx0,tx);
584 fjy0 = _mm_add_pd(fjy0,ty);
585 fjz0 = _mm_add_pd(fjz0,tz);
589 /**************************
590 * CALCULATE INTERACTIONS *
591 **************************/
593 if (gmx_mm_any_lt(rsq10,rcutoff2))
596 r10 = _mm_mul_pd(rsq10,rinv10);
598 /* Compute parameters for interactions between i and j atoms */
599 qq10 = _mm_mul_pd(iq1,jq0);
601 /* EWALD ELECTROSTATICS */
603 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
604 ewrt = _mm_mul_pd(r10,ewtabscale);
605 ewitab = _mm_cvttpd_epi32(ewrt);
606 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
607 ewitab = _mm_slli_epi32(ewitab,2);
608 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
609 ewtabD = _mm_setzero_pd();
610 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
611 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
612 ewtabFn = _mm_setzero_pd();
613 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
614 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
615 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
616 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
617 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
619 d = _mm_sub_pd(r10,rswitch);
620 d = _mm_max_pd(d,_mm_setzero_pd());
621 d2 = _mm_mul_pd(d,d);
622 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
624 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
626 /* Evaluate switch function */
627 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
628 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
629 velec = _mm_mul_pd(velec,sw);
630 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
632 /* Update potential sum for this i atom from the interaction with this j atom. */
633 velec = _mm_and_pd(velec,cutoff_mask);
634 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
635 velecsum = _mm_add_pd(velecsum,velec);
639 fscal = _mm_and_pd(fscal,cutoff_mask);
641 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
643 /* Calculate temporary vectorial force */
644 tx = _mm_mul_pd(fscal,dx10);
645 ty = _mm_mul_pd(fscal,dy10);
646 tz = _mm_mul_pd(fscal,dz10);
648 /* Update vectorial force */
649 fix1 = _mm_add_pd(fix1,tx);
650 fiy1 = _mm_add_pd(fiy1,ty);
651 fiz1 = _mm_add_pd(fiz1,tz);
653 fjx0 = _mm_add_pd(fjx0,tx);
654 fjy0 = _mm_add_pd(fjy0,ty);
655 fjz0 = _mm_add_pd(fjz0,tz);
659 /**************************
660 * CALCULATE INTERACTIONS *
661 **************************/
663 if (gmx_mm_any_lt(rsq20,rcutoff2))
666 r20 = _mm_mul_pd(rsq20,rinv20);
668 /* Compute parameters for interactions between i and j atoms */
669 qq20 = _mm_mul_pd(iq2,jq0);
671 /* EWALD ELECTROSTATICS */
673 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
674 ewrt = _mm_mul_pd(r20,ewtabscale);
675 ewitab = _mm_cvttpd_epi32(ewrt);
676 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
677 ewitab = _mm_slli_epi32(ewitab,2);
678 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
679 ewtabD = _mm_setzero_pd();
680 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
681 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
682 ewtabFn = _mm_setzero_pd();
683 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
684 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
685 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
686 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
687 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
689 d = _mm_sub_pd(r20,rswitch);
690 d = _mm_max_pd(d,_mm_setzero_pd());
691 d2 = _mm_mul_pd(d,d);
692 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
694 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
696 /* Evaluate switch function */
697 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
698 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
699 velec = _mm_mul_pd(velec,sw);
700 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
702 /* Update potential sum for this i atom from the interaction with this j atom. */
703 velec = _mm_and_pd(velec,cutoff_mask);
704 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
705 velecsum = _mm_add_pd(velecsum,velec);
709 fscal = _mm_and_pd(fscal,cutoff_mask);
711 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
713 /* Calculate temporary vectorial force */
714 tx = _mm_mul_pd(fscal,dx20);
715 ty = _mm_mul_pd(fscal,dy20);
716 tz = _mm_mul_pd(fscal,dz20);
718 /* Update vectorial force */
719 fix2 = _mm_add_pd(fix2,tx);
720 fiy2 = _mm_add_pd(fiy2,ty);
721 fiz2 = _mm_add_pd(fiz2,tz);
723 fjx0 = _mm_add_pd(fjx0,tx);
724 fjy0 = _mm_add_pd(fjy0,ty);
725 fjz0 = _mm_add_pd(fjz0,tz);
729 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
731 /* Inner loop uses 216 flops */
734 /* End of innermost loop */
736 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
737 f+i_coord_offset,fshift+i_shift_offset);
740 /* Update potential energies */
741 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
742 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
744 /* Increment number of inner iterations */
745 inneriter += j_index_end - j_index_start;
747 /* Outer loop uses 20 flops */
750 /* Increment number of outer iterations */
753 /* Update outer/inner flops */
755 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*216);
758 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_double
759 * Electrostatics interaction: Ewald
760 * VdW interaction: LennardJones
761 * Geometry: Water3-Particle
762 * Calculate force/pot: Force
765 nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_sse2_double
766 (t_nblist * gmx_restrict nlist,
767 rvec * gmx_restrict xx,
768 rvec * gmx_restrict ff,
769 t_forcerec * gmx_restrict fr,
770 t_mdatoms * gmx_restrict mdatoms,
771 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
772 t_nrnb * gmx_restrict nrnb)
774 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
775 * just 0 for non-waters.
776 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
777 * jnr indices corresponding to data put in the four positions in the SIMD register.
779 int i_shift_offset,i_coord_offset,outeriter,inneriter;
780 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
782 int j_coord_offsetA,j_coord_offsetB;
783 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
785 real *shiftvec,*fshift,*x,*f;
786 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
788 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
790 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
792 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
793 int vdwjidx0A,vdwjidx0B;
794 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
795 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
796 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
797 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
798 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
801 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
804 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
805 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
807 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
809 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
810 real rswitch_scalar,d_scalar;
811 __m128d dummy_mask,cutoff_mask;
812 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
813 __m128d one = _mm_set1_pd(1.0);
814 __m128d two = _mm_set1_pd(2.0);
820 jindex = nlist->jindex;
822 shiftidx = nlist->shift;
824 shiftvec = fr->shift_vec[0];
825 fshift = fr->fshift[0];
826 facel = _mm_set1_pd(fr->epsfac);
827 charge = mdatoms->chargeA;
828 nvdwtype = fr->ntype;
830 vdwtype = mdatoms->typeA;
832 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
833 ewtab = fr->ic->tabq_coul_FDV0;
834 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
835 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
837 /* Setup water-specific parameters */
838 inr = nlist->iinr[0];
839 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
840 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
841 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
842 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
844 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
845 rcutoff_scalar = fr->rcoulomb;
846 rcutoff = _mm_set1_pd(rcutoff_scalar);
847 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
849 rswitch_scalar = fr->rcoulomb_switch;
850 rswitch = _mm_set1_pd(rswitch_scalar);
851 /* Setup switch parameters */
852 d_scalar = rcutoff_scalar-rswitch_scalar;
853 d = _mm_set1_pd(d_scalar);
854 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
855 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
856 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
857 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
858 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
859 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
861 /* Avoid stupid compiler warnings */
869 /* Start outer loop over neighborlists */
870 for(iidx=0; iidx<nri; iidx++)
872 /* Load shift vector for this list */
873 i_shift_offset = DIM*shiftidx[iidx];
875 /* Load limits for loop over neighbors */
876 j_index_start = jindex[iidx];
877 j_index_end = jindex[iidx+1];
879 /* Get outer coordinate index */
881 i_coord_offset = DIM*inr;
883 /* Load i particle coords and add shift vector */
884 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
885 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
887 fix0 = _mm_setzero_pd();
888 fiy0 = _mm_setzero_pd();
889 fiz0 = _mm_setzero_pd();
890 fix1 = _mm_setzero_pd();
891 fiy1 = _mm_setzero_pd();
892 fiz1 = _mm_setzero_pd();
893 fix2 = _mm_setzero_pd();
894 fiy2 = _mm_setzero_pd();
895 fiz2 = _mm_setzero_pd();
897 /* Start inner kernel loop */
898 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
901 /* Get j neighbor index, and coordinate index */
904 j_coord_offsetA = DIM*jnrA;
905 j_coord_offsetB = DIM*jnrB;
907 /* load j atom coordinates */
908 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
911 /* Calculate displacement vector */
912 dx00 = _mm_sub_pd(ix0,jx0);
913 dy00 = _mm_sub_pd(iy0,jy0);
914 dz00 = _mm_sub_pd(iz0,jz0);
915 dx10 = _mm_sub_pd(ix1,jx0);
916 dy10 = _mm_sub_pd(iy1,jy0);
917 dz10 = _mm_sub_pd(iz1,jz0);
918 dx20 = _mm_sub_pd(ix2,jx0);
919 dy20 = _mm_sub_pd(iy2,jy0);
920 dz20 = _mm_sub_pd(iz2,jz0);
922 /* Calculate squared distance and things based on it */
923 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
924 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
925 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
927 rinv00 = gmx_mm_invsqrt_pd(rsq00);
928 rinv10 = gmx_mm_invsqrt_pd(rsq10);
929 rinv20 = gmx_mm_invsqrt_pd(rsq20);
931 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
932 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
933 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
935 /* Load parameters for j particles */
936 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
937 vdwjidx0A = 2*vdwtype[jnrA+0];
938 vdwjidx0B = 2*vdwtype[jnrB+0];
940 fjx0 = _mm_setzero_pd();
941 fjy0 = _mm_setzero_pd();
942 fjz0 = _mm_setzero_pd();
944 /**************************
945 * CALCULATE INTERACTIONS *
946 **************************/
948 if (gmx_mm_any_lt(rsq00,rcutoff2))
951 r00 = _mm_mul_pd(rsq00,rinv00);
953 /* Compute parameters for interactions between i and j atoms */
954 qq00 = _mm_mul_pd(iq0,jq0);
955 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
956 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
958 /* EWALD ELECTROSTATICS */
960 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
961 ewrt = _mm_mul_pd(r00,ewtabscale);
962 ewitab = _mm_cvttpd_epi32(ewrt);
963 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
964 ewitab = _mm_slli_epi32(ewitab,2);
965 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
966 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
967 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
968 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
969 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
970 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
971 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
972 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
973 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
974 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
976 /* LENNARD-JONES DISPERSION/REPULSION */
978 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
979 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
980 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
981 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
982 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
984 d = _mm_sub_pd(r00,rswitch);
985 d = _mm_max_pd(d,_mm_setzero_pd());
986 d2 = _mm_mul_pd(d,d);
987 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
989 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
991 /* Evaluate switch function */
992 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
993 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
994 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
995 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
997 fscal = _mm_add_pd(felec,fvdw);
999 fscal = _mm_and_pd(fscal,cutoff_mask);
1001 /* Calculate temporary vectorial force */
1002 tx = _mm_mul_pd(fscal,dx00);
1003 ty = _mm_mul_pd(fscal,dy00);
1004 tz = _mm_mul_pd(fscal,dz00);
1006 /* Update vectorial force */
1007 fix0 = _mm_add_pd(fix0,tx);
1008 fiy0 = _mm_add_pd(fiy0,ty);
1009 fiz0 = _mm_add_pd(fiz0,tz);
1011 fjx0 = _mm_add_pd(fjx0,tx);
1012 fjy0 = _mm_add_pd(fjy0,ty);
1013 fjz0 = _mm_add_pd(fjz0,tz);
1017 /**************************
1018 * CALCULATE INTERACTIONS *
1019 **************************/
1021 if (gmx_mm_any_lt(rsq10,rcutoff2))
1024 r10 = _mm_mul_pd(rsq10,rinv10);
1026 /* Compute parameters for interactions between i and j atoms */
1027 qq10 = _mm_mul_pd(iq1,jq0);
1029 /* EWALD ELECTROSTATICS */
1031 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1032 ewrt = _mm_mul_pd(r10,ewtabscale);
1033 ewitab = _mm_cvttpd_epi32(ewrt);
1034 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1035 ewitab = _mm_slli_epi32(ewitab,2);
1036 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1037 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1038 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1039 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1040 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1041 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1042 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1043 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1044 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
1045 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1047 d = _mm_sub_pd(r10,rswitch);
1048 d = _mm_max_pd(d,_mm_setzero_pd());
1049 d2 = _mm_mul_pd(d,d);
1050 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1052 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1054 /* Evaluate switch function */
1055 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1056 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
1057 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1061 fscal = _mm_and_pd(fscal,cutoff_mask);
1063 /* Calculate temporary vectorial force */
1064 tx = _mm_mul_pd(fscal,dx10);
1065 ty = _mm_mul_pd(fscal,dy10);
1066 tz = _mm_mul_pd(fscal,dz10);
1068 /* Update vectorial force */
1069 fix1 = _mm_add_pd(fix1,tx);
1070 fiy1 = _mm_add_pd(fiy1,ty);
1071 fiz1 = _mm_add_pd(fiz1,tz);
1073 fjx0 = _mm_add_pd(fjx0,tx);
1074 fjy0 = _mm_add_pd(fjy0,ty);
1075 fjz0 = _mm_add_pd(fjz0,tz);
1079 /**************************
1080 * CALCULATE INTERACTIONS *
1081 **************************/
1083 if (gmx_mm_any_lt(rsq20,rcutoff2))
1086 r20 = _mm_mul_pd(rsq20,rinv20);
1088 /* Compute parameters for interactions between i and j atoms */
1089 qq20 = _mm_mul_pd(iq2,jq0);
1091 /* EWALD ELECTROSTATICS */
1093 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1094 ewrt = _mm_mul_pd(r20,ewtabscale);
1095 ewitab = _mm_cvttpd_epi32(ewrt);
1096 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1097 ewitab = _mm_slli_epi32(ewitab,2);
1098 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1099 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1100 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1101 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1102 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1103 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1104 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1105 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1106 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
1107 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1109 d = _mm_sub_pd(r20,rswitch);
1110 d = _mm_max_pd(d,_mm_setzero_pd());
1111 d2 = _mm_mul_pd(d,d);
1112 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1114 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1116 /* Evaluate switch function */
1117 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1118 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
1119 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1123 fscal = _mm_and_pd(fscal,cutoff_mask);
1125 /* Calculate temporary vectorial force */
1126 tx = _mm_mul_pd(fscal,dx20);
1127 ty = _mm_mul_pd(fscal,dy20);
1128 tz = _mm_mul_pd(fscal,dz20);
1130 /* Update vectorial force */
1131 fix2 = _mm_add_pd(fix2,tx);
1132 fiy2 = _mm_add_pd(fiy2,ty);
1133 fiz2 = _mm_add_pd(fiz2,tz);
1135 fjx0 = _mm_add_pd(fjx0,tx);
1136 fjy0 = _mm_add_pd(fjy0,ty);
1137 fjz0 = _mm_add_pd(fjz0,tz);
1141 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
1143 /* Inner loop uses 204 flops */
1146 if(jidx<j_index_end)
1150 j_coord_offsetA = DIM*jnrA;
1152 /* load j atom coordinates */
1153 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1156 /* Calculate displacement vector */
1157 dx00 = _mm_sub_pd(ix0,jx0);
1158 dy00 = _mm_sub_pd(iy0,jy0);
1159 dz00 = _mm_sub_pd(iz0,jz0);
1160 dx10 = _mm_sub_pd(ix1,jx0);
1161 dy10 = _mm_sub_pd(iy1,jy0);
1162 dz10 = _mm_sub_pd(iz1,jz0);
1163 dx20 = _mm_sub_pd(ix2,jx0);
1164 dy20 = _mm_sub_pd(iy2,jy0);
1165 dz20 = _mm_sub_pd(iz2,jz0);
1167 /* Calculate squared distance and things based on it */
1168 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1169 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1170 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1172 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1173 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1174 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1176 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1177 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1178 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1180 /* Load parameters for j particles */
1181 jq0 = _mm_load_sd(charge+jnrA+0);
1182 vdwjidx0A = 2*vdwtype[jnrA+0];
1184 fjx0 = _mm_setzero_pd();
1185 fjy0 = _mm_setzero_pd();
1186 fjz0 = _mm_setzero_pd();
1188 /**************************
1189 * CALCULATE INTERACTIONS *
1190 **************************/
1192 if (gmx_mm_any_lt(rsq00,rcutoff2))
1195 r00 = _mm_mul_pd(rsq00,rinv00);
1197 /* Compute parameters for interactions between i and j atoms */
1198 qq00 = _mm_mul_pd(iq0,jq0);
1199 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
1201 /* EWALD ELECTROSTATICS */
1203 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1204 ewrt = _mm_mul_pd(r00,ewtabscale);
1205 ewitab = _mm_cvttpd_epi32(ewrt);
1206 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1207 ewitab = _mm_slli_epi32(ewitab,2);
1208 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1209 ewtabD = _mm_setzero_pd();
1210 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1211 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1212 ewtabFn = _mm_setzero_pd();
1213 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1214 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1215 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1216 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
1217 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1219 /* LENNARD-JONES DISPERSION/REPULSION */
1221 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1222 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1223 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1224 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
1225 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1227 d = _mm_sub_pd(r00,rswitch);
1228 d = _mm_max_pd(d,_mm_setzero_pd());
1229 d2 = _mm_mul_pd(d,d);
1230 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1232 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1234 /* Evaluate switch function */
1235 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1236 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
1237 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1238 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1240 fscal = _mm_add_pd(felec,fvdw);
1242 fscal = _mm_and_pd(fscal,cutoff_mask);
1244 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1246 /* Calculate temporary vectorial force */
1247 tx = _mm_mul_pd(fscal,dx00);
1248 ty = _mm_mul_pd(fscal,dy00);
1249 tz = _mm_mul_pd(fscal,dz00);
1251 /* Update vectorial force */
1252 fix0 = _mm_add_pd(fix0,tx);
1253 fiy0 = _mm_add_pd(fiy0,ty);
1254 fiz0 = _mm_add_pd(fiz0,tz);
1256 fjx0 = _mm_add_pd(fjx0,tx);
1257 fjy0 = _mm_add_pd(fjy0,ty);
1258 fjz0 = _mm_add_pd(fjz0,tz);
1262 /**************************
1263 * CALCULATE INTERACTIONS *
1264 **************************/
1266 if (gmx_mm_any_lt(rsq10,rcutoff2))
1269 r10 = _mm_mul_pd(rsq10,rinv10);
1271 /* Compute parameters for interactions between i and j atoms */
1272 qq10 = _mm_mul_pd(iq1,jq0);
1274 /* EWALD ELECTROSTATICS */
1276 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1277 ewrt = _mm_mul_pd(r10,ewtabscale);
1278 ewitab = _mm_cvttpd_epi32(ewrt);
1279 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1280 ewitab = _mm_slli_epi32(ewitab,2);
1281 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1282 ewtabD = _mm_setzero_pd();
1283 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1284 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1285 ewtabFn = _mm_setzero_pd();
1286 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1287 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1288 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1289 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
1290 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1292 d = _mm_sub_pd(r10,rswitch);
1293 d = _mm_max_pd(d,_mm_setzero_pd());
1294 d2 = _mm_mul_pd(d,d);
1295 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1297 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1299 /* Evaluate switch function */
1300 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1301 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
1302 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1306 fscal = _mm_and_pd(fscal,cutoff_mask);
1308 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1310 /* Calculate temporary vectorial force */
1311 tx = _mm_mul_pd(fscal,dx10);
1312 ty = _mm_mul_pd(fscal,dy10);
1313 tz = _mm_mul_pd(fscal,dz10);
1315 /* Update vectorial force */
1316 fix1 = _mm_add_pd(fix1,tx);
1317 fiy1 = _mm_add_pd(fiy1,ty);
1318 fiz1 = _mm_add_pd(fiz1,tz);
1320 fjx0 = _mm_add_pd(fjx0,tx);
1321 fjy0 = _mm_add_pd(fjy0,ty);
1322 fjz0 = _mm_add_pd(fjz0,tz);
1326 /**************************
1327 * CALCULATE INTERACTIONS *
1328 **************************/
1330 if (gmx_mm_any_lt(rsq20,rcutoff2))
1333 r20 = _mm_mul_pd(rsq20,rinv20);
1335 /* Compute parameters for interactions between i and j atoms */
1336 qq20 = _mm_mul_pd(iq2,jq0);
1338 /* EWALD ELECTROSTATICS */
1340 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1341 ewrt = _mm_mul_pd(r20,ewtabscale);
1342 ewitab = _mm_cvttpd_epi32(ewrt);
1343 eweps = _mm_sub_pd(ewrt,_mm_cvtepi32_pd(ewitab));
1344 ewitab = _mm_slli_epi32(ewitab,2);
1345 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1346 ewtabD = _mm_setzero_pd();
1347 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1348 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1349 ewtabFn = _mm_setzero_pd();
1350 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1351 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1352 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1353 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
1354 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1356 d = _mm_sub_pd(r20,rswitch);
1357 d = _mm_max_pd(d,_mm_setzero_pd());
1358 d2 = _mm_mul_pd(d,d);
1359 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1361 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1363 /* Evaluate switch function */
1364 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1365 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
1366 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1370 fscal = _mm_and_pd(fscal,cutoff_mask);
1372 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1374 /* Calculate temporary vectorial force */
1375 tx = _mm_mul_pd(fscal,dx20);
1376 ty = _mm_mul_pd(fscal,dy20);
1377 tz = _mm_mul_pd(fscal,dz20);
1379 /* Update vectorial force */
1380 fix2 = _mm_add_pd(fix2,tx);
1381 fiy2 = _mm_add_pd(fiy2,ty);
1382 fiz2 = _mm_add_pd(fiz2,tz);
1384 fjx0 = _mm_add_pd(fjx0,tx);
1385 fjy0 = _mm_add_pd(fjy0,ty);
1386 fjz0 = _mm_add_pd(fjz0,tz);
1390 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
1392 /* Inner loop uses 204 flops */
1395 /* End of innermost loop */
1397 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1398 f+i_coord_offset,fshift+i_shift_offset);
1400 /* Increment number of inner iterations */
1401 inneriter += j_index_end - j_index_start;
1403 /* Outer loop uses 18 flops */
1406 /* Increment number of outer iterations */
1409 /* Update outer/inner flops */
1411 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*204);