2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_sse4_1_double.h"
48 #include "kernelutil_x86_sse4_1_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse4_1_double
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LennardJones
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse4_1_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int j_coord_offsetA,j_coord_offsetB;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
87 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
88 int vdwjidx0A,vdwjidx0B;
89 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 int vdwjidx1A,vdwjidx1B;
91 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B;
93 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 int vdwjidx3A,vdwjidx3B;
95 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
96 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
100 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
103 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
104 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
105 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
106 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
109 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
113 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
115 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
117 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
118 real rswitch_scalar,d_scalar;
119 __m128d dummy_mask,cutoff_mask;
120 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
121 __m128d one = _mm_set1_pd(1.0);
122 __m128d two = _mm_set1_pd(2.0);
128 jindex = nlist->jindex;
130 shiftidx = nlist->shift;
132 shiftvec = fr->shift_vec[0];
133 fshift = fr->fshift[0];
134 facel = _mm_set1_pd(fr->epsfac);
135 charge = mdatoms->chargeA;
136 nvdwtype = fr->ntype;
138 vdwtype = mdatoms->typeA;
140 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
141 ewtab = fr->ic->tabq_coul_FDV0;
142 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
143 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
145 /* Setup water-specific parameters */
146 inr = nlist->iinr[0];
147 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
148 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
149 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
150 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
152 jq1 = _mm_set1_pd(charge[inr+1]);
153 jq2 = _mm_set1_pd(charge[inr+2]);
154 jq3 = _mm_set1_pd(charge[inr+3]);
155 vdwjidx0A = 2*vdwtype[inr+0];
156 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
157 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
158 qq11 = _mm_mul_pd(iq1,jq1);
159 qq12 = _mm_mul_pd(iq1,jq2);
160 qq13 = _mm_mul_pd(iq1,jq3);
161 qq21 = _mm_mul_pd(iq2,jq1);
162 qq22 = _mm_mul_pd(iq2,jq2);
163 qq23 = _mm_mul_pd(iq2,jq3);
164 qq31 = _mm_mul_pd(iq3,jq1);
165 qq32 = _mm_mul_pd(iq3,jq2);
166 qq33 = _mm_mul_pd(iq3,jq3);
168 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
169 rcutoff_scalar = fr->rcoulomb;
170 rcutoff = _mm_set1_pd(rcutoff_scalar);
171 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
173 rswitch_scalar = fr->rcoulomb_switch;
174 rswitch = _mm_set1_pd(rswitch_scalar);
175 /* Setup switch parameters */
176 d_scalar = rcutoff_scalar-rswitch_scalar;
177 d = _mm_set1_pd(d_scalar);
178 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
179 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
180 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
181 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
182 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
183 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
185 /* Avoid stupid compiler warnings */
193 /* Start outer loop over neighborlists */
194 for(iidx=0; iidx<nri; iidx++)
196 /* Load shift vector for this list */
197 i_shift_offset = DIM*shiftidx[iidx];
199 /* Load limits for loop over neighbors */
200 j_index_start = jindex[iidx];
201 j_index_end = jindex[iidx+1];
203 /* Get outer coordinate index */
205 i_coord_offset = DIM*inr;
207 /* Load i particle coords and add shift vector */
208 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
209 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
211 fix0 = _mm_setzero_pd();
212 fiy0 = _mm_setzero_pd();
213 fiz0 = _mm_setzero_pd();
214 fix1 = _mm_setzero_pd();
215 fiy1 = _mm_setzero_pd();
216 fiz1 = _mm_setzero_pd();
217 fix2 = _mm_setzero_pd();
218 fiy2 = _mm_setzero_pd();
219 fiz2 = _mm_setzero_pd();
220 fix3 = _mm_setzero_pd();
221 fiy3 = _mm_setzero_pd();
222 fiz3 = _mm_setzero_pd();
224 /* Reset potential sums */
225 velecsum = _mm_setzero_pd();
226 vvdwsum = _mm_setzero_pd();
228 /* Start inner kernel loop */
229 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
232 /* Get j neighbor index, and coordinate index */
235 j_coord_offsetA = DIM*jnrA;
236 j_coord_offsetB = DIM*jnrB;
238 /* load j atom coordinates */
239 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
240 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
241 &jy2,&jz2,&jx3,&jy3,&jz3);
243 /* Calculate displacement vector */
244 dx00 = _mm_sub_pd(ix0,jx0);
245 dy00 = _mm_sub_pd(iy0,jy0);
246 dz00 = _mm_sub_pd(iz0,jz0);
247 dx11 = _mm_sub_pd(ix1,jx1);
248 dy11 = _mm_sub_pd(iy1,jy1);
249 dz11 = _mm_sub_pd(iz1,jz1);
250 dx12 = _mm_sub_pd(ix1,jx2);
251 dy12 = _mm_sub_pd(iy1,jy2);
252 dz12 = _mm_sub_pd(iz1,jz2);
253 dx13 = _mm_sub_pd(ix1,jx3);
254 dy13 = _mm_sub_pd(iy1,jy3);
255 dz13 = _mm_sub_pd(iz1,jz3);
256 dx21 = _mm_sub_pd(ix2,jx1);
257 dy21 = _mm_sub_pd(iy2,jy1);
258 dz21 = _mm_sub_pd(iz2,jz1);
259 dx22 = _mm_sub_pd(ix2,jx2);
260 dy22 = _mm_sub_pd(iy2,jy2);
261 dz22 = _mm_sub_pd(iz2,jz2);
262 dx23 = _mm_sub_pd(ix2,jx3);
263 dy23 = _mm_sub_pd(iy2,jy3);
264 dz23 = _mm_sub_pd(iz2,jz3);
265 dx31 = _mm_sub_pd(ix3,jx1);
266 dy31 = _mm_sub_pd(iy3,jy1);
267 dz31 = _mm_sub_pd(iz3,jz1);
268 dx32 = _mm_sub_pd(ix3,jx2);
269 dy32 = _mm_sub_pd(iy3,jy2);
270 dz32 = _mm_sub_pd(iz3,jz2);
271 dx33 = _mm_sub_pd(ix3,jx3);
272 dy33 = _mm_sub_pd(iy3,jy3);
273 dz33 = _mm_sub_pd(iz3,jz3);
275 /* Calculate squared distance and things based on it */
276 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
277 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
278 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
279 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
280 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
281 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
282 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
283 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
284 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
285 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
287 rinv00 = gmx_mm_invsqrt_pd(rsq00);
288 rinv11 = gmx_mm_invsqrt_pd(rsq11);
289 rinv12 = gmx_mm_invsqrt_pd(rsq12);
290 rinv13 = gmx_mm_invsqrt_pd(rsq13);
291 rinv21 = gmx_mm_invsqrt_pd(rsq21);
292 rinv22 = gmx_mm_invsqrt_pd(rsq22);
293 rinv23 = gmx_mm_invsqrt_pd(rsq23);
294 rinv31 = gmx_mm_invsqrt_pd(rsq31);
295 rinv32 = gmx_mm_invsqrt_pd(rsq32);
296 rinv33 = gmx_mm_invsqrt_pd(rsq33);
298 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
299 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
300 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
301 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
302 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
303 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
304 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
305 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
306 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
307 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
309 fjx0 = _mm_setzero_pd();
310 fjy0 = _mm_setzero_pd();
311 fjz0 = _mm_setzero_pd();
312 fjx1 = _mm_setzero_pd();
313 fjy1 = _mm_setzero_pd();
314 fjz1 = _mm_setzero_pd();
315 fjx2 = _mm_setzero_pd();
316 fjy2 = _mm_setzero_pd();
317 fjz2 = _mm_setzero_pd();
318 fjx3 = _mm_setzero_pd();
319 fjy3 = _mm_setzero_pd();
320 fjz3 = _mm_setzero_pd();
322 /**************************
323 * CALCULATE INTERACTIONS *
324 **************************/
326 if (gmx_mm_any_lt(rsq00,rcutoff2))
329 r00 = _mm_mul_pd(rsq00,rinv00);
331 /* LENNARD-JONES DISPERSION/REPULSION */
333 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
334 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
335 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
336 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
337 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
339 d = _mm_sub_pd(r00,rswitch);
340 d = _mm_max_pd(d,_mm_setzero_pd());
341 d2 = _mm_mul_pd(d,d);
342 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
344 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
346 /* Evaluate switch function */
347 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
348 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
349 vvdw = _mm_mul_pd(vvdw,sw);
350 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
352 /* Update potential sum for this i atom from the interaction with this j atom. */
353 vvdw = _mm_and_pd(vvdw,cutoff_mask);
354 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
358 fscal = _mm_and_pd(fscal,cutoff_mask);
360 /* Calculate temporary vectorial force */
361 tx = _mm_mul_pd(fscal,dx00);
362 ty = _mm_mul_pd(fscal,dy00);
363 tz = _mm_mul_pd(fscal,dz00);
365 /* Update vectorial force */
366 fix0 = _mm_add_pd(fix0,tx);
367 fiy0 = _mm_add_pd(fiy0,ty);
368 fiz0 = _mm_add_pd(fiz0,tz);
370 fjx0 = _mm_add_pd(fjx0,tx);
371 fjy0 = _mm_add_pd(fjy0,ty);
372 fjz0 = _mm_add_pd(fjz0,tz);
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
380 if (gmx_mm_any_lt(rsq11,rcutoff2))
383 r11 = _mm_mul_pd(rsq11,rinv11);
385 /* EWALD ELECTROSTATICS */
387 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
388 ewrt = _mm_mul_pd(r11,ewtabscale);
389 ewitab = _mm_cvttpd_epi32(ewrt);
390 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
391 ewitab = _mm_slli_epi32(ewitab,2);
392 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
393 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
394 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
395 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
396 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
397 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
398 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
399 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
400 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
401 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
403 d = _mm_sub_pd(r11,rswitch);
404 d = _mm_max_pd(d,_mm_setzero_pd());
405 d2 = _mm_mul_pd(d,d);
406 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
408 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
410 /* Evaluate switch function */
411 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
412 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
413 velec = _mm_mul_pd(velec,sw);
414 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
416 /* Update potential sum for this i atom from the interaction with this j atom. */
417 velec = _mm_and_pd(velec,cutoff_mask);
418 velecsum = _mm_add_pd(velecsum,velec);
422 fscal = _mm_and_pd(fscal,cutoff_mask);
424 /* Calculate temporary vectorial force */
425 tx = _mm_mul_pd(fscal,dx11);
426 ty = _mm_mul_pd(fscal,dy11);
427 tz = _mm_mul_pd(fscal,dz11);
429 /* Update vectorial force */
430 fix1 = _mm_add_pd(fix1,tx);
431 fiy1 = _mm_add_pd(fiy1,ty);
432 fiz1 = _mm_add_pd(fiz1,tz);
434 fjx1 = _mm_add_pd(fjx1,tx);
435 fjy1 = _mm_add_pd(fjy1,ty);
436 fjz1 = _mm_add_pd(fjz1,tz);
440 /**************************
441 * CALCULATE INTERACTIONS *
442 **************************/
444 if (gmx_mm_any_lt(rsq12,rcutoff2))
447 r12 = _mm_mul_pd(rsq12,rinv12);
449 /* EWALD ELECTROSTATICS */
451 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
452 ewrt = _mm_mul_pd(r12,ewtabscale);
453 ewitab = _mm_cvttpd_epi32(ewrt);
454 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
455 ewitab = _mm_slli_epi32(ewitab,2);
456 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
457 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
458 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
459 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
460 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
461 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
462 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
463 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
464 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
465 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
467 d = _mm_sub_pd(r12,rswitch);
468 d = _mm_max_pd(d,_mm_setzero_pd());
469 d2 = _mm_mul_pd(d,d);
470 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
472 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
474 /* Evaluate switch function */
475 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
476 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
477 velec = _mm_mul_pd(velec,sw);
478 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
480 /* Update potential sum for this i atom from the interaction with this j atom. */
481 velec = _mm_and_pd(velec,cutoff_mask);
482 velecsum = _mm_add_pd(velecsum,velec);
486 fscal = _mm_and_pd(fscal,cutoff_mask);
488 /* Calculate temporary vectorial force */
489 tx = _mm_mul_pd(fscal,dx12);
490 ty = _mm_mul_pd(fscal,dy12);
491 tz = _mm_mul_pd(fscal,dz12);
493 /* Update vectorial force */
494 fix1 = _mm_add_pd(fix1,tx);
495 fiy1 = _mm_add_pd(fiy1,ty);
496 fiz1 = _mm_add_pd(fiz1,tz);
498 fjx2 = _mm_add_pd(fjx2,tx);
499 fjy2 = _mm_add_pd(fjy2,ty);
500 fjz2 = _mm_add_pd(fjz2,tz);
504 /**************************
505 * CALCULATE INTERACTIONS *
506 **************************/
508 if (gmx_mm_any_lt(rsq13,rcutoff2))
511 r13 = _mm_mul_pd(rsq13,rinv13);
513 /* EWALD ELECTROSTATICS */
515 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
516 ewrt = _mm_mul_pd(r13,ewtabscale);
517 ewitab = _mm_cvttpd_epi32(ewrt);
518 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
519 ewitab = _mm_slli_epi32(ewitab,2);
520 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
521 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
522 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
523 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
524 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
525 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
526 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
527 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
528 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
529 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
531 d = _mm_sub_pd(r13,rswitch);
532 d = _mm_max_pd(d,_mm_setzero_pd());
533 d2 = _mm_mul_pd(d,d);
534 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
536 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
538 /* Evaluate switch function */
539 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
540 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
541 velec = _mm_mul_pd(velec,sw);
542 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
544 /* Update potential sum for this i atom from the interaction with this j atom. */
545 velec = _mm_and_pd(velec,cutoff_mask);
546 velecsum = _mm_add_pd(velecsum,velec);
550 fscal = _mm_and_pd(fscal,cutoff_mask);
552 /* Calculate temporary vectorial force */
553 tx = _mm_mul_pd(fscal,dx13);
554 ty = _mm_mul_pd(fscal,dy13);
555 tz = _mm_mul_pd(fscal,dz13);
557 /* Update vectorial force */
558 fix1 = _mm_add_pd(fix1,tx);
559 fiy1 = _mm_add_pd(fiy1,ty);
560 fiz1 = _mm_add_pd(fiz1,tz);
562 fjx3 = _mm_add_pd(fjx3,tx);
563 fjy3 = _mm_add_pd(fjy3,ty);
564 fjz3 = _mm_add_pd(fjz3,tz);
568 /**************************
569 * CALCULATE INTERACTIONS *
570 **************************/
572 if (gmx_mm_any_lt(rsq21,rcutoff2))
575 r21 = _mm_mul_pd(rsq21,rinv21);
577 /* EWALD ELECTROSTATICS */
579 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
580 ewrt = _mm_mul_pd(r21,ewtabscale);
581 ewitab = _mm_cvttpd_epi32(ewrt);
582 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
583 ewitab = _mm_slli_epi32(ewitab,2);
584 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
585 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
586 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
587 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
588 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
589 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
590 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
591 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
592 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
593 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
595 d = _mm_sub_pd(r21,rswitch);
596 d = _mm_max_pd(d,_mm_setzero_pd());
597 d2 = _mm_mul_pd(d,d);
598 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
600 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
602 /* Evaluate switch function */
603 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
604 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
605 velec = _mm_mul_pd(velec,sw);
606 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
608 /* Update potential sum for this i atom from the interaction with this j atom. */
609 velec = _mm_and_pd(velec,cutoff_mask);
610 velecsum = _mm_add_pd(velecsum,velec);
614 fscal = _mm_and_pd(fscal,cutoff_mask);
616 /* Calculate temporary vectorial force */
617 tx = _mm_mul_pd(fscal,dx21);
618 ty = _mm_mul_pd(fscal,dy21);
619 tz = _mm_mul_pd(fscal,dz21);
621 /* Update vectorial force */
622 fix2 = _mm_add_pd(fix2,tx);
623 fiy2 = _mm_add_pd(fiy2,ty);
624 fiz2 = _mm_add_pd(fiz2,tz);
626 fjx1 = _mm_add_pd(fjx1,tx);
627 fjy1 = _mm_add_pd(fjy1,ty);
628 fjz1 = _mm_add_pd(fjz1,tz);
632 /**************************
633 * CALCULATE INTERACTIONS *
634 **************************/
636 if (gmx_mm_any_lt(rsq22,rcutoff2))
639 r22 = _mm_mul_pd(rsq22,rinv22);
641 /* EWALD ELECTROSTATICS */
643 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
644 ewrt = _mm_mul_pd(r22,ewtabscale);
645 ewitab = _mm_cvttpd_epi32(ewrt);
646 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
647 ewitab = _mm_slli_epi32(ewitab,2);
648 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
649 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
650 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
651 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
652 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
653 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
654 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
655 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
656 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
657 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
659 d = _mm_sub_pd(r22,rswitch);
660 d = _mm_max_pd(d,_mm_setzero_pd());
661 d2 = _mm_mul_pd(d,d);
662 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
664 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
666 /* Evaluate switch function */
667 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
668 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
669 velec = _mm_mul_pd(velec,sw);
670 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
672 /* Update potential sum for this i atom from the interaction with this j atom. */
673 velec = _mm_and_pd(velec,cutoff_mask);
674 velecsum = _mm_add_pd(velecsum,velec);
678 fscal = _mm_and_pd(fscal,cutoff_mask);
680 /* Calculate temporary vectorial force */
681 tx = _mm_mul_pd(fscal,dx22);
682 ty = _mm_mul_pd(fscal,dy22);
683 tz = _mm_mul_pd(fscal,dz22);
685 /* Update vectorial force */
686 fix2 = _mm_add_pd(fix2,tx);
687 fiy2 = _mm_add_pd(fiy2,ty);
688 fiz2 = _mm_add_pd(fiz2,tz);
690 fjx2 = _mm_add_pd(fjx2,tx);
691 fjy2 = _mm_add_pd(fjy2,ty);
692 fjz2 = _mm_add_pd(fjz2,tz);
696 /**************************
697 * CALCULATE INTERACTIONS *
698 **************************/
700 if (gmx_mm_any_lt(rsq23,rcutoff2))
703 r23 = _mm_mul_pd(rsq23,rinv23);
705 /* EWALD ELECTROSTATICS */
707 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
708 ewrt = _mm_mul_pd(r23,ewtabscale);
709 ewitab = _mm_cvttpd_epi32(ewrt);
710 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
711 ewitab = _mm_slli_epi32(ewitab,2);
712 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
713 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
714 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
715 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
716 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
717 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
718 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
719 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
720 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
721 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
723 d = _mm_sub_pd(r23,rswitch);
724 d = _mm_max_pd(d,_mm_setzero_pd());
725 d2 = _mm_mul_pd(d,d);
726 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
728 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
730 /* Evaluate switch function */
731 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
732 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
733 velec = _mm_mul_pd(velec,sw);
734 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
736 /* Update potential sum for this i atom from the interaction with this j atom. */
737 velec = _mm_and_pd(velec,cutoff_mask);
738 velecsum = _mm_add_pd(velecsum,velec);
742 fscal = _mm_and_pd(fscal,cutoff_mask);
744 /* Calculate temporary vectorial force */
745 tx = _mm_mul_pd(fscal,dx23);
746 ty = _mm_mul_pd(fscal,dy23);
747 tz = _mm_mul_pd(fscal,dz23);
749 /* Update vectorial force */
750 fix2 = _mm_add_pd(fix2,tx);
751 fiy2 = _mm_add_pd(fiy2,ty);
752 fiz2 = _mm_add_pd(fiz2,tz);
754 fjx3 = _mm_add_pd(fjx3,tx);
755 fjy3 = _mm_add_pd(fjy3,ty);
756 fjz3 = _mm_add_pd(fjz3,tz);
760 /**************************
761 * CALCULATE INTERACTIONS *
762 **************************/
764 if (gmx_mm_any_lt(rsq31,rcutoff2))
767 r31 = _mm_mul_pd(rsq31,rinv31);
769 /* EWALD ELECTROSTATICS */
771 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
772 ewrt = _mm_mul_pd(r31,ewtabscale);
773 ewitab = _mm_cvttpd_epi32(ewrt);
774 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
775 ewitab = _mm_slli_epi32(ewitab,2);
776 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
777 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
778 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
779 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
780 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
781 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
782 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
783 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
784 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
785 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
787 d = _mm_sub_pd(r31,rswitch);
788 d = _mm_max_pd(d,_mm_setzero_pd());
789 d2 = _mm_mul_pd(d,d);
790 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
792 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
794 /* Evaluate switch function */
795 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
796 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
797 velec = _mm_mul_pd(velec,sw);
798 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
800 /* Update potential sum for this i atom from the interaction with this j atom. */
801 velec = _mm_and_pd(velec,cutoff_mask);
802 velecsum = _mm_add_pd(velecsum,velec);
806 fscal = _mm_and_pd(fscal,cutoff_mask);
808 /* Calculate temporary vectorial force */
809 tx = _mm_mul_pd(fscal,dx31);
810 ty = _mm_mul_pd(fscal,dy31);
811 tz = _mm_mul_pd(fscal,dz31);
813 /* Update vectorial force */
814 fix3 = _mm_add_pd(fix3,tx);
815 fiy3 = _mm_add_pd(fiy3,ty);
816 fiz3 = _mm_add_pd(fiz3,tz);
818 fjx1 = _mm_add_pd(fjx1,tx);
819 fjy1 = _mm_add_pd(fjy1,ty);
820 fjz1 = _mm_add_pd(fjz1,tz);
824 /**************************
825 * CALCULATE INTERACTIONS *
826 **************************/
828 if (gmx_mm_any_lt(rsq32,rcutoff2))
831 r32 = _mm_mul_pd(rsq32,rinv32);
833 /* EWALD ELECTROSTATICS */
835 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
836 ewrt = _mm_mul_pd(r32,ewtabscale);
837 ewitab = _mm_cvttpd_epi32(ewrt);
838 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
839 ewitab = _mm_slli_epi32(ewitab,2);
840 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
841 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
842 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
843 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
844 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
845 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
846 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
847 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
848 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
849 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
851 d = _mm_sub_pd(r32,rswitch);
852 d = _mm_max_pd(d,_mm_setzero_pd());
853 d2 = _mm_mul_pd(d,d);
854 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
856 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
858 /* Evaluate switch function */
859 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
860 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
861 velec = _mm_mul_pd(velec,sw);
862 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
864 /* Update potential sum for this i atom from the interaction with this j atom. */
865 velec = _mm_and_pd(velec,cutoff_mask);
866 velecsum = _mm_add_pd(velecsum,velec);
870 fscal = _mm_and_pd(fscal,cutoff_mask);
872 /* Calculate temporary vectorial force */
873 tx = _mm_mul_pd(fscal,dx32);
874 ty = _mm_mul_pd(fscal,dy32);
875 tz = _mm_mul_pd(fscal,dz32);
877 /* Update vectorial force */
878 fix3 = _mm_add_pd(fix3,tx);
879 fiy3 = _mm_add_pd(fiy3,ty);
880 fiz3 = _mm_add_pd(fiz3,tz);
882 fjx2 = _mm_add_pd(fjx2,tx);
883 fjy2 = _mm_add_pd(fjy2,ty);
884 fjz2 = _mm_add_pd(fjz2,tz);
888 /**************************
889 * CALCULATE INTERACTIONS *
890 **************************/
892 if (gmx_mm_any_lt(rsq33,rcutoff2))
895 r33 = _mm_mul_pd(rsq33,rinv33);
897 /* EWALD ELECTROSTATICS */
899 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
900 ewrt = _mm_mul_pd(r33,ewtabscale);
901 ewitab = _mm_cvttpd_epi32(ewrt);
902 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
903 ewitab = _mm_slli_epi32(ewitab,2);
904 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
905 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
906 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
907 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
908 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
909 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
910 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
911 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
912 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
913 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
915 d = _mm_sub_pd(r33,rswitch);
916 d = _mm_max_pd(d,_mm_setzero_pd());
917 d2 = _mm_mul_pd(d,d);
918 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
920 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
922 /* Evaluate switch function */
923 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
924 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
925 velec = _mm_mul_pd(velec,sw);
926 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
928 /* Update potential sum for this i atom from the interaction with this j atom. */
929 velec = _mm_and_pd(velec,cutoff_mask);
930 velecsum = _mm_add_pd(velecsum,velec);
934 fscal = _mm_and_pd(fscal,cutoff_mask);
936 /* Calculate temporary vectorial force */
937 tx = _mm_mul_pd(fscal,dx33);
938 ty = _mm_mul_pd(fscal,dy33);
939 tz = _mm_mul_pd(fscal,dz33);
941 /* Update vectorial force */
942 fix3 = _mm_add_pd(fix3,tx);
943 fiy3 = _mm_add_pd(fiy3,ty);
944 fiz3 = _mm_add_pd(fiz3,tz);
946 fjx3 = _mm_add_pd(fjx3,tx);
947 fjy3 = _mm_add_pd(fjy3,ty);
948 fjz3 = _mm_add_pd(fjz3,tz);
952 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
954 /* Inner loop uses 647 flops */
961 j_coord_offsetA = DIM*jnrA;
963 /* load j atom coordinates */
964 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
965 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
966 &jy2,&jz2,&jx3,&jy3,&jz3);
968 /* Calculate displacement vector */
969 dx00 = _mm_sub_pd(ix0,jx0);
970 dy00 = _mm_sub_pd(iy0,jy0);
971 dz00 = _mm_sub_pd(iz0,jz0);
972 dx11 = _mm_sub_pd(ix1,jx1);
973 dy11 = _mm_sub_pd(iy1,jy1);
974 dz11 = _mm_sub_pd(iz1,jz1);
975 dx12 = _mm_sub_pd(ix1,jx2);
976 dy12 = _mm_sub_pd(iy1,jy2);
977 dz12 = _mm_sub_pd(iz1,jz2);
978 dx13 = _mm_sub_pd(ix1,jx3);
979 dy13 = _mm_sub_pd(iy1,jy3);
980 dz13 = _mm_sub_pd(iz1,jz3);
981 dx21 = _mm_sub_pd(ix2,jx1);
982 dy21 = _mm_sub_pd(iy2,jy1);
983 dz21 = _mm_sub_pd(iz2,jz1);
984 dx22 = _mm_sub_pd(ix2,jx2);
985 dy22 = _mm_sub_pd(iy2,jy2);
986 dz22 = _mm_sub_pd(iz2,jz2);
987 dx23 = _mm_sub_pd(ix2,jx3);
988 dy23 = _mm_sub_pd(iy2,jy3);
989 dz23 = _mm_sub_pd(iz2,jz3);
990 dx31 = _mm_sub_pd(ix3,jx1);
991 dy31 = _mm_sub_pd(iy3,jy1);
992 dz31 = _mm_sub_pd(iz3,jz1);
993 dx32 = _mm_sub_pd(ix3,jx2);
994 dy32 = _mm_sub_pd(iy3,jy2);
995 dz32 = _mm_sub_pd(iz3,jz2);
996 dx33 = _mm_sub_pd(ix3,jx3);
997 dy33 = _mm_sub_pd(iy3,jy3);
998 dz33 = _mm_sub_pd(iz3,jz3);
1000 /* Calculate squared distance and things based on it */
1001 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1002 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1003 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1004 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1005 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1006 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1007 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1008 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1009 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1010 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1012 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1013 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1014 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1015 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1016 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1017 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1018 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1019 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1020 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1021 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1023 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1024 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1025 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1026 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1027 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1028 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1029 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1030 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1031 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1032 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1034 fjx0 = _mm_setzero_pd();
1035 fjy0 = _mm_setzero_pd();
1036 fjz0 = _mm_setzero_pd();
1037 fjx1 = _mm_setzero_pd();
1038 fjy1 = _mm_setzero_pd();
1039 fjz1 = _mm_setzero_pd();
1040 fjx2 = _mm_setzero_pd();
1041 fjy2 = _mm_setzero_pd();
1042 fjz2 = _mm_setzero_pd();
1043 fjx3 = _mm_setzero_pd();
1044 fjy3 = _mm_setzero_pd();
1045 fjz3 = _mm_setzero_pd();
1047 /**************************
1048 * CALCULATE INTERACTIONS *
1049 **************************/
1051 if (gmx_mm_any_lt(rsq00,rcutoff2))
1054 r00 = _mm_mul_pd(rsq00,rinv00);
1056 /* LENNARD-JONES DISPERSION/REPULSION */
1058 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1059 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1060 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1061 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
1062 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1064 d = _mm_sub_pd(r00,rswitch);
1065 d = _mm_max_pd(d,_mm_setzero_pd());
1066 d2 = _mm_mul_pd(d,d);
1067 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1069 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1071 /* Evaluate switch function */
1072 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1073 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1074 vvdw = _mm_mul_pd(vvdw,sw);
1075 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1077 /* Update potential sum for this i atom from the interaction with this j atom. */
1078 vvdw = _mm_and_pd(vvdw,cutoff_mask);
1079 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
1080 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
1084 fscal = _mm_and_pd(fscal,cutoff_mask);
1086 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1088 /* Calculate temporary vectorial force */
1089 tx = _mm_mul_pd(fscal,dx00);
1090 ty = _mm_mul_pd(fscal,dy00);
1091 tz = _mm_mul_pd(fscal,dz00);
1093 /* Update vectorial force */
1094 fix0 = _mm_add_pd(fix0,tx);
1095 fiy0 = _mm_add_pd(fiy0,ty);
1096 fiz0 = _mm_add_pd(fiz0,tz);
1098 fjx0 = _mm_add_pd(fjx0,tx);
1099 fjy0 = _mm_add_pd(fjy0,ty);
1100 fjz0 = _mm_add_pd(fjz0,tz);
1104 /**************************
1105 * CALCULATE INTERACTIONS *
1106 **************************/
1108 if (gmx_mm_any_lt(rsq11,rcutoff2))
1111 r11 = _mm_mul_pd(rsq11,rinv11);
1113 /* EWALD ELECTROSTATICS */
1115 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1116 ewrt = _mm_mul_pd(r11,ewtabscale);
1117 ewitab = _mm_cvttpd_epi32(ewrt);
1118 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1119 ewitab = _mm_slli_epi32(ewitab,2);
1120 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1121 ewtabD = _mm_setzero_pd();
1122 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1123 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1124 ewtabFn = _mm_setzero_pd();
1125 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1126 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1127 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1128 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1129 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1131 d = _mm_sub_pd(r11,rswitch);
1132 d = _mm_max_pd(d,_mm_setzero_pd());
1133 d2 = _mm_mul_pd(d,d);
1134 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1136 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1138 /* Evaluate switch function */
1139 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1140 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
1141 velec = _mm_mul_pd(velec,sw);
1142 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1144 /* Update potential sum for this i atom from the interaction with this j atom. */
1145 velec = _mm_and_pd(velec,cutoff_mask);
1146 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1147 velecsum = _mm_add_pd(velecsum,velec);
1151 fscal = _mm_and_pd(fscal,cutoff_mask);
1153 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1155 /* Calculate temporary vectorial force */
1156 tx = _mm_mul_pd(fscal,dx11);
1157 ty = _mm_mul_pd(fscal,dy11);
1158 tz = _mm_mul_pd(fscal,dz11);
1160 /* Update vectorial force */
1161 fix1 = _mm_add_pd(fix1,tx);
1162 fiy1 = _mm_add_pd(fiy1,ty);
1163 fiz1 = _mm_add_pd(fiz1,tz);
1165 fjx1 = _mm_add_pd(fjx1,tx);
1166 fjy1 = _mm_add_pd(fjy1,ty);
1167 fjz1 = _mm_add_pd(fjz1,tz);
1171 /**************************
1172 * CALCULATE INTERACTIONS *
1173 **************************/
1175 if (gmx_mm_any_lt(rsq12,rcutoff2))
1178 r12 = _mm_mul_pd(rsq12,rinv12);
1180 /* EWALD ELECTROSTATICS */
1182 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1183 ewrt = _mm_mul_pd(r12,ewtabscale);
1184 ewitab = _mm_cvttpd_epi32(ewrt);
1185 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1186 ewitab = _mm_slli_epi32(ewitab,2);
1187 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1188 ewtabD = _mm_setzero_pd();
1189 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1190 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1191 ewtabFn = _mm_setzero_pd();
1192 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1193 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1194 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1195 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1196 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1198 d = _mm_sub_pd(r12,rswitch);
1199 d = _mm_max_pd(d,_mm_setzero_pd());
1200 d2 = _mm_mul_pd(d,d);
1201 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1203 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1205 /* Evaluate switch function */
1206 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1207 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1208 velec = _mm_mul_pd(velec,sw);
1209 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1211 /* Update potential sum for this i atom from the interaction with this j atom. */
1212 velec = _mm_and_pd(velec,cutoff_mask);
1213 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1214 velecsum = _mm_add_pd(velecsum,velec);
1218 fscal = _mm_and_pd(fscal,cutoff_mask);
1220 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1222 /* Calculate temporary vectorial force */
1223 tx = _mm_mul_pd(fscal,dx12);
1224 ty = _mm_mul_pd(fscal,dy12);
1225 tz = _mm_mul_pd(fscal,dz12);
1227 /* Update vectorial force */
1228 fix1 = _mm_add_pd(fix1,tx);
1229 fiy1 = _mm_add_pd(fiy1,ty);
1230 fiz1 = _mm_add_pd(fiz1,tz);
1232 fjx2 = _mm_add_pd(fjx2,tx);
1233 fjy2 = _mm_add_pd(fjy2,ty);
1234 fjz2 = _mm_add_pd(fjz2,tz);
1238 /**************************
1239 * CALCULATE INTERACTIONS *
1240 **************************/
1242 if (gmx_mm_any_lt(rsq13,rcutoff2))
1245 r13 = _mm_mul_pd(rsq13,rinv13);
1247 /* EWALD ELECTROSTATICS */
1249 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1250 ewrt = _mm_mul_pd(r13,ewtabscale);
1251 ewitab = _mm_cvttpd_epi32(ewrt);
1252 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1253 ewitab = _mm_slli_epi32(ewitab,2);
1254 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1255 ewtabD = _mm_setzero_pd();
1256 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1257 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1258 ewtabFn = _mm_setzero_pd();
1259 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1260 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1261 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1262 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
1263 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1265 d = _mm_sub_pd(r13,rswitch);
1266 d = _mm_max_pd(d,_mm_setzero_pd());
1267 d2 = _mm_mul_pd(d,d);
1268 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1270 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1272 /* Evaluate switch function */
1273 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1274 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
1275 velec = _mm_mul_pd(velec,sw);
1276 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1278 /* Update potential sum for this i atom from the interaction with this j atom. */
1279 velec = _mm_and_pd(velec,cutoff_mask);
1280 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1281 velecsum = _mm_add_pd(velecsum,velec);
1285 fscal = _mm_and_pd(fscal,cutoff_mask);
1287 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1289 /* Calculate temporary vectorial force */
1290 tx = _mm_mul_pd(fscal,dx13);
1291 ty = _mm_mul_pd(fscal,dy13);
1292 tz = _mm_mul_pd(fscal,dz13);
1294 /* Update vectorial force */
1295 fix1 = _mm_add_pd(fix1,tx);
1296 fiy1 = _mm_add_pd(fiy1,ty);
1297 fiz1 = _mm_add_pd(fiz1,tz);
1299 fjx3 = _mm_add_pd(fjx3,tx);
1300 fjy3 = _mm_add_pd(fjy3,ty);
1301 fjz3 = _mm_add_pd(fjz3,tz);
1305 /**************************
1306 * CALCULATE INTERACTIONS *
1307 **************************/
1309 if (gmx_mm_any_lt(rsq21,rcutoff2))
1312 r21 = _mm_mul_pd(rsq21,rinv21);
1314 /* EWALD ELECTROSTATICS */
1316 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1317 ewrt = _mm_mul_pd(r21,ewtabscale);
1318 ewitab = _mm_cvttpd_epi32(ewrt);
1319 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1320 ewitab = _mm_slli_epi32(ewitab,2);
1321 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1322 ewtabD = _mm_setzero_pd();
1323 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1324 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1325 ewtabFn = _mm_setzero_pd();
1326 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1327 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1328 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1329 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1330 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1332 d = _mm_sub_pd(r21,rswitch);
1333 d = _mm_max_pd(d,_mm_setzero_pd());
1334 d2 = _mm_mul_pd(d,d);
1335 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1337 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1339 /* Evaluate switch function */
1340 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1341 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
1342 velec = _mm_mul_pd(velec,sw);
1343 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1345 /* Update potential sum for this i atom from the interaction with this j atom. */
1346 velec = _mm_and_pd(velec,cutoff_mask);
1347 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1348 velecsum = _mm_add_pd(velecsum,velec);
1352 fscal = _mm_and_pd(fscal,cutoff_mask);
1354 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1356 /* Calculate temporary vectorial force */
1357 tx = _mm_mul_pd(fscal,dx21);
1358 ty = _mm_mul_pd(fscal,dy21);
1359 tz = _mm_mul_pd(fscal,dz21);
1361 /* Update vectorial force */
1362 fix2 = _mm_add_pd(fix2,tx);
1363 fiy2 = _mm_add_pd(fiy2,ty);
1364 fiz2 = _mm_add_pd(fiz2,tz);
1366 fjx1 = _mm_add_pd(fjx1,tx);
1367 fjy1 = _mm_add_pd(fjy1,ty);
1368 fjz1 = _mm_add_pd(fjz1,tz);
1372 /**************************
1373 * CALCULATE INTERACTIONS *
1374 **************************/
1376 if (gmx_mm_any_lt(rsq22,rcutoff2))
1379 r22 = _mm_mul_pd(rsq22,rinv22);
1381 /* EWALD ELECTROSTATICS */
1383 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1384 ewrt = _mm_mul_pd(r22,ewtabscale);
1385 ewitab = _mm_cvttpd_epi32(ewrt);
1386 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1387 ewitab = _mm_slli_epi32(ewitab,2);
1388 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1389 ewtabD = _mm_setzero_pd();
1390 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1391 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1392 ewtabFn = _mm_setzero_pd();
1393 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1394 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1395 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1396 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1397 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1399 d = _mm_sub_pd(r22,rswitch);
1400 d = _mm_max_pd(d,_mm_setzero_pd());
1401 d2 = _mm_mul_pd(d,d);
1402 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1404 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1406 /* Evaluate switch function */
1407 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1408 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
1409 velec = _mm_mul_pd(velec,sw);
1410 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1412 /* Update potential sum for this i atom from the interaction with this j atom. */
1413 velec = _mm_and_pd(velec,cutoff_mask);
1414 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1415 velecsum = _mm_add_pd(velecsum,velec);
1419 fscal = _mm_and_pd(fscal,cutoff_mask);
1421 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1423 /* Calculate temporary vectorial force */
1424 tx = _mm_mul_pd(fscal,dx22);
1425 ty = _mm_mul_pd(fscal,dy22);
1426 tz = _mm_mul_pd(fscal,dz22);
1428 /* Update vectorial force */
1429 fix2 = _mm_add_pd(fix2,tx);
1430 fiy2 = _mm_add_pd(fiy2,ty);
1431 fiz2 = _mm_add_pd(fiz2,tz);
1433 fjx2 = _mm_add_pd(fjx2,tx);
1434 fjy2 = _mm_add_pd(fjy2,ty);
1435 fjz2 = _mm_add_pd(fjz2,tz);
1439 /**************************
1440 * CALCULATE INTERACTIONS *
1441 **************************/
1443 if (gmx_mm_any_lt(rsq23,rcutoff2))
1446 r23 = _mm_mul_pd(rsq23,rinv23);
1448 /* EWALD ELECTROSTATICS */
1450 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1451 ewrt = _mm_mul_pd(r23,ewtabscale);
1452 ewitab = _mm_cvttpd_epi32(ewrt);
1453 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1454 ewitab = _mm_slli_epi32(ewitab,2);
1455 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1456 ewtabD = _mm_setzero_pd();
1457 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1458 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1459 ewtabFn = _mm_setzero_pd();
1460 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1461 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1462 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1463 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
1464 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1466 d = _mm_sub_pd(r23,rswitch);
1467 d = _mm_max_pd(d,_mm_setzero_pd());
1468 d2 = _mm_mul_pd(d,d);
1469 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1471 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1473 /* Evaluate switch function */
1474 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1475 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
1476 velec = _mm_mul_pd(velec,sw);
1477 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1479 /* Update potential sum for this i atom from the interaction with this j atom. */
1480 velec = _mm_and_pd(velec,cutoff_mask);
1481 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1482 velecsum = _mm_add_pd(velecsum,velec);
1486 fscal = _mm_and_pd(fscal,cutoff_mask);
1488 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1490 /* Calculate temporary vectorial force */
1491 tx = _mm_mul_pd(fscal,dx23);
1492 ty = _mm_mul_pd(fscal,dy23);
1493 tz = _mm_mul_pd(fscal,dz23);
1495 /* Update vectorial force */
1496 fix2 = _mm_add_pd(fix2,tx);
1497 fiy2 = _mm_add_pd(fiy2,ty);
1498 fiz2 = _mm_add_pd(fiz2,tz);
1500 fjx3 = _mm_add_pd(fjx3,tx);
1501 fjy3 = _mm_add_pd(fjy3,ty);
1502 fjz3 = _mm_add_pd(fjz3,tz);
1506 /**************************
1507 * CALCULATE INTERACTIONS *
1508 **************************/
1510 if (gmx_mm_any_lt(rsq31,rcutoff2))
1513 r31 = _mm_mul_pd(rsq31,rinv31);
1515 /* EWALD ELECTROSTATICS */
1517 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1518 ewrt = _mm_mul_pd(r31,ewtabscale);
1519 ewitab = _mm_cvttpd_epi32(ewrt);
1520 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1521 ewitab = _mm_slli_epi32(ewitab,2);
1522 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1523 ewtabD = _mm_setzero_pd();
1524 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1525 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1526 ewtabFn = _mm_setzero_pd();
1527 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1528 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1529 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1530 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
1531 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1533 d = _mm_sub_pd(r31,rswitch);
1534 d = _mm_max_pd(d,_mm_setzero_pd());
1535 d2 = _mm_mul_pd(d,d);
1536 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1538 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1540 /* Evaluate switch function */
1541 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1542 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
1543 velec = _mm_mul_pd(velec,sw);
1544 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1546 /* Update potential sum for this i atom from the interaction with this j atom. */
1547 velec = _mm_and_pd(velec,cutoff_mask);
1548 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1549 velecsum = _mm_add_pd(velecsum,velec);
1553 fscal = _mm_and_pd(fscal,cutoff_mask);
1555 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1557 /* Calculate temporary vectorial force */
1558 tx = _mm_mul_pd(fscal,dx31);
1559 ty = _mm_mul_pd(fscal,dy31);
1560 tz = _mm_mul_pd(fscal,dz31);
1562 /* Update vectorial force */
1563 fix3 = _mm_add_pd(fix3,tx);
1564 fiy3 = _mm_add_pd(fiy3,ty);
1565 fiz3 = _mm_add_pd(fiz3,tz);
1567 fjx1 = _mm_add_pd(fjx1,tx);
1568 fjy1 = _mm_add_pd(fjy1,ty);
1569 fjz1 = _mm_add_pd(fjz1,tz);
1573 /**************************
1574 * CALCULATE INTERACTIONS *
1575 **************************/
1577 if (gmx_mm_any_lt(rsq32,rcutoff2))
1580 r32 = _mm_mul_pd(rsq32,rinv32);
1582 /* EWALD ELECTROSTATICS */
1584 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1585 ewrt = _mm_mul_pd(r32,ewtabscale);
1586 ewitab = _mm_cvttpd_epi32(ewrt);
1587 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1588 ewitab = _mm_slli_epi32(ewitab,2);
1589 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1590 ewtabD = _mm_setzero_pd();
1591 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1592 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1593 ewtabFn = _mm_setzero_pd();
1594 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1595 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1596 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1597 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
1598 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1600 d = _mm_sub_pd(r32,rswitch);
1601 d = _mm_max_pd(d,_mm_setzero_pd());
1602 d2 = _mm_mul_pd(d,d);
1603 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1605 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1607 /* Evaluate switch function */
1608 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1609 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
1610 velec = _mm_mul_pd(velec,sw);
1611 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1613 /* Update potential sum for this i atom from the interaction with this j atom. */
1614 velec = _mm_and_pd(velec,cutoff_mask);
1615 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1616 velecsum = _mm_add_pd(velecsum,velec);
1620 fscal = _mm_and_pd(fscal,cutoff_mask);
1622 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1624 /* Calculate temporary vectorial force */
1625 tx = _mm_mul_pd(fscal,dx32);
1626 ty = _mm_mul_pd(fscal,dy32);
1627 tz = _mm_mul_pd(fscal,dz32);
1629 /* Update vectorial force */
1630 fix3 = _mm_add_pd(fix3,tx);
1631 fiy3 = _mm_add_pd(fiy3,ty);
1632 fiz3 = _mm_add_pd(fiz3,tz);
1634 fjx2 = _mm_add_pd(fjx2,tx);
1635 fjy2 = _mm_add_pd(fjy2,ty);
1636 fjz2 = _mm_add_pd(fjz2,tz);
1640 /**************************
1641 * CALCULATE INTERACTIONS *
1642 **************************/
1644 if (gmx_mm_any_lt(rsq33,rcutoff2))
1647 r33 = _mm_mul_pd(rsq33,rinv33);
1649 /* EWALD ELECTROSTATICS */
1651 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1652 ewrt = _mm_mul_pd(r33,ewtabscale);
1653 ewitab = _mm_cvttpd_epi32(ewrt);
1654 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1655 ewitab = _mm_slli_epi32(ewitab,2);
1656 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1657 ewtabD = _mm_setzero_pd();
1658 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1659 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1660 ewtabFn = _mm_setzero_pd();
1661 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1662 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1663 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1664 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
1665 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1667 d = _mm_sub_pd(r33,rswitch);
1668 d = _mm_max_pd(d,_mm_setzero_pd());
1669 d2 = _mm_mul_pd(d,d);
1670 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1672 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1674 /* Evaluate switch function */
1675 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1676 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
1677 velec = _mm_mul_pd(velec,sw);
1678 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1680 /* Update potential sum for this i atom from the interaction with this j atom. */
1681 velec = _mm_and_pd(velec,cutoff_mask);
1682 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1683 velecsum = _mm_add_pd(velecsum,velec);
1687 fscal = _mm_and_pd(fscal,cutoff_mask);
1689 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1691 /* Calculate temporary vectorial force */
1692 tx = _mm_mul_pd(fscal,dx33);
1693 ty = _mm_mul_pd(fscal,dy33);
1694 tz = _mm_mul_pd(fscal,dz33);
1696 /* Update vectorial force */
1697 fix3 = _mm_add_pd(fix3,tx);
1698 fiy3 = _mm_add_pd(fiy3,ty);
1699 fiz3 = _mm_add_pd(fiz3,tz);
1701 fjx3 = _mm_add_pd(fjx3,tx);
1702 fjy3 = _mm_add_pd(fjy3,ty);
1703 fjz3 = _mm_add_pd(fjz3,tz);
1707 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1709 /* Inner loop uses 647 flops */
1712 /* End of innermost loop */
1714 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1715 f+i_coord_offset,fshift+i_shift_offset);
1718 /* Update potential energies */
1719 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1720 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1722 /* Increment number of inner iterations */
1723 inneriter += j_index_end - j_index_start;
1725 /* Outer loop uses 26 flops */
1728 /* Increment number of outer iterations */
1731 /* Update outer/inner flops */
1733 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*647);
1736 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_double
1737 * Electrostatics interaction: Ewald
1738 * VdW interaction: LennardJones
1739 * Geometry: Water4-Water4
1740 * Calculate force/pot: Force
1743 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_double
1744 (t_nblist * gmx_restrict nlist,
1745 rvec * gmx_restrict xx,
1746 rvec * gmx_restrict ff,
1747 t_forcerec * gmx_restrict fr,
1748 t_mdatoms * gmx_restrict mdatoms,
1749 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1750 t_nrnb * gmx_restrict nrnb)
1752 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1753 * just 0 for non-waters.
1754 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1755 * jnr indices corresponding to data put in the four positions in the SIMD register.
1757 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1758 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1760 int j_coord_offsetA,j_coord_offsetB;
1761 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1762 real rcutoff_scalar;
1763 real *shiftvec,*fshift,*x,*f;
1764 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1766 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1768 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1770 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1772 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1773 int vdwjidx0A,vdwjidx0B;
1774 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1775 int vdwjidx1A,vdwjidx1B;
1776 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1777 int vdwjidx2A,vdwjidx2B;
1778 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1779 int vdwjidx3A,vdwjidx3B;
1780 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1781 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1782 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1783 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1784 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1785 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1786 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1787 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1788 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1789 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1790 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1791 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1794 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1797 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1798 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1800 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1802 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1803 real rswitch_scalar,d_scalar;
1804 __m128d dummy_mask,cutoff_mask;
1805 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1806 __m128d one = _mm_set1_pd(1.0);
1807 __m128d two = _mm_set1_pd(2.0);
1813 jindex = nlist->jindex;
1815 shiftidx = nlist->shift;
1817 shiftvec = fr->shift_vec[0];
1818 fshift = fr->fshift[0];
1819 facel = _mm_set1_pd(fr->epsfac);
1820 charge = mdatoms->chargeA;
1821 nvdwtype = fr->ntype;
1822 vdwparam = fr->nbfp;
1823 vdwtype = mdatoms->typeA;
1825 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1826 ewtab = fr->ic->tabq_coul_FDV0;
1827 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1828 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1830 /* Setup water-specific parameters */
1831 inr = nlist->iinr[0];
1832 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1833 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1834 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1835 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1837 jq1 = _mm_set1_pd(charge[inr+1]);
1838 jq2 = _mm_set1_pd(charge[inr+2]);
1839 jq3 = _mm_set1_pd(charge[inr+3]);
1840 vdwjidx0A = 2*vdwtype[inr+0];
1841 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1842 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1843 qq11 = _mm_mul_pd(iq1,jq1);
1844 qq12 = _mm_mul_pd(iq1,jq2);
1845 qq13 = _mm_mul_pd(iq1,jq3);
1846 qq21 = _mm_mul_pd(iq2,jq1);
1847 qq22 = _mm_mul_pd(iq2,jq2);
1848 qq23 = _mm_mul_pd(iq2,jq3);
1849 qq31 = _mm_mul_pd(iq3,jq1);
1850 qq32 = _mm_mul_pd(iq3,jq2);
1851 qq33 = _mm_mul_pd(iq3,jq3);
1853 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1854 rcutoff_scalar = fr->rcoulomb;
1855 rcutoff = _mm_set1_pd(rcutoff_scalar);
1856 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1858 rswitch_scalar = fr->rcoulomb_switch;
1859 rswitch = _mm_set1_pd(rswitch_scalar);
1860 /* Setup switch parameters */
1861 d_scalar = rcutoff_scalar-rswitch_scalar;
1862 d = _mm_set1_pd(d_scalar);
1863 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1864 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1865 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1866 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1867 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1868 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1870 /* Avoid stupid compiler warnings */
1872 j_coord_offsetA = 0;
1873 j_coord_offsetB = 0;
1878 /* Start outer loop over neighborlists */
1879 for(iidx=0; iidx<nri; iidx++)
1881 /* Load shift vector for this list */
1882 i_shift_offset = DIM*shiftidx[iidx];
1884 /* Load limits for loop over neighbors */
1885 j_index_start = jindex[iidx];
1886 j_index_end = jindex[iidx+1];
1888 /* Get outer coordinate index */
1890 i_coord_offset = DIM*inr;
1892 /* Load i particle coords and add shift vector */
1893 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1894 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1896 fix0 = _mm_setzero_pd();
1897 fiy0 = _mm_setzero_pd();
1898 fiz0 = _mm_setzero_pd();
1899 fix1 = _mm_setzero_pd();
1900 fiy1 = _mm_setzero_pd();
1901 fiz1 = _mm_setzero_pd();
1902 fix2 = _mm_setzero_pd();
1903 fiy2 = _mm_setzero_pd();
1904 fiz2 = _mm_setzero_pd();
1905 fix3 = _mm_setzero_pd();
1906 fiy3 = _mm_setzero_pd();
1907 fiz3 = _mm_setzero_pd();
1909 /* Start inner kernel loop */
1910 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1913 /* Get j neighbor index, and coordinate index */
1915 jnrB = jjnr[jidx+1];
1916 j_coord_offsetA = DIM*jnrA;
1917 j_coord_offsetB = DIM*jnrB;
1919 /* load j atom coordinates */
1920 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1921 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1922 &jy2,&jz2,&jx3,&jy3,&jz3);
1924 /* Calculate displacement vector */
1925 dx00 = _mm_sub_pd(ix0,jx0);
1926 dy00 = _mm_sub_pd(iy0,jy0);
1927 dz00 = _mm_sub_pd(iz0,jz0);
1928 dx11 = _mm_sub_pd(ix1,jx1);
1929 dy11 = _mm_sub_pd(iy1,jy1);
1930 dz11 = _mm_sub_pd(iz1,jz1);
1931 dx12 = _mm_sub_pd(ix1,jx2);
1932 dy12 = _mm_sub_pd(iy1,jy2);
1933 dz12 = _mm_sub_pd(iz1,jz2);
1934 dx13 = _mm_sub_pd(ix1,jx3);
1935 dy13 = _mm_sub_pd(iy1,jy3);
1936 dz13 = _mm_sub_pd(iz1,jz3);
1937 dx21 = _mm_sub_pd(ix2,jx1);
1938 dy21 = _mm_sub_pd(iy2,jy1);
1939 dz21 = _mm_sub_pd(iz2,jz1);
1940 dx22 = _mm_sub_pd(ix2,jx2);
1941 dy22 = _mm_sub_pd(iy2,jy2);
1942 dz22 = _mm_sub_pd(iz2,jz2);
1943 dx23 = _mm_sub_pd(ix2,jx3);
1944 dy23 = _mm_sub_pd(iy2,jy3);
1945 dz23 = _mm_sub_pd(iz2,jz3);
1946 dx31 = _mm_sub_pd(ix3,jx1);
1947 dy31 = _mm_sub_pd(iy3,jy1);
1948 dz31 = _mm_sub_pd(iz3,jz1);
1949 dx32 = _mm_sub_pd(ix3,jx2);
1950 dy32 = _mm_sub_pd(iy3,jy2);
1951 dz32 = _mm_sub_pd(iz3,jz2);
1952 dx33 = _mm_sub_pd(ix3,jx3);
1953 dy33 = _mm_sub_pd(iy3,jy3);
1954 dz33 = _mm_sub_pd(iz3,jz3);
1956 /* Calculate squared distance and things based on it */
1957 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1958 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1959 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1960 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1961 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1962 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1963 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1964 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1965 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1966 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1968 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1969 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1970 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1971 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1972 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1973 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1974 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1975 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1976 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1977 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1979 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1980 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1981 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1982 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1983 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1984 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1985 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1986 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1987 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1988 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1990 fjx0 = _mm_setzero_pd();
1991 fjy0 = _mm_setzero_pd();
1992 fjz0 = _mm_setzero_pd();
1993 fjx1 = _mm_setzero_pd();
1994 fjy1 = _mm_setzero_pd();
1995 fjz1 = _mm_setzero_pd();
1996 fjx2 = _mm_setzero_pd();
1997 fjy2 = _mm_setzero_pd();
1998 fjz2 = _mm_setzero_pd();
1999 fjx3 = _mm_setzero_pd();
2000 fjy3 = _mm_setzero_pd();
2001 fjz3 = _mm_setzero_pd();
2003 /**************************
2004 * CALCULATE INTERACTIONS *
2005 **************************/
2007 if (gmx_mm_any_lt(rsq00,rcutoff2))
2010 r00 = _mm_mul_pd(rsq00,rinv00);
2012 /* LENNARD-JONES DISPERSION/REPULSION */
2014 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2015 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
2016 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
2017 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
2018 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
2020 d = _mm_sub_pd(r00,rswitch);
2021 d = _mm_max_pd(d,_mm_setzero_pd());
2022 d2 = _mm_mul_pd(d,d);
2023 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2025 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2027 /* Evaluate switch function */
2028 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2029 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2030 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
2034 fscal = _mm_and_pd(fscal,cutoff_mask);
2036 /* Calculate temporary vectorial force */
2037 tx = _mm_mul_pd(fscal,dx00);
2038 ty = _mm_mul_pd(fscal,dy00);
2039 tz = _mm_mul_pd(fscal,dz00);
2041 /* Update vectorial force */
2042 fix0 = _mm_add_pd(fix0,tx);
2043 fiy0 = _mm_add_pd(fiy0,ty);
2044 fiz0 = _mm_add_pd(fiz0,tz);
2046 fjx0 = _mm_add_pd(fjx0,tx);
2047 fjy0 = _mm_add_pd(fjy0,ty);
2048 fjz0 = _mm_add_pd(fjz0,tz);
2052 /**************************
2053 * CALCULATE INTERACTIONS *
2054 **************************/
2056 if (gmx_mm_any_lt(rsq11,rcutoff2))
2059 r11 = _mm_mul_pd(rsq11,rinv11);
2061 /* EWALD ELECTROSTATICS */
2063 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2064 ewrt = _mm_mul_pd(r11,ewtabscale);
2065 ewitab = _mm_cvttpd_epi32(ewrt);
2066 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2067 ewitab = _mm_slli_epi32(ewitab,2);
2068 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2069 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2070 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2071 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2072 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2073 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2074 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2075 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2076 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2077 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2079 d = _mm_sub_pd(r11,rswitch);
2080 d = _mm_max_pd(d,_mm_setzero_pd());
2081 d2 = _mm_mul_pd(d,d);
2082 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2084 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2086 /* Evaluate switch function */
2087 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2088 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2089 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2093 fscal = _mm_and_pd(fscal,cutoff_mask);
2095 /* Calculate temporary vectorial force */
2096 tx = _mm_mul_pd(fscal,dx11);
2097 ty = _mm_mul_pd(fscal,dy11);
2098 tz = _mm_mul_pd(fscal,dz11);
2100 /* Update vectorial force */
2101 fix1 = _mm_add_pd(fix1,tx);
2102 fiy1 = _mm_add_pd(fiy1,ty);
2103 fiz1 = _mm_add_pd(fiz1,tz);
2105 fjx1 = _mm_add_pd(fjx1,tx);
2106 fjy1 = _mm_add_pd(fjy1,ty);
2107 fjz1 = _mm_add_pd(fjz1,tz);
2111 /**************************
2112 * CALCULATE INTERACTIONS *
2113 **************************/
2115 if (gmx_mm_any_lt(rsq12,rcutoff2))
2118 r12 = _mm_mul_pd(rsq12,rinv12);
2120 /* EWALD ELECTROSTATICS */
2122 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2123 ewrt = _mm_mul_pd(r12,ewtabscale);
2124 ewitab = _mm_cvttpd_epi32(ewrt);
2125 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2126 ewitab = _mm_slli_epi32(ewitab,2);
2127 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2128 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2129 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2130 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2131 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2132 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2133 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2134 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2135 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2136 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2138 d = _mm_sub_pd(r12,rswitch);
2139 d = _mm_max_pd(d,_mm_setzero_pd());
2140 d2 = _mm_mul_pd(d,d);
2141 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2143 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2145 /* Evaluate switch function */
2146 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2147 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2148 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2152 fscal = _mm_and_pd(fscal,cutoff_mask);
2154 /* Calculate temporary vectorial force */
2155 tx = _mm_mul_pd(fscal,dx12);
2156 ty = _mm_mul_pd(fscal,dy12);
2157 tz = _mm_mul_pd(fscal,dz12);
2159 /* Update vectorial force */
2160 fix1 = _mm_add_pd(fix1,tx);
2161 fiy1 = _mm_add_pd(fiy1,ty);
2162 fiz1 = _mm_add_pd(fiz1,tz);
2164 fjx2 = _mm_add_pd(fjx2,tx);
2165 fjy2 = _mm_add_pd(fjy2,ty);
2166 fjz2 = _mm_add_pd(fjz2,tz);
2170 /**************************
2171 * CALCULATE INTERACTIONS *
2172 **************************/
2174 if (gmx_mm_any_lt(rsq13,rcutoff2))
2177 r13 = _mm_mul_pd(rsq13,rinv13);
2179 /* EWALD ELECTROSTATICS */
2181 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2182 ewrt = _mm_mul_pd(r13,ewtabscale);
2183 ewitab = _mm_cvttpd_epi32(ewrt);
2184 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2185 ewitab = _mm_slli_epi32(ewitab,2);
2186 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2187 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2188 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2189 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2190 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2191 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2192 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2193 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2194 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
2195 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2197 d = _mm_sub_pd(r13,rswitch);
2198 d = _mm_max_pd(d,_mm_setzero_pd());
2199 d2 = _mm_mul_pd(d,d);
2200 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2202 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2204 /* Evaluate switch function */
2205 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2206 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
2207 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
2211 fscal = _mm_and_pd(fscal,cutoff_mask);
2213 /* Calculate temporary vectorial force */
2214 tx = _mm_mul_pd(fscal,dx13);
2215 ty = _mm_mul_pd(fscal,dy13);
2216 tz = _mm_mul_pd(fscal,dz13);
2218 /* Update vectorial force */
2219 fix1 = _mm_add_pd(fix1,tx);
2220 fiy1 = _mm_add_pd(fiy1,ty);
2221 fiz1 = _mm_add_pd(fiz1,tz);
2223 fjx3 = _mm_add_pd(fjx3,tx);
2224 fjy3 = _mm_add_pd(fjy3,ty);
2225 fjz3 = _mm_add_pd(fjz3,tz);
2229 /**************************
2230 * CALCULATE INTERACTIONS *
2231 **************************/
2233 if (gmx_mm_any_lt(rsq21,rcutoff2))
2236 r21 = _mm_mul_pd(rsq21,rinv21);
2238 /* EWALD ELECTROSTATICS */
2240 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2241 ewrt = _mm_mul_pd(r21,ewtabscale);
2242 ewitab = _mm_cvttpd_epi32(ewrt);
2243 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2244 ewitab = _mm_slli_epi32(ewitab,2);
2245 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2246 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2247 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2248 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2249 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2250 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2251 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2252 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2253 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2254 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2256 d = _mm_sub_pd(r21,rswitch);
2257 d = _mm_max_pd(d,_mm_setzero_pd());
2258 d2 = _mm_mul_pd(d,d);
2259 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2261 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2263 /* Evaluate switch function */
2264 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2265 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2266 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2270 fscal = _mm_and_pd(fscal,cutoff_mask);
2272 /* Calculate temporary vectorial force */
2273 tx = _mm_mul_pd(fscal,dx21);
2274 ty = _mm_mul_pd(fscal,dy21);
2275 tz = _mm_mul_pd(fscal,dz21);
2277 /* Update vectorial force */
2278 fix2 = _mm_add_pd(fix2,tx);
2279 fiy2 = _mm_add_pd(fiy2,ty);
2280 fiz2 = _mm_add_pd(fiz2,tz);
2282 fjx1 = _mm_add_pd(fjx1,tx);
2283 fjy1 = _mm_add_pd(fjy1,ty);
2284 fjz1 = _mm_add_pd(fjz1,tz);
2288 /**************************
2289 * CALCULATE INTERACTIONS *
2290 **************************/
2292 if (gmx_mm_any_lt(rsq22,rcutoff2))
2295 r22 = _mm_mul_pd(rsq22,rinv22);
2297 /* EWALD ELECTROSTATICS */
2299 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2300 ewrt = _mm_mul_pd(r22,ewtabscale);
2301 ewitab = _mm_cvttpd_epi32(ewrt);
2302 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2303 ewitab = _mm_slli_epi32(ewitab,2);
2304 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2305 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2306 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2307 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2308 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2309 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2310 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2311 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2312 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2313 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2315 d = _mm_sub_pd(r22,rswitch);
2316 d = _mm_max_pd(d,_mm_setzero_pd());
2317 d2 = _mm_mul_pd(d,d);
2318 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2320 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2322 /* Evaluate switch function */
2323 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2324 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2325 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2329 fscal = _mm_and_pd(fscal,cutoff_mask);
2331 /* Calculate temporary vectorial force */
2332 tx = _mm_mul_pd(fscal,dx22);
2333 ty = _mm_mul_pd(fscal,dy22);
2334 tz = _mm_mul_pd(fscal,dz22);
2336 /* Update vectorial force */
2337 fix2 = _mm_add_pd(fix2,tx);
2338 fiy2 = _mm_add_pd(fiy2,ty);
2339 fiz2 = _mm_add_pd(fiz2,tz);
2341 fjx2 = _mm_add_pd(fjx2,tx);
2342 fjy2 = _mm_add_pd(fjy2,ty);
2343 fjz2 = _mm_add_pd(fjz2,tz);
2347 /**************************
2348 * CALCULATE INTERACTIONS *
2349 **************************/
2351 if (gmx_mm_any_lt(rsq23,rcutoff2))
2354 r23 = _mm_mul_pd(rsq23,rinv23);
2356 /* EWALD ELECTROSTATICS */
2358 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2359 ewrt = _mm_mul_pd(r23,ewtabscale);
2360 ewitab = _mm_cvttpd_epi32(ewrt);
2361 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2362 ewitab = _mm_slli_epi32(ewitab,2);
2363 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2364 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2365 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2366 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2367 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2368 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2369 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2370 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2371 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
2372 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2374 d = _mm_sub_pd(r23,rswitch);
2375 d = _mm_max_pd(d,_mm_setzero_pd());
2376 d2 = _mm_mul_pd(d,d);
2377 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2379 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2381 /* Evaluate switch function */
2382 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2383 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
2384 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
2388 fscal = _mm_and_pd(fscal,cutoff_mask);
2390 /* Calculate temporary vectorial force */
2391 tx = _mm_mul_pd(fscal,dx23);
2392 ty = _mm_mul_pd(fscal,dy23);
2393 tz = _mm_mul_pd(fscal,dz23);
2395 /* Update vectorial force */
2396 fix2 = _mm_add_pd(fix2,tx);
2397 fiy2 = _mm_add_pd(fiy2,ty);
2398 fiz2 = _mm_add_pd(fiz2,tz);
2400 fjx3 = _mm_add_pd(fjx3,tx);
2401 fjy3 = _mm_add_pd(fjy3,ty);
2402 fjz3 = _mm_add_pd(fjz3,tz);
2406 /**************************
2407 * CALCULATE INTERACTIONS *
2408 **************************/
2410 if (gmx_mm_any_lt(rsq31,rcutoff2))
2413 r31 = _mm_mul_pd(rsq31,rinv31);
2415 /* EWALD ELECTROSTATICS */
2417 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2418 ewrt = _mm_mul_pd(r31,ewtabscale);
2419 ewitab = _mm_cvttpd_epi32(ewrt);
2420 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2421 ewitab = _mm_slli_epi32(ewitab,2);
2422 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2423 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2424 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2425 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2426 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2427 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2428 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2429 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2430 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
2431 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2433 d = _mm_sub_pd(r31,rswitch);
2434 d = _mm_max_pd(d,_mm_setzero_pd());
2435 d2 = _mm_mul_pd(d,d);
2436 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2438 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2440 /* Evaluate switch function */
2441 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2442 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
2443 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
2447 fscal = _mm_and_pd(fscal,cutoff_mask);
2449 /* Calculate temporary vectorial force */
2450 tx = _mm_mul_pd(fscal,dx31);
2451 ty = _mm_mul_pd(fscal,dy31);
2452 tz = _mm_mul_pd(fscal,dz31);
2454 /* Update vectorial force */
2455 fix3 = _mm_add_pd(fix3,tx);
2456 fiy3 = _mm_add_pd(fiy3,ty);
2457 fiz3 = _mm_add_pd(fiz3,tz);
2459 fjx1 = _mm_add_pd(fjx1,tx);
2460 fjy1 = _mm_add_pd(fjy1,ty);
2461 fjz1 = _mm_add_pd(fjz1,tz);
2465 /**************************
2466 * CALCULATE INTERACTIONS *
2467 **************************/
2469 if (gmx_mm_any_lt(rsq32,rcutoff2))
2472 r32 = _mm_mul_pd(rsq32,rinv32);
2474 /* EWALD ELECTROSTATICS */
2476 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2477 ewrt = _mm_mul_pd(r32,ewtabscale);
2478 ewitab = _mm_cvttpd_epi32(ewrt);
2479 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2480 ewitab = _mm_slli_epi32(ewitab,2);
2481 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2482 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2483 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2484 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2485 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2486 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2487 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2488 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2489 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
2490 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2492 d = _mm_sub_pd(r32,rswitch);
2493 d = _mm_max_pd(d,_mm_setzero_pd());
2494 d2 = _mm_mul_pd(d,d);
2495 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2497 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2499 /* Evaluate switch function */
2500 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2501 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
2502 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
2506 fscal = _mm_and_pd(fscal,cutoff_mask);
2508 /* Calculate temporary vectorial force */
2509 tx = _mm_mul_pd(fscal,dx32);
2510 ty = _mm_mul_pd(fscal,dy32);
2511 tz = _mm_mul_pd(fscal,dz32);
2513 /* Update vectorial force */
2514 fix3 = _mm_add_pd(fix3,tx);
2515 fiy3 = _mm_add_pd(fiy3,ty);
2516 fiz3 = _mm_add_pd(fiz3,tz);
2518 fjx2 = _mm_add_pd(fjx2,tx);
2519 fjy2 = _mm_add_pd(fjy2,ty);
2520 fjz2 = _mm_add_pd(fjz2,tz);
2524 /**************************
2525 * CALCULATE INTERACTIONS *
2526 **************************/
2528 if (gmx_mm_any_lt(rsq33,rcutoff2))
2531 r33 = _mm_mul_pd(rsq33,rinv33);
2533 /* EWALD ELECTROSTATICS */
2535 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2536 ewrt = _mm_mul_pd(r33,ewtabscale);
2537 ewitab = _mm_cvttpd_epi32(ewrt);
2538 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2539 ewitab = _mm_slli_epi32(ewitab,2);
2540 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2541 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2542 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2543 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2544 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2545 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2546 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2547 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2548 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
2549 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2551 d = _mm_sub_pd(r33,rswitch);
2552 d = _mm_max_pd(d,_mm_setzero_pd());
2553 d2 = _mm_mul_pd(d,d);
2554 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2556 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2558 /* Evaluate switch function */
2559 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2560 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
2561 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
2565 fscal = _mm_and_pd(fscal,cutoff_mask);
2567 /* Calculate temporary vectorial force */
2568 tx = _mm_mul_pd(fscal,dx33);
2569 ty = _mm_mul_pd(fscal,dy33);
2570 tz = _mm_mul_pd(fscal,dz33);
2572 /* Update vectorial force */
2573 fix3 = _mm_add_pd(fix3,tx);
2574 fiy3 = _mm_add_pd(fiy3,ty);
2575 fiz3 = _mm_add_pd(fiz3,tz);
2577 fjx3 = _mm_add_pd(fjx3,tx);
2578 fjy3 = _mm_add_pd(fjy3,ty);
2579 fjz3 = _mm_add_pd(fjz3,tz);
2583 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2585 /* Inner loop uses 617 flops */
2588 if(jidx<j_index_end)
2592 j_coord_offsetA = DIM*jnrA;
2594 /* load j atom coordinates */
2595 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2596 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2597 &jy2,&jz2,&jx3,&jy3,&jz3);
2599 /* Calculate displacement vector */
2600 dx00 = _mm_sub_pd(ix0,jx0);
2601 dy00 = _mm_sub_pd(iy0,jy0);
2602 dz00 = _mm_sub_pd(iz0,jz0);
2603 dx11 = _mm_sub_pd(ix1,jx1);
2604 dy11 = _mm_sub_pd(iy1,jy1);
2605 dz11 = _mm_sub_pd(iz1,jz1);
2606 dx12 = _mm_sub_pd(ix1,jx2);
2607 dy12 = _mm_sub_pd(iy1,jy2);
2608 dz12 = _mm_sub_pd(iz1,jz2);
2609 dx13 = _mm_sub_pd(ix1,jx3);
2610 dy13 = _mm_sub_pd(iy1,jy3);
2611 dz13 = _mm_sub_pd(iz1,jz3);
2612 dx21 = _mm_sub_pd(ix2,jx1);
2613 dy21 = _mm_sub_pd(iy2,jy1);
2614 dz21 = _mm_sub_pd(iz2,jz1);
2615 dx22 = _mm_sub_pd(ix2,jx2);
2616 dy22 = _mm_sub_pd(iy2,jy2);
2617 dz22 = _mm_sub_pd(iz2,jz2);
2618 dx23 = _mm_sub_pd(ix2,jx3);
2619 dy23 = _mm_sub_pd(iy2,jy3);
2620 dz23 = _mm_sub_pd(iz2,jz3);
2621 dx31 = _mm_sub_pd(ix3,jx1);
2622 dy31 = _mm_sub_pd(iy3,jy1);
2623 dz31 = _mm_sub_pd(iz3,jz1);
2624 dx32 = _mm_sub_pd(ix3,jx2);
2625 dy32 = _mm_sub_pd(iy3,jy2);
2626 dz32 = _mm_sub_pd(iz3,jz2);
2627 dx33 = _mm_sub_pd(ix3,jx3);
2628 dy33 = _mm_sub_pd(iy3,jy3);
2629 dz33 = _mm_sub_pd(iz3,jz3);
2631 /* Calculate squared distance and things based on it */
2632 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2633 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2634 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2635 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2636 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2637 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2638 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2639 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2640 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2641 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2643 rinv00 = gmx_mm_invsqrt_pd(rsq00);
2644 rinv11 = gmx_mm_invsqrt_pd(rsq11);
2645 rinv12 = gmx_mm_invsqrt_pd(rsq12);
2646 rinv13 = gmx_mm_invsqrt_pd(rsq13);
2647 rinv21 = gmx_mm_invsqrt_pd(rsq21);
2648 rinv22 = gmx_mm_invsqrt_pd(rsq22);
2649 rinv23 = gmx_mm_invsqrt_pd(rsq23);
2650 rinv31 = gmx_mm_invsqrt_pd(rsq31);
2651 rinv32 = gmx_mm_invsqrt_pd(rsq32);
2652 rinv33 = gmx_mm_invsqrt_pd(rsq33);
2654 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
2655 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
2656 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
2657 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
2658 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
2659 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
2660 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
2661 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
2662 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
2663 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
2665 fjx0 = _mm_setzero_pd();
2666 fjy0 = _mm_setzero_pd();
2667 fjz0 = _mm_setzero_pd();
2668 fjx1 = _mm_setzero_pd();
2669 fjy1 = _mm_setzero_pd();
2670 fjz1 = _mm_setzero_pd();
2671 fjx2 = _mm_setzero_pd();
2672 fjy2 = _mm_setzero_pd();
2673 fjz2 = _mm_setzero_pd();
2674 fjx3 = _mm_setzero_pd();
2675 fjy3 = _mm_setzero_pd();
2676 fjz3 = _mm_setzero_pd();
2678 /**************************
2679 * CALCULATE INTERACTIONS *
2680 **************************/
2682 if (gmx_mm_any_lt(rsq00,rcutoff2))
2685 r00 = _mm_mul_pd(rsq00,rinv00);
2687 /* LENNARD-JONES DISPERSION/REPULSION */
2689 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2690 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
2691 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
2692 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
2693 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
2695 d = _mm_sub_pd(r00,rswitch);
2696 d = _mm_max_pd(d,_mm_setzero_pd());
2697 d2 = _mm_mul_pd(d,d);
2698 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2700 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2702 /* Evaluate switch function */
2703 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2704 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2705 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
2709 fscal = _mm_and_pd(fscal,cutoff_mask);
2711 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2713 /* Calculate temporary vectorial force */
2714 tx = _mm_mul_pd(fscal,dx00);
2715 ty = _mm_mul_pd(fscal,dy00);
2716 tz = _mm_mul_pd(fscal,dz00);
2718 /* Update vectorial force */
2719 fix0 = _mm_add_pd(fix0,tx);
2720 fiy0 = _mm_add_pd(fiy0,ty);
2721 fiz0 = _mm_add_pd(fiz0,tz);
2723 fjx0 = _mm_add_pd(fjx0,tx);
2724 fjy0 = _mm_add_pd(fjy0,ty);
2725 fjz0 = _mm_add_pd(fjz0,tz);
2729 /**************************
2730 * CALCULATE INTERACTIONS *
2731 **************************/
2733 if (gmx_mm_any_lt(rsq11,rcutoff2))
2736 r11 = _mm_mul_pd(rsq11,rinv11);
2738 /* EWALD ELECTROSTATICS */
2740 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2741 ewrt = _mm_mul_pd(r11,ewtabscale);
2742 ewitab = _mm_cvttpd_epi32(ewrt);
2743 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2744 ewitab = _mm_slli_epi32(ewitab,2);
2745 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2746 ewtabD = _mm_setzero_pd();
2747 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2748 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2749 ewtabFn = _mm_setzero_pd();
2750 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2751 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2752 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2753 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2754 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2756 d = _mm_sub_pd(r11,rswitch);
2757 d = _mm_max_pd(d,_mm_setzero_pd());
2758 d2 = _mm_mul_pd(d,d);
2759 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2761 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2763 /* Evaluate switch function */
2764 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2765 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2766 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2770 fscal = _mm_and_pd(fscal,cutoff_mask);
2772 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2774 /* Calculate temporary vectorial force */
2775 tx = _mm_mul_pd(fscal,dx11);
2776 ty = _mm_mul_pd(fscal,dy11);
2777 tz = _mm_mul_pd(fscal,dz11);
2779 /* Update vectorial force */
2780 fix1 = _mm_add_pd(fix1,tx);
2781 fiy1 = _mm_add_pd(fiy1,ty);
2782 fiz1 = _mm_add_pd(fiz1,tz);
2784 fjx1 = _mm_add_pd(fjx1,tx);
2785 fjy1 = _mm_add_pd(fjy1,ty);
2786 fjz1 = _mm_add_pd(fjz1,tz);
2790 /**************************
2791 * CALCULATE INTERACTIONS *
2792 **************************/
2794 if (gmx_mm_any_lt(rsq12,rcutoff2))
2797 r12 = _mm_mul_pd(rsq12,rinv12);
2799 /* EWALD ELECTROSTATICS */
2801 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2802 ewrt = _mm_mul_pd(r12,ewtabscale);
2803 ewitab = _mm_cvttpd_epi32(ewrt);
2804 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2805 ewitab = _mm_slli_epi32(ewitab,2);
2806 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2807 ewtabD = _mm_setzero_pd();
2808 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2809 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2810 ewtabFn = _mm_setzero_pd();
2811 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2812 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2813 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2814 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2815 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2817 d = _mm_sub_pd(r12,rswitch);
2818 d = _mm_max_pd(d,_mm_setzero_pd());
2819 d2 = _mm_mul_pd(d,d);
2820 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2822 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2824 /* Evaluate switch function */
2825 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2826 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2827 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2831 fscal = _mm_and_pd(fscal,cutoff_mask);
2833 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2835 /* Calculate temporary vectorial force */
2836 tx = _mm_mul_pd(fscal,dx12);
2837 ty = _mm_mul_pd(fscal,dy12);
2838 tz = _mm_mul_pd(fscal,dz12);
2840 /* Update vectorial force */
2841 fix1 = _mm_add_pd(fix1,tx);
2842 fiy1 = _mm_add_pd(fiy1,ty);
2843 fiz1 = _mm_add_pd(fiz1,tz);
2845 fjx2 = _mm_add_pd(fjx2,tx);
2846 fjy2 = _mm_add_pd(fjy2,ty);
2847 fjz2 = _mm_add_pd(fjz2,tz);
2851 /**************************
2852 * CALCULATE INTERACTIONS *
2853 **************************/
2855 if (gmx_mm_any_lt(rsq13,rcutoff2))
2858 r13 = _mm_mul_pd(rsq13,rinv13);
2860 /* EWALD ELECTROSTATICS */
2862 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2863 ewrt = _mm_mul_pd(r13,ewtabscale);
2864 ewitab = _mm_cvttpd_epi32(ewrt);
2865 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2866 ewitab = _mm_slli_epi32(ewitab,2);
2867 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2868 ewtabD = _mm_setzero_pd();
2869 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2870 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2871 ewtabFn = _mm_setzero_pd();
2872 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2873 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2874 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2875 velec = _mm_mul_pd(qq13,_mm_sub_pd(rinv13,velec));
2876 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2878 d = _mm_sub_pd(r13,rswitch);
2879 d = _mm_max_pd(d,_mm_setzero_pd());
2880 d2 = _mm_mul_pd(d,d);
2881 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2883 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2885 /* Evaluate switch function */
2886 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2887 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv13,_mm_mul_pd(velec,dsw)) );
2888 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
2892 fscal = _mm_and_pd(fscal,cutoff_mask);
2894 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2896 /* Calculate temporary vectorial force */
2897 tx = _mm_mul_pd(fscal,dx13);
2898 ty = _mm_mul_pd(fscal,dy13);
2899 tz = _mm_mul_pd(fscal,dz13);
2901 /* Update vectorial force */
2902 fix1 = _mm_add_pd(fix1,tx);
2903 fiy1 = _mm_add_pd(fiy1,ty);
2904 fiz1 = _mm_add_pd(fiz1,tz);
2906 fjx3 = _mm_add_pd(fjx3,tx);
2907 fjy3 = _mm_add_pd(fjy3,ty);
2908 fjz3 = _mm_add_pd(fjz3,tz);
2912 /**************************
2913 * CALCULATE INTERACTIONS *
2914 **************************/
2916 if (gmx_mm_any_lt(rsq21,rcutoff2))
2919 r21 = _mm_mul_pd(rsq21,rinv21);
2921 /* EWALD ELECTROSTATICS */
2923 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2924 ewrt = _mm_mul_pd(r21,ewtabscale);
2925 ewitab = _mm_cvttpd_epi32(ewrt);
2926 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2927 ewitab = _mm_slli_epi32(ewitab,2);
2928 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2929 ewtabD = _mm_setzero_pd();
2930 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2931 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2932 ewtabFn = _mm_setzero_pd();
2933 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2934 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2935 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2936 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2937 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2939 d = _mm_sub_pd(r21,rswitch);
2940 d = _mm_max_pd(d,_mm_setzero_pd());
2941 d2 = _mm_mul_pd(d,d);
2942 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2944 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2946 /* Evaluate switch function */
2947 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2948 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2949 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2953 fscal = _mm_and_pd(fscal,cutoff_mask);
2955 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2957 /* Calculate temporary vectorial force */
2958 tx = _mm_mul_pd(fscal,dx21);
2959 ty = _mm_mul_pd(fscal,dy21);
2960 tz = _mm_mul_pd(fscal,dz21);
2962 /* Update vectorial force */
2963 fix2 = _mm_add_pd(fix2,tx);
2964 fiy2 = _mm_add_pd(fiy2,ty);
2965 fiz2 = _mm_add_pd(fiz2,tz);
2967 fjx1 = _mm_add_pd(fjx1,tx);
2968 fjy1 = _mm_add_pd(fjy1,ty);
2969 fjz1 = _mm_add_pd(fjz1,tz);
2973 /**************************
2974 * CALCULATE INTERACTIONS *
2975 **************************/
2977 if (gmx_mm_any_lt(rsq22,rcutoff2))
2980 r22 = _mm_mul_pd(rsq22,rinv22);
2982 /* EWALD ELECTROSTATICS */
2984 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2985 ewrt = _mm_mul_pd(r22,ewtabscale);
2986 ewitab = _mm_cvttpd_epi32(ewrt);
2987 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2988 ewitab = _mm_slli_epi32(ewitab,2);
2989 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2990 ewtabD = _mm_setzero_pd();
2991 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2992 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2993 ewtabFn = _mm_setzero_pd();
2994 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2995 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2996 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2997 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2998 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
3000 d = _mm_sub_pd(r22,rswitch);
3001 d = _mm_max_pd(d,_mm_setzero_pd());
3002 d2 = _mm_mul_pd(d,d);
3003 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3005 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3007 /* Evaluate switch function */
3008 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3009 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
3010 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
3014 fscal = _mm_and_pd(fscal,cutoff_mask);
3016 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3018 /* Calculate temporary vectorial force */
3019 tx = _mm_mul_pd(fscal,dx22);
3020 ty = _mm_mul_pd(fscal,dy22);
3021 tz = _mm_mul_pd(fscal,dz22);
3023 /* Update vectorial force */
3024 fix2 = _mm_add_pd(fix2,tx);
3025 fiy2 = _mm_add_pd(fiy2,ty);
3026 fiz2 = _mm_add_pd(fiz2,tz);
3028 fjx2 = _mm_add_pd(fjx2,tx);
3029 fjy2 = _mm_add_pd(fjy2,ty);
3030 fjz2 = _mm_add_pd(fjz2,tz);
3034 /**************************
3035 * CALCULATE INTERACTIONS *
3036 **************************/
3038 if (gmx_mm_any_lt(rsq23,rcutoff2))
3041 r23 = _mm_mul_pd(rsq23,rinv23);
3043 /* EWALD ELECTROSTATICS */
3045 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3046 ewrt = _mm_mul_pd(r23,ewtabscale);
3047 ewitab = _mm_cvttpd_epi32(ewrt);
3048 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3049 ewitab = _mm_slli_epi32(ewitab,2);
3050 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3051 ewtabD = _mm_setzero_pd();
3052 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3053 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3054 ewtabFn = _mm_setzero_pd();
3055 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3056 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3057 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3058 velec = _mm_mul_pd(qq23,_mm_sub_pd(rinv23,velec));
3059 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
3061 d = _mm_sub_pd(r23,rswitch);
3062 d = _mm_max_pd(d,_mm_setzero_pd());
3063 d2 = _mm_mul_pd(d,d);
3064 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3066 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3068 /* Evaluate switch function */
3069 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3070 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv23,_mm_mul_pd(velec,dsw)) );
3071 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
3075 fscal = _mm_and_pd(fscal,cutoff_mask);
3077 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3079 /* Calculate temporary vectorial force */
3080 tx = _mm_mul_pd(fscal,dx23);
3081 ty = _mm_mul_pd(fscal,dy23);
3082 tz = _mm_mul_pd(fscal,dz23);
3084 /* Update vectorial force */
3085 fix2 = _mm_add_pd(fix2,tx);
3086 fiy2 = _mm_add_pd(fiy2,ty);
3087 fiz2 = _mm_add_pd(fiz2,tz);
3089 fjx3 = _mm_add_pd(fjx3,tx);
3090 fjy3 = _mm_add_pd(fjy3,ty);
3091 fjz3 = _mm_add_pd(fjz3,tz);
3095 /**************************
3096 * CALCULATE INTERACTIONS *
3097 **************************/
3099 if (gmx_mm_any_lt(rsq31,rcutoff2))
3102 r31 = _mm_mul_pd(rsq31,rinv31);
3104 /* EWALD ELECTROSTATICS */
3106 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3107 ewrt = _mm_mul_pd(r31,ewtabscale);
3108 ewitab = _mm_cvttpd_epi32(ewrt);
3109 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3110 ewitab = _mm_slli_epi32(ewitab,2);
3111 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3112 ewtabD = _mm_setzero_pd();
3113 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3114 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3115 ewtabFn = _mm_setzero_pd();
3116 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3117 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3118 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3119 velec = _mm_mul_pd(qq31,_mm_sub_pd(rinv31,velec));
3120 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
3122 d = _mm_sub_pd(r31,rswitch);
3123 d = _mm_max_pd(d,_mm_setzero_pd());
3124 d2 = _mm_mul_pd(d,d);
3125 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3127 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3129 /* Evaluate switch function */
3130 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3131 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv31,_mm_mul_pd(velec,dsw)) );
3132 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
3136 fscal = _mm_and_pd(fscal,cutoff_mask);
3138 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3140 /* Calculate temporary vectorial force */
3141 tx = _mm_mul_pd(fscal,dx31);
3142 ty = _mm_mul_pd(fscal,dy31);
3143 tz = _mm_mul_pd(fscal,dz31);
3145 /* Update vectorial force */
3146 fix3 = _mm_add_pd(fix3,tx);
3147 fiy3 = _mm_add_pd(fiy3,ty);
3148 fiz3 = _mm_add_pd(fiz3,tz);
3150 fjx1 = _mm_add_pd(fjx1,tx);
3151 fjy1 = _mm_add_pd(fjy1,ty);
3152 fjz1 = _mm_add_pd(fjz1,tz);
3156 /**************************
3157 * CALCULATE INTERACTIONS *
3158 **************************/
3160 if (gmx_mm_any_lt(rsq32,rcutoff2))
3163 r32 = _mm_mul_pd(rsq32,rinv32);
3165 /* EWALD ELECTROSTATICS */
3167 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3168 ewrt = _mm_mul_pd(r32,ewtabscale);
3169 ewitab = _mm_cvttpd_epi32(ewrt);
3170 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3171 ewitab = _mm_slli_epi32(ewitab,2);
3172 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3173 ewtabD = _mm_setzero_pd();
3174 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3175 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3176 ewtabFn = _mm_setzero_pd();
3177 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3178 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3179 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3180 velec = _mm_mul_pd(qq32,_mm_sub_pd(rinv32,velec));
3181 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
3183 d = _mm_sub_pd(r32,rswitch);
3184 d = _mm_max_pd(d,_mm_setzero_pd());
3185 d2 = _mm_mul_pd(d,d);
3186 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3188 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3190 /* Evaluate switch function */
3191 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3192 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv32,_mm_mul_pd(velec,dsw)) );
3193 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
3197 fscal = _mm_and_pd(fscal,cutoff_mask);
3199 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3201 /* Calculate temporary vectorial force */
3202 tx = _mm_mul_pd(fscal,dx32);
3203 ty = _mm_mul_pd(fscal,dy32);
3204 tz = _mm_mul_pd(fscal,dz32);
3206 /* Update vectorial force */
3207 fix3 = _mm_add_pd(fix3,tx);
3208 fiy3 = _mm_add_pd(fiy3,ty);
3209 fiz3 = _mm_add_pd(fiz3,tz);
3211 fjx2 = _mm_add_pd(fjx2,tx);
3212 fjy2 = _mm_add_pd(fjy2,ty);
3213 fjz2 = _mm_add_pd(fjz2,tz);
3217 /**************************
3218 * CALCULATE INTERACTIONS *
3219 **************************/
3221 if (gmx_mm_any_lt(rsq33,rcutoff2))
3224 r33 = _mm_mul_pd(rsq33,rinv33);
3226 /* EWALD ELECTROSTATICS */
3228 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3229 ewrt = _mm_mul_pd(r33,ewtabscale);
3230 ewitab = _mm_cvttpd_epi32(ewrt);
3231 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3232 ewitab = _mm_slli_epi32(ewitab,2);
3233 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3234 ewtabD = _mm_setzero_pd();
3235 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3236 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3237 ewtabFn = _mm_setzero_pd();
3238 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3239 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3240 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3241 velec = _mm_mul_pd(qq33,_mm_sub_pd(rinv33,velec));
3242 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
3244 d = _mm_sub_pd(r33,rswitch);
3245 d = _mm_max_pd(d,_mm_setzero_pd());
3246 d2 = _mm_mul_pd(d,d);
3247 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3249 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3251 /* Evaluate switch function */
3252 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3253 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv33,_mm_mul_pd(velec,dsw)) );
3254 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
3258 fscal = _mm_and_pd(fscal,cutoff_mask);
3260 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3262 /* Calculate temporary vectorial force */
3263 tx = _mm_mul_pd(fscal,dx33);
3264 ty = _mm_mul_pd(fscal,dy33);
3265 tz = _mm_mul_pd(fscal,dz33);
3267 /* Update vectorial force */
3268 fix3 = _mm_add_pd(fix3,tx);
3269 fiy3 = _mm_add_pd(fiy3,ty);
3270 fiz3 = _mm_add_pd(fiz3,tz);
3272 fjx3 = _mm_add_pd(fjx3,tx);
3273 fjy3 = _mm_add_pd(fjy3,ty);
3274 fjz3 = _mm_add_pd(fjz3,tz);
3278 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3280 /* Inner loop uses 617 flops */
3283 /* End of innermost loop */
3285 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3286 f+i_coord_offset,fshift+i_shift_offset);
3288 /* Increment number of inner iterations */
3289 inneriter += j_index_end - j_index_start;
3291 /* Outer loop uses 24 flops */
3294 /* Increment number of outer iterations */
3297 /* Update outer/inner flops */
3299 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*617);