2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_sse4_1_double.h"
48 #include "kernelutil_x86_sse4_1_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse4_1_double
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LennardJones
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse4_1_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int j_coord_offsetA,j_coord_offsetB;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
81 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
83 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
85 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86 int vdwjidx0A,vdwjidx0B;
87 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
88 int vdwjidx1A,vdwjidx1B;
89 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
90 int vdwjidx2A,vdwjidx2B;
91 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
92 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
93 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
94 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
95 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
96 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
97 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
98 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
99 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
100 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
101 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
104 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
107 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
108 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
110 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
112 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
113 real rswitch_scalar,d_scalar;
114 __m128d dummy_mask,cutoff_mask;
115 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
116 __m128d one = _mm_set1_pd(1.0);
117 __m128d two = _mm_set1_pd(2.0);
123 jindex = nlist->jindex;
125 shiftidx = nlist->shift;
127 shiftvec = fr->shift_vec[0];
128 fshift = fr->fshift[0];
129 facel = _mm_set1_pd(fr->epsfac);
130 charge = mdatoms->chargeA;
131 nvdwtype = fr->ntype;
133 vdwtype = mdatoms->typeA;
135 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
136 ewtab = fr->ic->tabq_coul_FDV0;
137 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
138 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
140 /* Setup water-specific parameters */
141 inr = nlist->iinr[0];
142 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
143 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
144 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
145 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
147 jq0 = _mm_set1_pd(charge[inr+0]);
148 jq1 = _mm_set1_pd(charge[inr+1]);
149 jq2 = _mm_set1_pd(charge[inr+2]);
150 vdwjidx0A = 2*vdwtype[inr+0];
151 qq00 = _mm_mul_pd(iq0,jq0);
152 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
153 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
154 qq01 = _mm_mul_pd(iq0,jq1);
155 qq02 = _mm_mul_pd(iq0,jq2);
156 qq10 = _mm_mul_pd(iq1,jq0);
157 qq11 = _mm_mul_pd(iq1,jq1);
158 qq12 = _mm_mul_pd(iq1,jq2);
159 qq20 = _mm_mul_pd(iq2,jq0);
160 qq21 = _mm_mul_pd(iq2,jq1);
161 qq22 = _mm_mul_pd(iq2,jq2);
163 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
164 rcutoff_scalar = fr->rcoulomb;
165 rcutoff = _mm_set1_pd(rcutoff_scalar);
166 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
168 rswitch_scalar = fr->rcoulomb_switch;
169 rswitch = _mm_set1_pd(rswitch_scalar);
170 /* Setup switch parameters */
171 d_scalar = rcutoff_scalar-rswitch_scalar;
172 d = _mm_set1_pd(d_scalar);
173 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
174 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
175 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
176 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
177 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
178 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
180 /* Avoid stupid compiler warnings */
188 /* Start outer loop over neighborlists */
189 for(iidx=0; iidx<nri; iidx++)
191 /* Load shift vector for this list */
192 i_shift_offset = DIM*shiftidx[iidx];
194 /* Load limits for loop over neighbors */
195 j_index_start = jindex[iidx];
196 j_index_end = jindex[iidx+1];
198 /* Get outer coordinate index */
200 i_coord_offset = DIM*inr;
202 /* Load i particle coords and add shift vector */
203 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
204 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
206 fix0 = _mm_setzero_pd();
207 fiy0 = _mm_setzero_pd();
208 fiz0 = _mm_setzero_pd();
209 fix1 = _mm_setzero_pd();
210 fiy1 = _mm_setzero_pd();
211 fiz1 = _mm_setzero_pd();
212 fix2 = _mm_setzero_pd();
213 fiy2 = _mm_setzero_pd();
214 fiz2 = _mm_setzero_pd();
216 /* Reset potential sums */
217 velecsum = _mm_setzero_pd();
218 vvdwsum = _mm_setzero_pd();
220 /* Start inner kernel loop */
221 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
224 /* Get j neighbor index, and coordinate index */
227 j_coord_offsetA = DIM*jnrA;
228 j_coord_offsetB = DIM*jnrB;
230 /* load j atom coordinates */
231 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
232 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
234 /* Calculate displacement vector */
235 dx00 = _mm_sub_pd(ix0,jx0);
236 dy00 = _mm_sub_pd(iy0,jy0);
237 dz00 = _mm_sub_pd(iz0,jz0);
238 dx01 = _mm_sub_pd(ix0,jx1);
239 dy01 = _mm_sub_pd(iy0,jy1);
240 dz01 = _mm_sub_pd(iz0,jz1);
241 dx02 = _mm_sub_pd(ix0,jx2);
242 dy02 = _mm_sub_pd(iy0,jy2);
243 dz02 = _mm_sub_pd(iz0,jz2);
244 dx10 = _mm_sub_pd(ix1,jx0);
245 dy10 = _mm_sub_pd(iy1,jy0);
246 dz10 = _mm_sub_pd(iz1,jz0);
247 dx11 = _mm_sub_pd(ix1,jx1);
248 dy11 = _mm_sub_pd(iy1,jy1);
249 dz11 = _mm_sub_pd(iz1,jz1);
250 dx12 = _mm_sub_pd(ix1,jx2);
251 dy12 = _mm_sub_pd(iy1,jy2);
252 dz12 = _mm_sub_pd(iz1,jz2);
253 dx20 = _mm_sub_pd(ix2,jx0);
254 dy20 = _mm_sub_pd(iy2,jy0);
255 dz20 = _mm_sub_pd(iz2,jz0);
256 dx21 = _mm_sub_pd(ix2,jx1);
257 dy21 = _mm_sub_pd(iy2,jy1);
258 dz21 = _mm_sub_pd(iz2,jz1);
259 dx22 = _mm_sub_pd(ix2,jx2);
260 dy22 = _mm_sub_pd(iy2,jy2);
261 dz22 = _mm_sub_pd(iz2,jz2);
263 /* Calculate squared distance and things based on it */
264 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
265 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
266 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
267 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
268 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
269 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
270 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
271 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
272 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
274 rinv00 = gmx_mm_invsqrt_pd(rsq00);
275 rinv01 = gmx_mm_invsqrt_pd(rsq01);
276 rinv02 = gmx_mm_invsqrt_pd(rsq02);
277 rinv10 = gmx_mm_invsqrt_pd(rsq10);
278 rinv11 = gmx_mm_invsqrt_pd(rsq11);
279 rinv12 = gmx_mm_invsqrt_pd(rsq12);
280 rinv20 = gmx_mm_invsqrt_pd(rsq20);
281 rinv21 = gmx_mm_invsqrt_pd(rsq21);
282 rinv22 = gmx_mm_invsqrt_pd(rsq22);
284 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
285 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
286 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
287 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
288 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
289 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
290 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
291 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
292 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
294 fjx0 = _mm_setzero_pd();
295 fjy0 = _mm_setzero_pd();
296 fjz0 = _mm_setzero_pd();
297 fjx1 = _mm_setzero_pd();
298 fjy1 = _mm_setzero_pd();
299 fjz1 = _mm_setzero_pd();
300 fjx2 = _mm_setzero_pd();
301 fjy2 = _mm_setzero_pd();
302 fjz2 = _mm_setzero_pd();
304 /**************************
305 * CALCULATE INTERACTIONS *
306 **************************/
308 if (gmx_mm_any_lt(rsq00,rcutoff2))
311 r00 = _mm_mul_pd(rsq00,rinv00);
313 /* EWALD ELECTROSTATICS */
315 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
316 ewrt = _mm_mul_pd(r00,ewtabscale);
317 ewitab = _mm_cvttpd_epi32(ewrt);
318 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
319 ewitab = _mm_slli_epi32(ewitab,2);
320 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
321 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
322 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
323 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
324 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
325 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
326 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
327 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
328 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
329 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
331 /* LENNARD-JONES DISPERSION/REPULSION */
333 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
334 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
335 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
336 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
337 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
339 d = _mm_sub_pd(r00,rswitch);
340 d = _mm_max_pd(d,_mm_setzero_pd());
341 d2 = _mm_mul_pd(d,d);
342 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
344 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
346 /* Evaluate switch function */
347 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
348 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
349 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
350 velec = _mm_mul_pd(velec,sw);
351 vvdw = _mm_mul_pd(vvdw,sw);
352 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
354 /* Update potential sum for this i atom from the interaction with this j atom. */
355 velec = _mm_and_pd(velec,cutoff_mask);
356 velecsum = _mm_add_pd(velecsum,velec);
357 vvdw = _mm_and_pd(vvdw,cutoff_mask);
358 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
360 fscal = _mm_add_pd(felec,fvdw);
362 fscal = _mm_and_pd(fscal,cutoff_mask);
364 /* Calculate temporary vectorial force */
365 tx = _mm_mul_pd(fscal,dx00);
366 ty = _mm_mul_pd(fscal,dy00);
367 tz = _mm_mul_pd(fscal,dz00);
369 /* Update vectorial force */
370 fix0 = _mm_add_pd(fix0,tx);
371 fiy0 = _mm_add_pd(fiy0,ty);
372 fiz0 = _mm_add_pd(fiz0,tz);
374 fjx0 = _mm_add_pd(fjx0,tx);
375 fjy0 = _mm_add_pd(fjy0,ty);
376 fjz0 = _mm_add_pd(fjz0,tz);
380 /**************************
381 * CALCULATE INTERACTIONS *
382 **************************/
384 if (gmx_mm_any_lt(rsq01,rcutoff2))
387 r01 = _mm_mul_pd(rsq01,rinv01);
389 /* EWALD ELECTROSTATICS */
391 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
392 ewrt = _mm_mul_pd(r01,ewtabscale);
393 ewitab = _mm_cvttpd_epi32(ewrt);
394 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
395 ewitab = _mm_slli_epi32(ewitab,2);
396 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
397 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
398 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
399 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
400 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
401 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
402 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
403 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
404 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
405 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
407 d = _mm_sub_pd(r01,rswitch);
408 d = _mm_max_pd(d,_mm_setzero_pd());
409 d2 = _mm_mul_pd(d,d);
410 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
412 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
414 /* Evaluate switch function */
415 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
416 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
417 velec = _mm_mul_pd(velec,sw);
418 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
420 /* Update potential sum for this i atom from the interaction with this j atom. */
421 velec = _mm_and_pd(velec,cutoff_mask);
422 velecsum = _mm_add_pd(velecsum,velec);
426 fscal = _mm_and_pd(fscal,cutoff_mask);
428 /* Calculate temporary vectorial force */
429 tx = _mm_mul_pd(fscal,dx01);
430 ty = _mm_mul_pd(fscal,dy01);
431 tz = _mm_mul_pd(fscal,dz01);
433 /* Update vectorial force */
434 fix0 = _mm_add_pd(fix0,tx);
435 fiy0 = _mm_add_pd(fiy0,ty);
436 fiz0 = _mm_add_pd(fiz0,tz);
438 fjx1 = _mm_add_pd(fjx1,tx);
439 fjy1 = _mm_add_pd(fjy1,ty);
440 fjz1 = _mm_add_pd(fjz1,tz);
444 /**************************
445 * CALCULATE INTERACTIONS *
446 **************************/
448 if (gmx_mm_any_lt(rsq02,rcutoff2))
451 r02 = _mm_mul_pd(rsq02,rinv02);
453 /* EWALD ELECTROSTATICS */
455 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
456 ewrt = _mm_mul_pd(r02,ewtabscale);
457 ewitab = _mm_cvttpd_epi32(ewrt);
458 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
459 ewitab = _mm_slli_epi32(ewitab,2);
460 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
461 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
462 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
463 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
464 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
465 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
466 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
467 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
468 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
469 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
471 d = _mm_sub_pd(r02,rswitch);
472 d = _mm_max_pd(d,_mm_setzero_pd());
473 d2 = _mm_mul_pd(d,d);
474 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
476 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
478 /* Evaluate switch function */
479 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
480 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
481 velec = _mm_mul_pd(velec,sw);
482 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
484 /* Update potential sum for this i atom from the interaction with this j atom. */
485 velec = _mm_and_pd(velec,cutoff_mask);
486 velecsum = _mm_add_pd(velecsum,velec);
490 fscal = _mm_and_pd(fscal,cutoff_mask);
492 /* Calculate temporary vectorial force */
493 tx = _mm_mul_pd(fscal,dx02);
494 ty = _mm_mul_pd(fscal,dy02);
495 tz = _mm_mul_pd(fscal,dz02);
497 /* Update vectorial force */
498 fix0 = _mm_add_pd(fix0,tx);
499 fiy0 = _mm_add_pd(fiy0,ty);
500 fiz0 = _mm_add_pd(fiz0,tz);
502 fjx2 = _mm_add_pd(fjx2,tx);
503 fjy2 = _mm_add_pd(fjy2,ty);
504 fjz2 = _mm_add_pd(fjz2,tz);
508 /**************************
509 * CALCULATE INTERACTIONS *
510 **************************/
512 if (gmx_mm_any_lt(rsq10,rcutoff2))
515 r10 = _mm_mul_pd(rsq10,rinv10);
517 /* EWALD ELECTROSTATICS */
519 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
520 ewrt = _mm_mul_pd(r10,ewtabscale);
521 ewitab = _mm_cvttpd_epi32(ewrt);
522 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
523 ewitab = _mm_slli_epi32(ewitab,2);
524 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
525 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
526 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
527 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
528 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
529 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
530 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
531 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
532 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
533 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
535 d = _mm_sub_pd(r10,rswitch);
536 d = _mm_max_pd(d,_mm_setzero_pd());
537 d2 = _mm_mul_pd(d,d);
538 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
540 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
542 /* Evaluate switch function */
543 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
544 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
545 velec = _mm_mul_pd(velec,sw);
546 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
548 /* Update potential sum for this i atom from the interaction with this j atom. */
549 velec = _mm_and_pd(velec,cutoff_mask);
550 velecsum = _mm_add_pd(velecsum,velec);
554 fscal = _mm_and_pd(fscal,cutoff_mask);
556 /* Calculate temporary vectorial force */
557 tx = _mm_mul_pd(fscal,dx10);
558 ty = _mm_mul_pd(fscal,dy10);
559 tz = _mm_mul_pd(fscal,dz10);
561 /* Update vectorial force */
562 fix1 = _mm_add_pd(fix1,tx);
563 fiy1 = _mm_add_pd(fiy1,ty);
564 fiz1 = _mm_add_pd(fiz1,tz);
566 fjx0 = _mm_add_pd(fjx0,tx);
567 fjy0 = _mm_add_pd(fjy0,ty);
568 fjz0 = _mm_add_pd(fjz0,tz);
572 /**************************
573 * CALCULATE INTERACTIONS *
574 **************************/
576 if (gmx_mm_any_lt(rsq11,rcutoff2))
579 r11 = _mm_mul_pd(rsq11,rinv11);
581 /* EWALD ELECTROSTATICS */
583 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
584 ewrt = _mm_mul_pd(r11,ewtabscale);
585 ewitab = _mm_cvttpd_epi32(ewrt);
586 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
587 ewitab = _mm_slli_epi32(ewitab,2);
588 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
589 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
590 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
591 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
592 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
593 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
594 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
595 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
596 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
597 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
599 d = _mm_sub_pd(r11,rswitch);
600 d = _mm_max_pd(d,_mm_setzero_pd());
601 d2 = _mm_mul_pd(d,d);
602 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
604 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
606 /* Evaluate switch function */
607 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
608 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
609 velec = _mm_mul_pd(velec,sw);
610 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
612 /* Update potential sum for this i atom from the interaction with this j atom. */
613 velec = _mm_and_pd(velec,cutoff_mask);
614 velecsum = _mm_add_pd(velecsum,velec);
618 fscal = _mm_and_pd(fscal,cutoff_mask);
620 /* Calculate temporary vectorial force */
621 tx = _mm_mul_pd(fscal,dx11);
622 ty = _mm_mul_pd(fscal,dy11);
623 tz = _mm_mul_pd(fscal,dz11);
625 /* Update vectorial force */
626 fix1 = _mm_add_pd(fix1,tx);
627 fiy1 = _mm_add_pd(fiy1,ty);
628 fiz1 = _mm_add_pd(fiz1,tz);
630 fjx1 = _mm_add_pd(fjx1,tx);
631 fjy1 = _mm_add_pd(fjy1,ty);
632 fjz1 = _mm_add_pd(fjz1,tz);
636 /**************************
637 * CALCULATE INTERACTIONS *
638 **************************/
640 if (gmx_mm_any_lt(rsq12,rcutoff2))
643 r12 = _mm_mul_pd(rsq12,rinv12);
645 /* EWALD ELECTROSTATICS */
647 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
648 ewrt = _mm_mul_pd(r12,ewtabscale);
649 ewitab = _mm_cvttpd_epi32(ewrt);
650 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
651 ewitab = _mm_slli_epi32(ewitab,2);
652 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
653 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
654 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
655 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
656 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
657 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
658 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
659 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
660 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
661 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
663 d = _mm_sub_pd(r12,rswitch);
664 d = _mm_max_pd(d,_mm_setzero_pd());
665 d2 = _mm_mul_pd(d,d);
666 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
668 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
670 /* Evaluate switch function */
671 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
672 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
673 velec = _mm_mul_pd(velec,sw);
674 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
676 /* Update potential sum for this i atom from the interaction with this j atom. */
677 velec = _mm_and_pd(velec,cutoff_mask);
678 velecsum = _mm_add_pd(velecsum,velec);
682 fscal = _mm_and_pd(fscal,cutoff_mask);
684 /* Calculate temporary vectorial force */
685 tx = _mm_mul_pd(fscal,dx12);
686 ty = _mm_mul_pd(fscal,dy12);
687 tz = _mm_mul_pd(fscal,dz12);
689 /* Update vectorial force */
690 fix1 = _mm_add_pd(fix1,tx);
691 fiy1 = _mm_add_pd(fiy1,ty);
692 fiz1 = _mm_add_pd(fiz1,tz);
694 fjx2 = _mm_add_pd(fjx2,tx);
695 fjy2 = _mm_add_pd(fjy2,ty);
696 fjz2 = _mm_add_pd(fjz2,tz);
700 /**************************
701 * CALCULATE INTERACTIONS *
702 **************************/
704 if (gmx_mm_any_lt(rsq20,rcutoff2))
707 r20 = _mm_mul_pd(rsq20,rinv20);
709 /* EWALD ELECTROSTATICS */
711 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
712 ewrt = _mm_mul_pd(r20,ewtabscale);
713 ewitab = _mm_cvttpd_epi32(ewrt);
714 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
715 ewitab = _mm_slli_epi32(ewitab,2);
716 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
717 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
718 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
719 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
720 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
721 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
722 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
723 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
724 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
725 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
727 d = _mm_sub_pd(r20,rswitch);
728 d = _mm_max_pd(d,_mm_setzero_pd());
729 d2 = _mm_mul_pd(d,d);
730 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
732 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
734 /* Evaluate switch function */
735 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
736 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
737 velec = _mm_mul_pd(velec,sw);
738 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
740 /* Update potential sum for this i atom from the interaction with this j atom. */
741 velec = _mm_and_pd(velec,cutoff_mask);
742 velecsum = _mm_add_pd(velecsum,velec);
746 fscal = _mm_and_pd(fscal,cutoff_mask);
748 /* Calculate temporary vectorial force */
749 tx = _mm_mul_pd(fscal,dx20);
750 ty = _mm_mul_pd(fscal,dy20);
751 tz = _mm_mul_pd(fscal,dz20);
753 /* Update vectorial force */
754 fix2 = _mm_add_pd(fix2,tx);
755 fiy2 = _mm_add_pd(fiy2,ty);
756 fiz2 = _mm_add_pd(fiz2,tz);
758 fjx0 = _mm_add_pd(fjx0,tx);
759 fjy0 = _mm_add_pd(fjy0,ty);
760 fjz0 = _mm_add_pd(fjz0,tz);
764 /**************************
765 * CALCULATE INTERACTIONS *
766 **************************/
768 if (gmx_mm_any_lt(rsq21,rcutoff2))
771 r21 = _mm_mul_pd(rsq21,rinv21);
773 /* EWALD ELECTROSTATICS */
775 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
776 ewrt = _mm_mul_pd(r21,ewtabscale);
777 ewitab = _mm_cvttpd_epi32(ewrt);
778 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
779 ewitab = _mm_slli_epi32(ewitab,2);
780 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
781 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
782 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
783 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
784 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
785 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
786 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
787 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
788 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
789 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
791 d = _mm_sub_pd(r21,rswitch);
792 d = _mm_max_pd(d,_mm_setzero_pd());
793 d2 = _mm_mul_pd(d,d);
794 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
796 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
798 /* Evaluate switch function */
799 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
800 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
801 velec = _mm_mul_pd(velec,sw);
802 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
804 /* Update potential sum for this i atom from the interaction with this j atom. */
805 velec = _mm_and_pd(velec,cutoff_mask);
806 velecsum = _mm_add_pd(velecsum,velec);
810 fscal = _mm_and_pd(fscal,cutoff_mask);
812 /* Calculate temporary vectorial force */
813 tx = _mm_mul_pd(fscal,dx21);
814 ty = _mm_mul_pd(fscal,dy21);
815 tz = _mm_mul_pd(fscal,dz21);
817 /* Update vectorial force */
818 fix2 = _mm_add_pd(fix2,tx);
819 fiy2 = _mm_add_pd(fiy2,ty);
820 fiz2 = _mm_add_pd(fiz2,tz);
822 fjx1 = _mm_add_pd(fjx1,tx);
823 fjy1 = _mm_add_pd(fjy1,ty);
824 fjz1 = _mm_add_pd(fjz1,tz);
828 /**************************
829 * CALCULATE INTERACTIONS *
830 **************************/
832 if (gmx_mm_any_lt(rsq22,rcutoff2))
835 r22 = _mm_mul_pd(rsq22,rinv22);
837 /* EWALD ELECTROSTATICS */
839 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
840 ewrt = _mm_mul_pd(r22,ewtabscale);
841 ewitab = _mm_cvttpd_epi32(ewrt);
842 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
843 ewitab = _mm_slli_epi32(ewitab,2);
844 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
845 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
846 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
847 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
848 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
849 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
850 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
851 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
852 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
853 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
855 d = _mm_sub_pd(r22,rswitch);
856 d = _mm_max_pd(d,_mm_setzero_pd());
857 d2 = _mm_mul_pd(d,d);
858 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
860 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
862 /* Evaluate switch function */
863 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
864 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
865 velec = _mm_mul_pd(velec,sw);
866 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
868 /* Update potential sum for this i atom from the interaction with this j atom. */
869 velec = _mm_and_pd(velec,cutoff_mask);
870 velecsum = _mm_add_pd(velecsum,velec);
874 fscal = _mm_and_pd(fscal,cutoff_mask);
876 /* Calculate temporary vectorial force */
877 tx = _mm_mul_pd(fscal,dx22);
878 ty = _mm_mul_pd(fscal,dy22);
879 tz = _mm_mul_pd(fscal,dz22);
881 /* Update vectorial force */
882 fix2 = _mm_add_pd(fix2,tx);
883 fiy2 = _mm_add_pd(fiy2,ty);
884 fiz2 = _mm_add_pd(fiz2,tz);
886 fjx2 = _mm_add_pd(fjx2,tx);
887 fjy2 = _mm_add_pd(fjy2,ty);
888 fjz2 = _mm_add_pd(fjz2,tz);
892 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
894 /* Inner loop uses 603 flops */
901 j_coord_offsetA = DIM*jnrA;
903 /* load j atom coordinates */
904 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
905 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
907 /* Calculate displacement vector */
908 dx00 = _mm_sub_pd(ix0,jx0);
909 dy00 = _mm_sub_pd(iy0,jy0);
910 dz00 = _mm_sub_pd(iz0,jz0);
911 dx01 = _mm_sub_pd(ix0,jx1);
912 dy01 = _mm_sub_pd(iy0,jy1);
913 dz01 = _mm_sub_pd(iz0,jz1);
914 dx02 = _mm_sub_pd(ix0,jx2);
915 dy02 = _mm_sub_pd(iy0,jy2);
916 dz02 = _mm_sub_pd(iz0,jz2);
917 dx10 = _mm_sub_pd(ix1,jx0);
918 dy10 = _mm_sub_pd(iy1,jy0);
919 dz10 = _mm_sub_pd(iz1,jz0);
920 dx11 = _mm_sub_pd(ix1,jx1);
921 dy11 = _mm_sub_pd(iy1,jy1);
922 dz11 = _mm_sub_pd(iz1,jz1);
923 dx12 = _mm_sub_pd(ix1,jx2);
924 dy12 = _mm_sub_pd(iy1,jy2);
925 dz12 = _mm_sub_pd(iz1,jz2);
926 dx20 = _mm_sub_pd(ix2,jx0);
927 dy20 = _mm_sub_pd(iy2,jy0);
928 dz20 = _mm_sub_pd(iz2,jz0);
929 dx21 = _mm_sub_pd(ix2,jx1);
930 dy21 = _mm_sub_pd(iy2,jy1);
931 dz21 = _mm_sub_pd(iz2,jz1);
932 dx22 = _mm_sub_pd(ix2,jx2);
933 dy22 = _mm_sub_pd(iy2,jy2);
934 dz22 = _mm_sub_pd(iz2,jz2);
936 /* Calculate squared distance and things based on it */
937 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
938 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
939 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
940 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
941 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
942 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
943 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
944 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
945 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
947 rinv00 = gmx_mm_invsqrt_pd(rsq00);
948 rinv01 = gmx_mm_invsqrt_pd(rsq01);
949 rinv02 = gmx_mm_invsqrt_pd(rsq02);
950 rinv10 = gmx_mm_invsqrt_pd(rsq10);
951 rinv11 = gmx_mm_invsqrt_pd(rsq11);
952 rinv12 = gmx_mm_invsqrt_pd(rsq12);
953 rinv20 = gmx_mm_invsqrt_pd(rsq20);
954 rinv21 = gmx_mm_invsqrt_pd(rsq21);
955 rinv22 = gmx_mm_invsqrt_pd(rsq22);
957 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
958 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
959 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
960 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
961 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
962 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
963 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
964 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
965 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
967 fjx0 = _mm_setzero_pd();
968 fjy0 = _mm_setzero_pd();
969 fjz0 = _mm_setzero_pd();
970 fjx1 = _mm_setzero_pd();
971 fjy1 = _mm_setzero_pd();
972 fjz1 = _mm_setzero_pd();
973 fjx2 = _mm_setzero_pd();
974 fjy2 = _mm_setzero_pd();
975 fjz2 = _mm_setzero_pd();
977 /**************************
978 * CALCULATE INTERACTIONS *
979 **************************/
981 if (gmx_mm_any_lt(rsq00,rcutoff2))
984 r00 = _mm_mul_pd(rsq00,rinv00);
986 /* EWALD ELECTROSTATICS */
988 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
989 ewrt = _mm_mul_pd(r00,ewtabscale);
990 ewitab = _mm_cvttpd_epi32(ewrt);
991 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
992 ewitab = _mm_slli_epi32(ewitab,2);
993 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
994 ewtabD = _mm_setzero_pd();
995 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
996 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
997 ewtabFn = _mm_setzero_pd();
998 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
999 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1000 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1001 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
1002 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1004 /* LENNARD-JONES DISPERSION/REPULSION */
1006 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1007 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1008 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1009 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
1010 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1012 d = _mm_sub_pd(r00,rswitch);
1013 d = _mm_max_pd(d,_mm_setzero_pd());
1014 d2 = _mm_mul_pd(d,d);
1015 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1017 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1019 /* Evaluate switch function */
1020 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1021 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
1022 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1023 velec = _mm_mul_pd(velec,sw);
1024 vvdw = _mm_mul_pd(vvdw,sw);
1025 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1027 /* Update potential sum for this i atom from the interaction with this j atom. */
1028 velec = _mm_and_pd(velec,cutoff_mask);
1029 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1030 velecsum = _mm_add_pd(velecsum,velec);
1031 vvdw = _mm_and_pd(vvdw,cutoff_mask);
1032 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
1033 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
1035 fscal = _mm_add_pd(felec,fvdw);
1037 fscal = _mm_and_pd(fscal,cutoff_mask);
1039 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1041 /* Calculate temporary vectorial force */
1042 tx = _mm_mul_pd(fscal,dx00);
1043 ty = _mm_mul_pd(fscal,dy00);
1044 tz = _mm_mul_pd(fscal,dz00);
1046 /* Update vectorial force */
1047 fix0 = _mm_add_pd(fix0,tx);
1048 fiy0 = _mm_add_pd(fiy0,ty);
1049 fiz0 = _mm_add_pd(fiz0,tz);
1051 fjx0 = _mm_add_pd(fjx0,tx);
1052 fjy0 = _mm_add_pd(fjy0,ty);
1053 fjz0 = _mm_add_pd(fjz0,tz);
1057 /**************************
1058 * CALCULATE INTERACTIONS *
1059 **************************/
1061 if (gmx_mm_any_lt(rsq01,rcutoff2))
1064 r01 = _mm_mul_pd(rsq01,rinv01);
1066 /* EWALD ELECTROSTATICS */
1068 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1069 ewrt = _mm_mul_pd(r01,ewtabscale);
1070 ewitab = _mm_cvttpd_epi32(ewrt);
1071 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1072 ewitab = _mm_slli_epi32(ewitab,2);
1073 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1074 ewtabD = _mm_setzero_pd();
1075 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1076 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1077 ewtabFn = _mm_setzero_pd();
1078 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1079 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1080 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1081 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
1082 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1084 d = _mm_sub_pd(r01,rswitch);
1085 d = _mm_max_pd(d,_mm_setzero_pd());
1086 d2 = _mm_mul_pd(d,d);
1087 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1089 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1091 /* Evaluate switch function */
1092 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1093 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
1094 velec = _mm_mul_pd(velec,sw);
1095 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
1097 /* Update potential sum for this i atom from the interaction with this j atom. */
1098 velec = _mm_and_pd(velec,cutoff_mask);
1099 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1100 velecsum = _mm_add_pd(velecsum,velec);
1104 fscal = _mm_and_pd(fscal,cutoff_mask);
1106 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1108 /* Calculate temporary vectorial force */
1109 tx = _mm_mul_pd(fscal,dx01);
1110 ty = _mm_mul_pd(fscal,dy01);
1111 tz = _mm_mul_pd(fscal,dz01);
1113 /* Update vectorial force */
1114 fix0 = _mm_add_pd(fix0,tx);
1115 fiy0 = _mm_add_pd(fiy0,ty);
1116 fiz0 = _mm_add_pd(fiz0,tz);
1118 fjx1 = _mm_add_pd(fjx1,tx);
1119 fjy1 = _mm_add_pd(fjy1,ty);
1120 fjz1 = _mm_add_pd(fjz1,tz);
1124 /**************************
1125 * CALCULATE INTERACTIONS *
1126 **************************/
1128 if (gmx_mm_any_lt(rsq02,rcutoff2))
1131 r02 = _mm_mul_pd(rsq02,rinv02);
1133 /* EWALD ELECTROSTATICS */
1135 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1136 ewrt = _mm_mul_pd(r02,ewtabscale);
1137 ewitab = _mm_cvttpd_epi32(ewrt);
1138 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1139 ewitab = _mm_slli_epi32(ewitab,2);
1140 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1141 ewtabD = _mm_setzero_pd();
1142 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1143 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1144 ewtabFn = _mm_setzero_pd();
1145 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1146 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1147 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1148 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
1149 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1151 d = _mm_sub_pd(r02,rswitch);
1152 d = _mm_max_pd(d,_mm_setzero_pd());
1153 d2 = _mm_mul_pd(d,d);
1154 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1156 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1158 /* Evaluate switch function */
1159 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1160 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
1161 velec = _mm_mul_pd(velec,sw);
1162 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
1164 /* Update potential sum for this i atom from the interaction with this j atom. */
1165 velec = _mm_and_pd(velec,cutoff_mask);
1166 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1167 velecsum = _mm_add_pd(velecsum,velec);
1171 fscal = _mm_and_pd(fscal,cutoff_mask);
1173 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1175 /* Calculate temporary vectorial force */
1176 tx = _mm_mul_pd(fscal,dx02);
1177 ty = _mm_mul_pd(fscal,dy02);
1178 tz = _mm_mul_pd(fscal,dz02);
1180 /* Update vectorial force */
1181 fix0 = _mm_add_pd(fix0,tx);
1182 fiy0 = _mm_add_pd(fiy0,ty);
1183 fiz0 = _mm_add_pd(fiz0,tz);
1185 fjx2 = _mm_add_pd(fjx2,tx);
1186 fjy2 = _mm_add_pd(fjy2,ty);
1187 fjz2 = _mm_add_pd(fjz2,tz);
1191 /**************************
1192 * CALCULATE INTERACTIONS *
1193 **************************/
1195 if (gmx_mm_any_lt(rsq10,rcutoff2))
1198 r10 = _mm_mul_pd(rsq10,rinv10);
1200 /* EWALD ELECTROSTATICS */
1202 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1203 ewrt = _mm_mul_pd(r10,ewtabscale);
1204 ewitab = _mm_cvttpd_epi32(ewrt);
1205 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1206 ewitab = _mm_slli_epi32(ewitab,2);
1207 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1208 ewtabD = _mm_setzero_pd();
1209 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1210 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1211 ewtabFn = _mm_setzero_pd();
1212 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1213 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1214 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1215 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
1216 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1218 d = _mm_sub_pd(r10,rswitch);
1219 d = _mm_max_pd(d,_mm_setzero_pd());
1220 d2 = _mm_mul_pd(d,d);
1221 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1223 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1225 /* Evaluate switch function */
1226 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1227 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
1228 velec = _mm_mul_pd(velec,sw);
1229 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1231 /* Update potential sum for this i atom from the interaction with this j atom. */
1232 velec = _mm_and_pd(velec,cutoff_mask);
1233 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1234 velecsum = _mm_add_pd(velecsum,velec);
1238 fscal = _mm_and_pd(fscal,cutoff_mask);
1240 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1242 /* Calculate temporary vectorial force */
1243 tx = _mm_mul_pd(fscal,dx10);
1244 ty = _mm_mul_pd(fscal,dy10);
1245 tz = _mm_mul_pd(fscal,dz10);
1247 /* Update vectorial force */
1248 fix1 = _mm_add_pd(fix1,tx);
1249 fiy1 = _mm_add_pd(fiy1,ty);
1250 fiz1 = _mm_add_pd(fiz1,tz);
1252 fjx0 = _mm_add_pd(fjx0,tx);
1253 fjy0 = _mm_add_pd(fjy0,ty);
1254 fjz0 = _mm_add_pd(fjz0,tz);
1258 /**************************
1259 * CALCULATE INTERACTIONS *
1260 **************************/
1262 if (gmx_mm_any_lt(rsq11,rcutoff2))
1265 r11 = _mm_mul_pd(rsq11,rinv11);
1267 /* EWALD ELECTROSTATICS */
1269 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1270 ewrt = _mm_mul_pd(r11,ewtabscale);
1271 ewitab = _mm_cvttpd_epi32(ewrt);
1272 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1273 ewitab = _mm_slli_epi32(ewitab,2);
1274 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1275 ewtabD = _mm_setzero_pd();
1276 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1277 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1278 ewtabFn = _mm_setzero_pd();
1279 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1280 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1281 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1282 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1283 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1285 d = _mm_sub_pd(r11,rswitch);
1286 d = _mm_max_pd(d,_mm_setzero_pd());
1287 d2 = _mm_mul_pd(d,d);
1288 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1290 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1292 /* Evaluate switch function */
1293 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1294 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
1295 velec = _mm_mul_pd(velec,sw);
1296 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1298 /* Update potential sum for this i atom from the interaction with this j atom. */
1299 velec = _mm_and_pd(velec,cutoff_mask);
1300 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1301 velecsum = _mm_add_pd(velecsum,velec);
1305 fscal = _mm_and_pd(fscal,cutoff_mask);
1307 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1309 /* Calculate temporary vectorial force */
1310 tx = _mm_mul_pd(fscal,dx11);
1311 ty = _mm_mul_pd(fscal,dy11);
1312 tz = _mm_mul_pd(fscal,dz11);
1314 /* Update vectorial force */
1315 fix1 = _mm_add_pd(fix1,tx);
1316 fiy1 = _mm_add_pd(fiy1,ty);
1317 fiz1 = _mm_add_pd(fiz1,tz);
1319 fjx1 = _mm_add_pd(fjx1,tx);
1320 fjy1 = _mm_add_pd(fjy1,ty);
1321 fjz1 = _mm_add_pd(fjz1,tz);
1325 /**************************
1326 * CALCULATE INTERACTIONS *
1327 **************************/
1329 if (gmx_mm_any_lt(rsq12,rcutoff2))
1332 r12 = _mm_mul_pd(rsq12,rinv12);
1334 /* EWALD ELECTROSTATICS */
1336 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1337 ewrt = _mm_mul_pd(r12,ewtabscale);
1338 ewitab = _mm_cvttpd_epi32(ewrt);
1339 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1340 ewitab = _mm_slli_epi32(ewitab,2);
1341 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1342 ewtabD = _mm_setzero_pd();
1343 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1344 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1345 ewtabFn = _mm_setzero_pd();
1346 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1347 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1348 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1349 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1350 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1352 d = _mm_sub_pd(r12,rswitch);
1353 d = _mm_max_pd(d,_mm_setzero_pd());
1354 d2 = _mm_mul_pd(d,d);
1355 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1357 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1359 /* Evaluate switch function */
1360 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1361 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
1362 velec = _mm_mul_pd(velec,sw);
1363 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1365 /* Update potential sum for this i atom from the interaction with this j atom. */
1366 velec = _mm_and_pd(velec,cutoff_mask);
1367 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1368 velecsum = _mm_add_pd(velecsum,velec);
1372 fscal = _mm_and_pd(fscal,cutoff_mask);
1374 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1376 /* Calculate temporary vectorial force */
1377 tx = _mm_mul_pd(fscal,dx12);
1378 ty = _mm_mul_pd(fscal,dy12);
1379 tz = _mm_mul_pd(fscal,dz12);
1381 /* Update vectorial force */
1382 fix1 = _mm_add_pd(fix1,tx);
1383 fiy1 = _mm_add_pd(fiy1,ty);
1384 fiz1 = _mm_add_pd(fiz1,tz);
1386 fjx2 = _mm_add_pd(fjx2,tx);
1387 fjy2 = _mm_add_pd(fjy2,ty);
1388 fjz2 = _mm_add_pd(fjz2,tz);
1392 /**************************
1393 * CALCULATE INTERACTIONS *
1394 **************************/
1396 if (gmx_mm_any_lt(rsq20,rcutoff2))
1399 r20 = _mm_mul_pd(rsq20,rinv20);
1401 /* EWALD ELECTROSTATICS */
1403 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1404 ewrt = _mm_mul_pd(r20,ewtabscale);
1405 ewitab = _mm_cvttpd_epi32(ewrt);
1406 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1407 ewitab = _mm_slli_epi32(ewitab,2);
1408 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1409 ewtabD = _mm_setzero_pd();
1410 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1411 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1412 ewtabFn = _mm_setzero_pd();
1413 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1414 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1415 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1416 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
1417 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1419 d = _mm_sub_pd(r20,rswitch);
1420 d = _mm_max_pd(d,_mm_setzero_pd());
1421 d2 = _mm_mul_pd(d,d);
1422 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1424 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1426 /* Evaluate switch function */
1427 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1428 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
1429 velec = _mm_mul_pd(velec,sw);
1430 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1432 /* Update potential sum for this i atom from the interaction with this j atom. */
1433 velec = _mm_and_pd(velec,cutoff_mask);
1434 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1435 velecsum = _mm_add_pd(velecsum,velec);
1439 fscal = _mm_and_pd(fscal,cutoff_mask);
1441 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1443 /* Calculate temporary vectorial force */
1444 tx = _mm_mul_pd(fscal,dx20);
1445 ty = _mm_mul_pd(fscal,dy20);
1446 tz = _mm_mul_pd(fscal,dz20);
1448 /* Update vectorial force */
1449 fix2 = _mm_add_pd(fix2,tx);
1450 fiy2 = _mm_add_pd(fiy2,ty);
1451 fiz2 = _mm_add_pd(fiz2,tz);
1453 fjx0 = _mm_add_pd(fjx0,tx);
1454 fjy0 = _mm_add_pd(fjy0,ty);
1455 fjz0 = _mm_add_pd(fjz0,tz);
1459 /**************************
1460 * CALCULATE INTERACTIONS *
1461 **************************/
1463 if (gmx_mm_any_lt(rsq21,rcutoff2))
1466 r21 = _mm_mul_pd(rsq21,rinv21);
1468 /* EWALD ELECTROSTATICS */
1470 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1471 ewrt = _mm_mul_pd(r21,ewtabscale);
1472 ewitab = _mm_cvttpd_epi32(ewrt);
1473 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1474 ewitab = _mm_slli_epi32(ewitab,2);
1475 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1476 ewtabD = _mm_setzero_pd();
1477 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1478 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1479 ewtabFn = _mm_setzero_pd();
1480 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1481 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1482 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1483 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1484 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1486 d = _mm_sub_pd(r21,rswitch);
1487 d = _mm_max_pd(d,_mm_setzero_pd());
1488 d2 = _mm_mul_pd(d,d);
1489 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1491 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1493 /* Evaluate switch function */
1494 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1495 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
1496 velec = _mm_mul_pd(velec,sw);
1497 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1499 /* Update potential sum for this i atom from the interaction with this j atom. */
1500 velec = _mm_and_pd(velec,cutoff_mask);
1501 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1502 velecsum = _mm_add_pd(velecsum,velec);
1506 fscal = _mm_and_pd(fscal,cutoff_mask);
1508 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1510 /* Calculate temporary vectorial force */
1511 tx = _mm_mul_pd(fscal,dx21);
1512 ty = _mm_mul_pd(fscal,dy21);
1513 tz = _mm_mul_pd(fscal,dz21);
1515 /* Update vectorial force */
1516 fix2 = _mm_add_pd(fix2,tx);
1517 fiy2 = _mm_add_pd(fiy2,ty);
1518 fiz2 = _mm_add_pd(fiz2,tz);
1520 fjx1 = _mm_add_pd(fjx1,tx);
1521 fjy1 = _mm_add_pd(fjy1,ty);
1522 fjz1 = _mm_add_pd(fjz1,tz);
1526 /**************************
1527 * CALCULATE INTERACTIONS *
1528 **************************/
1530 if (gmx_mm_any_lt(rsq22,rcutoff2))
1533 r22 = _mm_mul_pd(rsq22,rinv22);
1535 /* EWALD ELECTROSTATICS */
1537 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1538 ewrt = _mm_mul_pd(r22,ewtabscale);
1539 ewitab = _mm_cvttpd_epi32(ewrt);
1540 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1541 ewitab = _mm_slli_epi32(ewitab,2);
1542 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1543 ewtabD = _mm_setzero_pd();
1544 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1545 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1546 ewtabFn = _mm_setzero_pd();
1547 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1548 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1549 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1550 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1551 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1553 d = _mm_sub_pd(r22,rswitch);
1554 d = _mm_max_pd(d,_mm_setzero_pd());
1555 d2 = _mm_mul_pd(d,d);
1556 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1558 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1560 /* Evaluate switch function */
1561 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1562 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
1563 velec = _mm_mul_pd(velec,sw);
1564 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1566 /* Update potential sum for this i atom from the interaction with this j atom. */
1567 velec = _mm_and_pd(velec,cutoff_mask);
1568 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1569 velecsum = _mm_add_pd(velecsum,velec);
1573 fscal = _mm_and_pd(fscal,cutoff_mask);
1575 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1577 /* Calculate temporary vectorial force */
1578 tx = _mm_mul_pd(fscal,dx22);
1579 ty = _mm_mul_pd(fscal,dy22);
1580 tz = _mm_mul_pd(fscal,dz22);
1582 /* Update vectorial force */
1583 fix2 = _mm_add_pd(fix2,tx);
1584 fiy2 = _mm_add_pd(fiy2,ty);
1585 fiz2 = _mm_add_pd(fiz2,tz);
1587 fjx2 = _mm_add_pd(fjx2,tx);
1588 fjy2 = _mm_add_pd(fjy2,ty);
1589 fjz2 = _mm_add_pd(fjz2,tz);
1593 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1595 /* Inner loop uses 603 flops */
1598 /* End of innermost loop */
1600 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1601 f+i_coord_offset,fshift+i_shift_offset);
1604 /* Update potential energies */
1605 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1606 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1608 /* Increment number of inner iterations */
1609 inneriter += j_index_end - j_index_start;
1611 /* Outer loop uses 20 flops */
1614 /* Increment number of outer iterations */
1617 /* Update outer/inner flops */
1619 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*603);
1622 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse4_1_double
1623 * Electrostatics interaction: Ewald
1624 * VdW interaction: LennardJones
1625 * Geometry: Water3-Water3
1626 * Calculate force/pot: Force
1629 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse4_1_double
1630 (t_nblist * gmx_restrict nlist,
1631 rvec * gmx_restrict xx,
1632 rvec * gmx_restrict ff,
1633 t_forcerec * gmx_restrict fr,
1634 t_mdatoms * gmx_restrict mdatoms,
1635 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1636 t_nrnb * gmx_restrict nrnb)
1638 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1639 * just 0 for non-waters.
1640 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1641 * jnr indices corresponding to data put in the four positions in the SIMD register.
1643 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1644 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1646 int j_coord_offsetA,j_coord_offsetB;
1647 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1648 real rcutoff_scalar;
1649 real *shiftvec,*fshift,*x,*f;
1650 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1652 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1654 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1656 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1657 int vdwjidx0A,vdwjidx0B;
1658 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1659 int vdwjidx1A,vdwjidx1B;
1660 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1661 int vdwjidx2A,vdwjidx2B;
1662 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1663 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1664 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1665 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1666 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1667 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1668 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1669 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1670 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1671 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1672 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1675 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1678 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1679 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1681 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1683 __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1684 real rswitch_scalar,d_scalar;
1685 __m128d dummy_mask,cutoff_mask;
1686 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1687 __m128d one = _mm_set1_pd(1.0);
1688 __m128d two = _mm_set1_pd(2.0);
1694 jindex = nlist->jindex;
1696 shiftidx = nlist->shift;
1698 shiftvec = fr->shift_vec[0];
1699 fshift = fr->fshift[0];
1700 facel = _mm_set1_pd(fr->epsfac);
1701 charge = mdatoms->chargeA;
1702 nvdwtype = fr->ntype;
1703 vdwparam = fr->nbfp;
1704 vdwtype = mdatoms->typeA;
1706 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1707 ewtab = fr->ic->tabq_coul_FDV0;
1708 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1709 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1711 /* Setup water-specific parameters */
1712 inr = nlist->iinr[0];
1713 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1714 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1715 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1716 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1718 jq0 = _mm_set1_pd(charge[inr+0]);
1719 jq1 = _mm_set1_pd(charge[inr+1]);
1720 jq2 = _mm_set1_pd(charge[inr+2]);
1721 vdwjidx0A = 2*vdwtype[inr+0];
1722 qq00 = _mm_mul_pd(iq0,jq0);
1723 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1724 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1725 qq01 = _mm_mul_pd(iq0,jq1);
1726 qq02 = _mm_mul_pd(iq0,jq2);
1727 qq10 = _mm_mul_pd(iq1,jq0);
1728 qq11 = _mm_mul_pd(iq1,jq1);
1729 qq12 = _mm_mul_pd(iq1,jq2);
1730 qq20 = _mm_mul_pd(iq2,jq0);
1731 qq21 = _mm_mul_pd(iq2,jq1);
1732 qq22 = _mm_mul_pd(iq2,jq2);
1734 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1735 rcutoff_scalar = fr->rcoulomb;
1736 rcutoff = _mm_set1_pd(rcutoff_scalar);
1737 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1739 rswitch_scalar = fr->rcoulomb_switch;
1740 rswitch = _mm_set1_pd(rswitch_scalar);
1741 /* Setup switch parameters */
1742 d_scalar = rcutoff_scalar-rswitch_scalar;
1743 d = _mm_set1_pd(d_scalar);
1744 swV3 = _mm_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
1745 swV4 = _mm_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1746 swV5 = _mm_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1747 swF2 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
1748 swF3 = _mm_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1749 swF4 = _mm_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1751 /* Avoid stupid compiler warnings */
1753 j_coord_offsetA = 0;
1754 j_coord_offsetB = 0;
1759 /* Start outer loop over neighborlists */
1760 for(iidx=0; iidx<nri; iidx++)
1762 /* Load shift vector for this list */
1763 i_shift_offset = DIM*shiftidx[iidx];
1765 /* Load limits for loop over neighbors */
1766 j_index_start = jindex[iidx];
1767 j_index_end = jindex[iidx+1];
1769 /* Get outer coordinate index */
1771 i_coord_offset = DIM*inr;
1773 /* Load i particle coords and add shift vector */
1774 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1775 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1777 fix0 = _mm_setzero_pd();
1778 fiy0 = _mm_setzero_pd();
1779 fiz0 = _mm_setzero_pd();
1780 fix1 = _mm_setzero_pd();
1781 fiy1 = _mm_setzero_pd();
1782 fiz1 = _mm_setzero_pd();
1783 fix2 = _mm_setzero_pd();
1784 fiy2 = _mm_setzero_pd();
1785 fiz2 = _mm_setzero_pd();
1787 /* Start inner kernel loop */
1788 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1791 /* Get j neighbor index, and coordinate index */
1793 jnrB = jjnr[jidx+1];
1794 j_coord_offsetA = DIM*jnrA;
1795 j_coord_offsetB = DIM*jnrB;
1797 /* load j atom coordinates */
1798 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1799 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1801 /* Calculate displacement vector */
1802 dx00 = _mm_sub_pd(ix0,jx0);
1803 dy00 = _mm_sub_pd(iy0,jy0);
1804 dz00 = _mm_sub_pd(iz0,jz0);
1805 dx01 = _mm_sub_pd(ix0,jx1);
1806 dy01 = _mm_sub_pd(iy0,jy1);
1807 dz01 = _mm_sub_pd(iz0,jz1);
1808 dx02 = _mm_sub_pd(ix0,jx2);
1809 dy02 = _mm_sub_pd(iy0,jy2);
1810 dz02 = _mm_sub_pd(iz0,jz2);
1811 dx10 = _mm_sub_pd(ix1,jx0);
1812 dy10 = _mm_sub_pd(iy1,jy0);
1813 dz10 = _mm_sub_pd(iz1,jz0);
1814 dx11 = _mm_sub_pd(ix1,jx1);
1815 dy11 = _mm_sub_pd(iy1,jy1);
1816 dz11 = _mm_sub_pd(iz1,jz1);
1817 dx12 = _mm_sub_pd(ix1,jx2);
1818 dy12 = _mm_sub_pd(iy1,jy2);
1819 dz12 = _mm_sub_pd(iz1,jz2);
1820 dx20 = _mm_sub_pd(ix2,jx0);
1821 dy20 = _mm_sub_pd(iy2,jy0);
1822 dz20 = _mm_sub_pd(iz2,jz0);
1823 dx21 = _mm_sub_pd(ix2,jx1);
1824 dy21 = _mm_sub_pd(iy2,jy1);
1825 dz21 = _mm_sub_pd(iz2,jz1);
1826 dx22 = _mm_sub_pd(ix2,jx2);
1827 dy22 = _mm_sub_pd(iy2,jy2);
1828 dz22 = _mm_sub_pd(iz2,jz2);
1830 /* Calculate squared distance and things based on it */
1831 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1832 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1833 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1834 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1835 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1836 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1837 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1838 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1839 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1841 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1842 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1843 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1844 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1845 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1846 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1847 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1848 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1849 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1851 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1852 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1853 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1854 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1855 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1856 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1857 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1858 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1859 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1861 fjx0 = _mm_setzero_pd();
1862 fjy0 = _mm_setzero_pd();
1863 fjz0 = _mm_setzero_pd();
1864 fjx1 = _mm_setzero_pd();
1865 fjy1 = _mm_setzero_pd();
1866 fjz1 = _mm_setzero_pd();
1867 fjx2 = _mm_setzero_pd();
1868 fjy2 = _mm_setzero_pd();
1869 fjz2 = _mm_setzero_pd();
1871 /**************************
1872 * CALCULATE INTERACTIONS *
1873 **************************/
1875 if (gmx_mm_any_lt(rsq00,rcutoff2))
1878 r00 = _mm_mul_pd(rsq00,rinv00);
1880 /* EWALD ELECTROSTATICS */
1882 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1883 ewrt = _mm_mul_pd(r00,ewtabscale);
1884 ewitab = _mm_cvttpd_epi32(ewrt);
1885 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1886 ewitab = _mm_slli_epi32(ewitab,2);
1887 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1888 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1889 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1890 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1891 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1892 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1893 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1894 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1895 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
1896 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1898 /* LENNARD-JONES DISPERSION/REPULSION */
1900 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1901 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
1902 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
1903 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
1904 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
1906 d = _mm_sub_pd(r00,rswitch);
1907 d = _mm_max_pd(d,_mm_setzero_pd());
1908 d2 = _mm_mul_pd(d,d);
1909 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1911 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1913 /* Evaluate switch function */
1914 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1915 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
1916 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
1917 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1919 fscal = _mm_add_pd(felec,fvdw);
1921 fscal = _mm_and_pd(fscal,cutoff_mask);
1923 /* Calculate temporary vectorial force */
1924 tx = _mm_mul_pd(fscal,dx00);
1925 ty = _mm_mul_pd(fscal,dy00);
1926 tz = _mm_mul_pd(fscal,dz00);
1928 /* Update vectorial force */
1929 fix0 = _mm_add_pd(fix0,tx);
1930 fiy0 = _mm_add_pd(fiy0,ty);
1931 fiz0 = _mm_add_pd(fiz0,tz);
1933 fjx0 = _mm_add_pd(fjx0,tx);
1934 fjy0 = _mm_add_pd(fjy0,ty);
1935 fjz0 = _mm_add_pd(fjz0,tz);
1939 /**************************
1940 * CALCULATE INTERACTIONS *
1941 **************************/
1943 if (gmx_mm_any_lt(rsq01,rcutoff2))
1946 r01 = _mm_mul_pd(rsq01,rinv01);
1948 /* EWALD ELECTROSTATICS */
1950 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1951 ewrt = _mm_mul_pd(r01,ewtabscale);
1952 ewitab = _mm_cvttpd_epi32(ewrt);
1953 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1954 ewitab = _mm_slli_epi32(ewitab,2);
1955 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1956 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1957 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1958 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1959 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
1960 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1961 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1962 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1963 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
1964 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1966 d = _mm_sub_pd(r01,rswitch);
1967 d = _mm_max_pd(d,_mm_setzero_pd());
1968 d2 = _mm_mul_pd(d,d);
1969 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
1971 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
1973 /* Evaluate switch function */
1974 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1975 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
1976 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
1980 fscal = _mm_and_pd(fscal,cutoff_mask);
1982 /* Calculate temporary vectorial force */
1983 tx = _mm_mul_pd(fscal,dx01);
1984 ty = _mm_mul_pd(fscal,dy01);
1985 tz = _mm_mul_pd(fscal,dz01);
1987 /* Update vectorial force */
1988 fix0 = _mm_add_pd(fix0,tx);
1989 fiy0 = _mm_add_pd(fiy0,ty);
1990 fiz0 = _mm_add_pd(fiz0,tz);
1992 fjx1 = _mm_add_pd(fjx1,tx);
1993 fjy1 = _mm_add_pd(fjy1,ty);
1994 fjz1 = _mm_add_pd(fjz1,tz);
1998 /**************************
1999 * CALCULATE INTERACTIONS *
2000 **************************/
2002 if (gmx_mm_any_lt(rsq02,rcutoff2))
2005 r02 = _mm_mul_pd(rsq02,rinv02);
2007 /* EWALD ELECTROSTATICS */
2009 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2010 ewrt = _mm_mul_pd(r02,ewtabscale);
2011 ewitab = _mm_cvttpd_epi32(ewrt);
2012 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2013 ewitab = _mm_slli_epi32(ewitab,2);
2014 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2015 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2016 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2017 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2018 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2019 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2020 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2021 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2022 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
2023 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
2025 d = _mm_sub_pd(r02,rswitch);
2026 d = _mm_max_pd(d,_mm_setzero_pd());
2027 d2 = _mm_mul_pd(d,d);
2028 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2030 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2032 /* Evaluate switch function */
2033 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2034 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
2035 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
2039 fscal = _mm_and_pd(fscal,cutoff_mask);
2041 /* Calculate temporary vectorial force */
2042 tx = _mm_mul_pd(fscal,dx02);
2043 ty = _mm_mul_pd(fscal,dy02);
2044 tz = _mm_mul_pd(fscal,dz02);
2046 /* Update vectorial force */
2047 fix0 = _mm_add_pd(fix0,tx);
2048 fiy0 = _mm_add_pd(fiy0,ty);
2049 fiz0 = _mm_add_pd(fiz0,tz);
2051 fjx2 = _mm_add_pd(fjx2,tx);
2052 fjy2 = _mm_add_pd(fjy2,ty);
2053 fjz2 = _mm_add_pd(fjz2,tz);
2057 /**************************
2058 * CALCULATE INTERACTIONS *
2059 **************************/
2061 if (gmx_mm_any_lt(rsq10,rcutoff2))
2064 r10 = _mm_mul_pd(rsq10,rinv10);
2066 /* EWALD ELECTROSTATICS */
2068 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2069 ewrt = _mm_mul_pd(r10,ewtabscale);
2070 ewitab = _mm_cvttpd_epi32(ewrt);
2071 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2072 ewitab = _mm_slli_epi32(ewitab,2);
2073 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2074 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2075 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2076 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2077 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2078 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2079 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2080 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2081 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
2082 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2084 d = _mm_sub_pd(r10,rswitch);
2085 d = _mm_max_pd(d,_mm_setzero_pd());
2086 d2 = _mm_mul_pd(d,d);
2087 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2089 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2091 /* Evaluate switch function */
2092 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2093 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
2094 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
2098 fscal = _mm_and_pd(fscal,cutoff_mask);
2100 /* Calculate temporary vectorial force */
2101 tx = _mm_mul_pd(fscal,dx10);
2102 ty = _mm_mul_pd(fscal,dy10);
2103 tz = _mm_mul_pd(fscal,dz10);
2105 /* Update vectorial force */
2106 fix1 = _mm_add_pd(fix1,tx);
2107 fiy1 = _mm_add_pd(fiy1,ty);
2108 fiz1 = _mm_add_pd(fiz1,tz);
2110 fjx0 = _mm_add_pd(fjx0,tx);
2111 fjy0 = _mm_add_pd(fjy0,ty);
2112 fjz0 = _mm_add_pd(fjz0,tz);
2116 /**************************
2117 * CALCULATE INTERACTIONS *
2118 **************************/
2120 if (gmx_mm_any_lt(rsq11,rcutoff2))
2123 r11 = _mm_mul_pd(rsq11,rinv11);
2125 /* EWALD ELECTROSTATICS */
2127 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2128 ewrt = _mm_mul_pd(r11,ewtabscale);
2129 ewitab = _mm_cvttpd_epi32(ewrt);
2130 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2131 ewitab = _mm_slli_epi32(ewitab,2);
2132 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2133 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2134 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2135 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2136 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2137 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2138 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2139 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2140 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2141 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2143 d = _mm_sub_pd(r11,rswitch);
2144 d = _mm_max_pd(d,_mm_setzero_pd());
2145 d2 = _mm_mul_pd(d,d);
2146 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2148 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2150 /* Evaluate switch function */
2151 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2152 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2153 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2157 fscal = _mm_and_pd(fscal,cutoff_mask);
2159 /* Calculate temporary vectorial force */
2160 tx = _mm_mul_pd(fscal,dx11);
2161 ty = _mm_mul_pd(fscal,dy11);
2162 tz = _mm_mul_pd(fscal,dz11);
2164 /* Update vectorial force */
2165 fix1 = _mm_add_pd(fix1,tx);
2166 fiy1 = _mm_add_pd(fiy1,ty);
2167 fiz1 = _mm_add_pd(fiz1,tz);
2169 fjx1 = _mm_add_pd(fjx1,tx);
2170 fjy1 = _mm_add_pd(fjy1,ty);
2171 fjz1 = _mm_add_pd(fjz1,tz);
2175 /**************************
2176 * CALCULATE INTERACTIONS *
2177 **************************/
2179 if (gmx_mm_any_lt(rsq12,rcutoff2))
2182 r12 = _mm_mul_pd(rsq12,rinv12);
2184 /* EWALD ELECTROSTATICS */
2186 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2187 ewrt = _mm_mul_pd(r12,ewtabscale);
2188 ewitab = _mm_cvttpd_epi32(ewrt);
2189 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2190 ewitab = _mm_slli_epi32(ewitab,2);
2191 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2192 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2193 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2194 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2195 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2196 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2197 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2198 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2199 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2200 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2202 d = _mm_sub_pd(r12,rswitch);
2203 d = _mm_max_pd(d,_mm_setzero_pd());
2204 d2 = _mm_mul_pd(d,d);
2205 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2207 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2209 /* Evaluate switch function */
2210 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2211 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2212 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2216 fscal = _mm_and_pd(fscal,cutoff_mask);
2218 /* Calculate temporary vectorial force */
2219 tx = _mm_mul_pd(fscal,dx12);
2220 ty = _mm_mul_pd(fscal,dy12);
2221 tz = _mm_mul_pd(fscal,dz12);
2223 /* Update vectorial force */
2224 fix1 = _mm_add_pd(fix1,tx);
2225 fiy1 = _mm_add_pd(fiy1,ty);
2226 fiz1 = _mm_add_pd(fiz1,tz);
2228 fjx2 = _mm_add_pd(fjx2,tx);
2229 fjy2 = _mm_add_pd(fjy2,ty);
2230 fjz2 = _mm_add_pd(fjz2,tz);
2234 /**************************
2235 * CALCULATE INTERACTIONS *
2236 **************************/
2238 if (gmx_mm_any_lt(rsq20,rcutoff2))
2241 r20 = _mm_mul_pd(rsq20,rinv20);
2243 /* EWALD ELECTROSTATICS */
2245 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2246 ewrt = _mm_mul_pd(r20,ewtabscale);
2247 ewitab = _mm_cvttpd_epi32(ewrt);
2248 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2249 ewitab = _mm_slli_epi32(ewitab,2);
2250 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2251 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2252 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2253 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2254 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2255 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2256 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2257 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2258 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
2259 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2261 d = _mm_sub_pd(r20,rswitch);
2262 d = _mm_max_pd(d,_mm_setzero_pd());
2263 d2 = _mm_mul_pd(d,d);
2264 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2266 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2268 /* Evaluate switch function */
2269 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2270 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
2271 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
2275 fscal = _mm_and_pd(fscal,cutoff_mask);
2277 /* Calculate temporary vectorial force */
2278 tx = _mm_mul_pd(fscal,dx20);
2279 ty = _mm_mul_pd(fscal,dy20);
2280 tz = _mm_mul_pd(fscal,dz20);
2282 /* Update vectorial force */
2283 fix2 = _mm_add_pd(fix2,tx);
2284 fiy2 = _mm_add_pd(fiy2,ty);
2285 fiz2 = _mm_add_pd(fiz2,tz);
2287 fjx0 = _mm_add_pd(fjx0,tx);
2288 fjy0 = _mm_add_pd(fjy0,ty);
2289 fjz0 = _mm_add_pd(fjz0,tz);
2293 /**************************
2294 * CALCULATE INTERACTIONS *
2295 **************************/
2297 if (gmx_mm_any_lt(rsq21,rcutoff2))
2300 r21 = _mm_mul_pd(rsq21,rinv21);
2302 /* EWALD ELECTROSTATICS */
2304 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2305 ewrt = _mm_mul_pd(r21,ewtabscale);
2306 ewitab = _mm_cvttpd_epi32(ewrt);
2307 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2308 ewitab = _mm_slli_epi32(ewitab,2);
2309 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2310 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2311 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2312 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2313 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2314 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2315 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2316 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2317 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2318 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2320 d = _mm_sub_pd(r21,rswitch);
2321 d = _mm_max_pd(d,_mm_setzero_pd());
2322 d2 = _mm_mul_pd(d,d);
2323 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2325 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2327 /* Evaluate switch function */
2328 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2329 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2330 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2334 fscal = _mm_and_pd(fscal,cutoff_mask);
2336 /* Calculate temporary vectorial force */
2337 tx = _mm_mul_pd(fscal,dx21);
2338 ty = _mm_mul_pd(fscal,dy21);
2339 tz = _mm_mul_pd(fscal,dz21);
2341 /* Update vectorial force */
2342 fix2 = _mm_add_pd(fix2,tx);
2343 fiy2 = _mm_add_pd(fiy2,ty);
2344 fiz2 = _mm_add_pd(fiz2,tz);
2346 fjx1 = _mm_add_pd(fjx1,tx);
2347 fjy1 = _mm_add_pd(fjy1,ty);
2348 fjz1 = _mm_add_pd(fjz1,tz);
2352 /**************************
2353 * CALCULATE INTERACTIONS *
2354 **************************/
2356 if (gmx_mm_any_lt(rsq22,rcutoff2))
2359 r22 = _mm_mul_pd(rsq22,rinv22);
2361 /* EWALD ELECTROSTATICS */
2363 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2364 ewrt = _mm_mul_pd(r22,ewtabscale);
2365 ewitab = _mm_cvttpd_epi32(ewrt);
2366 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2367 ewitab = _mm_slli_epi32(ewitab,2);
2368 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2369 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2370 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2371 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2372 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
2373 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2374 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2375 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2376 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
2377 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2379 d = _mm_sub_pd(r22,rswitch);
2380 d = _mm_max_pd(d,_mm_setzero_pd());
2381 d2 = _mm_mul_pd(d,d);
2382 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2384 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2386 /* Evaluate switch function */
2387 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2388 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
2389 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2393 fscal = _mm_and_pd(fscal,cutoff_mask);
2395 /* Calculate temporary vectorial force */
2396 tx = _mm_mul_pd(fscal,dx22);
2397 ty = _mm_mul_pd(fscal,dy22);
2398 tz = _mm_mul_pd(fscal,dz22);
2400 /* Update vectorial force */
2401 fix2 = _mm_add_pd(fix2,tx);
2402 fiy2 = _mm_add_pd(fiy2,ty);
2403 fiz2 = _mm_add_pd(fiz2,tz);
2405 fjx2 = _mm_add_pd(fjx2,tx);
2406 fjy2 = _mm_add_pd(fjy2,ty);
2407 fjz2 = _mm_add_pd(fjz2,tz);
2411 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2413 /* Inner loop uses 573 flops */
2416 if(jidx<j_index_end)
2420 j_coord_offsetA = DIM*jnrA;
2422 /* load j atom coordinates */
2423 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2424 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2426 /* Calculate displacement vector */
2427 dx00 = _mm_sub_pd(ix0,jx0);
2428 dy00 = _mm_sub_pd(iy0,jy0);
2429 dz00 = _mm_sub_pd(iz0,jz0);
2430 dx01 = _mm_sub_pd(ix0,jx1);
2431 dy01 = _mm_sub_pd(iy0,jy1);
2432 dz01 = _mm_sub_pd(iz0,jz1);
2433 dx02 = _mm_sub_pd(ix0,jx2);
2434 dy02 = _mm_sub_pd(iy0,jy2);
2435 dz02 = _mm_sub_pd(iz0,jz2);
2436 dx10 = _mm_sub_pd(ix1,jx0);
2437 dy10 = _mm_sub_pd(iy1,jy0);
2438 dz10 = _mm_sub_pd(iz1,jz0);
2439 dx11 = _mm_sub_pd(ix1,jx1);
2440 dy11 = _mm_sub_pd(iy1,jy1);
2441 dz11 = _mm_sub_pd(iz1,jz1);
2442 dx12 = _mm_sub_pd(ix1,jx2);
2443 dy12 = _mm_sub_pd(iy1,jy2);
2444 dz12 = _mm_sub_pd(iz1,jz2);
2445 dx20 = _mm_sub_pd(ix2,jx0);
2446 dy20 = _mm_sub_pd(iy2,jy0);
2447 dz20 = _mm_sub_pd(iz2,jz0);
2448 dx21 = _mm_sub_pd(ix2,jx1);
2449 dy21 = _mm_sub_pd(iy2,jy1);
2450 dz21 = _mm_sub_pd(iz2,jz1);
2451 dx22 = _mm_sub_pd(ix2,jx2);
2452 dy22 = _mm_sub_pd(iy2,jy2);
2453 dz22 = _mm_sub_pd(iz2,jz2);
2455 /* Calculate squared distance and things based on it */
2456 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2457 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
2458 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
2459 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
2460 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2461 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2462 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
2463 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2464 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2466 rinv00 = gmx_mm_invsqrt_pd(rsq00);
2467 rinv01 = gmx_mm_invsqrt_pd(rsq01);
2468 rinv02 = gmx_mm_invsqrt_pd(rsq02);
2469 rinv10 = gmx_mm_invsqrt_pd(rsq10);
2470 rinv11 = gmx_mm_invsqrt_pd(rsq11);
2471 rinv12 = gmx_mm_invsqrt_pd(rsq12);
2472 rinv20 = gmx_mm_invsqrt_pd(rsq20);
2473 rinv21 = gmx_mm_invsqrt_pd(rsq21);
2474 rinv22 = gmx_mm_invsqrt_pd(rsq22);
2476 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
2477 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
2478 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
2479 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
2480 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
2481 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
2482 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
2483 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
2484 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
2486 fjx0 = _mm_setzero_pd();
2487 fjy0 = _mm_setzero_pd();
2488 fjz0 = _mm_setzero_pd();
2489 fjx1 = _mm_setzero_pd();
2490 fjy1 = _mm_setzero_pd();
2491 fjz1 = _mm_setzero_pd();
2492 fjx2 = _mm_setzero_pd();
2493 fjy2 = _mm_setzero_pd();
2494 fjz2 = _mm_setzero_pd();
2496 /**************************
2497 * CALCULATE INTERACTIONS *
2498 **************************/
2500 if (gmx_mm_any_lt(rsq00,rcutoff2))
2503 r00 = _mm_mul_pd(rsq00,rinv00);
2505 /* EWALD ELECTROSTATICS */
2507 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2508 ewrt = _mm_mul_pd(r00,ewtabscale);
2509 ewitab = _mm_cvttpd_epi32(ewrt);
2510 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2511 ewitab = _mm_slli_epi32(ewitab,2);
2512 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2513 ewtabD = _mm_setzero_pd();
2514 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2515 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2516 ewtabFn = _mm_setzero_pd();
2517 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2518 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2519 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2520 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
2521 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
2523 /* LENNARD-JONES DISPERSION/REPULSION */
2525 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2526 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
2527 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
2528 vvdw = _mm_sub_pd( _mm_mul_pd(vvdw12,one_twelfth) , _mm_mul_pd(vvdw6,one_sixth) );
2529 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
2531 d = _mm_sub_pd(r00,rswitch);
2532 d = _mm_max_pd(d,_mm_setzero_pd());
2533 d2 = _mm_mul_pd(d,d);
2534 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2536 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2538 /* Evaluate switch function */
2539 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2540 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(velec,dsw)) );
2541 fvdw = _mm_sub_pd( _mm_mul_pd(fvdw,sw) , _mm_mul_pd(rinv00,_mm_mul_pd(vvdw,dsw)) );
2542 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
2544 fscal = _mm_add_pd(felec,fvdw);
2546 fscal = _mm_and_pd(fscal,cutoff_mask);
2548 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2550 /* Calculate temporary vectorial force */
2551 tx = _mm_mul_pd(fscal,dx00);
2552 ty = _mm_mul_pd(fscal,dy00);
2553 tz = _mm_mul_pd(fscal,dz00);
2555 /* Update vectorial force */
2556 fix0 = _mm_add_pd(fix0,tx);
2557 fiy0 = _mm_add_pd(fiy0,ty);
2558 fiz0 = _mm_add_pd(fiz0,tz);
2560 fjx0 = _mm_add_pd(fjx0,tx);
2561 fjy0 = _mm_add_pd(fjy0,ty);
2562 fjz0 = _mm_add_pd(fjz0,tz);
2566 /**************************
2567 * CALCULATE INTERACTIONS *
2568 **************************/
2570 if (gmx_mm_any_lt(rsq01,rcutoff2))
2573 r01 = _mm_mul_pd(rsq01,rinv01);
2575 /* EWALD ELECTROSTATICS */
2577 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2578 ewrt = _mm_mul_pd(r01,ewtabscale);
2579 ewitab = _mm_cvttpd_epi32(ewrt);
2580 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2581 ewitab = _mm_slli_epi32(ewitab,2);
2582 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2583 ewtabD = _mm_setzero_pd();
2584 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2585 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2586 ewtabFn = _mm_setzero_pd();
2587 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2588 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2589 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2590 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
2591 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
2593 d = _mm_sub_pd(r01,rswitch);
2594 d = _mm_max_pd(d,_mm_setzero_pd());
2595 d2 = _mm_mul_pd(d,d);
2596 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2598 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2600 /* Evaluate switch function */
2601 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2602 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv01,_mm_mul_pd(velec,dsw)) );
2603 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
2607 fscal = _mm_and_pd(fscal,cutoff_mask);
2609 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2611 /* Calculate temporary vectorial force */
2612 tx = _mm_mul_pd(fscal,dx01);
2613 ty = _mm_mul_pd(fscal,dy01);
2614 tz = _mm_mul_pd(fscal,dz01);
2616 /* Update vectorial force */
2617 fix0 = _mm_add_pd(fix0,tx);
2618 fiy0 = _mm_add_pd(fiy0,ty);
2619 fiz0 = _mm_add_pd(fiz0,tz);
2621 fjx1 = _mm_add_pd(fjx1,tx);
2622 fjy1 = _mm_add_pd(fjy1,ty);
2623 fjz1 = _mm_add_pd(fjz1,tz);
2627 /**************************
2628 * CALCULATE INTERACTIONS *
2629 **************************/
2631 if (gmx_mm_any_lt(rsq02,rcutoff2))
2634 r02 = _mm_mul_pd(rsq02,rinv02);
2636 /* EWALD ELECTROSTATICS */
2638 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2639 ewrt = _mm_mul_pd(r02,ewtabscale);
2640 ewitab = _mm_cvttpd_epi32(ewrt);
2641 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2642 ewitab = _mm_slli_epi32(ewitab,2);
2643 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2644 ewtabD = _mm_setzero_pd();
2645 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2646 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2647 ewtabFn = _mm_setzero_pd();
2648 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2649 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2650 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2651 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
2652 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
2654 d = _mm_sub_pd(r02,rswitch);
2655 d = _mm_max_pd(d,_mm_setzero_pd());
2656 d2 = _mm_mul_pd(d,d);
2657 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2659 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2661 /* Evaluate switch function */
2662 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2663 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv02,_mm_mul_pd(velec,dsw)) );
2664 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
2668 fscal = _mm_and_pd(fscal,cutoff_mask);
2670 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2672 /* Calculate temporary vectorial force */
2673 tx = _mm_mul_pd(fscal,dx02);
2674 ty = _mm_mul_pd(fscal,dy02);
2675 tz = _mm_mul_pd(fscal,dz02);
2677 /* Update vectorial force */
2678 fix0 = _mm_add_pd(fix0,tx);
2679 fiy0 = _mm_add_pd(fiy0,ty);
2680 fiz0 = _mm_add_pd(fiz0,tz);
2682 fjx2 = _mm_add_pd(fjx2,tx);
2683 fjy2 = _mm_add_pd(fjy2,ty);
2684 fjz2 = _mm_add_pd(fjz2,tz);
2688 /**************************
2689 * CALCULATE INTERACTIONS *
2690 **************************/
2692 if (gmx_mm_any_lt(rsq10,rcutoff2))
2695 r10 = _mm_mul_pd(rsq10,rinv10);
2697 /* EWALD ELECTROSTATICS */
2699 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2700 ewrt = _mm_mul_pd(r10,ewtabscale);
2701 ewitab = _mm_cvttpd_epi32(ewrt);
2702 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2703 ewitab = _mm_slli_epi32(ewitab,2);
2704 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2705 ewtabD = _mm_setzero_pd();
2706 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2707 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2708 ewtabFn = _mm_setzero_pd();
2709 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2710 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2711 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2712 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
2713 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2715 d = _mm_sub_pd(r10,rswitch);
2716 d = _mm_max_pd(d,_mm_setzero_pd());
2717 d2 = _mm_mul_pd(d,d);
2718 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2720 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2722 /* Evaluate switch function */
2723 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2724 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv10,_mm_mul_pd(velec,dsw)) );
2725 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
2729 fscal = _mm_and_pd(fscal,cutoff_mask);
2731 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2733 /* Calculate temporary vectorial force */
2734 tx = _mm_mul_pd(fscal,dx10);
2735 ty = _mm_mul_pd(fscal,dy10);
2736 tz = _mm_mul_pd(fscal,dz10);
2738 /* Update vectorial force */
2739 fix1 = _mm_add_pd(fix1,tx);
2740 fiy1 = _mm_add_pd(fiy1,ty);
2741 fiz1 = _mm_add_pd(fiz1,tz);
2743 fjx0 = _mm_add_pd(fjx0,tx);
2744 fjy0 = _mm_add_pd(fjy0,ty);
2745 fjz0 = _mm_add_pd(fjz0,tz);
2749 /**************************
2750 * CALCULATE INTERACTIONS *
2751 **************************/
2753 if (gmx_mm_any_lt(rsq11,rcutoff2))
2756 r11 = _mm_mul_pd(rsq11,rinv11);
2758 /* EWALD ELECTROSTATICS */
2760 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2761 ewrt = _mm_mul_pd(r11,ewtabscale);
2762 ewitab = _mm_cvttpd_epi32(ewrt);
2763 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2764 ewitab = _mm_slli_epi32(ewitab,2);
2765 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2766 ewtabD = _mm_setzero_pd();
2767 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2768 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2769 ewtabFn = _mm_setzero_pd();
2770 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2771 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2772 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2773 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
2774 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2776 d = _mm_sub_pd(r11,rswitch);
2777 d = _mm_max_pd(d,_mm_setzero_pd());
2778 d2 = _mm_mul_pd(d,d);
2779 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2781 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2783 /* Evaluate switch function */
2784 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2785 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv11,_mm_mul_pd(velec,dsw)) );
2786 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2790 fscal = _mm_and_pd(fscal,cutoff_mask);
2792 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2794 /* Calculate temporary vectorial force */
2795 tx = _mm_mul_pd(fscal,dx11);
2796 ty = _mm_mul_pd(fscal,dy11);
2797 tz = _mm_mul_pd(fscal,dz11);
2799 /* Update vectorial force */
2800 fix1 = _mm_add_pd(fix1,tx);
2801 fiy1 = _mm_add_pd(fiy1,ty);
2802 fiz1 = _mm_add_pd(fiz1,tz);
2804 fjx1 = _mm_add_pd(fjx1,tx);
2805 fjy1 = _mm_add_pd(fjy1,ty);
2806 fjz1 = _mm_add_pd(fjz1,tz);
2810 /**************************
2811 * CALCULATE INTERACTIONS *
2812 **************************/
2814 if (gmx_mm_any_lt(rsq12,rcutoff2))
2817 r12 = _mm_mul_pd(rsq12,rinv12);
2819 /* EWALD ELECTROSTATICS */
2821 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2822 ewrt = _mm_mul_pd(r12,ewtabscale);
2823 ewitab = _mm_cvttpd_epi32(ewrt);
2824 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2825 ewitab = _mm_slli_epi32(ewitab,2);
2826 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2827 ewtabD = _mm_setzero_pd();
2828 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2829 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2830 ewtabFn = _mm_setzero_pd();
2831 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2832 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2833 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2834 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
2835 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2837 d = _mm_sub_pd(r12,rswitch);
2838 d = _mm_max_pd(d,_mm_setzero_pd());
2839 d2 = _mm_mul_pd(d,d);
2840 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2842 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2844 /* Evaluate switch function */
2845 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2846 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv12,_mm_mul_pd(velec,dsw)) );
2847 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2851 fscal = _mm_and_pd(fscal,cutoff_mask);
2853 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2855 /* Calculate temporary vectorial force */
2856 tx = _mm_mul_pd(fscal,dx12);
2857 ty = _mm_mul_pd(fscal,dy12);
2858 tz = _mm_mul_pd(fscal,dz12);
2860 /* Update vectorial force */
2861 fix1 = _mm_add_pd(fix1,tx);
2862 fiy1 = _mm_add_pd(fiy1,ty);
2863 fiz1 = _mm_add_pd(fiz1,tz);
2865 fjx2 = _mm_add_pd(fjx2,tx);
2866 fjy2 = _mm_add_pd(fjy2,ty);
2867 fjz2 = _mm_add_pd(fjz2,tz);
2871 /**************************
2872 * CALCULATE INTERACTIONS *
2873 **************************/
2875 if (gmx_mm_any_lt(rsq20,rcutoff2))
2878 r20 = _mm_mul_pd(rsq20,rinv20);
2880 /* EWALD ELECTROSTATICS */
2882 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2883 ewrt = _mm_mul_pd(r20,ewtabscale);
2884 ewitab = _mm_cvttpd_epi32(ewrt);
2885 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2886 ewitab = _mm_slli_epi32(ewitab,2);
2887 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2888 ewtabD = _mm_setzero_pd();
2889 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2890 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2891 ewtabFn = _mm_setzero_pd();
2892 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2893 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2894 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2895 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
2896 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2898 d = _mm_sub_pd(r20,rswitch);
2899 d = _mm_max_pd(d,_mm_setzero_pd());
2900 d2 = _mm_mul_pd(d,d);
2901 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2903 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2905 /* Evaluate switch function */
2906 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2907 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv20,_mm_mul_pd(velec,dsw)) );
2908 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
2912 fscal = _mm_and_pd(fscal,cutoff_mask);
2914 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2916 /* Calculate temporary vectorial force */
2917 tx = _mm_mul_pd(fscal,dx20);
2918 ty = _mm_mul_pd(fscal,dy20);
2919 tz = _mm_mul_pd(fscal,dz20);
2921 /* Update vectorial force */
2922 fix2 = _mm_add_pd(fix2,tx);
2923 fiy2 = _mm_add_pd(fiy2,ty);
2924 fiz2 = _mm_add_pd(fiz2,tz);
2926 fjx0 = _mm_add_pd(fjx0,tx);
2927 fjy0 = _mm_add_pd(fjy0,ty);
2928 fjz0 = _mm_add_pd(fjz0,tz);
2932 /**************************
2933 * CALCULATE INTERACTIONS *
2934 **************************/
2936 if (gmx_mm_any_lt(rsq21,rcutoff2))
2939 r21 = _mm_mul_pd(rsq21,rinv21);
2941 /* EWALD ELECTROSTATICS */
2943 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2944 ewrt = _mm_mul_pd(r21,ewtabscale);
2945 ewitab = _mm_cvttpd_epi32(ewrt);
2946 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2947 ewitab = _mm_slli_epi32(ewitab,2);
2948 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2949 ewtabD = _mm_setzero_pd();
2950 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
2951 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
2952 ewtabFn = _mm_setzero_pd();
2953 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
2954 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
2955 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
2956 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
2957 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2959 d = _mm_sub_pd(r21,rswitch);
2960 d = _mm_max_pd(d,_mm_setzero_pd());
2961 d2 = _mm_mul_pd(d,d);
2962 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
2964 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
2966 /* Evaluate switch function */
2967 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2968 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv21,_mm_mul_pd(velec,dsw)) );
2969 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2973 fscal = _mm_and_pd(fscal,cutoff_mask);
2975 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2977 /* Calculate temporary vectorial force */
2978 tx = _mm_mul_pd(fscal,dx21);
2979 ty = _mm_mul_pd(fscal,dy21);
2980 tz = _mm_mul_pd(fscal,dz21);
2982 /* Update vectorial force */
2983 fix2 = _mm_add_pd(fix2,tx);
2984 fiy2 = _mm_add_pd(fiy2,ty);
2985 fiz2 = _mm_add_pd(fiz2,tz);
2987 fjx1 = _mm_add_pd(fjx1,tx);
2988 fjy1 = _mm_add_pd(fjy1,ty);
2989 fjz1 = _mm_add_pd(fjz1,tz);
2993 /**************************
2994 * CALCULATE INTERACTIONS *
2995 **************************/
2997 if (gmx_mm_any_lt(rsq22,rcutoff2))
3000 r22 = _mm_mul_pd(rsq22,rinv22);
3002 /* EWALD ELECTROSTATICS */
3004 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3005 ewrt = _mm_mul_pd(r22,ewtabscale);
3006 ewitab = _mm_cvttpd_epi32(ewrt);
3007 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
3008 ewitab = _mm_slli_epi32(ewitab,2);
3009 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3010 ewtabD = _mm_setzero_pd();
3011 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
3012 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
3013 ewtabFn = _mm_setzero_pd();
3014 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
3015 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
3016 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
3017 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
3018 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
3020 d = _mm_sub_pd(r22,rswitch);
3021 d = _mm_max_pd(d,_mm_setzero_pd());
3022 d2 = _mm_mul_pd(d,d);
3023 sw = _mm_add_pd(one,_mm_mul_pd(d2,_mm_mul_pd(d,_mm_add_pd(swV3,_mm_mul_pd(d,_mm_add_pd(swV4,_mm_mul_pd(d,swV5)))))));
3025 dsw = _mm_mul_pd(d2,_mm_add_pd(swF2,_mm_mul_pd(d,_mm_add_pd(swF3,_mm_mul_pd(d,swF4)))));
3027 /* Evaluate switch function */
3028 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3029 felec = _mm_sub_pd( _mm_mul_pd(felec,sw) , _mm_mul_pd(rinv22,_mm_mul_pd(velec,dsw)) );
3030 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
3034 fscal = _mm_and_pd(fscal,cutoff_mask);
3036 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
3038 /* Calculate temporary vectorial force */
3039 tx = _mm_mul_pd(fscal,dx22);
3040 ty = _mm_mul_pd(fscal,dy22);
3041 tz = _mm_mul_pd(fscal,dz22);
3043 /* Update vectorial force */
3044 fix2 = _mm_add_pd(fix2,tx);
3045 fiy2 = _mm_add_pd(fiy2,ty);
3046 fiz2 = _mm_add_pd(fiz2,tz);
3048 fjx2 = _mm_add_pd(fjx2,tx);
3049 fjy2 = _mm_add_pd(fjy2,ty);
3050 fjz2 = _mm_add_pd(fjz2,tz);
3054 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3056 /* Inner loop uses 573 flops */
3059 /* End of innermost loop */
3061 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3062 f+i_coord_offset,fshift+i_shift_offset);
3064 /* Increment number of inner iterations */
3065 inneriter += j_index_end - j_index_start;
3067 /* Outer loop uses 18 flops */
3070 /* Increment number of outer iterations */
3073 /* Update outer/inner flops */
3075 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*573);