2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse4_1_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse4_1_single
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LennardJones
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse4_1_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
90 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
97 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
98 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
102 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
105 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
106 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
107 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
108 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
111 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
114 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
115 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
117 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
119 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
120 real rswitch_scalar,d_scalar;
121 __m128 dummy_mask,cutoff_mask;
122 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
123 __m128 one = _mm_set1_ps(1.0);
124 __m128 two = _mm_set1_ps(2.0);
130 jindex = nlist->jindex;
132 shiftidx = nlist->shift;
134 shiftvec = fr->shift_vec[0];
135 fshift = fr->fshift[0];
136 facel = _mm_set1_ps(fr->ic->epsfac);
137 charge = mdatoms->chargeA;
138 nvdwtype = fr->ntype;
140 vdwtype = mdatoms->typeA;
142 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
143 ewtab = fr->ic->tabq_coul_FDV0;
144 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
145 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
147 /* Setup water-specific parameters */
148 inr = nlist->iinr[0];
149 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
150 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
151 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
152 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
154 jq1 = _mm_set1_ps(charge[inr+1]);
155 jq2 = _mm_set1_ps(charge[inr+2]);
156 jq3 = _mm_set1_ps(charge[inr+3]);
157 vdwjidx0A = 2*vdwtype[inr+0];
158 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
159 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
160 qq11 = _mm_mul_ps(iq1,jq1);
161 qq12 = _mm_mul_ps(iq1,jq2);
162 qq13 = _mm_mul_ps(iq1,jq3);
163 qq21 = _mm_mul_ps(iq2,jq1);
164 qq22 = _mm_mul_ps(iq2,jq2);
165 qq23 = _mm_mul_ps(iq2,jq3);
166 qq31 = _mm_mul_ps(iq3,jq1);
167 qq32 = _mm_mul_ps(iq3,jq2);
168 qq33 = _mm_mul_ps(iq3,jq3);
170 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
171 rcutoff_scalar = fr->ic->rcoulomb;
172 rcutoff = _mm_set1_ps(rcutoff_scalar);
173 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
175 rswitch_scalar = fr->ic->rcoulomb_switch;
176 rswitch = _mm_set1_ps(rswitch_scalar);
177 /* Setup switch parameters */
178 d_scalar = rcutoff_scalar-rswitch_scalar;
179 d = _mm_set1_ps(d_scalar);
180 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
181 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
182 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
183 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
184 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
185 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
187 /* Avoid stupid compiler warnings */
188 jnrA = jnrB = jnrC = jnrD = 0;
197 for(iidx=0;iidx<4*DIM;iidx++)
202 /* Start outer loop over neighborlists */
203 for(iidx=0; iidx<nri; iidx++)
205 /* Load shift vector for this list */
206 i_shift_offset = DIM*shiftidx[iidx];
208 /* Load limits for loop over neighbors */
209 j_index_start = jindex[iidx];
210 j_index_end = jindex[iidx+1];
212 /* Get outer coordinate index */
214 i_coord_offset = DIM*inr;
216 /* Load i particle coords and add shift vector */
217 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
218 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
220 fix0 = _mm_setzero_ps();
221 fiy0 = _mm_setzero_ps();
222 fiz0 = _mm_setzero_ps();
223 fix1 = _mm_setzero_ps();
224 fiy1 = _mm_setzero_ps();
225 fiz1 = _mm_setzero_ps();
226 fix2 = _mm_setzero_ps();
227 fiy2 = _mm_setzero_ps();
228 fiz2 = _mm_setzero_ps();
229 fix3 = _mm_setzero_ps();
230 fiy3 = _mm_setzero_ps();
231 fiz3 = _mm_setzero_ps();
233 /* Reset potential sums */
234 velecsum = _mm_setzero_ps();
235 vvdwsum = _mm_setzero_ps();
237 /* Start inner kernel loop */
238 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
241 /* Get j neighbor index, and coordinate index */
246 j_coord_offsetA = DIM*jnrA;
247 j_coord_offsetB = DIM*jnrB;
248 j_coord_offsetC = DIM*jnrC;
249 j_coord_offsetD = DIM*jnrD;
251 /* load j atom coordinates */
252 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
253 x+j_coord_offsetC,x+j_coord_offsetD,
254 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
255 &jy2,&jz2,&jx3,&jy3,&jz3);
257 /* Calculate displacement vector */
258 dx00 = _mm_sub_ps(ix0,jx0);
259 dy00 = _mm_sub_ps(iy0,jy0);
260 dz00 = _mm_sub_ps(iz0,jz0);
261 dx11 = _mm_sub_ps(ix1,jx1);
262 dy11 = _mm_sub_ps(iy1,jy1);
263 dz11 = _mm_sub_ps(iz1,jz1);
264 dx12 = _mm_sub_ps(ix1,jx2);
265 dy12 = _mm_sub_ps(iy1,jy2);
266 dz12 = _mm_sub_ps(iz1,jz2);
267 dx13 = _mm_sub_ps(ix1,jx3);
268 dy13 = _mm_sub_ps(iy1,jy3);
269 dz13 = _mm_sub_ps(iz1,jz3);
270 dx21 = _mm_sub_ps(ix2,jx1);
271 dy21 = _mm_sub_ps(iy2,jy1);
272 dz21 = _mm_sub_ps(iz2,jz1);
273 dx22 = _mm_sub_ps(ix2,jx2);
274 dy22 = _mm_sub_ps(iy2,jy2);
275 dz22 = _mm_sub_ps(iz2,jz2);
276 dx23 = _mm_sub_ps(ix2,jx3);
277 dy23 = _mm_sub_ps(iy2,jy3);
278 dz23 = _mm_sub_ps(iz2,jz3);
279 dx31 = _mm_sub_ps(ix3,jx1);
280 dy31 = _mm_sub_ps(iy3,jy1);
281 dz31 = _mm_sub_ps(iz3,jz1);
282 dx32 = _mm_sub_ps(ix3,jx2);
283 dy32 = _mm_sub_ps(iy3,jy2);
284 dz32 = _mm_sub_ps(iz3,jz2);
285 dx33 = _mm_sub_ps(ix3,jx3);
286 dy33 = _mm_sub_ps(iy3,jy3);
287 dz33 = _mm_sub_ps(iz3,jz3);
289 /* Calculate squared distance and things based on it */
290 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
291 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
292 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
293 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
294 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
295 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
296 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
297 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
298 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
299 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
301 rinv00 = sse41_invsqrt_f(rsq00);
302 rinv11 = sse41_invsqrt_f(rsq11);
303 rinv12 = sse41_invsqrt_f(rsq12);
304 rinv13 = sse41_invsqrt_f(rsq13);
305 rinv21 = sse41_invsqrt_f(rsq21);
306 rinv22 = sse41_invsqrt_f(rsq22);
307 rinv23 = sse41_invsqrt_f(rsq23);
308 rinv31 = sse41_invsqrt_f(rsq31);
309 rinv32 = sse41_invsqrt_f(rsq32);
310 rinv33 = sse41_invsqrt_f(rsq33);
312 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
313 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
314 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
315 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
316 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
317 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
318 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
319 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
320 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
321 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
323 fjx0 = _mm_setzero_ps();
324 fjy0 = _mm_setzero_ps();
325 fjz0 = _mm_setzero_ps();
326 fjx1 = _mm_setzero_ps();
327 fjy1 = _mm_setzero_ps();
328 fjz1 = _mm_setzero_ps();
329 fjx2 = _mm_setzero_ps();
330 fjy2 = _mm_setzero_ps();
331 fjz2 = _mm_setzero_ps();
332 fjx3 = _mm_setzero_ps();
333 fjy3 = _mm_setzero_ps();
334 fjz3 = _mm_setzero_ps();
336 /**************************
337 * CALCULATE INTERACTIONS *
338 **************************/
340 if (gmx_mm_any_lt(rsq00,rcutoff2))
343 r00 = _mm_mul_ps(rsq00,rinv00);
345 /* LENNARD-JONES DISPERSION/REPULSION */
347 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
348 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
349 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
350 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
351 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
353 d = _mm_sub_ps(r00,rswitch);
354 d = _mm_max_ps(d,_mm_setzero_ps());
355 d2 = _mm_mul_ps(d,d);
356 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
358 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
360 /* Evaluate switch function */
361 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
362 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
363 vvdw = _mm_mul_ps(vvdw,sw);
364 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
366 /* Update potential sum for this i atom from the interaction with this j atom. */
367 vvdw = _mm_and_ps(vvdw,cutoff_mask);
368 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
372 fscal = _mm_and_ps(fscal,cutoff_mask);
374 /* Calculate temporary vectorial force */
375 tx = _mm_mul_ps(fscal,dx00);
376 ty = _mm_mul_ps(fscal,dy00);
377 tz = _mm_mul_ps(fscal,dz00);
379 /* Update vectorial force */
380 fix0 = _mm_add_ps(fix0,tx);
381 fiy0 = _mm_add_ps(fiy0,ty);
382 fiz0 = _mm_add_ps(fiz0,tz);
384 fjx0 = _mm_add_ps(fjx0,tx);
385 fjy0 = _mm_add_ps(fjy0,ty);
386 fjz0 = _mm_add_ps(fjz0,tz);
390 /**************************
391 * CALCULATE INTERACTIONS *
392 **************************/
394 if (gmx_mm_any_lt(rsq11,rcutoff2))
397 r11 = _mm_mul_ps(rsq11,rinv11);
399 /* EWALD ELECTROSTATICS */
401 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
402 ewrt = _mm_mul_ps(r11,ewtabscale);
403 ewitab = _mm_cvttps_epi32(ewrt);
404 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
405 ewitab = _mm_slli_epi32(ewitab,2);
406 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
407 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
408 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
409 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
410 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
411 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
412 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
413 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
414 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
416 d = _mm_sub_ps(r11,rswitch);
417 d = _mm_max_ps(d,_mm_setzero_ps());
418 d2 = _mm_mul_ps(d,d);
419 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
421 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
423 /* Evaluate switch function */
424 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
425 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
426 velec = _mm_mul_ps(velec,sw);
427 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
429 /* Update potential sum for this i atom from the interaction with this j atom. */
430 velec = _mm_and_ps(velec,cutoff_mask);
431 velecsum = _mm_add_ps(velecsum,velec);
435 fscal = _mm_and_ps(fscal,cutoff_mask);
437 /* Calculate temporary vectorial force */
438 tx = _mm_mul_ps(fscal,dx11);
439 ty = _mm_mul_ps(fscal,dy11);
440 tz = _mm_mul_ps(fscal,dz11);
442 /* Update vectorial force */
443 fix1 = _mm_add_ps(fix1,tx);
444 fiy1 = _mm_add_ps(fiy1,ty);
445 fiz1 = _mm_add_ps(fiz1,tz);
447 fjx1 = _mm_add_ps(fjx1,tx);
448 fjy1 = _mm_add_ps(fjy1,ty);
449 fjz1 = _mm_add_ps(fjz1,tz);
453 /**************************
454 * CALCULATE INTERACTIONS *
455 **************************/
457 if (gmx_mm_any_lt(rsq12,rcutoff2))
460 r12 = _mm_mul_ps(rsq12,rinv12);
462 /* EWALD ELECTROSTATICS */
464 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
465 ewrt = _mm_mul_ps(r12,ewtabscale);
466 ewitab = _mm_cvttps_epi32(ewrt);
467 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
468 ewitab = _mm_slli_epi32(ewitab,2);
469 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
470 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
471 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
472 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
473 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
474 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
475 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
476 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
477 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
479 d = _mm_sub_ps(r12,rswitch);
480 d = _mm_max_ps(d,_mm_setzero_ps());
481 d2 = _mm_mul_ps(d,d);
482 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
484 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
486 /* Evaluate switch function */
487 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
488 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
489 velec = _mm_mul_ps(velec,sw);
490 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
492 /* Update potential sum for this i atom from the interaction with this j atom. */
493 velec = _mm_and_ps(velec,cutoff_mask);
494 velecsum = _mm_add_ps(velecsum,velec);
498 fscal = _mm_and_ps(fscal,cutoff_mask);
500 /* Calculate temporary vectorial force */
501 tx = _mm_mul_ps(fscal,dx12);
502 ty = _mm_mul_ps(fscal,dy12);
503 tz = _mm_mul_ps(fscal,dz12);
505 /* Update vectorial force */
506 fix1 = _mm_add_ps(fix1,tx);
507 fiy1 = _mm_add_ps(fiy1,ty);
508 fiz1 = _mm_add_ps(fiz1,tz);
510 fjx2 = _mm_add_ps(fjx2,tx);
511 fjy2 = _mm_add_ps(fjy2,ty);
512 fjz2 = _mm_add_ps(fjz2,tz);
516 /**************************
517 * CALCULATE INTERACTIONS *
518 **************************/
520 if (gmx_mm_any_lt(rsq13,rcutoff2))
523 r13 = _mm_mul_ps(rsq13,rinv13);
525 /* EWALD ELECTROSTATICS */
527 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
528 ewrt = _mm_mul_ps(r13,ewtabscale);
529 ewitab = _mm_cvttps_epi32(ewrt);
530 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
531 ewitab = _mm_slli_epi32(ewitab,2);
532 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
533 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
534 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
535 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
536 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
537 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
538 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
539 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
540 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
542 d = _mm_sub_ps(r13,rswitch);
543 d = _mm_max_ps(d,_mm_setzero_ps());
544 d2 = _mm_mul_ps(d,d);
545 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
547 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
549 /* Evaluate switch function */
550 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
551 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
552 velec = _mm_mul_ps(velec,sw);
553 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
555 /* Update potential sum for this i atom from the interaction with this j atom. */
556 velec = _mm_and_ps(velec,cutoff_mask);
557 velecsum = _mm_add_ps(velecsum,velec);
561 fscal = _mm_and_ps(fscal,cutoff_mask);
563 /* Calculate temporary vectorial force */
564 tx = _mm_mul_ps(fscal,dx13);
565 ty = _mm_mul_ps(fscal,dy13);
566 tz = _mm_mul_ps(fscal,dz13);
568 /* Update vectorial force */
569 fix1 = _mm_add_ps(fix1,tx);
570 fiy1 = _mm_add_ps(fiy1,ty);
571 fiz1 = _mm_add_ps(fiz1,tz);
573 fjx3 = _mm_add_ps(fjx3,tx);
574 fjy3 = _mm_add_ps(fjy3,ty);
575 fjz3 = _mm_add_ps(fjz3,tz);
579 /**************************
580 * CALCULATE INTERACTIONS *
581 **************************/
583 if (gmx_mm_any_lt(rsq21,rcutoff2))
586 r21 = _mm_mul_ps(rsq21,rinv21);
588 /* EWALD ELECTROSTATICS */
590 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
591 ewrt = _mm_mul_ps(r21,ewtabscale);
592 ewitab = _mm_cvttps_epi32(ewrt);
593 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
594 ewitab = _mm_slli_epi32(ewitab,2);
595 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
596 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
597 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
598 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
599 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
600 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
601 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
602 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
603 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
605 d = _mm_sub_ps(r21,rswitch);
606 d = _mm_max_ps(d,_mm_setzero_ps());
607 d2 = _mm_mul_ps(d,d);
608 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
610 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
612 /* Evaluate switch function */
613 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
614 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
615 velec = _mm_mul_ps(velec,sw);
616 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
618 /* Update potential sum for this i atom from the interaction with this j atom. */
619 velec = _mm_and_ps(velec,cutoff_mask);
620 velecsum = _mm_add_ps(velecsum,velec);
624 fscal = _mm_and_ps(fscal,cutoff_mask);
626 /* Calculate temporary vectorial force */
627 tx = _mm_mul_ps(fscal,dx21);
628 ty = _mm_mul_ps(fscal,dy21);
629 tz = _mm_mul_ps(fscal,dz21);
631 /* Update vectorial force */
632 fix2 = _mm_add_ps(fix2,tx);
633 fiy2 = _mm_add_ps(fiy2,ty);
634 fiz2 = _mm_add_ps(fiz2,tz);
636 fjx1 = _mm_add_ps(fjx1,tx);
637 fjy1 = _mm_add_ps(fjy1,ty);
638 fjz1 = _mm_add_ps(fjz1,tz);
642 /**************************
643 * CALCULATE INTERACTIONS *
644 **************************/
646 if (gmx_mm_any_lt(rsq22,rcutoff2))
649 r22 = _mm_mul_ps(rsq22,rinv22);
651 /* EWALD ELECTROSTATICS */
653 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
654 ewrt = _mm_mul_ps(r22,ewtabscale);
655 ewitab = _mm_cvttps_epi32(ewrt);
656 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
657 ewitab = _mm_slli_epi32(ewitab,2);
658 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
659 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
660 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
661 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
662 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
663 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
664 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
665 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
666 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
668 d = _mm_sub_ps(r22,rswitch);
669 d = _mm_max_ps(d,_mm_setzero_ps());
670 d2 = _mm_mul_ps(d,d);
671 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
673 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
675 /* Evaluate switch function */
676 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
677 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
678 velec = _mm_mul_ps(velec,sw);
679 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
681 /* Update potential sum for this i atom from the interaction with this j atom. */
682 velec = _mm_and_ps(velec,cutoff_mask);
683 velecsum = _mm_add_ps(velecsum,velec);
687 fscal = _mm_and_ps(fscal,cutoff_mask);
689 /* Calculate temporary vectorial force */
690 tx = _mm_mul_ps(fscal,dx22);
691 ty = _mm_mul_ps(fscal,dy22);
692 tz = _mm_mul_ps(fscal,dz22);
694 /* Update vectorial force */
695 fix2 = _mm_add_ps(fix2,tx);
696 fiy2 = _mm_add_ps(fiy2,ty);
697 fiz2 = _mm_add_ps(fiz2,tz);
699 fjx2 = _mm_add_ps(fjx2,tx);
700 fjy2 = _mm_add_ps(fjy2,ty);
701 fjz2 = _mm_add_ps(fjz2,tz);
705 /**************************
706 * CALCULATE INTERACTIONS *
707 **************************/
709 if (gmx_mm_any_lt(rsq23,rcutoff2))
712 r23 = _mm_mul_ps(rsq23,rinv23);
714 /* EWALD ELECTROSTATICS */
716 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
717 ewrt = _mm_mul_ps(r23,ewtabscale);
718 ewitab = _mm_cvttps_epi32(ewrt);
719 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
720 ewitab = _mm_slli_epi32(ewitab,2);
721 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
722 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
723 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
724 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
725 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
726 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
727 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
728 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
729 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
731 d = _mm_sub_ps(r23,rswitch);
732 d = _mm_max_ps(d,_mm_setzero_ps());
733 d2 = _mm_mul_ps(d,d);
734 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
736 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
738 /* Evaluate switch function */
739 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
740 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
741 velec = _mm_mul_ps(velec,sw);
742 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
744 /* Update potential sum for this i atom from the interaction with this j atom. */
745 velec = _mm_and_ps(velec,cutoff_mask);
746 velecsum = _mm_add_ps(velecsum,velec);
750 fscal = _mm_and_ps(fscal,cutoff_mask);
752 /* Calculate temporary vectorial force */
753 tx = _mm_mul_ps(fscal,dx23);
754 ty = _mm_mul_ps(fscal,dy23);
755 tz = _mm_mul_ps(fscal,dz23);
757 /* Update vectorial force */
758 fix2 = _mm_add_ps(fix2,tx);
759 fiy2 = _mm_add_ps(fiy2,ty);
760 fiz2 = _mm_add_ps(fiz2,tz);
762 fjx3 = _mm_add_ps(fjx3,tx);
763 fjy3 = _mm_add_ps(fjy3,ty);
764 fjz3 = _mm_add_ps(fjz3,tz);
768 /**************************
769 * CALCULATE INTERACTIONS *
770 **************************/
772 if (gmx_mm_any_lt(rsq31,rcutoff2))
775 r31 = _mm_mul_ps(rsq31,rinv31);
777 /* EWALD ELECTROSTATICS */
779 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
780 ewrt = _mm_mul_ps(r31,ewtabscale);
781 ewitab = _mm_cvttps_epi32(ewrt);
782 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
783 ewitab = _mm_slli_epi32(ewitab,2);
784 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
785 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
786 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
787 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
788 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
789 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
790 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
791 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
792 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
794 d = _mm_sub_ps(r31,rswitch);
795 d = _mm_max_ps(d,_mm_setzero_ps());
796 d2 = _mm_mul_ps(d,d);
797 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
799 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
801 /* Evaluate switch function */
802 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
803 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
804 velec = _mm_mul_ps(velec,sw);
805 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
807 /* Update potential sum for this i atom from the interaction with this j atom. */
808 velec = _mm_and_ps(velec,cutoff_mask);
809 velecsum = _mm_add_ps(velecsum,velec);
813 fscal = _mm_and_ps(fscal,cutoff_mask);
815 /* Calculate temporary vectorial force */
816 tx = _mm_mul_ps(fscal,dx31);
817 ty = _mm_mul_ps(fscal,dy31);
818 tz = _mm_mul_ps(fscal,dz31);
820 /* Update vectorial force */
821 fix3 = _mm_add_ps(fix3,tx);
822 fiy3 = _mm_add_ps(fiy3,ty);
823 fiz3 = _mm_add_ps(fiz3,tz);
825 fjx1 = _mm_add_ps(fjx1,tx);
826 fjy1 = _mm_add_ps(fjy1,ty);
827 fjz1 = _mm_add_ps(fjz1,tz);
831 /**************************
832 * CALCULATE INTERACTIONS *
833 **************************/
835 if (gmx_mm_any_lt(rsq32,rcutoff2))
838 r32 = _mm_mul_ps(rsq32,rinv32);
840 /* EWALD ELECTROSTATICS */
842 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
843 ewrt = _mm_mul_ps(r32,ewtabscale);
844 ewitab = _mm_cvttps_epi32(ewrt);
845 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
846 ewitab = _mm_slli_epi32(ewitab,2);
847 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
848 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
849 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
850 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
851 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
852 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
853 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
854 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
855 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
857 d = _mm_sub_ps(r32,rswitch);
858 d = _mm_max_ps(d,_mm_setzero_ps());
859 d2 = _mm_mul_ps(d,d);
860 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
862 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
864 /* Evaluate switch function */
865 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
866 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
867 velec = _mm_mul_ps(velec,sw);
868 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
870 /* Update potential sum for this i atom from the interaction with this j atom. */
871 velec = _mm_and_ps(velec,cutoff_mask);
872 velecsum = _mm_add_ps(velecsum,velec);
876 fscal = _mm_and_ps(fscal,cutoff_mask);
878 /* Calculate temporary vectorial force */
879 tx = _mm_mul_ps(fscal,dx32);
880 ty = _mm_mul_ps(fscal,dy32);
881 tz = _mm_mul_ps(fscal,dz32);
883 /* Update vectorial force */
884 fix3 = _mm_add_ps(fix3,tx);
885 fiy3 = _mm_add_ps(fiy3,ty);
886 fiz3 = _mm_add_ps(fiz3,tz);
888 fjx2 = _mm_add_ps(fjx2,tx);
889 fjy2 = _mm_add_ps(fjy2,ty);
890 fjz2 = _mm_add_ps(fjz2,tz);
894 /**************************
895 * CALCULATE INTERACTIONS *
896 **************************/
898 if (gmx_mm_any_lt(rsq33,rcutoff2))
901 r33 = _mm_mul_ps(rsq33,rinv33);
903 /* EWALD ELECTROSTATICS */
905 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
906 ewrt = _mm_mul_ps(r33,ewtabscale);
907 ewitab = _mm_cvttps_epi32(ewrt);
908 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
909 ewitab = _mm_slli_epi32(ewitab,2);
910 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
911 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
912 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
913 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
914 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
915 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
916 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
917 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
918 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
920 d = _mm_sub_ps(r33,rswitch);
921 d = _mm_max_ps(d,_mm_setzero_ps());
922 d2 = _mm_mul_ps(d,d);
923 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
925 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
927 /* Evaluate switch function */
928 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
929 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
930 velec = _mm_mul_ps(velec,sw);
931 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
933 /* Update potential sum for this i atom from the interaction with this j atom. */
934 velec = _mm_and_ps(velec,cutoff_mask);
935 velecsum = _mm_add_ps(velecsum,velec);
939 fscal = _mm_and_ps(fscal,cutoff_mask);
941 /* Calculate temporary vectorial force */
942 tx = _mm_mul_ps(fscal,dx33);
943 ty = _mm_mul_ps(fscal,dy33);
944 tz = _mm_mul_ps(fscal,dz33);
946 /* Update vectorial force */
947 fix3 = _mm_add_ps(fix3,tx);
948 fiy3 = _mm_add_ps(fiy3,ty);
949 fiz3 = _mm_add_ps(fiz3,tz);
951 fjx3 = _mm_add_ps(fjx3,tx);
952 fjy3 = _mm_add_ps(fjy3,ty);
953 fjz3 = _mm_add_ps(fjz3,tz);
957 fjptrA = f+j_coord_offsetA;
958 fjptrB = f+j_coord_offsetB;
959 fjptrC = f+j_coord_offsetC;
960 fjptrD = f+j_coord_offsetD;
962 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
963 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
964 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
966 /* Inner loop uses 647 flops */
972 /* Get j neighbor index, and coordinate index */
973 jnrlistA = jjnr[jidx];
974 jnrlistB = jjnr[jidx+1];
975 jnrlistC = jjnr[jidx+2];
976 jnrlistD = jjnr[jidx+3];
977 /* Sign of each element will be negative for non-real atoms.
978 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
979 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
981 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
982 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
983 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
984 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
985 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
986 j_coord_offsetA = DIM*jnrA;
987 j_coord_offsetB = DIM*jnrB;
988 j_coord_offsetC = DIM*jnrC;
989 j_coord_offsetD = DIM*jnrD;
991 /* load j atom coordinates */
992 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
993 x+j_coord_offsetC,x+j_coord_offsetD,
994 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
995 &jy2,&jz2,&jx3,&jy3,&jz3);
997 /* Calculate displacement vector */
998 dx00 = _mm_sub_ps(ix0,jx0);
999 dy00 = _mm_sub_ps(iy0,jy0);
1000 dz00 = _mm_sub_ps(iz0,jz0);
1001 dx11 = _mm_sub_ps(ix1,jx1);
1002 dy11 = _mm_sub_ps(iy1,jy1);
1003 dz11 = _mm_sub_ps(iz1,jz1);
1004 dx12 = _mm_sub_ps(ix1,jx2);
1005 dy12 = _mm_sub_ps(iy1,jy2);
1006 dz12 = _mm_sub_ps(iz1,jz2);
1007 dx13 = _mm_sub_ps(ix1,jx3);
1008 dy13 = _mm_sub_ps(iy1,jy3);
1009 dz13 = _mm_sub_ps(iz1,jz3);
1010 dx21 = _mm_sub_ps(ix2,jx1);
1011 dy21 = _mm_sub_ps(iy2,jy1);
1012 dz21 = _mm_sub_ps(iz2,jz1);
1013 dx22 = _mm_sub_ps(ix2,jx2);
1014 dy22 = _mm_sub_ps(iy2,jy2);
1015 dz22 = _mm_sub_ps(iz2,jz2);
1016 dx23 = _mm_sub_ps(ix2,jx3);
1017 dy23 = _mm_sub_ps(iy2,jy3);
1018 dz23 = _mm_sub_ps(iz2,jz3);
1019 dx31 = _mm_sub_ps(ix3,jx1);
1020 dy31 = _mm_sub_ps(iy3,jy1);
1021 dz31 = _mm_sub_ps(iz3,jz1);
1022 dx32 = _mm_sub_ps(ix3,jx2);
1023 dy32 = _mm_sub_ps(iy3,jy2);
1024 dz32 = _mm_sub_ps(iz3,jz2);
1025 dx33 = _mm_sub_ps(ix3,jx3);
1026 dy33 = _mm_sub_ps(iy3,jy3);
1027 dz33 = _mm_sub_ps(iz3,jz3);
1029 /* Calculate squared distance and things based on it */
1030 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1031 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1032 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1033 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1034 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1035 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1036 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1037 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1038 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1039 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1041 rinv00 = sse41_invsqrt_f(rsq00);
1042 rinv11 = sse41_invsqrt_f(rsq11);
1043 rinv12 = sse41_invsqrt_f(rsq12);
1044 rinv13 = sse41_invsqrt_f(rsq13);
1045 rinv21 = sse41_invsqrt_f(rsq21);
1046 rinv22 = sse41_invsqrt_f(rsq22);
1047 rinv23 = sse41_invsqrt_f(rsq23);
1048 rinv31 = sse41_invsqrt_f(rsq31);
1049 rinv32 = sse41_invsqrt_f(rsq32);
1050 rinv33 = sse41_invsqrt_f(rsq33);
1052 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1053 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1054 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1055 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1056 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1057 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1058 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1059 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1060 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1061 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1063 fjx0 = _mm_setzero_ps();
1064 fjy0 = _mm_setzero_ps();
1065 fjz0 = _mm_setzero_ps();
1066 fjx1 = _mm_setzero_ps();
1067 fjy1 = _mm_setzero_ps();
1068 fjz1 = _mm_setzero_ps();
1069 fjx2 = _mm_setzero_ps();
1070 fjy2 = _mm_setzero_ps();
1071 fjz2 = _mm_setzero_ps();
1072 fjx3 = _mm_setzero_ps();
1073 fjy3 = _mm_setzero_ps();
1074 fjz3 = _mm_setzero_ps();
1076 /**************************
1077 * CALCULATE INTERACTIONS *
1078 **************************/
1080 if (gmx_mm_any_lt(rsq00,rcutoff2))
1083 r00 = _mm_mul_ps(rsq00,rinv00);
1084 r00 = _mm_andnot_ps(dummy_mask,r00);
1086 /* LENNARD-JONES DISPERSION/REPULSION */
1088 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1089 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1090 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1091 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1092 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1094 d = _mm_sub_ps(r00,rswitch);
1095 d = _mm_max_ps(d,_mm_setzero_ps());
1096 d2 = _mm_mul_ps(d,d);
1097 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1099 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1101 /* Evaluate switch function */
1102 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1103 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1104 vvdw = _mm_mul_ps(vvdw,sw);
1105 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1107 /* Update potential sum for this i atom from the interaction with this j atom. */
1108 vvdw = _mm_and_ps(vvdw,cutoff_mask);
1109 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
1110 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
1114 fscal = _mm_and_ps(fscal,cutoff_mask);
1116 fscal = _mm_andnot_ps(dummy_mask,fscal);
1118 /* Calculate temporary vectorial force */
1119 tx = _mm_mul_ps(fscal,dx00);
1120 ty = _mm_mul_ps(fscal,dy00);
1121 tz = _mm_mul_ps(fscal,dz00);
1123 /* Update vectorial force */
1124 fix0 = _mm_add_ps(fix0,tx);
1125 fiy0 = _mm_add_ps(fiy0,ty);
1126 fiz0 = _mm_add_ps(fiz0,tz);
1128 fjx0 = _mm_add_ps(fjx0,tx);
1129 fjy0 = _mm_add_ps(fjy0,ty);
1130 fjz0 = _mm_add_ps(fjz0,tz);
1134 /**************************
1135 * CALCULATE INTERACTIONS *
1136 **************************/
1138 if (gmx_mm_any_lt(rsq11,rcutoff2))
1141 r11 = _mm_mul_ps(rsq11,rinv11);
1142 r11 = _mm_andnot_ps(dummy_mask,r11);
1144 /* EWALD ELECTROSTATICS */
1146 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1147 ewrt = _mm_mul_ps(r11,ewtabscale);
1148 ewitab = _mm_cvttps_epi32(ewrt);
1149 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1150 ewitab = _mm_slli_epi32(ewitab,2);
1151 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1152 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1153 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1154 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1155 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1156 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1157 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1158 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1159 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1161 d = _mm_sub_ps(r11,rswitch);
1162 d = _mm_max_ps(d,_mm_setzero_ps());
1163 d2 = _mm_mul_ps(d,d);
1164 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1166 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1168 /* Evaluate switch function */
1169 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1170 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1171 velec = _mm_mul_ps(velec,sw);
1172 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1174 /* Update potential sum for this i atom from the interaction with this j atom. */
1175 velec = _mm_and_ps(velec,cutoff_mask);
1176 velec = _mm_andnot_ps(dummy_mask,velec);
1177 velecsum = _mm_add_ps(velecsum,velec);
1181 fscal = _mm_and_ps(fscal,cutoff_mask);
1183 fscal = _mm_andnot_ps(dummy_mask,fscal);
1185 /* Calculate temporary vectorial force */
1186 tx = _mm_mul_ps(fscal,dx11);
1187 ty = _mm_mul_ps(fscal,dy11);
1188 tz = _mm_mul_ps(fscal,dz11);
1190 /* Update vectorial force */
1191 fix1 = _mm_add_ps(fix1,tx);
1192 fiy1 = _mm_add_ps(fiy1,ty);
1193 fiz1 = _mm_add_ps(fiz1,tz);
1195 fjx1 = _mm_add_ps(fjx1,tx);
1196 fjy1 = _mm_add_ps(fjy1,ty);
1197 fjz1 = _mm_add_ps(fjz1,tz);
1201 /**************************
1202 * CALCULATE INTERACTIONS *
1203 **************************/
1205 if (gmx_mm_any_lt(rsq12,rcutoff2))
1208 r12 = _mm_mul_ps(rsq12,rinv12);
1209 r12 = _mm_andnot_ps(dummy_mask,r12);
1211 /* EWALD ELECTROSTATICS */
1213 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1214 ewrt = _mm_mul_ps(r12,ewtabscale);
1215 ewitab = _mm_cvttps_epi32(ewrt);
1216 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1217 ewitab = _mm_slli_epi32(ewitab,2);
1218 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1219 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1220 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1221 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1222 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1223 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1224 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1225 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1226 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1228 d = _mm_sub_ps(r12,rswitch);
1229 d = _mm_max_ps(d,_mm_setzero_ps());
1230 d2 = _mm_mul_ps(d,d);
1231 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1233 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1235 /* Evaluate switch function */
1236 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1237 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1238 velec = _mm_mul_ps(velec,sw);
1239 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1241 /* Update potential sum for this i atom from the interaction with this j atom. */
1242 velec = _mm_and_ps(velec,cutoff_mask);
1243 velec = _mm_andnot_ps(dummy_mask,velec);
1244 velecsum = _mm_add_ps(velecsum,velec);
1248 fscal = _mm_and_ps(fscal,cutoff_mask);
1250 fscal = _mm_andnot_ps(dummy_mask,fscal);
1252 /* Calculate temporary vectorial force */
1253 tx = _mm_mul_ps(fscal,dx12);
1254 ty = _mm_mul_ps(fscal,dy12);
1255 tz = _mm_mul_ps(fscal,dz12);
1257 /* Update vectorial force */
1258 fix1 = _mm_add_ps(fix1,tx);
1259 fiy1 = _mm_add_ps(fiy1,ty);
1260 fiz1 = _mm_add_ps(fiz1,tz);
1262 fjx2 = _mm_add_ps(fjx2,tx);
1263 fjy2 = _mm_add_ps(fjy2,ty);
1264 fjz2 = _mm_add_ps(fjz2,tz);
1268 /**************************
1269 * CALCULATE INTERACTIONS *
1270 **************************/
1272 if (gmx_mm_any_lt(rsq13,rcutoff2))
1275 r13 = _mm_mul_ps(rsq13,rinv13);
1276 r13 = _mm_andnot_ps(dummy_mask,r13);
1278 /* EWALD ELECTROSTATICS */
1280 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1281 ewrt = _mm_mul_ps(r13,ewtabscale);
1282 ewitab = _mm_cvttps_epi32(ewrt);
1283 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1284 ewitab = _mm_slli_epi32(ewitab,2);
1285 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1286 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1287 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1288 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1289 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1290 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1291 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1292 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
1293 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1295 d = _mm_sub_ps(r13,rswitch);
1296 d = _mm_max_ps(d,_mm_setzero_ps());
1297 d2 = _mm_mul_ps(d,d);
1298 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1300 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1302 /* Evaluate switch function */
1303 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1304 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
1305 velec = _mm_mul_ps(velec,sw);
1306 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1308 /* Update potential sum for this i atom from the interaction with this j atom. */
1309 velec = _mm_and_ps(velec,cutoff_mask);
1310 velec = _mm_andnot_ps(dummy_mask,velec);
1311 velecsum = _mm_add_ps(velecsum,velec);
1315 fscal = _mm_and_ps(fscal,cutoff_mask);
1317 fscal = _mm_andnot_ps(dummy_mask,fscal);
1319 /* Calculate temporary vectorial force */
1320 tx = _mm_mul_ps(fscal,dx13);
1321 ty = _mm_mul_ps(fscal,dy13);
1322 tz = _mm_mul_ps(fscal,dz13);
1324 /* Update vectorial force */
1325 fix1 = _mm_add_ps(fix1,tx);
1326 fiy1 = _mm_add_ps(fiy1,ty);
1327 fiz1 = _mm_add_ps(fiz1,tz);
1329 fjx3 = _mm_add_ps(fjx3,tx);
1330 fjy3 = _mm_add_ps(fjy3,ty);
1331 fjz3 = _mm_add_ps(fjz3,tz);
1335 /**************************
1336 * CALCULATE INTERACTIONS *
1337 **************************/
1339 if (gmx_mm_any_lt(rsq21,rcutoff2))
1342 r21 = _mm_mul_ps(rsq21,rinv21);
1343 r21 = _mm_andnot_ps(dummy_mask,r21);
1345 /* EWALD ELECTROSTATICS */
1347 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1348 ewrt = _mm_mul_ps(r21,ewtabscale);
1349 ewitab = _mm_cvttps_epi32(ewrt);
1350 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1351 ewitab = _mm_slli_epi32(ewitab,2);
1352 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1353 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1354 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1355 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1356 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1357 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1358 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1359 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1360 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1362 d = _mm_sub_ps(r21,rswitch);
1363 d = _mm_max_ps(d,_mm_setzero_ps());
1364 d2 = _mm_mul_ps(d,d);
1365 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1367 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1369 /* Evaluate switch function */
1370 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1371 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1372 velec = _mm_mul_ps(velec,sw);
1373 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1375 /* Update potential sum for this i atom from the interaction with this j atom. */
1376 velec = _mm_and_ps(velec,cutoff_mask);
1377 velec = _mm_andnot_ps(dummy_mask,velec);
1378 velecsum = _mm_add_ps(velecsum,velec);
1382 fscal = _mm_and_ps(fscal,cutoff_mask);
1384 fscal = _mm_andnot_ps(dummy_mask,fscal);
1386 /* Calculate temporary vectorial force */
1387 tx = _mm_mul_ps(fscal,dx21);
1388 ty = _mm_mul_ps(fscal,dy21);
1389 tz = _mm_mul_ps(fscal,dz21);
1391 /* Update vectorial force */
1392 fix2 = _mm_add_ps(fix2,tx);
1393 fiy2 = _mm_add_ps(fiy2,ty);
1394 fiz2 = _mm_add_ps(fiz2,tz);
1396 fjx1 = _mm_add_ps(fjx1,tx);
1397 fjy1 = _mm_add_ps(fjy1,ty);
1398 fjz1 = _mm_add_ps(fjz1,tz);
1402 /**************************
1403 * CALCULATE INTERACTIONS *
1404 **************************/
1406 if (gmx_mm_any_lt(rsq22,rcutoff2))
1409 r22 = _mm_mul_ps(rsq22,rinv22);
1410 r22 = _mm_andnot_ps(dummy_mask,r22);
1412 /* EWALD ELECTROSTATICS */
1414 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1415 ewrt = _mm_mul_ps(r22,ewtabscale);
1416 ewitab = _mm_cvttps_epi32(ewrt);
1417 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1418 ewitab = _mm_slli_epi32(ewitab,2);
1419 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1420 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1421 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1422 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1423 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1424 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1425 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1426 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1427 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1429 d = _mm_sub_ps(r22,rswitch);
1430 d = _mm_max_ps(d,_mm_setzero_ps());
1431 d2 = _mm_mul_ps(d,d);
1432 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1434 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1436 /* Evaluate switch function */
1437 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1438 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1439 velec = _mm_mul_ps(velec,sw);
1440 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1442 /* Update potential sum for this i atom from the interaction with this j atom. */
1443 velec = _mm_and_ps(velec,cutoff_mask);
1444 velec = _mm_andnot_ps(dummy_mask,velec);
1445 velecsum = _mm_add_ps(velecsum,velec);
1449 fscal = _mm_and_ps(fscal,cutoff_mask);
1451 fscal = _mm_andnot_ps(dummy_mask,fscal);
1453 /* Calculate temporary vectorial force */
1454 tx = _mm_mul_ps(fscal,dx22);
1455 ty = _mm_mul_ps(fscal,dy22);
1456 tz = _mm_mul_ps(fscal,dz22);
1458 /* Update vectorial force */
1459 fix2 = _mm_add_ps(fix2,tx);
1460 fiy2 = _mm_add_ps(fiy2,ty);
1461 fiz2 = _mm_add_ps(fiz2,tz);
1463 fjx2 = _mm_add_ps(fjx2,tx);
1464 fjy2 = _mm_add_ps(fjy2,ty);
1465 fjz2 = _mm_add_ps(fjz2,tz);
1469 /**************************
1470 * CALCULATE INTERACTIONS *
1471 **************************/
1473 if (gmx_mm_any_lt(rsq23,rcutoff2))
1476 r23 = _mm_mul_ps(rsq23,rinv23);
1477 r23 = _mm_andnot_ps(dummy_mask,r23);
1479 /* EWALD ELECTROSTATICS */
1481 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1482 ewrt = _mm_mul_ps(r23,ewtabscale);
1483 ewitab = _mm_cvttps_epi32(ewrt);
1484 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1485 ewitab = _mm_slli_epi32(ewitab,2);
1486 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1487 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1488 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1489 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1490 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1491 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1492 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1493 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1494 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1496 d = _mm_sub_ps(r23,rswitch);
1497 d = _mm_max_ps(d,_mm_setzero_ps());
1498 d2 = _mm_mul_ps(d,d);
1499 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1501 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1503 /* Evaluate switch function */
1504 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1505 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
1506 velec = _mm_mul_ps(velec,sw);
1507 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1509 /* Update potential sum for this i atom from the interaction with this j atom. */
1510 velec = _mm_and_ps(velec,cutoff_mask);
1511 velec = _mm_andnot_ps(dummy_mask,velec);
1512 velecsum = _mm_add_ps(velecsum,velec);
1516 fscal = _mm_and_ps(fscal,cutoff_mask);
1518 fscal = _mm_andnot_ps(dummy_mask,fscal);
1520 /* Calculate temporary vectorial force */
1521 tx = _mm_mul_ps(fscal,dx23);
1522 ty = _mm_mul_ps(fscal,dy23);
1523 tz = _mm_mul_ps(fscal,dz23);
1525 /* Update vectorial force */
1526 fix2 = _mm_add_ps(fix2,tx);
1527 fiy2 = _mm_add_ps(fiy2,ty);
1528 fiz2 = _mm_add_ps(fiz2,tz);
1530 fjx3 = _mm_add_ps(fjx3,tx);
1531 fjy3 = _mm_add_ps(fjy3,ty);
1532 fjz3 = _mm_add_ps(fjz3,tz);
1536 /**************************
1537 * CALCULATE INTERACTIONS *
1538 **************************/
1540 if (gmx_mm_any_lt(rsq31,rcutoff2))
1543 r31 = _mm_mul_ps(rsq31,rinv31);
1544 r31 = _mm_andnot_ps(dummy_mask,r31);
1546 /* EWALD ELECTROSTATICS */
1548 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1549 ewrt = _mm_mul_ps(r31,ewtabscale);
1550 ewitab = _mm_cvttps_epi32(ewrt);
1551 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1552 ewitab = _mm_slli_epi32(ewitab,2);
1553 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1554 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1555 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1556 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1557 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1558 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1559 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1560 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1561 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1563 d = _mm_sub_ps(r31,rswitch);
1564 d = _mm_max_ps(d,_mm_setzero_ps());
1565 d2 = _mm_mul_ps(d,d);
1566 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1568 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1570 /* Evaluate switch function */
1571 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1572 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
1573 velec = _mm_mul_ps(velec,sw);
1574 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1576 /* Update potential sum for this i atom from the interaction with this j atom. */
1577 velec = _mm_and_ps(velec,cutoff_mask);
1578 velec = _mm_andnot_ps(dummy_mask,velec);
1579 velecsum = _mm_add_ps(velecsum,velec);
1583 fscal = _mm_and_ps(fscal,cutoff_mask);
1585 fscal = _mm_andnot_ps(dummy_mask,fscal);
1587 /* Calculate temporary vectorial force */
1588 tx = _mm_mul_ps(fscal,dx31);
1589 ty = _mm_mul_ps(fscal,dy31);
1590 tz = _mm_mul_ps(fscal,dz31);
1592 /* Update vectorial force */
1593 fix3 = _mm_add_ps(fix3,tx);
1594 fiy3 = _mm_add_ps(fiy3,ty);
1595 fiz3 = _mm_add_ps(fiz3,tz);
1597 fjx1 = _mm_add_ps(fjx1,tx);
1598 fjy1 = _mm_add_ps(fjy1,ty);
1599 fjz1 = _mm_add_ps(fjz1,tz);
1603 /**************************
1604 * CALCULATE INTERACTIONS *
1605 **************************/
1607 if (gmx_mm_any_lt(rsq32,rcutoff2))
1610 r32 = _mm_mul_ps(rsq32,rinv32);
1611 r32 = _mm_andnot_ps(dummy_mask,r32);
1613 /* EWALD ELECTROSTATICS */
1615 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1616 ewrt = _mm_mul_ps(r32,ewtabscale);
1617 ewitab = _mm_cvttps_epi32(ewrt);
1618 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1619 ewitab = _mm_slli_epi32(ewitab,2);
1620 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1621 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1622 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1623 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1624 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1625 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1626 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1627 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1628 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1630 d = _mm_sub_ps(r32,rswitch);
1631 d = _mm_max_ps(d,_mm_setzero_ps());
1632 d2 = _mm_mul_ps(d,d);
1633 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1635 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1637 /* Evaluate switch function */
1638 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1639 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
1640 velec = _mm_mul_ps(velec,sw);
1641 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1643 /* Update potential sum for this i atom from the interaction with this j atom. */
1644 velec = _mm_and_ps(velec,cutoff_mask);
1645 velec = _mm_andnot_ps(dummy_mask,velec);
1646 velecsum = _mm_add_ps(velecsum,velec);
1650 fscal = _mm_and_ps(fscal,cutoff_mask);
1652 fscal = _mm_andnot_ps(dummy_mask,fscal);
1654 /* Calculate temporary vectorial force */
1655 tx = _mm_mul_ps(fscal,dx32);
1656 ty = _mm_mul_ps(fscal,dy32);
1657 tz = _mm_mul_ps(fscal,dz32);
1659 /* Update vectorial force */
1660 fix3 = _mm_add_ps(fix3,tx);
1661 fiy3 = _mm_add_ps(fiy3,ty);
1662 fiz3 = _mm_add_ps(fiz3,tz);
1664 fjx2 = _mm_add_ps(fjx2,tx);
1665 fjy2 = _mm_add_ps(fjy2,ty);
1666 fjz2 = _mm_add_ps(fjz2,tz);
1670 /**************************
1671 * CALCULATE INTERACTIONS *
1672 **************************/
1674 if (gmx_mm_any_lt(rsq33,rcutoff2))
1677 r33 = _mm_mul_ps(rsq33,rinv33);
1678 r33 = _mm_andnot_ps(dummy_mask,r33);
1680 /* EWALD ELECTROSTATICS */
1682 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1683 ewrt = _mm_mul_ps(r33,ewtabscale);
1684 ewitab = _mm_cvttps_epi32(ewrt);
1685 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1686 ewitab = _mm_slli_epi32(ewitab,2);
1687 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1688 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1689 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1690 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1691 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1692 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1693 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1694 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1695 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1697 d = _mm_sub_ps(r33,rswitch);
1698 d = _mm_max_ps(d,_mm_setzero_ps());
1699 d2 = _mm_mul_ps(d,d);
1700 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1702 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1704 /* Evaluate switch function */
1705 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1706 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
1707 velec = _mm_mul_ps(velec,sw);
1708 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1710 /* Update potential sum for this i atom from the interaction with this j atom. */
1711 velec = _mm_and_ps(velec,cutoff_mask);
1712 velec = _mm_andnot_ps(dummy_mask,velec);
1713 velecsum = _mm_add_ps(velecsum,velec);
1717 fscal = _mm_and_ps(fscal,cutoff_mask);
1719 fscal = _mm_andnot_ps(dummy_mask,fscal);
1721 /* Calculate temporary vectorial force */
1722 tx = _mm_mul_ps(fscal,dx33);
1723 ty = _mm_mul_ps(fscal,dy33);
1724 tz = _mm_mul_ps(fscal,dz33);
1726 /* Update vectorial force */
1727 fix3 = _mm_add_ps(fix3,tx);
1728 fiy3 = _mm_add_ps(fiy3,ty);
1729 fiz3 = _mm_add_ps(fiz3,tz);
1731 fjx3 = _mm_add_ps(fjx3,tx);
1732 fjy3 = _mm_add_ps(fjy3,ty);
1733 fjz3 = _mm_add_ps(fjz3,tz);
1737 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1738 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1739 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1740 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1742 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1743 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1744 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1746 /* Inner loop uses 657 flops */
1749 /* End of innermost loop */
1751 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1752 f+i_coord_offset,fshift+i_shift_offset);
1755 /* Update potential energies */
1756 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1757 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1759 /* Increment number of inner iterations */
1760 inneriter += j_index_end - j_index_start;
1762 /* Outer loop uses 26 flops */
1765 /* Increment number of outer iterations */
1768 /* Update outer/inner flops */
1770 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*657);
1773 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_single
1774 * Electrostatics interaction: Ewald
1775 * VdW interaction: LennardJones
1776 * Geometry: Water4-Water4
1777 * Calculate force/pot: Force
1780 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_single
1781 (t_nblist * gmx_restrict nlist,
1782 rvec * gmx_restrict xx,
1783 rvec * gmx_restrict ff,
1784 struct t_forcerec * gmx_restrict fr,
1785 t_mdatoms * gmx_restrict mdatoms,
1786 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1787 t_nrnb * gmx_restrict nrnb)
1789 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1790 * just 0 for non-waters.
1791 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1792 * jnr indices corresponding to data put in the four positions in the SIMD register.
1794 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1795 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1796 int jnrA,jnrB,jnrC,jnrD;
1797 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1798 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1799 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1800 real rcutoff_scalar;
1801 real *shiftvec,*fshift,*x,*f;
1802 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1803 real scratch[4*DIM];
1804 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1806 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1808 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1810 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1812 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1813 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1814 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1815 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1816 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1817 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1818 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1819 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1820 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1821 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1822 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1823 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1824 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1825 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1826 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1827 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1828 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1829 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1830 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1831 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1834 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1837 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1838 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1840 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1842 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1843 real rswitch_scalar,d_scalar;
1844 __m128 dummy_mask,cutoff_mask;
1845 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1846 __m128 one = _mm_set1_ps(1.0);
1847 __m128 two = _mm_set1_ps(2.0);
1853 jindex = nlist->jindex;
1855 shiftidx = nlist->shift;
1857 shiftvec = fr->shift_vec[0];
1858 fshift = fr->fshift[0];
1859 facel = _mm_set1_ps(fr->ic->epsfac);
1860 charge = mdatoms->chargeA;
1861 nvdwtype = fr->ntype;
1862 vdwparam = fr->nbfp;
1863 vdwtype = mdatoms->typeA;
1865 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1866 ewtab = fr->ic->tabq_coul_FDV0;
1867 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1868 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1870 /* Setup water-specific parameters */
1871 inr = nlist->iinr[0];
1872 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1873 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1874 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1875 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1877 jq1 = _mm_set1_ps(charge[inr+1]);
1878 jq2 = _mm_set1_ps(charge[inr+2]);
1879 jq3 = _mm_set1_ps(charge[inr+3]);
1880 vdwjidx0A = 2*vdwtype[inr+0];
1881 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1882 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1883 qq11 = _mm_mul_ps(iq1,jq1);
1884 qq12 = _mm_mul_ps(iq1,jq2);
1885 qq13 = _mm_mul_ps(iq1,jq3);
1886 qq21 = _mm_mul_ps(iq2,jq1);
1887 qq22 = _mm_mul_ps(iq2,jq2);
1888 qq23 = _mm_mul_ps(iq2,jq3);
1889 qq31 = _mm_mul_ps(iq3,jq1);
1890 qq32 = _mm_mul_ps(iq3,jq2);
1891 qq33 = _mm_mul_ps(iq3,jq3);
1893 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1894 rcutoff_scalar = fr->ic->rcoulomb;
1895 rcutoff = _mm_set1_ps(rcutoff_scalar);
1896 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1898 rswitch_scalar = fr->ic->rcoulomb_switch;
1899 rswitch = _mm_set1_ps(rswitch_scalar);
1900 /* Setup switch parameters */
1901 d_scalar = rcutoff_scalar-rswitch_scalar;
1902 d = _mm_set1_ps(d_scalar);
1903 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1904 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1905 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1906 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1907 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1908 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1910 /* Avoid stupid compiler warnings */
1911 jnrA = jnrB = jnrC = jnrD = 0;
1912 j_coord_offsetA = 0;
1913 j_coord_offsetB = 0;
1914 j_coord_offsetC = 0;
1915 j_coord_offsetD = 0;
1920 for(iidx=0;iidx<4*DIM;iidx++)
1922 scratch[iidx] = 0.0;
1925 /* Start outer loop over neighborlists */
1926 for(iidx=0; iidx<nri; iidx++)
1928 /* Load shift vector for this list */
1929 i_shift_offset = DIM*shiftidx[iidx];
1931 /* Load limits for loop over neighbors */
1932 j_index_start = jindex[iidx];
1933 j_index_end = jindex[iidx+1];
1935 /* Get outer coordinate index */
1937 i_coord_offset = DIM*inr;
1939 /* Load i particle coords and add shift vector */
1940 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1941 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1943 fix0 = _mm_setzero_ps();
1944 fiy0 = _mm_setzero_ps();
1945 fiz0 = _mm_setzero_ps();
1946 fix1 = _mm_setzero_ps();
1947 fiy1 = _mm_setzero_ps();
1948 fiz1 = _mm_setzero_ps();
1949 fix2 = _mm_setzero_ps();
1950 fiy2 = _mm_setzero_ps();
1951 fiz2 = _mm_setzero_ps();
1952 fix3 = _mm_setzero_ps();
1953 fiy3 = _mm_setzero_ps();
1954 fiz3 = _mm_setzero_ps();
1956 /* Start inner kernel loop */
1957 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1960 /* Get j neighbor index, and coordinate index */
1962 jnrB = jjnr[jidx+1];
1963 jnrC = jjnr[jidx+2];
1964 jnrD = jjnr[jidx+3];
1965 j_coord_offsetA = DIM*jnrA;
1966 j_coord_offsetB = DIM*jnrB;
1967 j_coord_offsetC = DIM*jnrC;
1968 j_coord_offsetD = DIM*jnrD;
1970 /* load j atom coordinates */
1971 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1972 x+j_coord_offsetC,x+j_coord_offsetD,
1973 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1974 &jy2,&jz2,&jx3,&jy3,&jz3);
1976 /* Calculate displacement vector */
1977 dx00 = _mm_sub_ps(ix0,jx0);
1978 dy00 = _mm_sub_ps(iy0,jy0);
1979 dz00 = _mm_sub_ps(iz0,jz0);
1980 dx11 = _mm_sub_ps(ix1,jx1);
1981 dy11 = _mm_sub_ps(iy1,jy1);
1982 dz11 = _mm_sub_ps(iz1,jz1);
1983 dx12 = _mm_sub_ps(ix1,jx2);
1984 dy12 = _mm_sub_ps(iy1,jy2);
1985 dz12 = _mm_sub_ps(iz1,jz2);
1986 dx13 = _mm_sub_ps(ix1,jx3);
1987 dy13 = _mm_sub_ps(iy1,jy3);
1988 dz13 = _mm_sub_ps(iz1,jz3);
1989 dx21 = _mm_sub_ps(ix2,jx1);
1990 dy21 = _mm_sub_ps(iy2,jy1);
1991 dz21 = _mm_sub_ps(iz2,jz1);
1992 dx22 = _mm_sub_ps(ix2,jx2);
1993 dy22 = _mm_sub_ps(iy2,jy2);
1994 dz22 = _mm_sub_ps(iz2,jz2);
1995 dx23 = _mm_sub_ps(ix2,jx3);
1996 dy23 = _mm_sub_ps(iy2,jy3);
1997 dz23 = _mm_sub_ps(iz2,jz3);
1998 dx31 = _mm_sub_ps(ix3,jx1);
1999 dy31 = _mm_sub_ps(iy3,jy1);
2000 dz31 = _mm_sub_ps(iz3,jz1);
2001 dx32 = _mm_sub_ps(ix3,jx2);
2002 dy32 = _mm_sub_ps(iy3,jy2);
2003 dz32 = _mm_sub_ps(iz3,jz2);
2004 dx33 = _mm_sub_ps(ix3,jx3);
2005 dy33 = _mm_sub_ps(iy3,jy3);
2006 dz33 = _mm_sub_ps(iz3,jz3);
2008 /* Calculate squared distance and things based on it */
2009 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2010 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2011 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2012 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2013 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2014 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2015 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2016 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2017 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2018 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2020 rinv00 = sse41_invsqrt_f(rsq00);
2021 rinv11 = sse41_invsqrt_f(rsq11);
2022 rinv12 = sse41_invsqrt_f(rsq12);
2023 rinv13 = sse41_invsqrt_f(rsq13);
2024 rinv21 = sse41_invsqrt_f(rsq21);
2025 rinv22 = sse41_invsqrt_f(rsq22);
2026 rinv23 = sse41_invsqrt_f(rsq23);
2027 rinv31 = sse41_invsqrt_f(rsq31);
2028 rinv32 = sse41_invsqrt_f(rsq32);
2029 rinv33 = sse41_invsqrt_f(rsq33);
2031 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2032 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2033 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2034 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2035 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2036 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2037 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2038 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2039 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2040 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2042 fjx0 = _mm_setzero_ps();
2043 fjy0 = _mm_setzero_ps();
2044 fjz0 = _mm_setzero_ps();
2045 fjx1 = _mm_setzero_ps();
2046 fjy1 = _mm_setzero_ps();
2047 fjz1 = _mm_setzero_ps();
2048 fjx2 = _mm_setzero_ps();
2049 fjy2 = _mm_setzero_ps();
2050 fjz2 = _mm_setzero_ps();
2051 fjx3 = _mm_setzero_ps();
2052 fjy3 = _mm_setzero_ps();
2053 fjz3 = _mm_setzero_ps();
2055 /**************************
2056 * CALCULATE INTERACTIONS *
2057 **************************/
2059 if (gmx_mm_any_lt(rsq00,rcutoff2))
2062 r00 = _mm_mul_ps(rsq00,rinv00);
2064 /* LENNARD-JONES DISPERSION/REPULSION */
2066 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2067 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
2068 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2069 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2070 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2072 d = _mm_sub_ps(r00,rswitch);
2073 d = _mm_max_ps(d,_mm_setzero_ps());
2074 d2 = _mm_mul_ps(d,d);
2075 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2077 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2079 /* Evaluate switch function */
2080 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2081 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2082 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2086 fscal = _mm_and_ps(fscal,cutoff_mask);
2088 /* Calculate temporary vectorial force */
2089 tx = _mm_mul_ps(fscal,dx00);
2090 ty = _mm_mul_ps(fscal,dy00);
2091 tz = _mm_mul_ps(fscal,dz00);
2093 /* Update vectorial force */
2094 fix0 = _mm_add_ps(fix0,tx);
2095 fiy0 = _mm_add_ps(fiy0,ty);
2096 fiz0 = _mm_add_ps(fiz0,tz);
2098 fjx0 = _mm_add_ps(fjx0,tx);
2099 fjy0 = _mm_add_ps(fjy0,ty);
2100 fjz0 = _mm_add_ps(fjz0,tz);
2104 /**************************
2105 * CALCULATE INTERACTIONS *
2106 **************************/
2108 if (gmx_mm_any_lt(rsq11,rcutoff2))
2111 r11 = _mm_mul_ps(rsq11,rinv11);
2113 /* EWALD ELECTROSTATICS */
2115 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2116 ewrt = _mm_mul_ps(r11,ewtabscale);
2117 ewitab = _mm_cvttps_epi32(ewrt);
2118 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2119 ewitab = _mm_slli_epi32(ewitab,2);
2120 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2121 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2122 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2123 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2124 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2125 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2126 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2127 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2128 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2130 d = _mm_sub_ps(r11,rswitch);
2131 d = _mm_max_ps(d,_mm_setzero_ps());
2132 d2 = _mm_mul_ps(d,d);
2133 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2135 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2137 /* Evaluate switch function */
2138 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2139 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2140 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2144 fscal = _mm_and_ps(fscal,cutoff_mask);
2146 /* Calculate temporary vectorial force */
2147 tx = _mm_mul_ps(fscal,dx11);
2148 ty = _mm_mul_ps(fscal,dy11);
2149 tz = _mm_mul_ps(fscal,dz11);
2151 /* Update vectorial force */
2152 fix1 = _mm_add_ps(fix1,tx);
2153 fiy1 = _mm_add_ps(fiy1,ty);
2154 fiz1 = _mm_add_ps(fiz1,tz);
2156 fjx1 = _mm_add_ps(fjx1,tx);
2157 fjy1 = _mm_add_ps(fjy1,ty);
2158 fjz1 = _mm_add_ps(fjz1,tz);
2162 /**************************
2163 * CALCULATE INTERACTIONS *
2164 **************************/
2166 if (gmx_mm_any_lt(rsq12,rcutoff2))
2169 r12 = _mm_mul_ps(rsq12,rinv12);
2171 /* EWALD ELECTROSTATICS */
2173 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2174 ewrt = _mm_mul_ps(r12,ewtabscale);
2175 ewitab = _mm_cvttps_epi32(ewrt);
2176 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2177 ewitab = _mm_slli_epi32(ewitab,2);
2178 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2179 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2180 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2181 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2182 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2183 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2184 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2185 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2186 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2188 d = _mm_sub_ps(r12,rswitch);
2189 d = _mm_max_ps(d,_mm_setzero_ps());
2190 d2 = _mm_mul_ps(d,d);
2191 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2193 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2195 /* Evaluate switch function */
2196 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2197 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2198 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2202 fscal = _mm_and_ps(fscal,cutoff_mask);
2204 /* Calculate temporary vectorial force */
2205 tx = _mm_mul_ps(fscal,dx12);
2206 ty = _mm_mul_ps(fscal,dy12);
2207 tz = _mm_mul_ps(fscal,dz12);
2209 /* Update vectorial force */
2210 fix1 = _mm_add_ps(fix1,tx);
2211 fiy1 = _mm_add_ps(fiy1,ty);
2212 fiz1 = _mm_add_ps(fiz1,tz);
2214 fjx2 = _mm_add_ps(fjx2,tx);
2215 fjy2 = _mm_add_ps(fjy2,ty);
2216 fjz2 = _mm_add_ps(fjz2,tz);
2220 /**************************
2221 * CALCULATE INTERACTIONS *
2222 **************************/
2224 if (gmx_mm_any_lt(rsq13,rcutoff2))
2227 r13 = _mm_mul_ps(rsq13,rinv13);
2229 /* EWALD ELECTROSTATICS */
2231 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2232 ewrt = _mm_mul_ps(r13,ewtabscale);
2233 ewitab = _mm_cvttps_epi32(ewrt);
2234 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2235 ewitab = _mm_slli_epi32(ewitab,2);
2236 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2237 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2238 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2239 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2240 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2241 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2242 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2243 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2244 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2246 d = _mm_sub_ps(r13,rswitch);
2247 d = _mm_max_ps(d,_mm_setzero_ps());
2248 d2 = _mm_mul_ps(d,d);
2249 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2251 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2253 /* Evaluate switch function */
2254 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2255 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2256 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2260 fscal = _mm_and_ps(fscal,cutoff_mask);
2262 /* Calculate temporary vectorial force */
2263 tx = _mm_mul_ps(fscal,dx13);
2264 ty = _mm_mul_ps(fscal,dy13);
2265 tz = _mm_mul_ps(fscal,dz13);
2267 /* Update vectorial force */
2268 fix1 = _mm_add_ps(fix1,tx);
2269 fiy1 = _mm_add_ps(fiy1,ty);
2270 fiz1 = _mm_add_ps(fiz1,tz);
2272 fjx3 = _mm_add_ps(fjx3,tx);
2273 fjy3 = _mm_add_ps(fjy3,ty);
2274 fjz3 = _mm_add_ps(fjz3,tz);
2278 /**************************
2279 * CALCULATE INTERACTIONS *
2280 **************************/
2282 if (gmx_mm_any_lt(rsq21,rcutoff2))
2285 r21 = _mm_mul_ps(rsq21,rinv21);
2287 /* EWALD ELECTROSTATICS */
2289 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2290 ewrt = _mm_mul_ps(r21,ewtabscale);
2291 ewitab = _mm_cvttps_epi32(ewrt);
2292 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2293 ewitab = _mm_slli_epi32(ewitab,2);
2294 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2295 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2296 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2297 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2298 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2299 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2300 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2301 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2302 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2304 d = _mm_sub_ps(r21,rswitch);
2305 d = _mm_max_ps(d,_mm_setzero_ps());
2306 d2 = _mm_mul_ps(d,d);
2307 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2309 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2311 /* Evaluate switch function */
2312 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2313 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2314 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2318 fscal = _mm_and_ps(fscal,cutoff_mask);
2320 /* Calculate temporary vectorial force */
2321 tx = _mm_mul_ps(fscal,dx21);
2322 ty = _mm_mul_ps(fscal,dy21);
2323 tz = _mm_mul_ps(fscal,dz21);
2325 /* Update vectorial force */
2326 fix2 = _mm_add_ps(fix2,tx);
2327 fiy2 = _mm_add_ps(fiy2,ty);
2328 fiz2 = _mm_add_ps(fiz2,tz);
2330 fjx1 = _mm_add_ps(fjx1,tx);
2331 fjy1 = _mm_add_ps(fjy1,ty);
2332 fjz1 = _mm_add_ps(fjz1,tz);
2336 /**************************
2337 * CALCULATE INTERACTIONS *
2338 **************************/
2340 if (gmx_mm_any_lt(rsq22,rcutoff2))
2343 r22 = _mm_mul_ps(rsq22,rinv22);
2345 /* EWALD ELECTROSTATICS */
2347 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2348 ewrt = _mm_mul_ps(r22,ewtabscale);
2349 ewitab = _mm_cvttps_epi32(ewrt);
2350 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2351 ewitab = _mm_slli_epi32(ewitab,2);
2352 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2353 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2354 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2355 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2356 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2357 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2358 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2359 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2360 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2362 d = _mm_sub_ps(r22,rswitch);
2363 d = _mm_max_ps(d,_mm_setzero_ps());
2364 d2 = _mm_mul_ps(d,d);
2365 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2367 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2369 /* Evaluate switch function */
2370 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2371 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2372 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2376 fscal = _mm_and_ps(fscal,cutoff_mask);
2378 /* Calculate temporary vectorial force */
2379 tx = _mm_mul_ps(fscal,dx22);
2380 ty = _mm_mul_ps(fscal,dy22);
2381 tz = _mm_mul_ps(fscal,dz22);
2383 /* Update vectorial force */
2384 fix2 = _mm_add_ps(fix2,tx);
2385 fiy2 = _mm_add_ps(fiy2,ty);
2386 fiz2 = _mm_add_ps(fiz2,tz);
2388 fjx2 = _mm_add_ps(fjx2,tx);
2389 fjy2 = _mm_add_ps(fjy2,ty);
2390 fjz2 = _mm_add_ps(fjz2,tz);
2394 /**************************
2395 * CALCULATE INTERACTIONS *
2396 **************************/
2398 if (gmx_mm_any_lt(rsq23,rcutoff2))
2401 r23 = _mm_mul_ps(rsq23,rinv23);
2403 /* EWALD ELECTROSTATICS */
2405 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2406 ewrt = _mm_mul_ps(r23,ewtabscale);
2407 ewitab = _mm_cvttps_epi32(ewrt);
2408 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2409 ewitab = _mm_slli_epi32(ewitab,2);
2410 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2411 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2412 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2413 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2414 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2415 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2416 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2417 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
2418 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2420 d = _mm_sub_ps(r23,rswitch);
2421 d = _mm_max_ps(d,_mm_setzero_ps());
2422 d2 = _mm_mul_ps(d,d);
2423 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2425 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2427 /* Evaluate switch function */
2428 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2429 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2430 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2434 fscal = _mm_and_ps(fscal,cutoff_mask);
2436 /* Calculate temporary vectorial force */
2437 tx = _mm_mul_ps(fscal,dx23);
2438 ty = _mm_mul_ps(fscal,dy23);
2439 tz = _mm_mul_ps(fscal,dz23);
2441 /* Update vectorial force */
2442 fix2 = _mm_add_ps(fix2,tx);
2443 fiy2 = _mm_add_ps(fiy2,ty);
2444 fiz2 = _mm_add_ps(fiz2,tz);
2446 fjx3 = _mm_add_ps(fjx3,tx);
2447 fjy3 = _mm_add_ps(fjy3,ty);
2448 fjz3 = _mm_add_ps(fjz3,tz);
2452 /**************************
2453 * CALCULATE INTERACTIONS *
2454 **************************/
2456 if (gmx_mm_any_lt(rsq31,rcutoff2))
2459 r31 = _mm_mul_ps(rsq31,rinv31);
2461 /* EWALD ELECTROSTATICS */
2463 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2464 ewrt = _mm_mul_ps(r31,ewtabscale);
2465 ewitab = _mm_cvttps_epi32(ewrt);
2466 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2467 ewitab = _mm_slli_epi32(ewitab,2);
2468 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2469 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2470 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2471 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2472 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2473 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2474 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2475 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
2476 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2478 d = _mm_sub_ps(r31,rswitch);
2479 d = _mm_max_ps(d,_mm_setzero_ps());
2480 d2 = _mm_mul_ps(d,d);
2481 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2483 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2485 /* Evaluate switch function */
2486 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2487 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2488 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2492 fscal = _mm_and_ps(fscal,cutoff_mask);
2494 /* Calculate temporary vectorial force */
2495 tx = _mm_mul_ps(fscal,dx31);
2496 ty = _mm_mul_ps(fscal,dy31);
2497 tz = _mm_mul_ps(fscal,dz31);
2499 /* Update vectorial force */
2500 fix3 = _mm_add_ps(fix3,tx);
2501 fiy3 = _mm_add_ps(fiy3,ty);
2502 fiz3 = _mm_add_ps(fiz3,tz);
2504 fjx1 = _mm_add_ps(fjx1,tx);
2505 fjy1 = _mm_add_ps(fjy1,ty);
2506 fjz1 = _mm_add_ps(fjz1,tz);
2510 /**************************
2511 * CALCULATE INTERACTIONS *
2512 **************************/
2514 if (gmx_mm_any_lt(rsq32,rcutoff2))
2517 r32 = _mm_mul_ps(rsq32,rinv32);
2519 /* EWALD ELECTROSTATICS */
2521 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2522 ewrt = _mm_mul_ps(r32,ewtabscale);
2523 ewitab = _mm_cvttps_epi32(ewrt);
2524 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2525 ewitab = _mm_slli_epi32(ewitab,2);
2526 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2527 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2528 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2529 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2530 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2531 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2532 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2533 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
2534 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2536 d = _mm_sub_ps(r32,rswitch);
2537 d = _mm_max_ps(d,_mm_setzero_ps());
2538 d2 = _mm_mul_ps(d,d);
2539 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2541 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2543 /* Evaluate switch function */
2544 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2545 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2546 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2550 fscal = _mm_and_ps(fscal,cutoff_mask);
2552 /* Calculate temporary vectorial force */
2553 tx = _mm_mul_ps(fscal,dx32);
2554 ty = _mm_mul_ps(fscal,dy32);
2555 tz = _mm_mul_ps(fscal,dz32);
2557 /* Update vectorial force */
2558 fix3 = _mm_add_ps(fix3,tx);
2559 fiy3 = _mm_add_ps(fiy3,ty);
2560 fiz3 = _mm_add_ps(fiz3,tz);
2562 fjx2 = _mm_add_ps(fjx2,tx);
2563 fjy2 = _mm_add_ps(fjy2,ty);
2564 fjz2 = _mm_add_ps(fjz2,tz);
2568 /**************************
2569 * CALCULATE INTERACTIONS *
2570 **************************/
2572 if (gmx_mm_any_lt(rsq33,rcutoff2))
2575 r33 = _mm_mul_ps(rsq33,rinv33);
2577 /* EWALD ELECTROSTATICS */
2579 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2580 ewrt = _mm_mul_ps(r33,ewtabscale);
2581 ewitab = _mm_cvttps_epi32(ewrt);
2582 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2583 ewitab = _mm_slli_epi32(ewitab,2);
2584 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2585 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2586 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2587 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2588 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2589 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2590 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2591 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
2592 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2594 d = _mm_sub_ps(r33,rswitch);
2595 d = _mm_max_ps(d,_mm_setzero_ps());
2596 d2 = _mm_mul_ps(d,d);
2597 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2599 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2601 /* Evaluate switch function */
2602 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2603 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2604 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2608 fscal = _mm_and_ps(fscal,cutoff_mask);
2610 /* Calculate temporary vectorial force */
2611 tx = _mm_mul_ps(fscal,dx33);
2612 ty = _mm_mul_ps(fscal,dy33);
2613 tz = _mm_mul_ps(fscal,dz33);
2615 /* Update vectorial force */
2616 fix3 = _mm_add_ps(fix3,tx);
2617 fiy3 = _mm_add_ps(fiy3,ty);
2618 fiz3 = _mm_add_ps(fiz3,tz);
2620 fjx3 = _mm_add_ps(fjx3,tx);
2621 fjy3 = _mm_add_ps(fjy3,ty);
2622 fjz3 = _mm_add_ps(fjz3,tz);
2626 fjptrA = f+j_coord_offsetA;
2627 fjptrB = f+j_coord_offsetB;
2628 fjptrC = f+j_coord_offsetC;
2629 fjptrD = f+j_coord_offsetD;
2631 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2632 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2633 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2635 /* Inner loop uses 617 flops */
2638 if(jidx<j_index_end)
2641 /* Get j neighbor index, and coordinate index */
2642 jnrlistA = jjnr[jidx];
2643 jnrlistB = jjnr[jidx+1];
2644 jnrlistC = jjnr[jidx+2];
2645 jnrlistD = jjnr[jidx+3];
2646 /* Sign of each element will be negative for non-real atoms.
2647 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2648 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2650 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2651 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2652 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2653 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2654 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2655 j_coord_offsetA = DIM*jnrA;
2656 j_coord_offsetB = DIM*jnrB;
2657 j_coord_offsetC = DIM*jnrC;
2658 j_coord_offsetD = DIM*jnrD;
2660 /* load j atom coordinates */
2661 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2662 x+j_coord_offsetC,x+j_coord_offsetD,
2663 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2664 &jy2,&jz2,&jx3,&jy3,&jz3);
2666 /* Calculate displacement vector */
2667 dx00 = _mm_sub_ps(ix0,jx0);
2668 dy00 = _mm_sub_ps(iy0,jy0);
2669 dz00 = _mm_sub_ps(iz0,jz0);
2670 dx11 = _mm_sub_ps(ix1,jx1);
2671 dy11 = _mm_sub_ps(iy1,jy1);
2672 dz11 = _mm_sub_ps(iz1,jz1);
2673 dx12 = _mm_sub_ps(ix1,jx2);
2674 dy12 = _mm_sub_ps(iy1,jy2);
2675 dz12 = _mm_sub_ps(iz1,jz2);
2676 dx13 = _mm_sub_ps(ix1,jx3);
2677 dy13 = _mm_sub_ps(iy1,jy3);
2678 dz13 = _mm_sub_ps(iz1,jz3);
2679 dx21 = _mm_sub_ps(ix2,jx1);
2680 dy21 = _mm_sub_ps(iy2,jy1);
2681 dz21 = _mm_sub_ps(iz2,jz1);
2682 dx22 = _mm_sub_ps(ix2,jx2);
2683 dy22 = _mm_sub_ps(iy2,jy2);
2684 dz22 = _mm_sub_ps(iz2,jz2);
2685 dx23 = _mm_sub_ps(ix2,jx3);
2686 dy23 = _mm_sub_ps(iy2,jy3);
2687 dz23 = _mm_sub_ps(iz2,jz3);
2688 dx31 = _mm_sub_ps(ix3,jx1);
2689 dy31 = _mm_sub_ps(iy3,jy1);
2690 dz31 = _mm_sub_ps(iz3,jz1);
2691 dx32 = _mm_sub_ps(ix3,jx2);
2692 dy32 = _mm_sub_ps(iy3,jy2);
2693 dz32 = _mm_sub_ps(iz3,jz2);
2694 dx33 = _mm_sub_ps(ix3,jx3);
2695 dy33 = _mm_sub_ps(iy3,jy3);
2696 dz33 = _mm_sub_ps(iz3,jz3);
2698 /* Calculate squared distance and things based on it */
2699 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2700 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2701 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2702 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2703 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2704 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2705 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2706 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2707 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2708 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2710 rinv00 = sse41_invsqrt_f(rsq00);
2711 rinv11 = sse41_invsqrt_f(rsq11);
2712 rinv12 = sse41_invsqrt_f(rsq12);
2713 rinv13 = sse41_invsqrt_f(rsq13);
2714 rinv21 = sse41_invsqrt_f(rsq21);
2715 rinv22 = sse41_invsqrt_f(rsq22);
2716 rinv23 = sse41_invsqrt_f(rsq23);
2717 rinv31 = sse41_invsqrt_f(rsq31);
2718 rinv32 = sse41_invsqrt_f(rsq32);
2719 rinv33 = sse41_invsqrt_f(rsq33);
2721 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2722 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2723 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2724 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2725 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2726 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2727 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2728 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2729 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2730 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2732 fjx0 = _mm_setzero_ps();
2733 fjy0 = _mm_setzero_ps();
2734 fjz0 = _mm_setzero_ps();
2735 fjx1 = _mm_setzero_ps();
2736 fjy1 = _mm_setzero_ps();
2737 fjz1 = _mm_setzero_ps();
2738 fjx2 = _mm_setzero_ps();
2739 fjy2 = _mm_setzero_ps();
2740 fjz2 = _mm_setzero_ps();
2741 fjx3 = _mm_setzero_ps();
2742 fjy3 = _mm_setzero_ps();
2743 fjz3 = _mm_setzero_ps();
2745 /**************************
2746 * CALCULATE INTERACTIONS *
2747 **************************/
2749 if (gmx_mm_any_lt(rsq00,rcutoff2))
2752 r00 = _mm_mul_ps(rsq00,rinv00);
2753 r00 = _mm_andnot_ps(dummy_mask,r00);
2755 /* LENNARD-JONES DISPERSION/REPULSION */
2757 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2758 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
2759 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2760 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2761 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2763 d = _mm_sub_ps(r00,rswitch);
2764 d = _mm_max_ps(d,_mm_setzero_ps());
2765 d2 = _mm_mul_ps(d,d);
2766 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2768 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2770 /* Evaluate switch function */
2771 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2772 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2773 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2777 fscal = _mm_and_ps(fscal,cutoff_mask);
2779 fscal = _mm_andnot_ps(dummy_mask,fscal);
2781 /* Calculate temporary vectorial force */
2782 tx = _mm_mul_ps(fscal,dx00);
2783 ty = _mm_mul_ps(fscal,dy00);
2784 tz = _mm_mul_ps(fscal,dz00);
2786 /* Update vectorial force */
2787 fix0 = _mm_add_ps(fix0,tx);
2788 fiy0 = _mm_add_ps(fiy0,ty);
2789 fiz0 = _mm_add_ps(fiz0,tz);
2791 fjx0 = _mm_add_ps(fjx0,tx);
2792 fjy0 = _mm_add_ps(fjy0,ty);
2793 fjz0 = _mm_add_ps(fjz0,tz);
2797 /**************************
2798 * CALCULATE INTERACTIONS *
2799 **************************/
2801 if (gmx_mm_any_lt(rsq11,rcutoff2))
2804 r11 = _mm_mul_ps(rsq11,rinv11);
2805 r11 = _mm_andnot_ps(dummy_mask,r11);
2807 /* EWALD ELECTROSTATICS */
2809 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2810 ewrt = _mm_mul_ps(r11,ewtabscale);
2811 ewitab = _mm_cvttps_epi32(ewrt);
2812 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2813 ewitab = _mm_slli_epi32(ewitab,2);
2814 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2815 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2816 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2817 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2818 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2819 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2820 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2821 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2822 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2824 d = _mm_sub_ps(r11,rswitch);
2825 d = _mm_max_ps(d,_mm_setzero_ps());
2826 d2 = _mm_mul_ps(d,d);
2827 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2829 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2831 /* Evaluate switch function */
2832 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2833 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2834 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2838 fscal = _mm_and_ps(fscal,cutoff_mask);
2840 fscal = _mm_andnot_ps(dummy_mask,fscal);
2842 /* Calculate temporary vectorial force */
2843 tx = _mm_mul_ps(fscal,dx11);
2844 ty = _mm_mul_ps(fscal,dy11);
2845 tz = _mm_mul_ps(fscal,dz11);
2847 /* Update vectorial force */
2848 fix1 = _mm_add_ps(fix1,tx);
2849 fiy1 = _mm_add_ps(fiy1,ty);
2850 fiz1 = _mm_add_ps(fiz1,tz);
2852 fjx1 = _mm_add_ps(fjx1,tx);
2853 fjy1 = _mm_add_ps(fjy1,ty);
2854 fjz1 = _mm_add_ps(fjz1,tz);
2858 /**************************
2859 * CALCULATE INTERACTIONS *
2860 **************************/
2862 if (gmx_mm_any_lt(rsq12,rcutoff2))
2865 r12 = _mm_mul_ps(rsq12,rinv12);
2866 r12 = _mm_andnot_ps(dummy_mask,r12);
2868 /* EWALD ELECTROSTATICS */
2870 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2871 ewrt = _mm_mul_ps(r12,ewtabscale);
2872 ewitab = _mm_cvttps_epi32(ewrt);
2873 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2874 ewitab = _mm_slli_epi32(ewitab,2);
2875 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2876 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2877 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2878 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2879 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2880 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2881 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2882 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2883 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2885 d = _mm_sub_ps(r12,rswitch);
2886 d = _mm_max_ps(d,_mm_setzero_ps());
2887 d2 = _mm_mul_ps(d,d);
2888 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2890 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2892 /* Evaluate switch function */
2893 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2894 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2895 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2899 fscal = _mm_and_ps(fscal,cutoff_mask);
2901 fscal = _mm_andnot_ps(dummy_mask,fscal);
2903 /* Calculate temporary vectorial force */
2904 tx = _mm_mul_ps(fscal,dx12);
2905 ty = _mm_mul_ps(fscal,dy12);
2906 tz = _mm_mul_ps(fscal,dz12);
2908 /* Update vectorial force */
2909 fix1 = _mm_add_ps(fix1,tx);
2910 fiy1 = _mm_add_ps(fiy1,ty);
2911 fiz1 = _mm_add_ps(fiz1,tz);
2913 fjx2 = _mm_add_ps(fjx2,tx);
2914 fjy2 = _mm_add_ps(fjy2,ty);
2915 fjz2 = _mm_add_ps(fjz2,tz);
2919 /**************************
2920 * CALCULATE INTERACTIONS *
2921 **************************/
2923 if (gmx_mm_any_lt(rsq13,rcutoff2))
2926 r13 = _mm_mul_ps(rsq13,rinv13);
2927 r13 = _mm_andnot_ps(dummy_mask,r13);
2929 /* EWALD ELECTROSTATICS */
2931 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2932 ewrt = _mm_mul_ps(r13,ewtabscale);
2933 ewitab = _mm_cvttps_epi32(ewrt);
2934 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2935 ewitab = _mm_slli_epi32(ewitab,2);
2936 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2937 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2938 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2939 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2940 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2941 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2942 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2943 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2944 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2946 d = _mm_sub_ps(r13,rswitch);
2947 d = _mm_max_ps(d,_mm_setzero_ps());
2948 d2 = _mm_mul_ps(d,d);
2949 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2951 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2953 /* Evaluate switch function */
2954 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2955 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2956 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2960 fscal = _mm_and_ps(fscal,cutoff_mask);
2962 fscal = _mm_andnot_ps(dummy_mask,fscal);
2964 /* Calculate temporary vectorial force */
2965 tx = _mm_mul_ps(fscal,dx13);
2966 ty = _mm_mul_ps(fscal,dy13);
2967 tz = _mm_mul_ps(fscal,dz13);
2969 /* Update vectorial force */
2970 fix1 = _mm_add_ps(fix1,tx);
2971 fiy1 = _mm_add_ps(fiy1,ty);
2972 fiz1 = _mm_add_ps(fiz1,tz);
2974 fjx3 = _mm_add_ps(fjx3,tx);
2975 fjy3 = _mm_add_ps(fjy3,ty);
2976 fjz3 = _mm_add_ps(fjz3,tz);
2980 /**************************
2981 * CALCULATE INTERACTIONS *
2982 **************************/
2984 if (gmx_mm_any_lt(rsq21,rcutoff2))
2987 r21 = _mm_mul_ps(rsq21,rinv21);
2988 r21 = _mm_andnot_ps(dummy_mask,r21);
2990 /* EWALD ELECTROSTATICS */
2992 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2993 ewrt = _mm_mul_ps(r21,ewtabscale);
2994 ewitab = _mm_cvttps_epi32(ewrt);
2995 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2996 ewitab = _mm_slli_epi32(ewitab,2);
2997 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2998 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2999 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3000 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3001 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3002 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3003 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3004 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
3005 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
3007 d = _mm_sub_ps(r21,rswitch);
3008 d = _mm_max_ps(d,_mm_setzero_ps());
3009 d2 = _mm_mul_ps(d,d);
3010 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3012 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3014 /* Evaluate switch function */
3015 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3016 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
3017 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
3021 fscal = _mm_and_ps(fscal,cutoff_mask);
3023 fscal = _mm_andnot_ps(dummy_mask,fscal);
3025 /* Calculate temporary vectorial force */
3026 tx = _mm_mul_ps(fscal,dx21);
3027 ty = _mm_mul_ps(fscal,dy21);
3028 tz = _mm_mul_ps(fscal,dz21);
3030 /* Update vectorial force */
3031 fix2 = _mm_add_ps(fix2,tx);
3032 fiy2 = _mm_add_ps(fiy2,ty);
3033 fiz2 = _mm_add_ps(fiz2,tz);
3035 fjx1 = _mm_add_ps(fjx1,tx);
3036 fjy1 = _mm_add_ps(fjy1,ty);
3037 fjz1 = _mm_add_ps(fjz1,tz);
3041 /**************************
3042 * CALCULATE INTERACTIONS *
3043 **************************/
3045 if (gmx_mm_any_lt(rsq22,rcutoff2))
3048 r22 = _mm_mul_ps(rsq22,rinv22);
3049 r22 = _mm_andnot_ps(dummy_mask,r22);
3051 /* EWALD ELECTROSTATICS */
3053 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3054 ewrt = _mm_mul_ps(r22,ewtabscale);
3055 ewitab = _mm_cvttps_epi32(ewrt);
3056 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3057 ewitab = _mm_slli_epi32(ewitab,2);
3058 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3059 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3060 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3061 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3062 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3063 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3064 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3065 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
3066 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
3068 d = _mm_sub_ps(r22,rswitch);
3069 d = _mm_max_ps(d,_mm_setzero_ps());
3070 d2 = _mm_mul_ps(d,d);
3071 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3073 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3075 /* Evaluate switch function */
3076 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3077 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
3078 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
3082 fscal = _mm_and_ps(fscal,cutoff_mask);
3084 fscal = _mm_andnot_ps(dummy_mask,fscal);
3086 /* Calculate temporary vectorial force */
3087 tx = _mm_mul_ps(fscal,dx22);
3088 ty = _mm_mul_ps(fscal,dy22);
3089 tz = _mm_mul_ps(fscal,dz22);
3091 /* Update vectorial force */
3092 fix2 = _mm_add_ps(fix2,tx);
3093 fiy2 = _mm_add_ps(fiy2,ty);
3094 fiz2 = _mm_add_ps(fiz2,tz);
3096 fjx2 = _mm_add_ps(fjx2,tx);
3097 fjy2 = _mm_add_ps(fjy2,ty);
3098 fjz2 = _mm_add_ps(fjz2,tz);
3102 /**************************
3103 * CALCULATE INTERACTIONS *
3104 **************************/
3106 if (gmx_mm_any_lt(rsq23,rcutoff2))
3109 r23 = _mm_mul_ps(rsq23,rinv23);
3110 r23 = _mm_andnot_ps(dummy_mask,r23);
3112 /* EWALD ELECTROSTATICS */
3114 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3115 ewrt = _mm_mul_ps(r23,ewtabscale);
3116 ewitab = _mm_cvttps_epi32(ewrt);
3117 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3118 ewitab = _mm_slli_epi32(ewitab,2);
3119 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3120 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3121 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3122 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3123 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3124 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3125 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3126 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
3127 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
3129 d = _mm_sub_ps(r23,rswitch);
3130 d = _mm_max_ps(d,_mm_setzero_ps());
3131 d2 = _mm_mul_ps(d,d);
3132 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3134 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3136 /* Evaluate switch function */
3137 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3138 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
3139 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
3143 fscal = _mm_and_ps(fscal,cutoff_mask);
3145 fscal = _mm_andnot_ps(dummy_mask,fscal);
3147 /* Calculate temporary vectorial force */
3148 tx = _mm_mul_ps(fscal,dx23);
3149 ty = _mm_mul_ps(fscal,dy23);
3150 tz = _mm_mul_ps(fscal,dz23);
3152 /* Update vectorial force */
3153 fix2 = _mm_add_ps(fix2,tx);
3154 fiy2 = _mm_add_ps(fiy2,ty);
3155 fiz2 = _mm_add_ps(fiz2,tz);
3157 fjx3 = _mm_add_ps(fjx3,tx);
3158 fjy3 = _mm_add_ps(fjy3,ty);
3159 fjz3 = _mm_add_ps(fjz3,tz);
3163 /**************************
3164 * CALCULATE INTERACTIONS *
3165 **************************/
3167 if (gmx_mm_any_lt(rsq31,rcutoff2))
3170 r31 = _mm_mul_ps(rsq31,rinv31);
3171 r31 = _mm_andnot_ps(dummy_mask,r31);
3173 /* EWALD ELECTROSTATICS */
3175 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3176 ewrt = _mm_mul_ps(r31,ewtabscale);
3177 ewitab = _mm_cvttps_epi32(ewrt);
3178 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3179 ewitab = _mm_slli_epi32(ewitab,2);
3180 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3181 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3182 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3183 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3184 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3185 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3186 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3187 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
3188 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
3190 d = _mm_sub_ps(r31,rswitch);
3191 d = _mm_max_ps(d,_mm_setzero_ps());
3192 d2 = _mm_mul_ps(d,d);
3193 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3195 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3197 /* Evaluate switch function */
3198 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3199 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
3200 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
3204 fscal = _mm_and_ps(fscal,cutoff_mask);
3206 fscal = _mm_andnot_ps(dummy_mask,fscal);
3208 /* Calculate temporary vectorial force */
3209 tx = _mm_mul_ps(fscal,dx31);
3210 ty = _mm_mul_ps(fscal,dy31);
3211 tz = _mm_mul_ps(fscal,dz31);
3213 /* Update vectorial force */
3214 fix3 = _mm_add_ps(fix3,tx);
3215 fiy3 = _mm_add_ps(fiy3,ty);
3216 fiz3 = _mm_add_ps(fiz3,tz);
3218 fjx1 = _mm_add_ps(fjx1,tx);
3219 fjy1 = _mm_add_ps(fjy1,ty);
3220 fjz1 = _mm_add_ps(fjz1,tz);
3224 /**************************
3225 * CALCULATE INTERACTIONS *
3226 **************************/
3228 if (gmx_mm_any_lt(rsq32,rcutoff2))
3231 r32 = _mm_mul_ps(rsq32,rinv32);
3232 r32 = _mm_andnot_ps(dummy_mask,r32);
3234 /* EWALD ELECTROSTATICS */
3236 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3237 ewrt = _mm_mul_ps(r32,ewtabscale);
3238 ewitab = _mm_cvttps_epi32(ewrt);
3239 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3240 ewitab = _mm_slli_epi32(ewitab,2);
3241 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3242 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3243 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3244 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3245 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3246 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3247 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3248 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
3249 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
3251 d = _mm_sub_ps(r32,rswitch);
3252 d = _mm_max_ps(d,_mm_setzero_ps());
3253 d2 = _mm_mul_ps(d,d);
3254 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3256 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3258 /* Evaluate switch function */
3259 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3260 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
3261 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
3265 fscal = _mm_and_ps(fscal,cutoff_mask);
3267 fscal = _mm_andnot_ps(dummy_mask,fscal);
3269 /* Calculate temporary vectorial force */
3270 tx = _mm_mul_ps(fscal,dx32);
3271 ty = _mm_mul_ps(fscal,dy32);
3272 tz = _mm_mul_ps(fscal,dz32);
3274 /* Update vectorial force */
3275 fix3 = _mm_add_ps(fix3,tx);
3276 fiy3 = _mm_add_ps(fiy3,ty);
3277 fiz3 = _mm_add_ps(fiz3,tz);
3279 fjx2 = _mm_add_ps(fjx2,tx);
3280 fjy2 = _mm_add_ps(fjy2,ty);
3281 fjz2 = _mm_add_ps(fjz2,tz);
3285 /**************************
3286 * CALCULATE INTERACTIONS *
3287 **************************/
3289 if (gmx_mm_any_lt(rsq33,rcutoff2))
3292 r33 = _mm_mul_ps(rsq33,rinv33);
3293 r33 = _mm_andnot_ps(dummy_mask,r33);
3295 /* EWALD ELECTROSTATICS */
3297 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3298 ewrt = _mm_mul_ps(r33,ewtabscale);
3299 ewitab = _mm_cvttps_epi32(ewrt);
3300 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3301 ewitab = _mm_slli_epi32(ewitab,2);
3302 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3303 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3304 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3305 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3306 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3307 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3308 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3309 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
3310 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
3312 d = _mm_sub_ps(r33,rswitch);
3313 d = _mm_max_ps(d,_mm_setzero_ps());
3314 d2 = _mm_mul_ps(d,d);
3315 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3317 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3319 /* Evaluate switch function */
3320 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3321 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
3322 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
3326 fscal = _mm_and_ps(fscal,cutoff_mask);
3328 fscal = _mm_andnot_ps(dummy_mask,fscal);
3330 /* Calculate temporary vectorial force */
3331 tx = _mm_mul_ps(fscal,dx33);
3332 ty = _mm_mul_ps(fscal,dy33);
3333 tz = _mm_mul_ps(fscal,dz33);
3335 /* Update vectorial force */
3336 fix3 = _mm_add_ps(fix3,tx);
3337 fiy3 = _mm_add_ps(fiy3,ty);
3338 fiz3 = _mm_add_ps(fiz3,tz);
3340 fjx3 = _mm_add_ps(fjx3,tx);
3341 fjy3 = _mm_add_ps(fjy3,ty);
3342 fjz3 = _mm_add_ps(fjz3,tz);
3346 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3347 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3348 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3349 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3351 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
3352 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
3353 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3355 /* Inner loop uses 627 flops */
3358 /* End of innermost loop */
3360 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3361 f+i_coord_offset,fshift+i_shift_offset);
3363 /* Increment number of inner iterations */
3364 inneriter += j_index_end - j_index_start;
3366 /* Outer loop uses 24 flops */
3369 /* Increment number of outer iterations */
3372 /* Update outer/inner flops */
3374 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*627);