2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
49 #include "gromacs/simd/math_x86_sse4_1_single.h"
50 #include "kernelutil_x86_sse4_1_single.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse4_1_single
54 * Electrostatics interaction: Ewald
55 * VdW interaction: LennardJones
56 * Geometry: Water4-Water4
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse4_1_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
84 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
96 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
97 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
98 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
99 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
100 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
101 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
102 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
105 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
108 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
109 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
110 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
111 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
114 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
117 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
118 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
120 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
122 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
123 real rswitch_scalar,d_scalar;
124 __m128 dummy_mask,cutoff_mask;
125 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
126 __m128 one = _mm_set1_ps(1.0);
127 __m128 two = _mm_set1_ps(2.0);
133 jindex = nlist->jindex;
135 shiftidx = nlist->shift;
137 shiftvec = fr->shift_vec[0];
138 fshift = fr->fshift[0];
139 facel = _mm_set1_ps(fr->epsfac);
140 charge = mdatoms->chargeA;
141 nvdwtype = fr->ntype;
143 vdwtype = mdatoms->typeA;
145 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
146 ewtab = fr->ic->tabq_coul_FDV0;
147 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
148 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
150 /* Setup water-specific parameters */
151 inr = nlist->iinr[0];
152 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
153 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
154 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
155 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
157 jq1 = _mm_set1_ps(charge[inr+1]);
158 jq2 = _mm_set1_ps(charge[inr+2]);
159 jq3 = _mm_set1_ps(charge[inr+3]);
160 vdwjidx0A = 2*vdwtype[inr+0];
161 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
162 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
163 qq11 = _mm_mul_ps(iq1,jq1);
164 qq12 = _mm_mul_ps(iq1,jq2);
165 qq13 = _mm_mul_ps(iq1,jq3);
166 qq21 = _mm_mul_ps(iq2,jq1);
167 qq22 = _mm_mul_ps(iq2,jq2);
168 qq23 = _mm_mul_ps(iq2,jq3);
169 qq31 = _mm_mul_ps(iq3,jq1);
170 qq32 = _mm_mul_ps(iq3,jq2);
171 qq33 = _mm_mul_ps(iq3,jq3);
173 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
174 rcutoff_scalar = fr->rcoulomb;
175 rcutoff = _mm_set1_ps(rcutoff_scalar);
176 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
178 rswitch_scalar = fr->rcoulomb_switch;
179 rswitch = _mm_set1_ps(rswitch_scalar);
180 /* Setup switch parameters */
181 d_scalar = rcutoff_scalar-rswitch_scalar;
182 d = _mm_set1_ps(d_scalar);
183 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
184 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
185 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
186 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
187 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
188 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
190 /* Avoid stupid compiler warnings */
191 jnrA = jnrB = jnrC = jnrD = 0;
200 for(iidx=0;iidx<4*DIM;iidx++)
205 /* Start outer loop over neighborlists */
206 for(iidx=0; iidx<nri; iidx++)
208 /* Load shift vector for this list */
209 i_shift_offset = DIM*shiftidx[iidx];
211 /* Load limits for loop over neighbors */
212 j_index_start = jindex[iidx];
213 j_index_end = jindex[iidx+1];
215 /* Get outer coordinate index */
217 i_coord_offset = DIM*inr;
219 /* Load i particle coords and add shift vector */
220 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
221 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
223 fix0 = _mm_setzero_ps();
224 fiy0 = _mm_setzero_ps();
225 fiz0 = _mm_setzero_ps();
226 fix1 = _mm_setzero_ps();
227 fiy1 = _mm_setzero_ps();
228 fiz1 = _mm_setzero_ps();
229 fix2 = _mm_setzero_ps();
230 fiy2 = _mm_setzero_ps();
231 fiz2 = _mm_setzero_ps();
232 fix3 = _mm_setzero_ps();
233 fiy3 = _mm_setzero_ps();
234 fiz3 = _mm_setzero_ps();
236 /* Reset potential sums */
237 velecsum = _mm_setzero_ps();
238 vvdwsum = _mm_setzero_ps();
240 /* Start inner kernel loop */
241 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
244 /* Get j neighbor index, and coordinate index */
249 j_coord_offsetA = DIM*jnrA;
250 j_coord_offsetB = DIM*jnrB;
251 j_coord_offsetC = DIM*jnrC;
252 j_coord_offsetD = DIM*jnrD;
254 /* load j atom coordinates */
255 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
256 x+j_coord_offsetC,x+j_coord_offsetD,
257 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
258 &jy2,&jz2,&jx3,&jy3,&jz3);
260 /* Calculate displacement vector */
261 dx00 = _mm_sub_ps(ix0,jx0);
262 dy00 = _mm_sub_ps(iy0,jy0);
263 dz00 = _mm_sub_ps(iz0,jz0);
264 dx11 = _mm_sub_ps(ix1,jx1);
265 dy11 = _mm_sub_ps(iy1,jy1);
266 dz11 = _mm_sub_ps(iz1,jz1);
267 dx12 = _mm_sub_ps(ix1,jx2);
268 dy12 = _mm_sub_ps(iy1,jy2);
269 dz12 = _mm_sub_ps(iz1,jz2);
270 dx13 = _mm_sub_ps(ix1,jx3);
271 dy13 = _mm_sub_ps(iy1,jy3);
272 dz13 = _mm_sub_ps(iz1,jz3);
273 dx21 = _mm_sub_ps(ix2,jx1);
274 dy21 = _mm_sub_ps(iy2,jy1);
275 dz21 = _mm_sub_ps(iz2,jz1);
276 dx22 = _mm_sub_ps(ix2,jx2);
277 dy22 = _mm_sub_ps(iy2,jy2);
278 dz22 = _mm_sub_ps(iz2,jz2);
279 dx23 = _mm_sub_ps(ix2,jx3);
280 dy23 = _mm_sub_ps(iy2,jy3);
281 dz23 = _mm_sub_ps(iz2,jz3);
282 dx31 = _mm_sub_ps(ix3,jx1);
283 dy31 = _mm_sub_ps(iy3,jy1);
284 dz31 = _mm_sub_ps(iz3,jz1);
285 dx32 = _mm_sub_ps(ix3,jx2);
286 dy32 = _mm_sub_ps(iy3,jy2);
287 dz32 = _mm_sub_ps(iz3,jz2);
288 dx33 = _mm_sub_ps(ix3,jx3);
289 dy33 = _mm_sub_ps(iy3,jy3);
290 dz33 = _mm_sub_ps(iz3,jz3);
292 /* Calculate squared distance and things based on it */
293 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
294 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
295 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
296 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
297 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
298 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
299 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
300 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
301 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
302 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
304 rinv00 = gmx_mm_invsqrt_ps(rsq00);
305 rinv11 = gmx_mm_invsqrt_ps(rsq11);
306 rinv12 = gmx_mm_invsqrt_ps(rsq12);
307 rinv13 = gmx_mm_invsqrt_ps(rsq13);
308 rinv21 = gmx_mm_invsqrt_ps(rsq21);
309 rinv22 = gmx_mm_invsqrt_ps(rsq22);
310 rinv23 = gmx_mm_invsqrt_ps(rsq23);
311 rinv31 = gmx_mm_invsqrt_ps(rsq31);
312 rinv32 = gmx_mm_invsqrt_ps(rsq32);
313 rinv33 = gmx_mm_invsqrt_ps(rsq33);
315 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
316 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
317 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
318 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
319 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
320 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
321 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
322 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
323 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
324 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
326 fjx0 = _mm_setzero_ps();
327 fjy0 = _mm_setzero_ps();
328 fjz0 = _mm_setzero_ps();
329 fjx1 = _mm_setzero_ps();
330 fjy1 = _mm_setzero_ps();
331 fjz1 = _mm_setzero_ps();
332 fjx2 = _mm_setzero_ps();
333 fjy2 = _mm_setzero_ps();
334 fjz2 = _mm_setzero_ps();
335 fjx3 = _mm_setzero_ps();
336 fjy3 = _mm_setzero_ps();
337 fjz3 = _mm_setzero_ps();
339 /**************************
340 * CALCULATE INTERACTIONS *
341 **************************/
343 if (gmx_mm_any_lt(rsq00,rcutoff2))
346 r00 = _mm_mul_ps(rsq00,rinv00);
348 /* LENNARD-JONES DISPERSION/REPULSION */
350 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
351 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
352 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
353 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
354 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
356 d = _mm_sub_ps(r00,rswitch);
357 d = _mm_max_ps(d,_mm_setzero_ps());
358 d2 = _mm_mul_ps(d,d);
359 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
361 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
363 /* Evaluate switch function */
364 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
365 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
366 vvdw = _mm_mul_ps(vvdw,sw);
367 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
369 /* Update potential sum for this i atom from the interaction with this j atom. */
370 vvdw = _mm_and_ps(vvdw,cutoff_mask);
371 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
375 fscal = _mm_and_ps(fscal,cutoff_mask);
377 /* Calculate temporary vectorial force */
378 tx = _mm_mul_ps(fscal,dx00);
379 ty = _mm_mul_ps(fscal,dy00);
380 tz = _mm_mul_ps(fscal,dz00);
382 /* Update vectorial force */
383 fix0 = _mm_add_ps(fix0,tx);
384 fiy0 = _mm_add_ps(fiy0,ty);
385 fiz0 = _mm_add_ps(fiz0,tz);
387 fjx0 = _mm_add_ps(fjx0,tx);
388 fjy0 = _mm_add_ps(fjy0,ty);
389 fjz0 = _mm_add_ps(fjz0,tz);
393 /**************************
394 * CALCULATE INTERACTIONS *
395 **************************/
397 if (gmx_mm_any_lt(rsq11,rcutoff2))
400 r11 = _mm_mul_ps(rsq11,rinv11);
402 /* EWALD ELECTROSTATICS */
404 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
405 ewrt = _mm_mul_ps(r11,ewtabscale);
406 ewitab = _mm_cvttps_epi32(ewrt);
407 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
408 ewitab = _mm_slli_epi32(ewitab,2);
409 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
410 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
411 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
412 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
413 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
414 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
415 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
416 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
417 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
419 d = _mm_sub_ps(r11,rswitch);
420 d = _mm_max_ps(d,_mm_setzero_ps());
421 d2 = _mm_mul_ps(d,d);
422 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
424 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
426 /* Evaluate switch function */
427 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
428 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
429 velec = _mm_mul_ps(velec,sw);
430 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
432 /* Update potential sum for this i atom from the interaction with this j atom. */
433 velec = _mm_and_ps(velec,cutoff_mask);
434 velecsum = _mm_add_ps(velecsum,velec);
438 fscal = _mm_and_ps(fscal,cutoff_mask);
440 /* Calculate temporary vectorial force */
441 tx = _mm_mul_ps(fscal,dx11);
442 ty = _mm_mul_ps(fscal,dy11);
443 tz = _mm_mul_ps(fscal,dz11);
445 /* Update vectorial force */
446 fix1 = _mm_add_ps(fix1,tx);
447 fiy1 = _mm_add_ps(fiy1,ty);
448 fiz1 = _mm_add_ps(fiz1,tz);
450 fjx1 = _mm_add_ps(fjx1,tx);
451 fjy1 = _mm_add_ps(fjy1,ty);
452 fjz1 = _mm_add_ps(fjz1,tz);
456 /**************************
457 * CALCULATE INTERACTIONS *
458 **************************/
460 if (gmx_mm_any_lt(rsq12,rcutoff2))
463 r12 = _mm_mul_ps(rsq12,rinv12);
465 /* EWALD ELECTROSTATICS */
467 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
468 ewrt = _mm_mul_ps(r12,ewtabscale);
469 ewitab = _mm_cvttps_epi32(ewrt);
470 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
471 ewitab = _mm_slli_epi32(ewitab,2);
472 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
473 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
474 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
475 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
476 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
477 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
478 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
479 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
480 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
482 d = _mm_sub_ps(r12,rswitch);
483 d = _mm_max_ps(d,_mm_setzero_ps());
484 d2 = _mm_mul_ps(d,d);
485 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
487 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
489 /* Evaluate switch function */
490 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
491 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
492 velec = _mm_mul_ps(velec,sw);
493 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
495 /* Update potential sum for this i atom from the interaction with this j atom. */
496 velec = _mm_and_ps(velec,cutoff_mask);
497 velecsum = _mm_add_ps(velecsum,velec);
501 fscal = _mm_and_ps(fscal,cutoff_mask);
503 /* Calculate temporary vectorial force */
504 tx = _mm_mul_ps(fscal,dx12);
505 ty = _mm_mul_ps(fscal,dy12);
506 tz = _mm_mul_ps(fscal,dz12);
508 /* Update vectorial force */
509 fix1 = _mm_add_ps(fix1,tx);
510 fiy1 = _mm_add_ps(fiy1,ty);
511 fiz1 = _mm_add_ps(fiz1,tz);
513 fjx2 = _mm_add_ps(fjx2,tx);
514 fjy2 = _mm_add_ps(fjy2,ty);
515 fjz2 = _mm_add_ps(fjz2,tz);
519 /**************************
520 * CALCULATE INTERACTIONS *
521 **************************/
523 if (gmx_mm_any_lt(rsq13,rcutoff2))
526 r13 = _mm_mul_ps(rsq13,rinv13);
528 /* EWALD ELECTROSTATICS */
530 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
531 ewrt = _mm_mul_ps(r13,ewtabscale);
532 ewitab = _mm_cvttps_epi32(ewrt);
533 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
534 ewitab = _mm_slli_epi32(ewitab,2);
535 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
536 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
537 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
538 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
539 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
540 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
541 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
542 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
543 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
545 d = _mm_sub_ps(r13,rswitch);
546 d = _mm_max_ps(d,_mm_setzero_ps());
547 d2 = _mm_mul_ps(d,d);
548 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
550 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
552 /* Evaluate switch function */
553 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
554 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
555 velec = _mm_mul_ps(velec,sw);
556 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
558 /* Update potential sum for this i atom from the interaction with this j atom. */
559 velec = _mm_and_ps(velec,cutoff_mask);
560 velecsum = _mm_add_ps(velecsum,velec);
564 fscal = _mm_and_ps(fscal,cutoff_mask);
566 /* Calculate temporary vectorial force */
567 tx = _mm_mul_ps(fscal,dx13);
568 ty = _mm_mul_ps(fscal,dy13);
569 tz = _mm_mul_ps(fscal,dz13);
571 /* Update vectorial force */
572 fix1 = _mm_add_ps(fix1,tx);
573 fiy1 = _mm_add_ps(fiy1,ty);
574 fiz1 = _mm_add_ps(fiz1,tz);
576 fjx3 = _mm_add_ps(fjx3,tx);
577 fjy3 = _mm_add_ps(fjy3,ty);
578 fjz3 = _mm_add_ps(fjz3,tz);
582 /**************************
583 * CALCULATE INTERACTIONS *
584 **************************/
586 if (gmx_mm_any_lt(rsq21,rcutoff2))
589 r21 = _mm_mul_ps(rsq21,rinv21);
591 /* EWALD ELECTROSTATICS */
593 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
594 ewrt = _mm_mul_ps(r21,ewtabscale);
595 ewitab = _mm_cvttps_epi32(ewrt);
596 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
597 ewitab = _mm_slli_epi32(ewitab,2);
598 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
599 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
600 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
601 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
602 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
603 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
604 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
605 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
606 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
608 d = _mm_sub_ps(r21,rswitch);
609 d = _mm_max_ps(d,_mm_setzero_ps());
610 d2 = _mm_mul_ps(d,d);
611 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
613 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
615 /* Evaluate switch function */
616 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
617 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
618 velec = _mm_mul_ps(velec,sw);
619 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
621 /* Update potential sum for this i atom from the interaction with this j atom. */
622 velec = _mm_and_ps(velec,cutoff_mask);
623 velecsum = _mm_add_ps(velecsum,velec);
627 fscal = _mm_and_ps(fscal,cutoff_mask);
629 /* Calculate temporary vectorial force */
630 tx = _mm_mul_ps(fscal,dx21);
631 ty = _mm_mul_ps(fscal,dy21);
632 tz = _mm_mul_ps(fscal,dz21);
634 /* Update vectorial force */
635 fix2 = _mm_add_ps(fix2,tx);
636 fiy2 = _mm_add_ps(fiy2,ty);
637 fiz2 = _mm_add_ps(fiz2,tz);
639 fjx1 = _mm_add_ps(fjx1,tx);
640 fjy1 = _mm_add_ps(fjy1,ty);
641 fjz1 = _mm_add_ps(fjz1,tz);
645 /**************************
646 * CALCULATE INTERACTIONS *
647 **************************/
649 if (gmx_mm_any_lt(rsq22,rcutoff2))
652 r22 = _mm_mul_ps(rsq22,rinv22);
654 /* EWALD ELECTROSTATICS */
656 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
657 ewrt = _mm_mul_ps(r22,ewtabscale);
658 ewitab = _mm_cvttps_epi32(ewrt);
659 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
660 ewitab = _mm_slli_epi32(ewitab,2);
661 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
662 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
663 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
664 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
665 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
666 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
667 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
668 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
669 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
671 d = _mm_sub_ps(r22,rswitch);
672 d = _mm_max_ps(d,_mm_setzero_ps());
673 d2 = _mm_mul_ps(d,d);
674 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
676 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
678 /* Evaluate switch function */
679 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
680 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
681 velec = _mm_mul_ps(velec,sw);
682 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
684 /* Update potential sum for this i atom from the interaction with this j atom. */
685 velec = _mm_and_ps(velec,cutoff_mask);
686 velecsum = _mm_add_ps(velecsum,velec);
690 fscal = _mm_and_ps(fscal,cutoff_mask);
692 /* Calculate temporary vectorial force */
693 tx = _mm_mul_ps(fscal,dx22);
694 ty = _mm_mul_ps(fscal,dy22);
695 tz = _mm_mul_ps(fscal,dz22);
697 /* Update vectorial force */
698 fix2 = _mm_add_ps(fix2,tx);
699 fiy2 = _mm_add_ps(fiy2,ty);
700 fiz2 = _mm_add_ps(fiz2,tz);
702 fjx2 = _mm_add_ps(fjx2,tx);
703 fjy2 = _mm_add_ps(fjy2,ty);
704 fjz2 = _mm_add_ps(fjz2,tz);
708 /**************************
709 * CALCULATE INTERACTIONS *
710 **************************/
712 if (gmx_mm_any_lt(rsq23,rcutoff2))
715 r23 = _mm_mul_ps(rsq23,rinv23);
717 /* EWALD ELECTROSTATICS */
719 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
720 ewrt = _mm_mul_ps(r23,ewtabscale);
721 ewitab = _mm_cvttps_epi32(ewrt);
722 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
723 ewitab = _mm_slli_epi32(ewitab,2);
724 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
725 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
726 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
727 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
728 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
729 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
730 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
731 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
732 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
734 d = _mm_sub_ps(r23,rswitch);
735 d = _mm_max_ps(d,_mm_setzero_ps());
736 d2 = _mm_mul_ps(d,d);
737 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
739 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
741 /* Evaluate switch function */
742 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
743 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
744 velec = _mm_mul_ps(velec,sw);
745 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
747 /* Update potential sum for this i atom from the interaction with this j atom. */
748 velec = _mm_and_ps(velec,cutoff_mask);
749 velecsum = _mm_add_ps(velecsum,velec);
753 fscal = _mm_and_ps(fscal,cutoff_mask);
755 /* Calculate temporary vectorial force */
756 tx = _mm_mul_ps(fscal,dx23);
757 ty = _mm_mul_ps(fscal,dy23);
758 tz = _mm_mul_ps(fscal,dz23);
760 /* Update vectorial force */
761 fix2 = _mm_add_ps(fix2,tx);
762 fiy2 = _mm_add_ps(fiy2,ty);
763 fiz2 = _mm_add_ps(fiz2,tz);
765 fjx3 = _mm_add_ps(fjx3,tx);
766 fjy3 = _mm_add_ps(fjy3,ty);
767 fjz3 = _mm_add_ps(fjz3,tz);
771 /**************************
772 * CALCULATE INTERACTIONS *
773 **************************/
775 if (gmx_mm_any_lt(rsq31,rcutoff2))
778 r31 = _mm_mul_ps(rsq31,rinv31);
780 /* EWALD ELECTROSTATICS */
782 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
783 ewrt = _mm_mul_ps(r31,ewtabscale);
784 ewitab = _mm_cvttps_epi32(ewrt);
785 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
786 ewitab = _mm_slli_epi32(ewitab,2);
787 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
788 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
789 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
790 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
791 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
792 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
793 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
794 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
795 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
797 d = _mm_sub_ps(r31,rswitch);
798 d = _mm_max_ps(d,_mm_setzero_ps());
799 d2 = _mm_mul_ps(d,d);
800 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
802 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
804 /* Evaluate switch function */
805 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
806 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
807 velec = _mm_mul_ps(velec,sw);
808 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
810 /* Update potential sum for this i atom from the interaction with this j atom. */
811 velec = _mm_and_ps(velec,cutoff_mask);
812 velecsum = _mm_add_ps(velecsum,velec);
816 fscal = _mm_and_ps(fscal,cutoff_mask);
818 /* Calculate temporary vectorial force */
819 tx = _mm_mul_ps(fscal,dx31);
820 ty = _mm_mul_ps(fscal,dy31);
821 tz = _mm_mul_ps(fscal,dz31);
823 /* Update vectorial force */
824 fix3 = _mm_add_ps(fix3,tx);
825 fiy3 = _mm_add_ps(fiy3,ty);
826 fiz3 = _mm_add_ps(fiz3,tz);
828 fjx1 = _mm_add_ps(fjx1,tx);
829 fjy1 = _mm_add_ps(fjy1,ty);
830 fjz1 = _mm_add_ps(fjz1,tz);
834 /**************************
835 * CALCULATE INTERACTIONS *
836 **************************/
838 if (gmx_mm_any_lt(rsq32,rcutoff2))
841 r32 = _mm_mul_ps(rsq32,rinv32);
843 /* EWALD ELECTROSTATICS */
845 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
846 ewrt = _mm_mul_ps(r32,ewtabscale);
847 ewitab = _mm_cvttps_epi32(ewrt);
848 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
849 ewitab = _mm_slli_epi32(ewitab,2);
850 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
851 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
852 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
853 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
854 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
855 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
856 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
857 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
858 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
860 d = _mm_sub_ps(r32,rswitch);
861 d = _mm_max_ps(d,_mm_setzero_ps());
862 d2 = _mm_mul_ps(d,d);
863 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
865 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
867 /* Evaluate switch function */
868 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
869 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
870 velec = _mm_mul_ps(velec,sw);
871 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
873 /* Update potential sum for this i atom from the interaction with this j atom. */
874 velec = _mm_and_ps(velec,cutoff_mask);
875 velecsum = _mm_add_ps(velecsum,velec);
879 fscal = _mm_and_ps(fscal,cutoff_mask);
881 /* Calculate temporary vectorial force */
882 tx = _mm_mul_ps(fscal,dx32);
883 ty = _mm_mul_ps(fscal,dy32);
884 tz = _mm_mul_ps(fscal,dz32);
886 /* Update vectorial force */
887 fix3 = _mm_add_ps(fix3,tx);
888 fiy3 = _mm_add_ps(fiy3,ty);
889 fiz3 = _mm_add_ps(fiz3,tz);
891 fjx2 = _mm_add_ps(fjx2,tx);
892 fjy2 = _mm_add_ps(fjy2,ty);
893 fjz2 = _mm_add_ps(fjz2,tz);
897 /**************************
898 * CALCULATE INTERACTIONS *
899 **************************/
901 if (gmx_mm_any_lt(rsq33,rcutoff2))
904 r33 = _mm_mul_ps(rsq33,rinv33);
906 /* EWALD ELECTROSTATICS */
908 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
909 ewrt = _mm_mul_ps(r33,ewtabscale);
910 ewitab = _mm_cvttps_epi32(ewrt);
911 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
912 ewitab = _mm_slli_epi32(ewitab,2);
913 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
914 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
915 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
916 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
917 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
918 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
919 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
920 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
921 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
923 d = _mm_sub_ps(r33,rswitch);
924 d = _mm_max_ps(d,_mm_setzero_ps());
925 d2 = _mm_mul_ps(d,d);
926 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
928 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
930 /* Evaluate switch function */
931 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
932 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
933 velec = _mm_mul_ps(velec,sw);
934 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
936 /* Update potential sum for this i atom from the interaction with this j atom. */
937 velec = _mm_and_ps(velec,cutoff_mask);
938 velecsum = _mm_add_ps(velecsum,velec);
942 fscal = _mm_and_ps(fscal,cutoff_mask);
944 /* Calculate temporary vectorial force */
945 tx = _mm_mul_ps(fscal,dx33);
946 ty = _mm_mul_ps(fscal,dy33);
947 tz = _mm_mul_ps(fscal,dz33);
949 /* Update vectorial force */
950 fix3 = _mm_add_ps(fix3,tx);
951 fiy3 = _mm_add_ps(fiy3,ty);
952 fiz3 = _mm_add_ps(fiz3,tz);
954 fjx3 = _mm_add_ps(fjx3,tx);
955 fjy3 = _mm_add_ps(fjy3,ty);
956 fjz3 = _mm_add_ps(fjz3,tz);
960 fjptrA = f+j_coord_offsetA;
961 fjptrB = f+j_coord_offsetB;
962 fjptrC = f+j_coord_offsetC;
963 fjptrD = f+j_coord_offsetD;
965 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
966 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
967 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
969 /* Inner loop uses 647 flops */
975 /* Get j neighbor index, and coordinate index */
976 jnrlistA = jjnr[jidx];
977 jnrlistB = jjnr[jidx+1];
978 jnrlistC = jjnr[jidx+2];
979 jnrlistD = jjnr[jidx+3];
980 /* Sign of each element will be negative for non-real atoms.
981 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
982 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
984 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
985 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
986 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
987 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
988 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
989 j_coord_offsetA = DIM*jnrA;
990 j_coord_offsetB = DIM*jnrB;
991 j_coord_offsetC = DIM*jnrC;
992 j_coord_offsetD = DIM*jnrD;
994 /* load j atom coordinates */
995 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
996 x+j_coord_offsetC,x+j_coord_offsetD,
997 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
998 &jy2,&jz2,&jx3,&jy3,&jz3);
1000 /* Calculate displacement vector */
1001 dx00 = _mm_sub_ps(ix0,jx0);
1002 dy00 = _mm_sub_ps(iy0,jy0);
1003 dz00 = _mm_sub_ps(iz0,jz0);
1004 dx11 = _mm_sub_ps(ix1,jx1);
1005 dy11 = _mm_sub_ps(iy1,jy1);
1006 dz11 = _mm_sub_ps(iz1,jz1);
1007 dx12 = _mm_sub_ps(ix1,jx2);
1008 dy12 = _mm_sub_ps(iy1,jy2);
1009 dz12 = _mm_sub_ps(iz1,jz2);
1010 dx13 = _mm_sub_ps(ix1,jx3);
1011 dy13 = _mm_sub_ps(iy1,jy3);
1012 dz13 = _mm_sub_ps(iz1,jz3);
1013 dx21 = _mm_sub_ps(ix2,jx1);
1014 dy21 = _mm_sub_ps(iy2,jy1);
1015 dz21 = _mm_sub_ps(iz2,jz1);
1016 dx22 = _mm_sub_ps(ix2,jx2);
1017 dy22 = _mm_sub_ps(iy2,jy2);
1018 dz22 = _mm_sub_ps(iz2,jz2);
1019 dx23 = _mm_sub_ps(ix2,jx3);
1020 dy23 = _mm_sub_ps(iy2,jy3);
1021 dz23 = _mm_sub_ps(iz2,jz3);
1022 dx31 = _mm_sub_ps(ix3,jx1);
1023 dy31 = _mm_sub_ps(iy3,jy1);
1024 dz31 = _mm_sub_ps(iz3,jz1);
1025 dx32 = _mm_sub_ps(ix3,jx2);
1026 dy32 = _mm_sub_ps(iy3,jy2);
1027 dz32 = _mm_sub_ps(iz3,jz2);
1028 dx33 = _mm_sub_ps(ix3,jx3);
1029 dy33 = _mm_sub_ps(iy3,jy3);
1030 dz33 = _mm_sub_ps(iz3,jz3);
1032 /* Calculate squared distance and things based on it */
1033 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1034 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1035 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1036 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1037 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1038 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1039 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1040 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1041 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1042 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1044 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1045 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1046 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1047 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1048 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1049 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1050 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1051 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1052 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1053 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1055 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1056 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1057 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1058 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1059 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1060 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1061 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1062 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1063 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1064 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1066 fjx0 = _mm_setzero_ps();
1067 fjy0 = _mm_setzero_ps();
1068 fjz0 = _mm_setzero_ps();
1069 fjx1 = _mm_setzero_ps();
1070 fjy1 = _mm_setzero_ps();
1071 fjz1 = _mm_setzero_ps();
1072 fjx2 = _mm_setzero_ps();
1073 fjy2 = _mm_setzero_ps();
1074 fjz2 = _mm_setzero_ps();
1075 fjx3 = _mm_setzero_ps();
1076 fjy3 = _mm_setzero_ps();
1077 fjz3 = _mm_setzero_ps();
1079 /**************************
1080 * CALCULATE INTERACTIONS *
1081 **************************/
1083 if (gmx_mm_any_lt(rsq00,rcutoff2))
1086 r00 = _mm_mul_ps(rsq00,rinv00);
1087 r00 = _mm_andnot_ps(dummy_mask,r00);
1089 /* LENNARD-JONES DISPERSION/REPULSION */
1091 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1092 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1093 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1094 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1095 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1097 d = _mm_sub_ps(r00,rswitch);
1098 d = _mm_max_ps(d,_mm_setzero_ps());
1099 d2 = _mm_mul_ps(d,d);
1100 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1102 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1104 /* Evaluate switch function */
1105 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1106 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1107 vvdw = _mm_mul_ps(vvdw,sw);
1108 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1110 /* Update potential sum for this i atom from the interaction with this j atom. */
1111 vvdw = _mm_and_ps(vvdw,cutoff_mask);
1112 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
1113 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
1117 fscal = _mm_and_ps(fscal,cutoff_mask);
1119 fscal = _mm_andnot_ps(dummy_mask,fscal);
1121 /* Calculate temporary vectorial force */
1122 tx = _mm_mul_ps(fscal,dx00);
1123 ty = _mm_mul_ps(fscal,dy00);
1124 tz = _mm_mul_ps(fscal,dz00);
1126 /* Update vectorial force */
1127 fix0 = _mm_add_ps(fix0,tx);
1128 fiy0 = _mm_add_ps(fiy0,ty);
1129 fiz0 = _mm_add_ps(fiz0,tz);
1131 fjx0 = _mm_add_ps(fjx0,tx);
1132 fjy0 = _mm_add_ps(fjy0,ty);
1133 fjz0 = _mm_add_ps(fjz0,tz);
1137 /**************************
1138 * CALCULATE INTERACTIONS *
1139 **************************/
1141 if (gmx_mm_any_lt(rsq11,rcutoff2))
1144 r11 = _mm_mul_ps(rsq11,rinv11);
1145 r11 = _mm_andnot_ps(dummy_mask,r11);
1147 /* EWALD ELECTROSTATICS */
1149 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1150 ewrt = _mm_mul_ps(r11,ewtabscale);
1151 ewitab = _mm_cvttps_epi32(ewrt);
1152 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1153 ewitab = _mm_slli_epi32(ewitab,2);
1154 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1155 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1156 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1157 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1158 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1159 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1160 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1161 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1162 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1164 d = _mm_sub_ps(r11,rswitch);
1165 d = _mm_max_ps(d,_mm_setzero_ps());
1166 d2 = _mm_mul_ps(d,d);
1167 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1169 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1171 /* Evaluate switch function */
1172 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1173 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1174 velec = _mm_mul_ps(velec,sw);
1175 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1177 /* Update potential sum for this i atom from the interaction with this j atom. */
1178 velec = _mm_and_ps(velec,cutoff_mask);
1179 velec = _mm_andnot_ps(dummy_mask,velec);
1180 velecsum = _mm_add_ps(velecsum,velec);
1184 fscal = _mm_and_ps(fscal,cutoff_mask);
1186 fscal = _mm_andnot_ps(dummy_mask,fscal);
1188 /* Calculate temporary vectorial force */
1189 tx = _mm_mul_ps(fscal,dx11);
1190 ty = _mm_mul_ps(fscal,dy11);
1191 tz = _mm_mul_ps(fscal,dz11);
1193 /* Update vectorial force */
1194 fix1 = _mm_add_ps(fix1,tx);
1195 fiy1 = _mm_add_ps(fiy1,ty);
1196 fiz1 = _mm_add_ps(fiz1,tz);
1198 fjx1 = _mm_add_ps(fjx1,tx);
1199 fjy1 = _mm_add_ps(fjy1,ty);
1200 fjz1 = _mm_add_ps(fjz1,tz);
1204 /**************************
1205 * CALCULATE INTERACTIONS *
1206 **************************/
1208 if (gmx_mm_any_lt(rsq12,rcutoff2))
1211 r12 = _mm_mul_ps(rsq12,rinv12);
1212 r12 = _mm_andnot_ps(dummy_mask,r12);
1214 /* EWALD ELECTROSTATICS */
1216 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1217 ewrt = _mm_mul_ps(r12,ewtabscale);
1218 ewitab = _mm_cvttps_epi32(ewrt);
1219 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1220 ewitab = _mm_slli_epi32(ewitab,2);
1221 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1222 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1223 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1224 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1225 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1226 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1227 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1228 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1229 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1231 d = _mm_sub_ps(r12,rswitch);
1232 d = _mm_max_ps(d,_mm_setzero_ps());
1233 d2 = _mm_mul_ps(d,d);
1234 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1236 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1238 /* Evaluate switch function */
1239 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1240 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1241 velec = _mm_mul_ps(velec,sw);
1242 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1244 /* Update potential sum for this i atom from the interaction with this j atom. */
1245 velec = _mm_and_ps(velec,cutoff_mask);
1246 velec = _mm_andnot_ps(dummy_mask,velec);
1247 velecsum = _mm_add_ps(velecsum,velec);
1251 fscal = _mm_and_ps(fscal,cutoff_mask);
1253 fscal = _mm_andnot_ps(dummy_mask,fscal);
1255 /* Calculate temporary vectorial force */
1256 tx = _mm_mul_ps(fscal,dx12);
1257 ty = _mm_mul_ps(fscal,dy12);
1258 tz = _mm_mul_ps(fscal,dz12);
1260 /* Update vectorial force */
1261 fix1 = _mm_add_ps(fix1,tx);
1262 fiy1 = _mm_add_ps(fiy1,ty);
1263 fiz1 = _mm_add_ps(fiz1,tz);
1265 fjx2 = _mm_add_ps(fjx2,tx);
1266 fjy2 = _mm_add_ps(fjy2,ty);
1267 fjz2 = _mm_add_ps(fjz2,tz);
1271 /**************************
1272 * CALCULATE INTERACTIONS *
1273 **************************/
1275 if (gmx_mm_any_lt(rsq13,rcutoff2))
1278 r13 = _mm_mul_ps(rsq13,rinv13);
1279 r13 = _mm_andnot_ps(dummy_mask,r13);
1281 /* EWALD ELECTROSTATICS */
1283 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1284 ewrt = _mm_mul_ps(r13,ewtabscale);
1285 ewitab = _mm_cvttps_epi32(ewrt);
1286 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1287 ewitab = _mm_slli_epi32(ewitab,2);
1288 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1289 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1290 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1291 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1292 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1293 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1294 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1295 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
1296 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1298 d = _mm_sub_ps(r13,rswitch);
1299 d = _mm_max_ps(d,_mm_setzero_ps());
1300 d2 = _mm_mul_ps(d,d);
1301 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1303 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1305 /* Evaluate switch function */
1306 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1307 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
1308 velec = _mm_mul_ps(velec,sw);
1309 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1311 /* Update potential sum for this i atom from the interaction with this j atom. */
1312 velec = _mm_and_ps(velec,cutoff_mask);
1313 velec = _mm_andnot_ps(dummy_mask,velec);
1314 velecsum = _mm_add_ps(velecsum,velec);
1318 fscal = _mm_and_ps(fscal,cutoff_mask);
1320 fscal = _mm_andnot_ps(dummy_mask,fscal);
1322 /* Calculate temporary vectorial force */
1323 tx = _mm_mul_ps(fscal,dx13);
1324 ty = _mm_mul_ps(fscal,dy13);
1325 tz = _mm_mul_ps(fscal,dz13);
1327 /* Update vectorial force */
1328 fix1 = _mm_add_ps(fix1,tx);
1329 fiy1 = _mm_add_ps(fiy1,ty);
1330 fiz1 = _mm_add_ps(fiz1,tz);
1332 fjx3 = _mm_add_ps(fjx3,tx);
1333 fjy3 = _mm_add_ps(fjy3,ty);
1334 fjz3 = _mm_add_ps(fjz3,tz);
1338 /**************************
1339 * CALCULATE INTERACTIONS *
1340 **************************/
1342 if (gmx_mm_any_lt(rsq21,rcutoff2))
1345 r21 = _mm_mul_ps(rsq21,rinv21);
1346 r21 = _mm_andnot_ps(dummy_mask,r21);
1348 /* EWALD ELECTROSTATICS */
1350 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1351 ewrt = _mm_mul_ps(r21,ewtabscale);
1352 ewitab = _mm_cvttps_epi32(ewrt);
1353 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1354 ewitab = _mm_slli_epi32(ewitab,2);
1355 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1356 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1357 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1358 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1359 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1360 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1361 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1362 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1363 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1365 d = _mm_sub_ps(r21,rswitch);
1366 d = _mm_max_ps(d,_mm_setzero_ps());
1367 d2 = _mm_mul_ps(d,d);
1368 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1370 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1372 /* Evaluate switch function */
1373 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1374 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1375 velec = _mm_mul_ps(velec,sw);
1376 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1378 /* Update potential sum for this i atom from the interaction with this j atom. */
1379 velec = _mm_and_ps(velec,cutoff_mask);
1380 velec = _mm_andnot_ps(dummy_mask,velec);
1381 velecsum = _mm_add_ps(velecsum,velec);
1385 fscal = _mm_and_ps(fscal,cutoff_mask);
1387 fscal = _mm_andnot_ps(dummy_mask,fscal);
1389 /* Calculate temporary vectorial force */
1390 tx = _mm_mul_ps(fscal,dx21);
1391 ty = _mm_mul_ps(fscal,dy21);
1392 tz = _mm_mul_ps(fscal,dz21);
1394 /* Update vectorial force */
1395 fix2 = _mm_add_ps(fix2,tx);
1396 fiy2 = _mm_add_ps(fiy2,ty);
1397 fiz2 = _mm_add_ps(fiz2,tz);
1399 fjx1 = _mm_add_ps(fjx1,tx);
1400 fjy1 = _mm_add_ps(fjy1,ty);
1401 fjz1 = _mm_add_ps(fjz1,tz);
1405 /**************************
1406 * CALCULATE INTERACTIONS *
1407 **************************/
1409 if (gmx_mm_any_lt(rsq22,rcutoff2))
1412 r22 = _mm_mul_ps(rsq22,rinv22);
1413 r22 = _mm_andnot_ps(dummy_mask,r22);
1415 /* EWALD ELECTROSTATICS */
1417 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1418 ewrt = _mm_mul_ps(r22,ewtabscale);
1419 ewitab = _mm_cvttps_epi32(ewrt);
1420 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1421 ewitab = _mm_slli_epi32(ewitab,2);
1422 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1423 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1424 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1425 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1426 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1427 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1428 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1429 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1430 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1432 d = _mm_sub_ps(r22,rswitch);
1433 d = _mm_max_ps(d,_mm_setzero_ps());
1434 d2 = _mm_mul_ps(d,d);
1435 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1437 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1439 /* Evaluate switch function */
1440 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1441 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1442 velec = _mm_mul_ps(velec,sw);
1443 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1445 /* Update potential sum for this i atom from the interaction with this j atom. */
1446 velec = _mm_and_ps(velec,cutoff_mask);
1447 velec = _mm_andnot_ps(dummy_mask,velec);
1448 velecsum = _mm_add_ps(velecsum,velec);
1452 fscal = _mm_and_ps(fscal,cutoff_mask);
1454 fscal = _mm_andnot_ps(dummy_mask,fscal);
1456 /* Calculate temporary vectorial force */
1457 tx = _mm_mul_ps(fscal,dx22);
1458 ty = _mm_mul_ps(fscal,dy22);
1459 tz = _mm_mul_ps(fscal,dz22);
1461 /* Update vectorial force */
1462 fix2 = _mm_add_ps(fix2,tx);
1463 fiy2 = _mm_add_ps(fiy2,ty);
1464 fiz2 = _mm_add_ps(fiz2,tz);
1466 fjx2 = _mm_add_ps(fjx2,tx);
1467 fjy2 = _mm_add_ps(fjy2,ty);
1468 fjz2 = _mm_add_ps(fjz2,tz);
1472 /**************************
1473 * CALCULATE INTERACTIONS *
1474 **************************/
1476 if (gmx_mm_any_lt(rsq23,rcutoff2))
1479 r23 = _mm_mul_ps(rsq23,rinv23);
1480 r23 = _mm_andnot_ps(dummy_mask,r23);
1482 /* EWALD ELECTROSTATICS */
1484 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1485 ewrt = _mm_mul_ps(r23,ewtabscale);
1486 ewitab = _mm_cvttps_epi32(ewrt);
1487 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1488 ewitab = _mm_slli_epi32(ewitab,2);
1489 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1490 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1491 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1492 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1493 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1494 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1495 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1496 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1497 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1499 d = _mm_sub_ps(r23,rswitch);
1500 d = _mm_max_ps(d,_mm_setzero_ps());
1501 d2 = _mm_mul_ps(d,d);
1502 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1504 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1506 /* Evaluate switch function */
1507 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1508 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
1509 velec = _mm_mul_ps(velec,sw);
1510 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1512 /* Update potential sum for this i atom from the interaction with this j atom. */
1513 velec = _mm_and_ps(velec,cutoff_mask);
1514 velec = _mm_andnot_ps(dummy_mask,velec);
1515 velecsum = _mm_add_ps(velecsum,velec);
1519 fscal = _mm_and_ps(fscal,cutoff_mask);
1521 fscal = _mm_andnot_ps(dummy_mask,fscal);
1523 /* Calculate temporary vectorial force */
1524 tx = _mm_mul_ps(fscal,dx23);
1525 ty = _mm_mul_ps(fscal,dy23);
1526 tz = _mm_mul_ps(fscal,dz23);
1528 /* Update vectorial force */
1529 fix2 = _mm_add_ps(fix2,tx);
1530 fiy2 = _mm_add_ps(fiy2,ty);
1531 fiz2 = _mm_add_ps(fiz2,tz);
1533 fjx3 = _mm_add_ps(fjx3,tx);
1534 fjy3 = _mm_add_ps(fjy3,ty);
1535 fjz3 = _mm_add_ps(fjz3,tz);
1539 /**************************
1540 * CALCULATE INTERACTIONS *
1541 **************************/
1543 if (gmx_mm_any_lt(rsq31,rcutoff2))
1546 r31 = _mm_mul_ps(rsq31,rinv31);
1547 r31 = _mm_andnot_ps(dummy_mask,r31);
1549 /* EWALD ELECTROSTATICS */
1551 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1552 ewrt = _mm_mul_ps(r31,ewtabscale);
1553 ewitab = _mm_cvttps_epi32(ewrt);
1554 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1555 ewitab = _mm_slli_epi32(ewitab,2);
1556 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1557 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1558 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1559 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1560 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1561 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1562 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1563 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1564 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1566 d = _mm_sub_ps(r31,rswitch);
1567 d = _mm_max_ps(d,_mm_setzero_ps());
1568 d2 = _mm_mul_ps(d,d);
1569 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1571 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1573 /* Evaluate switch function */
1574 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1575 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
1576 velec = _mm_mul_ps(velec,sw);
1577 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1579 /* Update potential sum for this i atom from the interaction with this j atom. */
1580 velec = _mm_and_ps(velec,cutoff_mask);
1581 velec = _mm_andnot_ps(dummy_mask,velec);
1582 velecsum = _mm_add_ps(velecsum,velec);
1586 fscal = _mm_and_ps(fscal,cutoff_mask);
1588 fscal = _mm_andnot_ps(dummy_mask,fscal);
1590 /* Calculate temporary vectorial force */
1591 tx = _mm_mul_ps(fscal,dx31);
1592 ty = _mm_mul_ps(fscal,dy31);
1593 tz = _mm_mul_ps(fscal,dz31);
1595 /* Update vectorial force */
1596 fix3 = _mm_add_ps(fix3,tx);
1597 fiy3 = _mm_add_ps(fiy3,ty);
1598 fiz3 = _mm_add_ps(fiz3,tz);
1600 fjx1 = _mm_add_ps(fjx1,tx);
1601 fjy1 = _mm_add_ps(fjy1,ty);
1602 fjz1 = _mm_add_ps(fjz1,tz);
1606 /**************************
1607 * CALCULATE INTERACTIONS *
1608 **************************/
1610 if (gmx_mm_any_lt(rsq32,rcutoff2))
1613 r32 = _mm_mul_ps(rsq32,rinv32);
1614 r32 = _mm_andnot_ps(dummy_mask,r32);
1616 /* EWALD ELECTROSTATICS */
1618 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1619 ewrt = _mm_mul_ps(r32,ewtabscale);
1620 ewitab = _mm_cvttps_epi32(ewrt);
1621 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1622 ewitab = _mm_slli_epi32(ewitab,2);
1623 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1624 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1625 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1626 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1627 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1628 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1629 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1630 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1631 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1633 d = _mm_sub_ps(r32,rswitch);
1634 d = _mm_max_ps(d,_mm_setzero_ps());
1635 d2 = _mm_mul_ps(d,d);
1636 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1638 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1640 /* Evaluate switch function */
1641 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1642 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
1643 velec = _mm_mul_ps(velec,sw);
1644 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1646 /* Update potential sum for this i atom from the interaction with this j atom. */
1647 velec = _mm_and_ps(velec,cutoff_mask);
1648 velec = _mm_andnot_ps(dummy_mask,velec);
1649 velecsum = _mm_add_ps(velecsum,velec);
1653 fscal = _mm_and_ps(fscal,cutoff_mask);
1655 fscal = _mm_andnot_ps(dummy_mask,fscal);
1657 /* Calculate temporary vectorial force */
1658 tx = _mm_mul_ps(fscal,dx32);
1659 ty = _mm_mul_ps(fscal,dy32);
1660 tz = _mm_mul_ps(fscal,dz32);
1662 /* Update vectorial force */
1663 fix3 = _mm_add_ps(fix3,tx);
1664 fiy3 = _mm_add_ps(fiy3,ty);
1665 fiz3 = _mm_add_ps(fiz3,tz);
1667 fjx2 = _mm_add_ps(fjx2,tx);
1668 fjy2 = _mm_add_ps(fjy2,ty);
1669 fjz2 = _mm_add_ps(fjz2,tz);
1673 /**************************
1674 * CALCULATE INTERACTIONS *
1675 **************************/
1677 if (gmx_mm_any_lt(rsq33,rcutoff2))
1680 r33 = _mm_mul_ps(rsq33,rinv33);
1681 r33 = _mm_andnot_ps(dummy_mask,r33);
1683 /* EWALD ELECTROSTATICS */
1685 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1686 ewrt = _mm_mul_ps(r33,ewtabscale);
1687 ewitab = _mm_cvttps_epi32(ewrt);
1688 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1689 ewitab = _mm_slli_epi32(ewitab,2);
1690 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1691 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1692 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1693 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1694 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1695 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1696 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1697 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1698 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1700 d = _mm_sub_ps(r33,rswitch);
1701 d = _mm_max_ps(d,_mm_setzero_ps());
1702 d2 = _mm_mul_ps(d,d);
1703 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1705 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1707 /* Evaluate switch function */
1708 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1709 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
1710 velec = _mm_mul_ps(velec,sw);
1711 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1713 /* Update potential sum for this i atom from the interaction with this j atom. */
1714 velec = _mm_and_ps(velec,cutoff_mask);
1715 velec = _mm_andnot_ps(dummy_mask,velec);
1716 velecsum = _mm_add_ps(velecsum,velec);
1720 fscal = _mm_and_ps(fscal,cutoff_mask);
1722 fscal = _mm_andnot_ps(dummy_mask,fscal);
1724 /* Calculate temporary vectorial force */
1725 tx = _mm_mul_ps(fscal,dx33);
1726 ty = _mm_mul_ps(fscal,dy33);
1727 tz = _mm_mul_ps(fscal,dz33);
1729 /* Update vectorial force */
1730 fix3 = _mm_add_ps(fix3,tx);
1731 fiy3 = _mm_add_ps(fiy3,ty);
1732 fiz3 = _mm_add_ps(fiz3,tz);
1734 fjx3 = _mm_add_ps(fjx3,tx);
1735 fjy3 = _mm_add_ps(fjy3,ty);
1736 fjz3 = _mm_add_ps(fjz3,tz);
1740 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1741 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1742 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1743 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1745 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1746 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1747 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1749 /* Inner loop uses 657 flops */
1752 /* End of innermost loop */
1754 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1755 f+i_coord_offset,fshift+i_shift_offset);
1758 /* Update potential energies */
1759 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1760 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1762 /* Increment number of inner iterations */
1763 inneriter += j_index_end - j_index_start;
1765 /* Outer loop uses 26 flops */
1768 /* Increment number of outer iterations */
1771 /* Update outer/inner flops */
1773 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*657);
1776 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_single
1777 * Electrostatics interaction: Ewald
1778 * VdW interaction: LennardJones
1779 * Geometry: Water4-Water4
1780 * Calculate force/pot: Force
1783 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse4_1_single
1784 (t_nblist * gmx_restrict nlist,
1785 rvec * gmx_restrict xx,
1786 rvec * gmx_restrict ff,
1787 t_forcerec * gmx_restrict fr,
1788 t_mdatoms * gmx_restrict mdatoms,
1789 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1790 t_nrnb * gmx_restrict nrnb)
1792 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1793 * just 0 for non-waters.
1794 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1795 * jnr indices corresponding to data put in the four positions in the SIMD register.
1797 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1798 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1799 int jnrA,jnrB,jnrC,jnrD;
1800 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1801 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1802 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1803 real rcutoff_scalar;
1804 real *shiftvec,*fshift,*x,*f;
1805 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1806 real scratch[4*DIM];
1807 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1809 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1811 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1813 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1815 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1816 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1817 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1818 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1819 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1820 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1821 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1822 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1823 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1824 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1825 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1826 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1827 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1828 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1829 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1830 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1831 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1832 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1833 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1834 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1837 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1840 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1841 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1843 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1845 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1846 real rswitch_scalar,d_scalar;
1847 __m128 dummy_mask,cutoff_mask;
1848 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1849 __m128 one = _mm_set1_ps(1.0);
1850 __m128 two = _mm_set1_ps(2.0);
1856 jindex = nlist->jindex;
1858 shiftidx = nlist->shift;
1860 shiftvec = fr->shift_vec[0];
1861 fshift = fr->fshift[0];
1862 facel = _mm_set1_ps(fr->epsfac);
1863 charge = mdatoms->chargeA;
1864 nvdwtype = fr->ntype;
1865 vdwparam = fr->nbfp;
1866 vdwtype = mdatoms->typeA;
1868 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1869 ewtab = fr->ic->tabq_coul_FDV0;
1870 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1871 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1873 /* Setup water-specific parameters */
1874 inr = nlist->iinr[0];
1875 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1876 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1877 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1878 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1880 jq1 = _mm_set1_ps(charge[inr+1]);
1881 jq2 = _mm_set1_ps(charge[inr+2]);
1882 jq3 = _mm_set1_ps(charge[inr+3]);
1883 vdwjidx0A = 2*vdwtype[inr+0];
1884 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1885 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1886 qq11 = _mm_mul_ps(iq1,jq1);
1887 qq12 = _mm_mul_ps(iq1,jq2);
1888 qq13 = _mm_mul_ps(iq1,jq3);
1889 qq21 = _mm_mul_ps(iq2,jq1);
1890 qq22 = _mm_mul_ps(iq2,jq2);
1891 qq23 = _mm_mul_ps(iq2,jq3);
1892 qq31 = _mm_mul_ps(iq3,jq1);
1893 qq32 = _mm_mul_ps(iq3,jq2);
1894 qq33 = _mm_mul_ps(iq3,jq3);
1896 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1897 rcutoff_scalar = fr->rcoulomb;
1898 rcutoff = _mm_set1_ps(rcutoff_scalar);
1899 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1901 rswitch_scalar = fr->rcoulomb_switch;
1902 rswitch = _mm_set1_ps(rswitch_scalar);
1903 /* Setup switch parameters */
1904 d_scalar = rcutoff_scalar-rswitch_scalar;
1905 d = _mm_set1_ps(d_scalar);
1906 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1907 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1908 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1909 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1910 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1911 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1913 /* Avoid stupid compiler warnings */
1914 jnrA = jnrB = jnrC = jnrD = 0;
1915 j_coord_offsetA = 0;
1916 j_coord_offsetB = 0;
1917 j_coord_offsetC = 0;
1918 j_coord_offsetD = 0;
1923 for(iidx=0;iidx<4*DIM;iidx++)
1925 scratch[iidx] = 0.0;
1928 /* Start outer loop over neighborlists */
1929 for(iidx=0; iidx<nri; iidx++)
1931 /* Load shift vector for this list */
1932 i_shift_offset = DIM*shiftidx[iidx];
1934 /* Load limits for loop over neighbors */
1935 j_index_start = jindex[iidx];
1936 j_index_end = jindex[iidx+1];
1938 /* Get outer coordinate index */
1940 i_coord_offset = DIM*inr;
1942 /* Load i particle coords and add shift vector */
1943 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1944 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1946 fix0 = _mm_setzero_ps();
1947 fiy0 = _mm_setzero_ps();
1948 fiz0 = _mm_setzero_ps();
1949 fix1 = _mm_setzero_ps();
1950 fiy1 = _mm_setzero_ps();
1951 fiz1 = _mm_setzero_ps();
1952 fix2 = _mm_setzero_ps();
1953 fiy2 = _mm_setzero_ps();
1954 fiz2 = _mm_setzero_ps();
1955 fix3 = _mm_setzero_ps();
1956 fiy3 = _mm_setzero_ps();
1957 fiz3 = _mm_setzero_ps();
1959 /* Start inner kernel loop */
1960 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1963 /* Get j neighbor index, and coordinate index */
1965 jnrB = jjnr[jidx+1];
1966 jnrC = jjnr[jidx+2];
1967 jnrD = jjnr[jidx+3];
1968 j_coord_offsetA = DIM*jnrA;
1969 j_coord_offsetB = DIM*jnrB;
1970 j_coord_offsetC = DIM*jnrC;
1971 j_coord_offsetD = DIM*jnrD;
1973 /* load j atom coordinates */
1974 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1975 x+j_coord_offsetC,x+j_coord_offsetD,
1976 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1977 &jy2,&jz2,&jx3,&jy3,&jz3);
1979 /* Calculate displacement vector */
1980 dx00 = _mm_sub_ps(ix0,jx0);
1981 dy00 = _mm_sub_ps(iy0,jy0);
1982 dz00 = _mm_sub_ps(iz0,jz0);
1983 dx11 = _mm_sub_ps(ix1,jx1);
1984 dy11 = _mm_sub_ps(iy1,jy1);
1985 dz11 = _mm_sub_ps(iz1,jz1);
1986 dx12 = _mm_sub_ps(ix1,jx2);
1987 dy12 = _mm_sub_ps(iy1,jy2);
1988 dz12 = _mm_sub_ps(iz1,jz2);
1989 dx13 = _mm_sub_ps(ix1,jx3);
1990 dy13 = _mm_sub_ps(iy1,jy3);
1991 dz13 = _mm_sub_ps(iz1,jz3);
1992 dx21 = _mm_sub_ps(ix2,jx1);
1993 dy21 = _mm_sub_ps(iy2,jy1);
1994 dz21 = _mm_sub_ps(iz2,jz1);
1995 dx22 = _mm_sub_ps(ix2,jx2);
1996 dy22 = _mm_sub_ps(iy2,jy2);
1997 dz22 = _mm_sub_ps(iz2,jz2);
1998 dx23 = _mm_sub_ps(ix2,jx3);
1999 dy23 = _mm_sub_ps(iy2,jy3);
2000 dz23 = _mm_sub_ps(iz2,jz3);
2001 dx31 = _mm_sub_ps(ix3,jx1);
2002 dy31 = _mm_sub_ps(iy3,jy1);
2003 dz31 = _mm_sub_ps(iz3,jz1);
2004 dx32 = _mm_sub_ps(ix3,jx2);
2005 dy32 = _mm_sub_ps(iy3,jy2);
2006 dz32 = _mm_sub_ps(iz3,jz2);
2007 dx33 = _mm_sub_ps(ix3,jx3);
2008 dy33 = _mm_sub_ps(iy3,jy3);
2009 dz33 = _mm_sub_ps(iz3,jz3);
2011 /* Calculate squared distance and things based on it */
2012 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2013 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2014 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2015 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2016 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2017 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2018 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2019 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2020 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2021 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2023 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2024 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2025 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2026 rinv13 = gmx_mm_invsqrt_ps(rsq13);
2027 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2028 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2029 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2030 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2031 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2032 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2034 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2035 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2036 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2037 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2038 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2039 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2040 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2041 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2042 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2043 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2045 fjx0 = _mm_setzero_ps();
2046 fjy0 = _mm_setzero_ps();
2047 fjz0 = _mm_setzero_ps();
2048 fjx1 = _mm_setzero_ps();
2049 fjy1 = _mm_setzero_ps();
2050 fjz1 = _mm_setzero_ps();
2051 fjx2 = _mm_setzero_ps();
2052 fjy2 = _mm_setzero_ps();
2053 fjz2 = _mm_setzero_ps();
2054 fjx3 = _mm_setzero_ps();
2055 fjy3 = _mm_setzero_ps();
2056 fjz3 = _mm_setzero_ps();
2058 /**************************
2059 * CALCULATE INTERACTIONS *
2060 **************************/
2062 if (gmx_mm_any_lt(rsq00,rcutoff2))
2065 r00 = _mm_mul_ps(rsq00,rinv00);
2067 /* LENNARD-JONES DISPERSION/REPULSION */
2069 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2070 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
2071 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2072 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2073 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2075 d = _mm_sub_ps(r00,rswitch);
2076 d = _mm_max_ps(d,_mm_setzero_ps());
2077 d2 = _mm_mul_ps(d,d);
2078 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2080 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2082 /* Evaluate switch function */
2083 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2084 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2085 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2089 fscal = _mm_and_ps(fscal,cutoff_mask);
2091 /* Calculate temporary vectorial force */
2092 tx = _mm_mul_ps(fscal,dx00);
2093 ty = _mm_mul_ps(fscal,dy00);
2094 tz = _mm_mul_ps(fscal,dz00);
2096 /* Update vectorial force */
2097 fix0 = _mm_add_ps(fix0,tx);
2098 fiy0 = _mm_add_ps(fiy0,ty);
2099 fiz0 = _mm_add_ps(fiz0,tz);
2101 fjx0 = _mm_add_ps(fjx0,tx);
2102 fjy0 = _mm_add_ps(fjy0,ty);
2103 fjz0 = _mm_add_ps(fjz0,tz);
2107 /**************************
2108 * CALCULATE INTERACTIONS *
2109 **************************/
2111 if (gmx_mm_any_lt(rsq11,rcutoff2))
2114 r11 = _mm_mul_ps(rsq11,rinv11);
2116 /* EWALD ELECTROSTATICS */
2118 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2119 ewrt = _mm_mul_ps(r11,ewtabscale);
2120 ewitab = _mm_cvttps_epi32(ewrt);
2121 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2122 ewitab = _mm_slli_epi32(ewitab,2);
2123 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2124 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2125 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2126 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2127 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2128 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2129 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2130 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2131 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2133 d = _mm_sub_ps(r11,rswitch);
2134 d = _mm_max_ps(d,_mm_setzero_ps());
2135 d2 = _mm_mul_ps(d,d);
2136 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2138 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2140 /* Evaluate switch function */
2141 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2142 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2143 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2147 fscal = _mm_and_ps(fscal,cutoff_mask);
2149 /* Calculate temporary vectorial force */
2150 tx = _mm_mul_ps(fscal,dx11);
2151 ty = _mm_mul_ps(fscal,dy11);
2152 tz = _mm_mul_ps(fscal,dz11);
2154 /* Update vectorial force */
2155 fix1 = _mm_add_ps(fix1,tx);
2156 fiy1 = _mm_add_ps(fiy1,ty);
2157 fiz1 = _mm_add_ps(fiz1,tz);
2159 fjx1 = _mm_add_ps(fjx1,tx);
2160 fjy1 = _mm_add_ps(fjy1,ty);
2161 fjz1 = _mm_add_ps(fjz1,tz);
2165 /**************************
2166 * CALCULATE INTERACTIONS *
2167 **************************/
2169 if (gmx_mm_any_lt(rsq12,rcutoff2))
2172 r12 = _mm_mul_ps(rsq12,rinv12);
2174 /* EWALD ELECTROSTATICS */
2176 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2177 ewrt = _mm_mul_ps(r12,ewtabscale);
2178 ewitab = _mm_cvttps_epi32(ewrt);
2179 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2180 ewitab = _mm_slli_epi32(ewitab,2);
2181 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2182 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2183 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2184 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2185 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2186 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2187 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2188 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2189 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2191 d = _mm_sub_ps(r12,rswitch);
2192 d = _mm_max_ps(d,_mm_setzero_ps());
2193 d2 = _mm_mul_ps(d,d);
2194 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2196 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2198 /* Evaluate switch function */
2199 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2200 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2201 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2205 fscal = _mm_and_ps(fscal,cutoff_mask);
2207 /* Calculate temporary vectorial force */
2208 tx = _mm_mul_ps(fscal,dx12);
2209 ty = _mm_mul_ps(fscal,dy12);
2210 tz = _mm_mul_ps(fscal,dz12);
2212 /* Update vectorial force */
2213 fix1 = _mm_add_ps(fix1,tx);
2214 fiy1 = _mm_add_ps(fiy1,ty);
2215 fiz1 = _mm_add_ps(fiz1,tz);
2217 fjx2 = _mm_add_ps(fjx2,tx);
2218 fjy2 = _mm_add_ps(fjy2,ty);
2219 fjz2 = _mm_add_ps(fjz2,tz);
2223 /**************************
2224 * CALCULATE INTERACTIONS *
2225 **************************/
2227 if (gmx_mm_any_lt(rsq13,rcutoff2))
2230 r13 = _mm_mul_ps(rsq13,rinv13);
2232 /* EWALD ELECTROSTATICS */
2234 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2235 ewrt = _mm_mul_ps(r13,ewtabscale);
2236 ewitab = _mm_cvttps_epi32(ewrt);
2237 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2238 ewitab = _mm_slli_epi32(ewitab,2);
2239 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2240 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2241 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2242 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2243 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2244 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2245 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2246 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2247 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2249 d = _mm_sub_ps(r13,rswitch);
2250 d = _mm_max_ps(d,_mm_setzero_ps());
2251 d2 = _mm_mul_ps(d,d);
2252 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2254 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2256 /* Evaluate switch function */
2257 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2258 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2259 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2263 fscal = _mm_and_ps(fscal,cutoff_mask);
2265 /* Calculate temporary vectorial force */
2266 tx = _mm_mul_ps(fscal,dx13);
2267 ty = _mm_mul_ps(fscal,dy13);
2268 tz = _mm_mul_ps(fscal,dz13);
2270 /* Update vectorial force */
2271 fix1 = _mm_add_ps(fix1,tx);
2272 fiy1 = _mm_add_ps(fiy1,ty);
2273 fiz1 = _mm_add_ps(fiz1,tz);
2275 fjx3 = _mm_add_ps(fjx3,tx);
2276 fjy3 = _mm_add_ps(fjy3,ty);
2277 fjz3 = _mm_add_ps(fjz3,tz);
2281 /**************************
2282 * CALCULATE INTERACTIONS *
2283 **************************/
2285 if (gmx_mm_any_lt(rsq21,rcutoff2))
2288 r21 = _mm_mul_ps(rsq21,rinv21);
2290 /* EWALD ELECTROSTATICS */
2292 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2293 ewrt = _mm_mul_ps(r21,ewtabscale);
2294 ewitab = _mm_cvttps_epi32(ewrt);
2295 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2296 ewitab = _mm_slli_epi32(ewitab,2);
2297 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2298 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2299 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2300 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2301 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2302 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2303 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2304 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2305 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2307 d = _mm_sub_ps(r21,rswitch);
2308 d = _mm_max_ps(d,_mm_setzero_ps());
2309 d2 = _mm_mul_ps(d,d);
2310 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2312 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2314 /* Evaluate switch function */
2315 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2316 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2317 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2321 fscal = _mm_and_ps(fscal,cutoff_mask);
2323 /* Calculate temporary vectorial force */
2324 tx = _mm_mul_ps(fscal,dx21);
2325 ty = _mm_mul_ps(fscal,dy21);
2326 tz = _mm_mul_ps(fscal,dz21);
2328 /* Update vectorial force */
2329 fix2 = _mm_add_ps(fix2,tx);
2330 fiy2 = _mm_add_ps(fiy2,ty);
2331 fiz2 = _mm_add_ps(fiz2,tz);
2333 fjx1 = _mm_add_ps(fjx1,tx);
2334 fjy1 = _mm_add_ps(fjy1,ty);
2335 fjz1 = _mm_add_ps(fjz1,tz);
2339 /**************************
2340 * CALCULATE INTERACTIONS *
2341 **************************/
2343 if (gmx_mm_any_lt(rsq22,rcutoff2))
2346 r22 = _mm_mul_ps(rsq22,rinv22);
2348 /* EWALD ELECTROSTATICS */
2350 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2351 ewrt = _mm_mul_ps(r22,ewtabscale);
2352 ewitab = _mm_cvttps_epi32(ewrt);
2353 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2354 ewitab = _mm_slli_epi32(ewitab,2);
2355 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2356 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2357 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2358 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2359 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2360 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2361 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2362 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2363 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2365 d = _mm_sub_ps(r22,rswitch);
2366 d = _mm_max_ps(d,_mm_setzero_ps());
2367 d2 = _mm_mul_ps(d,d);
2368 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2370 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2372 /* Evaluate switch function */
2373 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2374 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2375 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2379 fscal = _mm_and_ps(fscal,cutoff_mask);
2381 /* Calculate temporary vectorial force */
2382 tx = _mm_mul_ps(fscal,dx22);
2383 ty = _mm_mul_ps(fscal,dy22);
2384 tz = _mm_mul_ps(fscal,dz22);
2386 /* Update vectorial force */
2387 fix2 = _mm_add_ps(fix2,tx);
2388 fiy2 = _mm_add_ps(fiy2,ty);
2389 fiz2 = _mm_add_ps(fiz2,tz);
2391 fjx2 = _mm_add_ps(fjx2,tx);
2392 fjy2 = _mm_add_ps(fjy2,ty);
2393 fjz2 = _mm_add_ps(fjz2,tz);
2397 /**************************
2398 * CALCULATE INTERACTIONS *
2399 **************************/
2401 if (gmx_mm_any_lt(rsq23,rcutoff2))
2404 r23 = _mm_mul_ps(rsq23,rinv23);
2406 /* EWALD ELECTROSTATICS */
2408 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2409 ewrt = _mm_mul_ps(r23,ewtabscale);
2410 ewitab = _mm_cvttps_epi32(ewrt);
2411 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2412 ewitab = _mm_slli_epi32(ewitab,2);
2413 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2414 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2415 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2416 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2417 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2418 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2419 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2420 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
2421 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2423 d = _mm_sub_ps(r23,rswitch);
2424 d = _mm_max_ps(d,_mm_setzero_ps());
2425 d2 = _mm_mul_ps(d,d);
2426 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2428 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2430 /* Evaluate switch function */
2431 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2432 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2433 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2437 fscal = _mm_and_ps(fscal,cutoff_mask);
2439 /* Calculate temporary vectorial force */
2440 tx = _mm_mul_ps(fscal,dx23);
2441 ty = _mm_mul_ps(fscal,dy23);
2442 tz = _mm_mul_ps(fscal,dz23);
2444 /* Update vectorial force */
2445 fix2 = _mm_add_ps(fix2,tx);
2446 fiy2 = _mm_add_ps(fiy2,ty);
2447 fiz2 = _mm_add_ps(fiz2,tz);
2449 fjx3 = _mm_add_ps(fjx3,tx);
2450 fjy3 = _mm_add_ps(fjy3,ty);
2451 fjz3 = _mm_add_ps(fjz3,tz);
2455 /**************************
2456 * CALCULATE INTERACTIONS *
2457 **************************/
2459 if (gmx_mm_any_lt(rsq31,rcutoff2))
2462 r31 = _mm_mul_ps(rsq31,rinv31);
2464 /* EWALD ELECTROSTATICS */
2466 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2467 ewrt = _mm_mul_ps(r31,ewtabscale);
2468 ewitab = _mm_cvttps_epi32(ewrt);
2469 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2470 ewitab = _mm_slli_epi32(ewitab,2);
2471 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2472 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2473 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2474 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2475 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2476 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2477 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2478 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
2479 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2481 d = _mm_sub_ps(r31,rswitch);
2482 d = _mm_max_ps(d,_mm_setzero_ps());
2483 d2 = _mm_mul_ps(d,d);
2484 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2486 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2488 /* Evaluate switch function */
2489 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2490 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2491 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2495 fscal = _mm_and_ps(fscal,cutoff_mask);
2497 /* Calculate temporary vectorial force */
2498 tx = _mm_mul_ps(fscal,dx31);
2499 ty = _mm_mul_ps(fscal,dy31);
2500 tz = _mm_mul_ps(fscal,dz31);
2502 /* Update vectorial force */
2503 fix3 = _mm_add_ps(fix3,tx);
2504 fiy3 = _mm_add_ps(fiy3,ty);
2505 fiz3 = _mm_add_ps(fiz3,tz);
2507 fjx1 = _mm_add_ps(fjx1,tx);
2508 fjy1 = _mm_add_ps(fjy1,ty);
2509 fjz1 = _mm_add_ps(fjz1,tz);
2513 /**************************
2514 * CALCULATE INTERACTIONS *
2515 **************************/
2517 if (gmx_mm_any_lt(rsq32,rcutoff2))
2520 r32 = _mm_mul_ps(rsq32,rinv32);
2522 /* EWALD ELECTROSTATICS */
2524 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2525 ewrt = _mm_mul_ps(r32,ewtabscale);
2526 ewitab = _mm_cvttps_epi32(ewrt);
2527 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2528 ewitab = _mm_slli_epi32(ewitab,2);
2529 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2530 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2531 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2532 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2533 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2534 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2535 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2536 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
2537 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2539 d = _mm_sub_ps(r32,rswitch);
2540 d = _mm_max_ps(d,_mm_setzero_ps());
2541 d2 = _mm_mul_ps(d,d);
2542 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2544 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2546 /* Evaluate switch function */
2547 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2548 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2549 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2553 fscal = _mm_and_ps(fscal,cutoff_mask);
2555 /* Calculate temporary vectorial force */
2556 tx = _mm_mul_ps(fscal,dx32);
2557 ty = _mm_mul_ps(fscal,dy32);
2558 tz = _mm_mul_ps(fscal,dz32);
2560 /* Update vectorial force */
2561 fix3 = _mm_add_ps(fix3,tx);
2562 fiy3 = _mm_add_ps(fiy3,ty);
2563 fiz3 = _mm_add_ps(fiz3,tz);
2565 fjx2 = _mm_add_ps(fjx2,tx);
2566 fjy2 = _mm_add_ps(fjy2,ty);
2567 fjz2 = _mm_add_ps(fjz2,tz);
2571 /**************************
2572 * CALCULATE INTERACTIONS *
2573 **************************/
2575 if (gmx_mm_any_lt(rsq33,rcutoff2))
2578 r33 = _mm_mul_ps(rsq33,rinv33);
2580 /* EWALD ELECTROSTATICS */
2582 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2583 ewrt = _mm_mul_ps(r33,ewtabscale);
2584 ewitab = _mm_cvttps_epi32(ewrt);
2585 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2586 ewitab = _mm_slli_epi32(ewitab,2);
2587 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2588 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2589 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2590 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2591 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2592 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2593 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2594 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
2595 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2597 d = _mm_sub_ps(r33,rswitch);
2598 d = _mm_max_ps(d,_mm_setzero_ps());
2599 d2 = _mm_mul_ps(d,d);
2600 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2602 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2604 /* Evaluate switch function */
2605 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2606 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2607 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2611 fscal = _mm_and_ps(fscal,cutoff_mask);
2613 /* Calculate temporary vectorial force */
2614 tx = _mm_mul_ps(fscal,dx33);
2615 ty = _mm_mul_ps(fscal,dy33);
2616 tz = _mm_mul_ps(fscal,dz33);
2618 /* Update vectorial force */
2619 fix3 = _mm_add_ps(fix3,tx);
2620 fiy3 = _mm_add_ps(fiy3,ty);
2621 fiz3 = _mm_add_ps(fiz3,tz);
2623 fjx3 = _mm_add_ps(fjx3,tx);
2624 fjy3 = _mm_add_ps(fjy3,ty);
2625 fjz3 = _mm_add_ps(fjz3,tz);
2629 fjptrA = f+j_coord_offsetA;
2630 fjptrB = f+j_coord_offsetB;
2631 fjptrC = f+j_coord_offsetC;
2632 fjptrD = f+j_coord_offsetD;
2634 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2635 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2636 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2638 /* Inner loop uses 617 flops */
2641 if(jidx<j_index_end)
2644 /* Get j neighbor index, and coordinate index */
2645 jnrlistA = jjnr[jidx];
2646 jnrlistB = jjnr[jidx+1];
2647 jnrlistC = jjnr[jidx+2];
2648 jnrlistD = jjnr[jidx+3];
2649 /* Sign of each element will be negative for non-real atoms.
2650 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2651 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2653 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2654 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2655 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2656 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2657 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2658 j_coord_offsetA = DIM*jnrA;
2659 j_coord_offsetB = DIM*jnrB;
2660 j_coord_offsetC = DIM*jnrC;
2661 j_coord_offsetD = DIM*jnrD;
2663 /* load j atom coordinates */
2664 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2665 x+j_coord_offsetC,x+j_coord_offsetD,
2666 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2667 &jy2,&jz2,&jx3,&jy3,&jz3);
2669 /* Calculate displacement vector */
2670 dx00 = _mm_sub_ps(ix0,jx0);
2671 dy00 = _mm_sub_ps(iy0,jy0);
2672 dz00 = _mm_sub_ps(iz0,jz0);
2673 dx11 = _mm_sub_ps(ix1,jx1);
2674 dy11 = _mm_sub_ps(iy1,jy1);
2675 dz11 = _mm_sub_ps(iz1,jz1);
2676 dx12 = _mm_sub_ps(ix1,jx2);
2677 dy12 = _mm_sub_ps(iy1,jy2);
2678 dz12 = _mm_sub_ps(iz1,jz2);
2679 dx13 = _mm_sub_ps(ix1,jx3);
2680 dy13 = _mm_sub_ps(iy1,jy3);
2681 dz13 = _mm_sub_ps(iz1,jz3);
2682 dx21 = _mm_sub_ps(ix2,jx1);
2683 dy21 = _mm_sub_ps(iy2,jy1);
2684 dz21 = _mm_sub_ps(iz2,jz1);
2685 dx22 = _mm_sub_ps(ix2,jx2);
2686 dy22 = _mm_sub_ps(iy2,jy2);
2687 dz22 = _mm_sub_ps(iz2,jz2);
2688 dx23 = _mm_sub_ps(ix2,jx3);
2689 dy23 = _mm_sub_ps(iy2,jy3);
2690 dz23 = _mm_sub_ps(iz2,jz3);
2691 dx31 = _mm_sub_ps(ix3,jx1);
2692 dy31 = _mm_sub_ps(iy3,jy1);
2693 dz31 = _mm_sub_ps(iz3,jz1);
2694 dx32 = _mm_sub_ps(ix3,jx2);
2695 dy32 = _mm_sub_ps(iy3,jy2);
2696 dz32 = _mm_sub_ps(iz3,jz2);
2697 dx33 = _mm_sub_ps(ix3,jx3);
2698 dy33 = _mm_sub_ps(iy3,jy3);
2699 dz33 = _mm_sub_ps(iz3,jz3);
2701 /* Calculate squared distance and things based on it */
2702 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2703 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2704 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2705 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2706 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2707 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2708 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2709 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2710 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2711 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2713 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2714 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2715 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2716 rinv13 = gmx_mm_invsqrt_ps(rsq13);
2717 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2718 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2719 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2720 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2721 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2722 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2724 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2725 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2726 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2727 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2728 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2729 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2730 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2731 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2732 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2733 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2735 fjx0 = _mm_setzero_ps();
2736 fjy0 = _mm_setzero_ps();
2737 fjz0 = _mm_setzero_ps();
2738 fjx1 = _mm_setzero_ps();
2739 fjy1 = _mm_setzero_ps();
2740 fjz1 = _mm_setzero_ps();
2741 fjx2 = _mm_setzero_ps();
2742 fjy2 = _mm_setzero_ps();
2743 fjz2 = _mm_setzero_ps();
2744 fjx3 = _mm_setzero_ps();
2745 fjy3 = _mm_setzero_ps();
2746 fjz3 = _mm_setzero_ps();
2748 /**************************
2749 * CALCULATE INTERACTIONS *
2750 **************************/
2752 if (gmx_mm_any_lt(rsq00,rcutoff2))
2755 r00 = _mm_mul_ps(rsq00,rinv00);
2756 r00 = _mm_andnot_ps(dummy_mask,r00);
2758 /* LENNARD-JONES DISPERSION/REPULSION */
2760 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2761 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
2762 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2763 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2764 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2766 d = _mm_sub_ps(r00,rswitch);
2767 d = _mm_max_ps(d,_mm_setzero_ps());
2768 d2 = _mm_mul_ps(d,d);
2769 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2771 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2773 /* Evaluate switch function */
2774 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2775 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2776 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2780 fscal = _mm_and_ps(fscal,cutoff_mask);
2782 fscal = _mm_andnot_ps(dummy_mask,fscal);
2784 /* Calculate temporary vectorial force */
2785 tx = _mm_mul_ps(fscal,dx00);
2786 ty = _mm_mul_ps(fscal,dy00);
2787 tz = _mm_mul_ps(fscal,dz00);
2789 /* Update vectorial force */
2790 fix0 = _mm_add_ps(fix0,tx);
2791 fiy0 = _mm_add_ps(fiy0,ty);
2792 fiz0 = _mm_add_ps(fiz0,tz);
2794 fjx0 = _mm_add_ps(fjx0,tx);
2795 fjy0 = _mm_add_ps(fjy0,ty);
2796 fjz0 = _mm_add_ps(fjz0,tz);
2800 /**************************
2801 * CALCULATE INTERACTIONS *
2802 **************************/
2804 if (gmx_mm_any_lt(rsq11,rcutoff2))
2807 r11 = _mm_mul_ps(rsq11,rinv11);
2808 r11 = _mm_andnot_ps(dummy_mask,r11);
2810 /* EWALD ELECTROSTATICS */
2812 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2813 ewrt = _mm_mul_ps(r11,ewtabscale);
2814 ewitab = _mm_cvttps_epi32(ewrt);
2815 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2816 ewitab = _mm_slli_epi32(ewitab,2);
2817 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2818 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2819 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2820 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2821 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2822 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2823 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2824 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2825 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2827 d = _mm_sub_ps(r11,rswitch);
2828 d = _mm_max_ps(d,_mm_setzero_ps());
2829 d2 = _mm_mul_ps(d,d);
2830 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2832 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2834 /* Evaluate switch function */
2835 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2836 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2837 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2841 fscal = _mm_and_ps(fscal,cutoff_mask);
2843 fscal = _mm_andnot_ps(dummy_mask,fscal);
2845 /* Calculate temporary vectorial force */
2846 tx = _mm_mul_ps(fscal,dx11);
2847 ty = _mm_mul_ps(fscal,dy11);
2848 tz = _mm_mul_ps(fscal,dz11);
2850 /* Update vectorial force */
2851 fix1 = _mm_add_ps(fix1,tx);
2852 fiy1 = _mm_add_ps(fiy1,ty);
2853 fiz1 = _mm_add_ps(fiz1,tz);
2855 fjx1 = _mm_add_ps(fjx1,tx);
2856 fjy1 = _mm_add_ps(fjy1,ty);
2857 fjz1 = _mm_add_ps(fjz1,tz);
2861 /**************************
2862 * CALCULATE INTERACTIONS *
2863 **************************/
2865 if (gmx_mm_any_lt(rsq12,rcutoff2))
2868 r12 = _mm_mul_ps(rsq12,rinv12);
2869 r12 = _mm_andnot_ps(dummy_mask,r12);
2871 /* EWALD ELECTROSTATICS */
2873 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2874 ewrt = _mm_mul_ps(r12,ewtabscale);
2875 ewitab = _mm_cvttps_epi32(ewrt);
2876 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2877 ewitab = _mm_slli_epi32(ewitab,2);
2878 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2879 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2880 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2881 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2882 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2883 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2884 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2885 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2886 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2888 d = _mm_sub_ps(r12,rswitch);
2889 d = _mm_max_ps(d,_mm_setzero_ps());
2890 d2 = _mm_mul_ps(d,d);
2891 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2893 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2895 /* Evaluate switch function */
2896 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2897 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2898 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2902 fscal = _mm_and_ps(fscal,cutoff_mask);
2904 fscal = _mm_andnot_ps(dummy_mask,fscal);
2906 /* Calculate temporary vectorial force */
2907 tx = _mm_mul_ps(fscal,dx12);
2908 ty = _mm_mul_ps(fscal,dy12);
2909 tz = _mm_mul_ps(fscal,dz12);
2911 /* Update vectorial force */
2912 fix1 = _mm_add_ps(fix1,tx);
2913 fiy1 = _mm_add_ps(fiy1,ty);
2914 fiz1 = _mm_add_ps(fiz1,tz);
2916 fjx2 = _mm_add_ps(fjx2,tx);
2917 fjy2 = _mm_add_ps(fjy2,ty);
2918 fjz2 = _mm_add_ps(fjz2,tz);
2922 /**************************
2923 * CALCULATE INTERACTIONS *
2924 **************************/
2926 if (gmx_mm_any_lt(rsq13,rcutoff2))
2929 r13 = _mm_mul_ps(rsq13,rinv13);
2930 r13 = _mm_andnot_ps(dummy_mask,r13);
2932 /* EWALD ELECTROSTATICS */
2934 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2935 ewrt = _mm_mul_ps(r13,ewtabscale);
2936 ewitab = _mm_cvttps_epi32(ewrt);
2937 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2938 ewitab = _mm_slli_epi32(ewitab,2);
2939 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2940 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2941 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2942 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2943 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2944 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2945 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2946 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2947 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2949 d = _mm_sub_ps(r13,rswitch);
2950 d = _mm_max_ps(d,_mm_setzero_ps());
2951 d2 = _mm_mul_ps(d,d);
2952 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2954 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2956 /* Evaluate switch function */
2957 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2958 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2959 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2963 fscal = _mm_and_ps(fscal,cutoff_mask);
2965 fscal = _mm_andnot_ps(dummy_mask,fscal);
2967 /* Calculate temporary vectorial force */
2968 tx = _mm_mul_ps(fscal,dx13);
2969 ty = _mm_mul_ps(fscal,dy13);
2970 tz = _mm_mul_ps(fscal,dz13);
2972 /* Update vectorial force */
2973 fix1 = _mm_add_ps(fix1,tx);
2974 fiy1 = _mm_add_ps(fiy1,ty);
2975 fiz1 = _mm_add_ps(fiz1,tz);
2977 fjx3 = _mm_add_ps(fjx3,tx);
2978 fjy3 = _mm_add_ps(fjy3,ty);
2979 fjz3 = _mm_add_ps(fjz3,tz);
2983 /**************************
2984 * CALCULATE INTERACTIONS *
2985 **************************/
2987 if (gmx_mm_any_lt(rsq21,rcutoff2))
2990 r21 = _mm_mul_ps(rsq21,rinv21);
2991 r21 = _mm_andnot_ps(dummy_mask,r21);
2993 /* EWALD ELECTROSTATICS */
2995 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2996 ewrt = _mm_mul_ps(r21,ewtabscale);
2997 ewitab = _mm_cvttps_epi32(ewrt);
2998 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2999 ewitab = _mm_slli_epi32(ewitab,2);
3000 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3001 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3002 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3003 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3004 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3005 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3006 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3007 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
3008 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
3010 d = _mm_sub_ps(r21,rswitch);
3011 d = _mm_max_ps(d,_mm_setzero_ps());
3012 d2 = _mm_mul_ps(d,d);
3013 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3015 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3017 /* Evaluate switch function */
3018 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3019 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
3020 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
3024 fscal = _mm_and_ps(fscal,cutoff_mask);
3026 fscal = _mm_andnot_ps(dummy_mask,fscal);
3028 /* Calculate temporary vectorial force */
3029 tx = _mm_mul_ps(fscal,dx21);
3030 ty = _mm_mul_ps(fscal,dy21);
3031 tz = _mm_mul_ps(fscal,dz21);
3033 /* Update vectorial force */
3034 fix2 = _mm_add_ps(fix2,tx);
3035 fiy2 = _mm_add_ps(fiy2,ty);
3036 fiz2 = _mm_add_ps(fiz2,tz);
3038 fjx1 = _mm_add_ps(fjx1,tx);
3039 fjy1 = _mm_add_ps(fjy1,ty);
3040 fjz1 = _mm_add_ps(fjz1,tz);
3044 /**************************
3045 * CALCULATE INTERACTIONS *
3046 **************************/
3048 if (gmx_mm_any_lt(rsq22,rcutoff2))
3051 r22 = _mm_mul_ps(rsq22,rinv22);
3052 r22 = _mm_andnot_ps(dummy_mask,r22);
3054 /* EWALD ELECTROSTATICS */
3056 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3057 ewrt = _mm_mul_ps(r22,ewtabscale);
3058 ewitab = _mm_cvttps_epi32(ewrt);
3059 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3060 ewitab = _mm_slli_epi32(ewitab,2);
3061 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3062 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3063 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3064 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3065 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3066 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3067 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3068 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
3069 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
3071 d = _mm_sub_ps(r22,rswitch);
3072 d = _mm_max_ps(d,_mm_setzero_ps());
3073 d2 = _mm_mul_ps(d,d);
3074 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3076 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3078 /* Evaluate switch function */
3079 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3080 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
3081 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
3085 fscal = _mm_and_ps(fscal,cutoff_mask);
3087 fscal = _mm_andnot_ps(dummy_mask,fscal);
3089 /* Calculate temporary vectorial force */
3090 tx = _mm_mul_ps(fscal,dx22);
3091 ty = _mm_mul_ps(fscal,dy22);
3092 tz = _mm_mul_ps(fscal,dz22);
3094 /* Update vectorial force */
3095 fix2 = _mm_add_ps(fix2,tx);
3096 fiy2 = _mm_add_ps(fiy2,ty);
3097 fiz2 = _mm_add_ps(fiz2,tz);
3099 fjx2 = _mm_add_ps(fjx2,tx);
3100 fjy2 = _mm_add_ps(fjy2,ty);
3101 fjz2 = _mm_add_ps(fjz2,tz);
3105 /**************************
3106 * CALCULATE INTERACTIONS *
3107 **************************/
3109 if (gmx_mm_any_lt(rsq23,rcutoff2))
3112 r23 = _mm_mul_ps(rsq23,rinv23);
3113 r23 = _mm_andnot_ps(dummy_mask,r23);
3115 /* EWALD ELECTROSTATICS */
3117 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3118 ewrt = _mm_mul_ps(r23,ewtabscale);
3119 ewitab = _mm_cvttps_epi32(ewrt);
3120 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3121 ewitab = _mm_slli_epi32(ewitab,2);
3122 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3123 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3124 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3125 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3126 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3127 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3128 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3129 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
3130 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
3132 d = _mm_sub_ps(r23,rswitch);
3133 d = _mm_max_ps(d,_mm_setzero_ps());
3134 d2 = _mm_mul_ps(d,d);
3135 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3137 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3139 /* Evaluate switch function */
3140 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3141 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
3142 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
3146 fscal = _mm_and_ps(fscal,cutoff_mask);
3148 fscal = _mm_andnot_ps(dummy_mask,fscal);
3150 /* Calculate temporary vectorial force */
3151 tx = _mm_mul_ps(fscal,dx23);
3152 ty = _mm_mul_ps(fscal,dy23);
3153 tz = _mm_mul_ps(fscal,dz23);
3155 /* Update vectorial force */
3156 fix2 = _mm_add_ps(fix2,tx);
3157 fiy2 = _mm_add_ps(fiy2,ty);
3158 fiz2 = _mm_add_ps(fiz2,tz);
3160 fjx3 = _mm_add_ps(fjx3,tx);
3161 fjy3 = _mm_add_ps(fjy3,ty);
3162 fjz3 = _mm_add_ps(fjz3,tz);
3166 /**************************
3167 * CALCULATE INTERACTIONS *
3168 **************************/
3170 if (gmx_mm_any_lt(rsq31,rcutoff2))
3173 r31 = _mm_mul_ps(rsq31,rinv31);
3174 r31 = _mm_andnot_ps(dummy_mask,r31);
3176 /* EWALD ELECTROSTATICS */
3178 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3179 ewrt = _mm_mul_ps(r31,ewtabscale);
3180 ewitab = _mm_cvttps_epi32(ewrt);
3181 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3182 ewitab = _mm_slli_epi32(ewitab,2);
3183 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3184 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3185 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3186 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3187 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3188 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3189 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3190 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
3191 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
3193 d = _mm_sub_ps(r31,rswitch);
3194 d = _mm_max_ps(d,_mm_setzero_ps());
3195 d2 = _mm_mul_ps(d,d);
3196 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3198 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3200 /* Evaluate switch function */
3201 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3202 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
3203 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
3207 fscal = _mm_and_ps(fscal,cutoff_mask);
3209 fscal = _mm_andnot_ps(dummy_mask,fscal);
3211 /* Calculate temporary vectorial force */
3212 tx = _mm_mul_ps(fscal,dx31);
3213 ty = _mm_mul_ps(fscal,dy31);
3214 tz = _mm_mul_ps(fscal,dz31);
3216 /* Update vectorial force */
3217 fix3 = _mm_add_ps(fix3,tx);
3218 fiy3 = _mm_add_ps(fiy3,ty);
3219 fiz3 = _mm_add_ps(fiz3,tz);
3221 fjx1 = _mm_add_ps(fjx1,tx);
3222 fjy1 = _mm_add_ps(fjy1,ty);
3223 fjz1 = _mm_add_ps(fjz1,tz);
3227 /**************************
3228 * CALCULATE INTERACTIONS *
3229 **************************/
3231 if (gmx_mm_any_lt(rsq32,rcutoff2))
3234 r32 = _mm_mul_ps(rsq32,rinv32);
3235 r32 = _mm_andnot_ps(dummy_mask,r32);
3237 /* EWALD ELECTROSTATICS */
3239 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3240 ewrt = _mm_mul_ps(r32,ewtabscale);
3241 ewitab = _mm_cvttps_epi32(ewrt);
3242 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3243 ewitab = _mm_slli_epi32(ewitab,2);
3244 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3245 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3246 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3247 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3248 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3249 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3250 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3251 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
3252 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
3254 d = _mm_sub_ps(r32,rswitch);
3255 d = _mm_max_ps(d,_mm_setzero_ps());
3256 d2 = _mm_mul_ps(d,d);
3257 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3259 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3261 /* Evaluate switch function */
3262 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3263 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
3264 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
3268 fscal = _mm_and_ps(fscal,cutoff_mask);
3270 fscal = _mm_andnot_ps(dummy_mask,fscal);
3272 /* Calculate temporary vectorial force */
3273 tx = _mm_mul_ps(fscal,dx32);
3274 ty = _mm_mul_ps(fscal,dy32);
3275 tz = _mm_mul_ps(fscal,dz32);
3277 /* Update vectorial force */
3278 fix3 = _mm_add_ps(fix3,tx);
3279 fiy3 = _mm_add_ps(fiy3,ty);
3280 fiz3 = _mm_add_ps(fiz3,tz);
3282 fjx2 = _mm_add_ps(fjx2,tx);
3283 fjy2 = _mm_add_ps(fjy2,ty);
3284 fjz2 = _mm_add_ps(fjz2,tz);
3288 /**************************
3289 * CALCULATE INTERACTIONS *
3290 **************************/
3292 if (gmx_mm_any_lt(rsq33,rcutoff2))
3295 r33 = _mm_mul_ps(rsq33,rinv33);
3296 r33 = _mm_andnot_ps(dummy_mask,r33);
3298 /* EWALD ELECTROSTATICS */
3300 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3301 ewrt = _mm_mul_ps(r33,ewtabscale);
3302 ewitab = _mm_cvttps_epi32(ewrt);
3303 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3304 ewitab = _mm_slli_epi32(ewitab,2);
3305 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3306 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3307 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3308 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3309 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3310 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3311 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3312 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
3313 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
3315 d = _mm_sub_ps(r33,rswitch);
3316 d = _mm_max_ps(d,_mm_setzero_ps());
3317 d2 = _mm_mul_ps(d,d);
3318 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3320 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3322 /* Evaluate switch function */
3323 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3324 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
3325 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
3329 fscal = _mm_and_ps(fscal,cutoff_mask);
3331 fscal = _mm_andnot_ps(dummy_mask,fscal);
3333 /* Calculate temporary vectorial force */
3334 tx = _mm_mul_ps(fscal,dx33);
3335 ty = _mm_mul_ps(fscal,dy33);
3336 tz = _mm_mul_ps(fscal,dz33);
3338 /* Update vectorial force */
3339 fix3 = _mm_add_ps(fix3,tx);
3340 fiy3 = _mm_add_ps(fiy3,ty);
3341 fiz3 = _mm_add_ps(fiz3,tz);
3343 fjx3 = _mm_add_ps(fjx3,tx);
3344 fjy3 = _mm_add_ps(fjy3,ty);
3345 fjz3 = _mm_add_ps(fjz3,tz);
3349 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3350 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3351 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3352 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3354 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
3355 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
3356 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3358 /* Inner loop uses 627 flops */
3361 /* End of innermost loop */
3363 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3364 f+i_coord_offset,fshift+i_shift_offset);
3366 /* Increment number of inner iterations */
3367 inneriter += j_index_end - j_index_start;
3369 /* Outer loop uses 24 flops */
3372 /* Increment number of outer iterations */
3375 /* Update outer/inner flops */
3377 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*627);