2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_sse2_single.h"
48 #include "kernelutil_x86_sse2_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LennardJones
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
112 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
116 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
118 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
120 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
121 real rswitch_scalar,d_scalar;
122 __m128 dummy_mask,cutoff_mask;
123 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
124 __m128 one = _mm_set1_ps(1.0);
125 __m128 two = _mm_set1_ps(2.0);
131 jindex = nlist->jindex;
133 shiftidx = nlist->shift;
135 shiftvec = fr->shift_vec[0];
136 fshift = fr->fshift[0];
137 facel = _mm_set1_ps(fr->epsfac);
138 charge = mdatoms->chargeA;
139 nvdwtype = fr->ntype;
141 vdwtype = mdatoms->typeA;
143 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
144 ewtab = fr->ic->tabq_coul_FDV0;
145 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
146 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
148 /* Setup water-specific parameters */
149 inr = nlist->iinr[0];
150 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
151 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
152 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
153 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
155 jq1 = _mm_set1_ps(charge[inr+1]);
156 jq2 = _mm_set1_ps(charge[inr+2]);
157 jq3 = _mm_set1_ps(charge[inr+3]);
158 vdwjidx0A = 2*vdwtype[inr+0];
159 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
160 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
161 qq11 = _mm_mul_ps(iq1,jq1);
162 qq12 = _mm_mul_ps(iq1,jq2);
163 qq13 = _mm_mul_ps(iq1,jq3);
164 qq21 = _mm_mul_ps(iq2,jq1);
165 qq22 = _mm_mul_ps(iq2,jq2);
166 qq23 = _mm_mul_ps(iq2,jq3);
167 qq31 = _mm_mul_ps(iq3,jq1);
168 qq32 = _mm_mul_ps(iq3,jq2);
169 qq33 = _mm_mul_ps(iq3,jq3);
171 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
172 rcutoff_scalar = fr->rcoulomb;
173 rcutoff = _mm_set1_ps(rcutoff_scalar);
174 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
176 rswitch_scalar = fr->rcoulomb_switch;
177 rswitch = _mm_set1_ps(rswitch_scalar);
178 /* Setup switch parameters */
179 d_scalar = rcutoff_scalar-rswitch_scalar;
180 d = _mm_set1_ps(d_scalar);
181 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
182 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
183 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
184 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
185 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
186 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
188 /* Avoid stupid compiler warnings */
189 jnrA = jnrB = jnrC = jnrD = 0;
198 for(iidx=0;iidx<4*DIM;iidx++)
203 /* Start outer loop over neighborlists */
204 for(iidx=0; iidx<nri; iidx++)
206 /* Load shift vector for this list */
207 i_shift_offset = DIM*shiftidx[iidx];
209 /* Load limits for loop over neighbors */
210 j_index_start = jindex[iidx];
211 j_index_end = jindex[iidx+1];
213 /* Get outer coordinate index */
215 i_coord_offset = DIM*inr;
217 /* Load i particle coords and add shift vector */
218 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
219 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
221 fix0 = _mm_setzero_ps();
222 fiy0 = _mm_setzero_ps();
223 fiz0 = _mm_setzero_ps();
224 fix1 = _mm_setzero_ps();
225 fiy1 = _mm_setzero_ps();
226 fiz1 = _mm_setzero_ps();
227 fix2 = _mm_setzero_ps();
228 fiy2 = _mm_setzero_ps();
229 fiz2 = _mm_setzero_ps();
230 fix3 = _mm_setzero_ps();
231 fiy3 = _mm_setzero_ps();
232 fiz3 = _mm_setzero_ps();
234 /* Reset potential sums */
235 velecsum = _mm_setzero_ps();
236 vvdwsum = _mm_setzero_ps();
238 /* Start inner kernel loop */
239 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
242 /* Get j neighbor index, and coordinate index */
247 j_coord_offsetA = DIM*jnrA;
248 j_coord_offsetB = DIM*jnrB;
249 j_coord_offsetC = DIM*jnrC;
250 j_coord_offsetD = DIM*jnrD;
252 /* load j atom coordinates */
253 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
254 x+j_coord_offsetC,x+j_coord_offsetD,
255 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
256 &jy2,&jz2,&jx3,&jy3,&jz3);
258 /* Calculate displacement vector */
259 dx00 = _mm_sub_ps(ix0,jx0);
260 dy00 = _mm_sub_ps(iy0,jy0);
261 dz00 = _mm_sub_ps(iz0,jz0);
262 dx11 = _mm_sub_ps(ix1,jx1);
263 dy11 = _mm_sub_ps(iy1,jy1);
264 dz11 = _mm_sub_ps(iz1,jz1);
265 dx12 = _mm_sub_ps(ix1,jx2);
266 dy12 = _mm_sub_ps(iy1,jy2);
267 dz12 = _mm_sub_ps(iz1,jz2);
268 dx13 = _mm_sub_ps(ix1,jx3);
269 dy13 = _mm_sub_ps(iy1,jy3);
270 dz13 = _mm_sub_ps(iz1,jz3);
271 dx21 = _mm_sub_ps(ix2,jx1);
272 dy21 = _mm_sub_ps(iy2,jy1);
273 dz21 = _mm_sub_ps(iz2,jz1);
274 dx22 = _mm_sub_ps(ix2,jx2);
275 dy22 = _mm_sub_ps(iy2,jy2);
276 dz22 = _mm_sub_ps(iz2,jz2);
277 dx23 = _mm_sub_ps(ix2,jx3);
278 dy23 = _mm_sub_ps(iy2,jy3);
279 dz23 = _mm_sub_ps(iz2,jz3);
280 dx31 = _mm_sub_ps(ix3,jx1);
281 dy31 = _mm_sub_ps(iy3,jy1);
282 dz31 = _mm_sub_ps(iz3,jz1);
283 dx32 = _mm_sub_ps(ix3,jx2);
284 dy32 = _mm_sub_ps(iy3,jy2);
285 dz32 = _mm_sub_ps(iz3,jz2);
286 dx33 = _mm_sub_ps(ix3,jx3);
287 dy33 = _mm_sub_ps(iy3,jy3);
288 dz33 = _mm_sub_ps(iz3,jz3);
290 /* Calculate squared distance and things based on it */
291 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
292 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
293 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
294 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
295 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
296 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
297 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
298 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
299 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
300 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
302 rinv00 = gmx_mm_invsqrt_ps(rsq00);
303 rinv11 = gmx_mm_invsqrt_ps(rsq11);
304 rinv12 = gmx_mm_invsqrt_ps(rsq12);
305 rinv13 = gmx_mm_invsqrt_ps(rsq13);
306 rinv21 = gmx_mm_invsqrt_ps(rsq21);
307 rinv22 = gmx_mm_invsqrt_ps(rsq22);
308 rinv23 = gmx_mm_invsqrt_ps(rsq23);
309 rinv31 = gmx_mm_invsqrt_ps(rsq31);
310 rinv32 = gmx_mm_invsqrt_ps(rsq32);
311 rinv33 = gmx_mm_invsqrt_ps(rsq33);
313 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
314 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
315 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
316 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
317 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
318 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
319 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
320 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
321 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
322 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
324 fjx0 = _mm_setzero_ps();
325 fjy0 = _mm_setzero_ps();
326 fjz0 = _mm_setzero_ps();
327 fjx1 = _mm_setzero_ps();
328 fjy1 = _mm_setzero_ps();
329 fjz1 = _mm_setzero_ps();
330 fjx2 = _mm_setzero_ps();
331 fjy2 = _mm_setzero_ps();
332 fjz2 = _mm_setzero_ps();
333 fjx3 = _mm_setzero_ps();
334 fjy3 = _mm_setzero_ps();
335 fjz3 = _mm_setzero_ps();
337 /**************************
338 * CALCULATE INTERACTIONS *
339 **************************/
341 if (gmx_mm_any_lt(rsq00,rcutoff2))
344 r00 = _mm_mul_ps(rsq00,rinv00);
346 /* LENNARD-JONES DISPERSION/REPULSION */
348 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
349 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
350 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
351 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
352 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
354 d = _mm_sub_ps(r00,rswitch);
355 d = _mm_max_ps(d,_mm_setzero_ps());
356 d2 = _mm_mul_ps(d,d);
357 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
359 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
361 /* Evaluate switch function */
362 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
363 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
364 vvdw = _mm_mul_ps(vvdw,sw);
365 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
367 /* Update potential sum for this i atom from the interaction with this j atom. */
368 vvdw = _mm_and_ps(vvdw,cutoff_mask);
369 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
373 fscal = _mm_and_ps(fscal,cutoff_mask);
375 /* Calculate temporary vectorial force */
376 tx = _mm_mul_ps(fscal,dx00);
377 ty = _mm_mul_ps(fscal,dy00);
378 tz = _mm_mul_ps(fscal,dz00);
380 /* Update vectorial force */
381 fix0 = _mm_add_ps(fix0,tx);
382 fiy0 = _mm_add_ps(fiy0,ty);
383 fiz0 = _mm_add_ps(fiz0,tz);
385 fjx0 = _mm_add_ps(fjx0,tx);
386 fjy0 = _mm_add_ps(fjy0,ty);
387 fjz0 = _mm_add_ps(fjz0,tz);
391 /**************************
392 * CALCULATE INTERACTIONS *
393 **************************/
395 if (gmx_mm_any_lt(rsq11,rcutoff2))
398 r11 = _mm_mul_ps(rsq11,rinv11);
400 /* EWALD ELECTROSTATICS */
402 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
403 ewrt = _mm_mul_ps(r11,ewtabscale);
404 ewitab = _mm_cvttps_epi32(ewrt);
405 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
406 ewitab = _mm_slli_epi32(ewitab,2);
407 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
408 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
409 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
410 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
411 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
412 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
413 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
414 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
415 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
417 d = _mm_sub_ps(r11,rswitch);
418 d = _mm_max_ps(d,_mm_setzero_ps());
419 d2 = _mm_mul_ps(d,d);
420 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
422 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
424 /* Evaluate switch function */
425 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
426 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
427 velec = _mm_mul_ps(velec,sw);
428 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
430 /* Update potential sum for this i atom from the interaction with this j atom. */
431 velec = _mm_and_ps(velec,cutoff_mask);
432 velecsum = _mm_add_ps(velecsum,velec);
436 fscal = _mm_and_ps(fscal,cutoff_mask);
438 /* Calculate temporary vectorial force */
439 tx = _mm_mul_ps(fscal,dx11);
440 ty = _mm_mul_ps(fscal,dy11);
441 tz = _mm_mul_ps(fscal,dz11);
443 /* Update vectorial force */
444 fix1 = _mm_add_ps(fix1,tx);
445 fiy1 = _mm_add_ps(fiy1,ty);
446 fiz1 = _mm_add_ps(fiz1,tz);
448 fjx1 = _mm_add_ps(fjx1,tx);
449 fjy1 = _mm_add_ps(fjy1,ty);
450 fjz1 = _mm_add_ps(fjz1,tz);
454 /**************************
455 * CALCULATE INTERACTIONS *
456 **************************/
458 if (gmx_mm_any_lt(rsq12,rcutoff2))
461 r12 = _mm_mul_ps(rsq12,rinv12);
463 /* EWALD ELECTROSTATICS */
465 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
466 ewrt = _mm_mul_ps(r12,ewtabscale);
467 ewitab = _mm_cvttps_epi32(ewrt);
468 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
469 ewitab = _mm_slli_epi32(ewitab,2);
470 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
471 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
472 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
473 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
474 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
475 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
476 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
477 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
478 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
480 d = _mm_sub_ps(r12,rswitch);
481 d = _mm_max_ps(d,_mm_setzero_ps());
482 d2 = _mm_mul_ps(d,d);
483 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
485 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
487 /* Evaluate switch function */
488 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
489 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
490 velec = _mm_mul_ps(velec,sw);
491 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
493 /* Update potential sum for this i atom from the interaction with this j atom. */
494 velec = _mm_and_ps(velec,cutoff_mask);
495 velecsum = _mm_add_ps(velecsum,velec);
499 fscal = _mm_and_ps(fscal,cutoff_mask);
501 /* Calculate temporary vectorial force */
502 tx = _mm_mul_ps(fscal,dx12);
503 ty = _mm_mul_ps(fscal,dy12);
504 tz = _mm_mul_ps(fscal,dz12);
506 /* Update vectorial force */
507 fix1 = _mm_add_ps(fix1,tx);
508 fiy1 = _mm_add_ps(fiy1,ty);
509 fiz1 = _mm_add_ps(fiz1,tz);
511 fjx2 = _mm_add_ps(fjx2,tx);
512 fjy2 = _mm_add_ps(fjy2,ty);
513 fjz2 = _mm_add_ps(fjz2,tz);
517 /**************************
518 * CALCULATE INTERACTIONS *
519 **************************/
521 if (gmx_mm_any_lt(rsq13,rcutoff2))
524 r13 = _mm_mul_ps(rsq13,rinv13);
526 /* EWALD ELECTROSTATICS */
528 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
529 ewrt = _mm_mul_ps(r13,ewtabscale);
530 ewitab = _mm_cvttps_epi32(ewrt);
531 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
532 ewitab = _mm_slli_epi32(ewitab,2);
533 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
534 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
535 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
536 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
537 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
538 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
539 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
540 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
541 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
543 d = _mm_sub_ps(r13,rswitch);
544 d = _mm_max_ps(d,_mm_setzero_ps());
545 d2 = _mm_mul_ps(d,d);
546 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
548 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
550 /* Evaluate switch function */
551 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
552 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
553 velec = _mm_mul_ps(velec,sw);
554 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
556 /* Update potential sum for this i atom from the interaction with this j atom. */
557 velec = _mm_and_ps(velec,cutoff_mask);
558 velecsum = _mm_add_ps(velecsum,velec);
562 fscal = _mm_and_ps(fscal,cutoff_mask);
564 /* Calculate temporary vectorial force */
565 tx = _mm_mul_ps(fscal,dx13);
566 ty = _mm_mul_ps(fscal,dy13);
567 tz = _mm_mul_ps(fscal,dz13);
569 /* Update vectorial force */
570 fix1 = _mm_add_ps(fix1,tx);
571 fiy1 = _mm_add_ps(fiy1,ty);
572 fiz1 = _mm_add_ps(fiz1,tz);
574 fjx3 = _mm_add_ps(fjx3,tx);
575 fjy3 = _mm_add_ps(fjy3,ty);
576 fjz3 = _mm_add_ps(fjz3,tz);
580 /**************************
581 * CALCULATE INTERACTIONS *
582 **************************/
584 if (gmx_mm_any_lt(rsq21,rcutoff2))
587 r21 = _mm_mul_ps(rsq21,rinv21);
589 /* EWALD ELECTROSTATICS */
591 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
592 ewrt = _mm_mul_ps(r21,ewtabscale);
593 ewitab = _mm_cvttps_epi32(ewrt);
594 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
595 ewitab = _mm_slli_epi32(ewitab,2);
596 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
597 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
598 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
599 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
600 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
601 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
602 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
603 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
604 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
606 d = _mm_sub_ps(r21,rswitch);
607 d = _mm_max_ps(d,_mm_setzero_ps());
608 d2 = _mm_mul_ps(d,d);
609 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
611 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
613 /* Evaluate switch function */
614 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
615 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
616 velec = _mm_mul_ps(velec,sw);
617 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
619 /* Update potential sum for this i atom from the interaction with this j atom. */
620 velec = _mm_and_ps(velec,cutoff_mask);
621 velecsum = _mm_add_ps(velecsum,velec);
625 fscal = _mm_and_ps(fscal,cutoff_mask);
627 /* Calculate temporary vectorial force */
628 tx = _mm_mul_ps(fscal,dx21);
629 ty = _mm_mul_ps(fscal,dy21);
630 tz = _mm_mul_ps(fscal,dz21);
632 /* Update vectorial force */
633 fix2 = _mm_add_ps(fix2,tx);
634 fiy2 = _mm_add_ps(fiy2,ty);
635 fiz2 = _mm_add_ps(fiz2,tz);
637 fjx1 = _mm_add_ps(fjx1,tx);
638 fjy1 = _mm_add_ps(fjy1,ty);
639 fjz1 = _mm_add_ps(fjz1,tz);
643 /**************************
644 * CALCULATE INTERACTIONS *
645 **************************/
647 if (gmx_mm_any_lt(rsq22,rcutoff2))
650 r22 = _mm_mul_ps(rsq22,rinv22);
652 /* EWALD ELECTROSTATICS */
654 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
655 ewrt = _mm_mul_ps(r22,ewtabscale);
656 ewitab = _mm_cvttps_epi32(ewrt);
657 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
658 ewitab = _mm_slli_epi32(ewitab,2);
659 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
660 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
661 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
662 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
663 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
664 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
665 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
666 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
667 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
669 d = _mm_sub_ps(r22,rswitch);
670 d = _mm_max_ps(d,_mm_setzero_ps());
671 d2 = _mm_mul_ps(d,d);
672 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
674 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
676 /* Evaluate switch function */
677 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
678 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
679 velec = _mm_mul_ps(velec,sw);
680 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
682 /* Update potential sum for this i atom from the interaction with this j atom. */
683 velec = _mm_and_ps(velec,cutoff_mask);
684 velecsum = _mm_add_ps(velecsum,velec);
688 fscal = _mm_and_ps(fscal,cutoff_mask);
690 /* Calculate temporary vectorial force */
691 tx = _mm_mul_ps(fscal,dx22);
692 ty = _mm_mul_ps(fscal,dy22);
693 tz = _mm_mul_ps(fscal,dz22);
695 /* Update vectorial force */
696 fix2 = _mm_add_ps(fix2,tx);
697 fiy2 = _mm_add_ps(fiy2,ty);
698 fiz2 = _mm_add_ps(fiz2,tz);
700 fjx2 = _mm_add_ps(fjx2,tx);
701 fjy2 = _mm_add_ps(fjy2,ty);
702 fjz2 = _mm_add_ps(fjz2,tz);
706 /**************************
707 * CALCULATE INTERACTIONS *
708 **************************/
710 if (gmx_mm_any_lt(rsq23,rcutoff2))
713 r23 = _mm_mul_ps(rsq23,rinv23);
715 /* EWALD ELECTROSTATICS */
717 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
718 ewrt = _mm_mul_ps(r23,ewtabscale);
719 ewitab = _mm_cvttps_epi32(ewrt);
720 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
721 ewitab = _mm_slli_epi32(ewitab,2);
722 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
723 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
724 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
725 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
726 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
727 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
728 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
729 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
730 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
732 d = _mm_sub_ps(r23,rswitch);
733 d = _mm_max_ps(d,_mm_setzero_ps());
734 d2 = _mm_mul_ps(d,d);
735 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
737 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
739 /* Evaluate switch function */
740 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
741 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
742 velec = _mm_mul_ps(velec,sw);
743 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
745 /* Update potential sum for this i atom from the interaction with this j atom. */
746 velec = _mm_and_ps(velec,cutoff_mask);
747 velecsum = _mm_add_ps(velecsum,velec);
751 fscal = _mm_and_ps(fscal,cutoff_mask);
753 /* Calculate temporary vectorial force */
754 tx = _mm_mul_ps(fscal,dx23);
755 ty = _mm_mul_ps(fscal,dy23);
756 tz = _mm_mul_ps(fscal,dz23);
758 /* Update vectorial force */
759 fix2 = _mm_add_ps(fix2,tx);
760 fiy2 = _mm_add_ps(fiy2,ty);
761 fiz2 = _mm_add_ps(fiz2,tz);
763 fjx3 = _mm_add_ps(fjx3,tx);
764 fjy3 = _mm_add_ps(fjy3,ty);
765 fjz3 = _mm_add_ps(fjz3,tz);
769 /**************************
770 * CALCULATE INTERACTIONS *
771 **************************/
773 if (gmx_mm_any_lt(rsq31,rcutoff2))
776 r31 = _mm_mul_ps(rsq31,rinv31);
778 /* EWALD ELECTROSTATICS */
780 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
781 ewrt = _mm_mul_ps(r31,ewtabscale);
782 ewitab = _mm_cvttps_epi32(ewrt);
783 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
784 ewitab = _mm_slli_epi32(ewitab,2);
785 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
786 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
787 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
788 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
789 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
790 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
791 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
792 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
793 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
795 d = _mm_sub_ps(r31,rswitch);
796 d = _mm_max_ps(d,_mm_setzero_ps());
797 d2 = _mm_mul_ps(d,d);
798 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
800 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
802 /* Evaluate switch function */
803 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
804 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
805 velec = _mm_mul_ps(velec,sw);
806 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
808 /* Update potential sum for this i atom from the interaction with this j atom. */
809 velec = _mm_and_ps(velec,cutoff_mask);
810 velecsum = _mm_add_ps(velecsum,velec);
814 fscal = _mm_and_ps(fscal,cutoff_mask);
816 /* Calculate temporary vectorial force */
817 tx = _mm_mul_ps(fscal,dx31);
818 ty = _mm_mul_ps(fscal,dy31);
819 tz = _mm_mul_ps(fscal,dz31);
821 /* Update vectorial force */
822 fix3 = _mm_add_ps(fix3,tx);
823 fiy3 = _mm_add_ps(fiy3,ty);
824 fiz3 = _mm_add_ps(fiz3,tz);
826 fjx1 = _mm_add_ps(fjx1,tx);
827 fjy1 = _mm_add_ps(fjy1,ty);
828 fjz1 = _mm_add_ps(fjz1,tz);
832 /**************************
833 * CALCULATE INTERACTIONS *
834 **************************/
836 if (gmx_mm_any_lt(rsq32,rcutoff2))
839 r32 = _mm_mul_ps(rsq32,rinv32);
841 /* EWALD ELECTROSTATICS */
843 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
844 ewrt = _mm_mul_ps(r32,ewtabscale);
845 ewitab = _mm_cvttps_epi32(ewrt);
846 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
847 ewitab = _mm_slli_epi32(ewitab,2);
848 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
849 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
850 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
851 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
852 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
853 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
854 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
855 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
856 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
858 d = _mm_sub_ps(r32,rswitch);
859 d = _mm_max_ps(d,_mm_setzero_ps());
860 d2 = _mm_mul_ps(d,d);
861 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
863 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
865 /* Evaluate switch function */
866 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
867 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
868 velec = _mm_mul_ps(velec,sw);
869 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
871 /* Update potential sum for this i atom from the interaction with this j atom. */
872 velec = _mm_and_ps(velec,cutoff_mask);
873 velecsum = _mm_add_ps(velecsum,velec);
877 fscal = _mm_and_ps(fscal,cutoff_mask);
879 /* Calculate temporary vectorial force */
880 tx = _mm_mul_ps(fscal,dx32);
881 ty = _mm_mul_ps(fscal,dy32);
882 tz = _mm_mul_ps(fscal,dz32);
884 /* Update vectorial force */
885 fix3 = _mm_add_ps(fix3,tx);
886 fiy3 = _mm_add_ps(fiy3,ty);
887 fiz3 = _mm_add_ps(fiz3,tz);
889 fjx2 = _mm_add_ps(fjx2,tx);
890 fjy2 = _mm_add_ps(fjy2,ty);
891 fjz2 = _mm_add_ps(fjz2,tz);
895 /**************************
896 * CALCULATE INTERACTIONS *
897 **************************/
899 if (gmx_mm_any_lt(rsq33,rcutoff2))
902 r33 = _mm_mul_ps(rsq33,rinv33);
904 /* EWALD ELECTROSTATICS */
906 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
907 ewrt = _mm_mul_ps(r33,ewtabscale);
908 ewitab = _mm_cvttps_epi32(ewrt);
909 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
910 ewitab = _mm_slli_epi32(ewitab,2);
911 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
912 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
913 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
914 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
915 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
916 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
917 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
918 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
919 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
921 d = _mm_sub_ps(r33,rswitch);
922 d = _mm_max_ps(d,_mm_setzero_ps());
923 d2 = _mm_mul_ps(d,d);
924 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
926 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
928 /* Evaluate switch function */
929 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
930 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
931 velec = _mm_mul_ps(velec,sw);
932 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
934 /* Update potential sum for this i atom from the interaction with this j atom. */
935 velec = _mm_and_ps(velec,cutoff_mask);
936 velecsum = _mm_add_ps(velecsum,velec);
940 fscal = _mm_and_ps(fscal,cutoff_mask);
942 /* Calculate temporary vectorial force */
943 tx = _mm_mul_ps(fscal,dx33);
944 ty = _mm_mul_ps(fscal,dy33);
945 tz = _mm_mul_ps(fscal,dz33);
947 /* Update vectorial force */
948 fix3 = _mm_add_ps(fix3,tx);
949 fiy3 = _mm_add_ps(fiy3,ty);
950 fiz3 = _mm_add_ps(fiz3,tz);
952 fjx3 = _mm_add_ps(fjx3,tx);
953 fjy3 = _mm_add_ps(fjy3,ty);
954 fjz3 = _mm_add_ps(fjz3,tz);
958 fjptrA = f+j_coord_offsetA;
959 fjptrB = f+j_coord_offsetB;
960 fjptrC = f+j_coord_offsetC;
961 fjptrD = f+j_coord_offsetD;
963 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
964 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
965 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
967 /* Inner loop uses 647 flops */
973 /* Get j neighbor index, and coordinate index */
974 jnrlistA = jjnr[jidx];
975 jnrlistB = jjnr[jidx+1];
976 jnrlistC = jjnr[jidx+2];
977 jnrlistD = jjnr[jidx+3];
978 /* Sign of each element will be negative for non-real atoms.
979 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
980 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
982 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
983 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
984 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
985 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
986 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
987 j_coord_offsetA = DIM*jnrA;
988 j_coord_offsetB = DIM*jnrB;
989 j_coord_offsetC = DIM*jnrC;
990 j_coord_offsetD = DIM*jnrD;
992 /* load j atom coordinates */
993 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
994 x+j_coord_offsetC,x+j_coord_offsetD,
995 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
996 &jy2,&jz2,&jx3,&jy3,&jz3);
998 /* Calculate displacement vector */
999 dx00 = _mm_sub_ps(ix0,jx0);
1000 dy00 = _mm_sub_ps(iy0,jy0);
1001 dz00 = _mm_sub_ps(iz0,jz0);
1002 dx11 = _mm_sub_ps(ix1,jx1);
1003 dy11 = _mm_sub_ps(iy1,jy1);
1004 dz11 = _mm_sub_ps(iz1,jz1);
1005 dx12 = _mm_sub_ps(ix1,jx2);
1006 dy12 = _mm_sub_ps(iy1,jy2);
1007 dz12 = _mm_sub_ps(iz1,jz2);
1008 dx13 = _mm_sub_ps(ix1,jx3);
1009 dy13 = _mm_sub_ps(iy1,jy3);
1010 dz13 = _mm_sub_ps(iz1,jz3);
1011 dx21 = _mm_sub_ps(ix2,jx1);
1012 dy21 = _mm_sub_ps(iy2,jy1);
1013 dz21 = _mm_sub_ps(iz2,jz1);
1014 dx22 = _mm_sub_ps(ix2,jx2);
1015 dy22 = _mm_sub_ps(iy2,jy2);
1016 dz22 = _mm_sub_ps(iz2,jz2);
1017 dx23 = _mm_sub_ps(ix2,jx3);
1018 dy23 = _mm_sub_ps(iy2,jy3);
1019 dz23 = _mm_sub_ps(iz2,jz3);
1020 dx31 = _mm_sub_ps(ix3,jx1);
1021 dy31 = _mm_sub_ps(iy3,jy1);
1022 dz31 = _mm_sub_ps(iz3,jz1);
1023 dx32 = _mm_sub_ps(ix3,jx2);
1024 dy32 = _mm_sub_ps(iy3,jy2);
1025 dz32 = _mm_sub_ps(iz3,jz2);
1026 dx33 = _mm_sub_ps(ix3,jx3);
1027 dy33 = _mm_sub_ps(iy3,jy3);
1028 dz33 = _mm_sub_ps(iz3,jz3);
1030 /* Calculate squared distance and things based on it */
1031 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1032 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1033 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1034 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1035 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1036 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1037 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1038 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1039 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1040 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1042 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1043 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1044 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1045 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1046 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1047 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1048 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1049 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1050 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1051 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1053 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1054 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1055 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1056 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1057 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1058 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1059 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1060 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1061 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1062 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1064 fjx0 = _mm_setzero_ps();
1065 fjy0 = _mm_setzero_ps();
1066 fjz0 = _mm_setzero_ps();
1067 fjx1 = _mm_setzero_ps();
1068 fjy1 = _mm_setzero_ps();
1069 fjz1 = _mm_setzero_ps();
1070 fjx2 = _mm_setzero_ps();
1071 fjy2 = _mm_setzero_ps();
1072 fjz2 = _mm_setzero_ps();
1073 fjx3 = _mm_setzero_ps();
1074 fjy3 = _mm_setzero_ps();
1075 fjz3 = _mm_setzero_ps();
1077 /**************************
1078 * CALCULATE INTERACTIONS *
1079 **************************/
1081 if (gmx_mm_any_lt(rsq00,rcutoff2))
1084 r00 = _mm_mul_ps(rsq00,rinv00);
1085 r00 = _mm_andnot_ps(dummy_mask,r00);
1087 /* LENNARD-JONES DISPERSION/REPULSION */
1089 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1090 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1091 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1092 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1093 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1095 d = _mm_sub_ps(r00,rswitch);
1096 d = _mm_max_ps(d,_mm_setzero_ps());
1097 d2 = _mm_mul_ps(d,d);
1098 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1100 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1102 /* Evaluate switch function */
1103 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1104 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1105 vvdw = _mm_mul_ps(vvdw,sw);
1106 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1108 /* Update potential sum for this i atom from the interaction with this j atom. */
1109 vvdw = _mm_and_ps(vvdw,cutoff_mask);
1110 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
1111 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
1115 fscal = _mm_and_ps(fscal,cutoff_mask);
1117 fscal = _mm_andnot_ps(dummy_mask,fscal);
1119 /* Calculate temporary vectorial force */
1120 tx = _mm_mul_ps(fscal,dx00);
1121 ty = _mm_mul_ps(fscal,dy00);
1122 tz = _mm_mul_ps(fscal,dz00);
1124 /* Update vectorial force */
1125 fix0 = _mm_add_ps(fix0,tx);
1126 fiy0 = _mm_add_ps(fiy0,ty);
1127 fiz0 = _mm_add_ps(fiz0,tz);
1129 fjx0 = _mm_add_ps(fjx0,tx);
1130 fjy0 = _mm_add_ps(fjy0,ty);
1131 fjz0 = _mm_add_ps(fjz0,tz);
1135 /**************************
1136 * CALCULATE INTERACTIONS *
1137 **************************/
1139 if (gmx_mm_any_lt(rsq11,rcutoff2))
1142 r11 = _mm_mul_ps(rsq11,rinv11);
1143 r11 = _mm_andnot_ps(dummy_mask,r11);
1145 /* EWALD ELECTROSTATICS */
1147 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1148 ewrt = _mm_mul_ps(r11,ewtabscale);
1149 ewitab = _mm_cvttps_epi32(ewrt);
1150 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1151 ewitab = _mm_slli_epi32(ewitab,2);
1152 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1153 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1154 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1155 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1156 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1157 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1158 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1159 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1160 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1162 d = _mm_sub_ps(r11,rswitch);
1163 d = _mm_max_ps(d,_mm_setzero_ps());
1164 d2 = _mm_mul_ps(d,d);
1165 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1167 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1169 /* Evaluate switch function */
1170 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1171 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1172 velec = _mm_mul_ps(velec,sw);
1173 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1175 /* Update potential sum for this i atom from the interaction with this j atom. */
1176 velec = _mm_and_ps(velec,cutoff_mask);
1177 velec = _mm_andnot_ps(dummy_mask,velec);
1178 velecsum = _mm_add_ps(velecsum,velec);
1182 fscal = _mm_and_ps(fscal,cutoff_mask);
1184 fscal = _mm_andnot_ps(dummy_mask,fscal);
1186 /* Calculate temporary vectorial force */
1187 tx = _mm_mul_ps(fscal,dx11);
1188 ty = _mm_mul_ps(fscal,dy11);
1189 tz = _mm_mul_ps(fscal,dz11);
1191 /* Update vectorial force */
1192 fix1 = _mm_add_ps(fix1,tx);
1193 fiy1 = _mm_add_ps(fiy1,ty);
1194 fiz1 = _mm_add_ps(fiz1,tz);
1196 fjx1 = _mm_add_ps(fjx1,tx);
1197 fjy1 = _mm_add_ps(fjy1,ty);
1198 fjz1 = _mm_add_ps(fjz1,tz);
1202 /**************************
1203 * CALCULATE INTERACTIONS *
1204 **************************/
1206 if (gmx_mm_any_lt(rsq12,rcutoff2))
1209 r12 = _mm_mul_ps(rsq12,rinv12);
1210 r12 = _mm_andnot_ps(dummy_mask,r12);
1212 /* EWALD ELECTROSTATICS */
1214 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1215 ewrt = _mm_mul_ps(r12,ewtabscale);
1216 ewitab = _mm_cvttps_epi32(ewrt);
1217 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1218 ewitab = _mm_slli_epi32(ewitab,2);
1219 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1220 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1221 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1222 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1223 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1224 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1225 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1226 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1227 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1229 d = _mm_sub_ps(r12,rswitch);
1230 d = _mm_max_ps(d,_mm_setzero_ps());
1231 d2 = _mm_mul_ps(d,d);
1232 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1234 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1236 /* Evaluate switch function */
1237 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1238 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1239 velec = _mm_mul_ps(velec,sw);
1240 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1242 /* Update potential sum for this i atom from the interaction with this j atom. */
1243 velec = _mm_and_ps(velec,cutoff_mask);
1244 velec = _mm_andnot_ps(dummy_mask,velec);
1245 velecsum = _mm_add_ps(velecsum,velec);
1249 fscal = _mm_and_ps(fscal,cutoff_mask);
1251 fscal = _mm_andnot_ps(dummy_mask,fscal);
1253 /* Calculate temporary vectorial force */
1254 tx = _mm_mul_ps(fscal,dx12);
1255 ty = _mm_mul_ps(fscal,dy12);
1256 tz = _mm_mul_ps(fscal,dz12);
1258 /* Update vectorial force */
1259 fix1 = _mm_add_ps(fix1,tx);
1260 fiy1 = _mm_add_ps(fiy1,ty);
1261 fiz1 = _mm_add_ps(fiz1,tz);
1263 fjx2 = _mm_add_ps(fjx2,tx);
1264 fjy2 = _mm_add_ps(fjy2,ty);
1265 fjz2 = _mm_add_ps(fjz2,tz);
1269 /**************************
1270 * CALCULATE INTERACTIONS *
1271 **************************/
1273 if (gmx_mm_any_lt(rsq13,rcutoff2))
1276 r13 = _mm_mul_ps(rsq13,rinv13);
1277 r13 = _mm_andnot_ps(dummy_mask,r13);
1279 /* EWALD ELECTROSTATICS */
1281 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1282 ewrt = _mm_mul_ps(r13,ewtabscale);
1283 ewitab = _mm_cvttps_epi32(ewrt);
1284 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1285 ewitab = _mm_slli_epi32(ewitab,2);
1286 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1287 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1288 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1289 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1290 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1291 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1292 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1293 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
1294 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1296 d = _mm_sub_ps(r13,rswitch);
1297 d = _mm_max_ps(d,_mm_setzero_ps());
1298 d2 = _mm_mul_ps(d,d);
1299 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1301 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1303 /* Evaluate switch function */
1304 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1305 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
1306 velec = _mm_mul_ps(velec,sw);
1307 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1309 /* Update potential sum for this i atom from the interaction with this j atom. */
1310 velec = _mm_and_ps(velec,cutoff_mask);
1311 velec = _mm_andnot_ps(dummy_mask,velec);
1312 velecsum = _mm_add_ps(velecsum,velec);
1316 fscal = _mm_and_ps(fscal,cutoff_mask);
1318 fscal = _mm_andnot_ps(dummy_mask,fscal);
1320 /* Calculate temporary vectorial force */
1321 tx = _mm_mul_ps(fscal,dx13);
1322 ty = _mm_mul_ps(fscal,dy13);
1323 tz = _mm_mul_ps(fscal,dz13);
1325 /* Update vectorial force */
1326 fix1 = _mm_add_ps(fix1,tx);
1327 fiy1 = _mm_add_ps(fiy1,ty);
1328 fiz1 = _mm_add_ps(fiz1,tz);
1330 fjx3 = _mm_add_ps(fjx3,tx);
1331 fjy3 = _mm_add_ps(fjy3,ty);
1332 fjz3 = _mm_add_ps(fjz3,tz);
1336 /**************************
1337 * CALCULATE INTERACTIONS *
1338 **************************/
1340 if (gmx_mm_any_lt(rsq21,rcutoff2))
1343 r21 = _mm_mul_ps(rsq21,rinv21);
1344 r21 = _mm_andnot_ps(dummy_mask,r21);
1346 /* EWALD ELECTROSTATICS */
1348 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1349 ewrt = _mm_mul_ps(r21,ewtabscale);
1350 ewitab = _mm_cvttps_epi32(ewrt);
1351 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1352 ewitab = _mm_slli_epi32(ewitab,2);
1353 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1354 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1355 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1356 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1357 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1358 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1359 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1360 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1361 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1363 d = _mm_sub_ps(r21,rswitch);
1364 d = _mm_max_ps(d,_mm_setzero_ps());
1365 d2 = _mm_mul_ps(d,d);
1366 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1368 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1370 /* Evaluate switch function */
1371 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1372 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1373 velec = _mm_mul_ps(velec,sw);
1374 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1376 /* Update potential sum for this i atom from the interaction with this j atom. */
1377 velec = _mm_and_ps(velec,cutoff_mask);
1378 velec = _mm_andnot_ps(dummy_mask,velec);
1379 velecsum = _mm_add_ps(velecsum,velec);
1383 fscal = _mm_and_ps(fscal,cutoff_mask);
1385 fscal = _mm_andnot_ps(dummy_mask,fscal);
1387 /* Calculate temporary vectorial force */
1388 tx = _mm_mul_ps(fscal,dx21);
1389 ty = _mm_mul_ps(fscal,dy21);
1390 tz = _mm_mul_ps(fscal,dz21);
1392 /* Update vectorial force */
1393 fix2 = _mm_add_ps(fix2,tx);
1394 fiy2 = _mm_add_ps(fiy2,ty);
1395 fiz2 = _mm_add_ps(fiz2,tz);
1397 fjx1 = _mm_add_ps(fjx1,tx);
1398 fjy1 = _mm_add_ps(fjy1,ty);
1399 fjz1 = _mm_add_ps(fjz1,tz);
1403 /**************************
1404 * CALCULATE INTERACTIONS *
1405 **************************/
1407 if (gmx_mm_any_lt(rsq22,rcutoff2))
1410 r22 = _mm_mul_ps(rsq22,rinv22);
1411 r22 = _mm_andnot_ps(dummy_mask,r22);
1413 /* EWALD ELECTROSTATICS */
1415 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1416 ewrt = _mm_mul_ps(r22,ewtabscale);
1417 ewitab = _mm_cvttps_epi32(ewrt);
1418 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1419 ewitab = _mm_slli_epi32(ewitab,2);
1420 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1421 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1422 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1423 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1424 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1425 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1426 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1427 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1428 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1430 d = _mm_sub_ps(r22,rswitch);
1431 d = _mm_max_ps(d,_mm_setzero_ps());
1432 d2 = _mm_mul_ps(d,d);
1433 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1435 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1437 /* Evaluate switch function */
1438 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1439 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1440 velec = _mm_mul_ps(velec,sw);
1441 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1443 /* Update potential sum for this i atom from the interaction with this j atom. */
1444 velec = _mm_and_ps(velec,cutoff_mask);
1445 velec = _mm_andnot_ps(dummy_mask,velec);
1446 velecsum = _mm_add_ps(velecsum,velec);
1450 fscal = _mm_and_ps(fscal,cutoff_mask);
1452 fscal = _mm_andnot_ps(dummy_mask,fscal);
1454 /* Calculate temporary vectorial force */
1455 tx = _mm_mul_ps(fscal,dx22);
1456 ty = _mm_mul_ps(fscal,dy22);
1457 tz = _mm_mul_ps(fscal,dz22);
1459 /* Update vectorial force */
1460 fix2 = _mm_add_ps(fix2,tx);
1461 fiy2 = _mm_add_ps(fiy2,ty);
1462 fiz2 = _mm_add_ps(fiz2,tz);
1464 fjx2 = _mm_add_ps(fjx2,tx);
1465 fjy2 = _mm_add_ps(fjy2,ty);
1466 fjz2 = _mm_add_ps(fjz2,tz);
1470 /**************************
1471 * CALCULATE INTERACTIONS *
1472 **************************/
1474 if (gmx_mm_any_lt(rsq23,rcutoff2))
1477 r23 = _mm_mul_ps(rsq23,rinv23);
1478 r23 = _mm_andnot_ps(dummy_mask,r23);
1480 /* EWALD ELECTROSTATICS */
1482 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1483 ewrt = _mm_mul_ps(r23,ewtabscale);
1484 ewitab = _mm_cvttps_epi32(ewrt);
1485 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1486 ewitab = _mm_slli_epi32(ewitab,2);
1487 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1488 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1489 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1490 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1491 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1492 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1493 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1494 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1495 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1497 d = _mm_sub_ps(r23,rswitch);
1498 d = _mm_max_ps(d,_mm_setzero_ps());
1499 d2 = _mm_mul_ps(d,d);
1500 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1502 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1504 /* Evaluate switch function */
1505 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1506 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
1507 velec = _mm_mul_ps(velec,sw);
1508 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1510 /* Update potential sum for this i atom from the interaction with this j atom. */
1511 velec = _mm_and_ps(velec,cutoff_mask);
1512 velec = _mm_andnot_ps(dummy_mask,velec);
1513 velecsum = _mm_add_ps(velecsum,velec);
1517 fscal = _mm_and_ps(fscal,cutoff_mask);
1519 fscal = _mm_andnot_ps(dummy_mask,fscal);
1521 /* Calculate temporary vectorial force */
1522 tx = _mm_mul_ps(fscal,dx23);
1523 ty = _mm_mul_ps(fscal,dy23);
1524 tz = _mm_mul_ps(fscal,dz23);
1526 /* Update vectorial force */
1527 fix2 = _mm_add_ps(fix2,tx);
1528 fiy2 = _mm_add_ps(fiy2,ty);
1529 fiz2 = _mm_add_ps(fiz2,tz);
1531 fjx3 = _mm_add_ps(fjx3,tx);
1532 fjy3 = _mm_add_ps(fjy3,ty);
1533 fjz3 = _mm_add_ps(fjz3,tz);
1537 /**************************
1538 * CALCULATE INTERACTIONS *
1539 **************************/
1541 if (gmx_mm_any_lt(rsq31,rcutoff2))
1544 r31 = _mm_mul_ps(rsq31,rinv31);
1545 r31 = _mm_andnot_ps(dummy_mask,r31);
1547 /* EWALD ELECTROSTATICS */
1549 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1550 ewrt = _mm_mul_ps(r31,ewtabscale);
1551 ewitab = _mm_cvttps_epi32(ewrt);
1552 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1553 ewitab = _mm_slli_epi32(ewitab,2);
1554 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1555 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1556 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1557 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1558 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1559 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1560 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1561 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1562 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1564 d = _mm_sub_ps(r31,rswitch);
1565 d = _mm_max_ps(d,_mm_setzero_ps());
1566 d2 = _mm_mul_ps(d,d);
1567 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1569 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1571 /* Evaluate switch function */
1572 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1573 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
1574 velec = _mm_mul_ps(velec,sw);
1575 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1577 /* Update potential sum for this i atom from the interaction with this j atom. */
1578 velec = _mm_and_ps(velec,cutoff_mask);
1579 velec = _mm_andnot_ps(dummy_mask,velec);
1580 velecsum = _mm_add_ps(velecsum,velec);
1584 fscal = _mm_and_ps(fscal,cutoff_mask);
1586 fscal = _mm_andnot_ps(dummy_mask,fscal);
1588 /* Calculate temporary vectorial force */
1589 tx = _mm_mul_ps(fscal,dx31);
1590 ty = _mm_mul_ps(fscal,dy31);
1591 tz = _mm_mul_ps(fscal,dz31);
1593 /* Update vectorial force */
1594 fix3 = _mm_add_ps(fix3,tx);
1595 fiy3 = _mm_add_ps(fiy3,ty);
1596 fiz3 = _mm_add_ps(fiz3,tz);
1598 fjx1 = _mm_add_ps(fjx1,tx);
1599 fjy1 = _mm_add_ps(fjy1,ty);
1600 fjz1 = _mm_add_ps(fjz1,tz);
1604 /**************************
1605 * CALCULATE INTERACTIONS *
1606 **************************/
1608 if (gmx_mm_any_lt(rsq32,rcutoff2))
1611 r32 = _mm_mul_ps(rsq32,rinv32);
1612 r32 = _mm_andnot_ps(dummy_mask,r32);
1614 /* EWALD ELECTROSTATICS */
1616 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1617 ewrt = _mm_mul_ps(r32,ewtabscale);
1618 ewitab = _mm_cvttps_epi32(ewrt);
1619 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1620 ewitab = _mm_slli_epi32(ewitab,2);
1621 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1622 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1623 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1624 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1625 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1626 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1627 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1628 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1629 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1631 d = _mm_sub_ps(r32,rswitch);
1632 d = _mm_max_ps(d,_mm_setzero_ps());
1633 d2 = _mm_mul_ps(d,d);
1634 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1636 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1638 /* Evaluate switch function */
1639 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1640 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
1641 velec = _mm_mul_ps(velec,sw);
1642 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1644 /* Update potential sum for this i atom from the interaction with this j atom. */
1645 velec = _mm_and_ps(velec,cutoff_mask);
1646 velec = _mm_andnot_ps(dummy_mask,velec);
1647 velecsum = _mm_add_ps(velecsum,velec);
1651 fscal = _mm_and_ps(fscal,cutoff_mask);
1653 fscal = _mm_andnot_ps(dummy_mask,fscal);
1655 /* Calculate temporary vectorial force */
1656 tx = _mm_mul_ps(fscal,dx32);
1657 ty = _mm_mul_ps(fscal,dy32);
1658 tz = _mm_mul_ps(fscal,dz32);
1660 /* Update vectorial force */
1661 fix3 = _mm_add_ps(fix3,tx);
1662 fiy3 = _mm_add_ps(fiy3,ty);
1663 fiz3 = _mm_add_ps(fiz3,tz);
1665 fjx2 = _mm_add_ps(fjx2,tx);
1666 fjy2 = _mm_add_ps(fjy2,ty);
1667 fjz2 = _mm_add_ps(fjz2,tz);
1671 /**************************
1672 * CALCULATE INTERACTIONS *
1673 **************************/
1675 if (gmx_mm_any_lt(rsq33,rcutoff2))
1678 r33 = _mm_mul_ps(rsq33,rinv33);
1679 r33 = _mm_andnot_ps(dummy_mask,r33);
1681 /* EWALD ELECTROSTATICS */
1683 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1684 ewrt = _mm_mul_ps(r33,ewtabscale);
1685 ewitab = _mm_cvttps_epi32(ewrt);
1686 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1687 ewitab = _mm_slli_epi32(ewitab,2);
1688 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1689 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1690 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1691 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1692 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1693 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1694 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1695 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1696 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1698 d = _mm_sub_ps(r33,rswitch);
1699 d = _mm_max_ps(d,_mm_setzero_ps());
1700 d2 = _mm_mul_ps(d,d);
1701 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1703 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1705 /* Evaluate switch function */
1706 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1707 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
1708 velec = _mm_mul_ps(velec,sw);
1709 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1711 /* Update potential sum for this i atom from the interaction with this j atom. */
1712 velec = _mm_and_ps(velec,cutoff_mask);
1713 velec = _mm_andnot_ps(dummy_mask,velec);
1714 velecsum = _mm_add_ps(velecsum,velec);
1718 fscal = _mm_and_ps(fscal,cutoff_mask);
1720 fscal = _mm_andnot_ps(dummy_mask,fscal);
1722 /* Calculate temporary vectorial force */
1723 tx = _mm_mul_ps(fscal,dx33);
1724 ty = _mm_mul_ps(fscal,dy33);
1725 tz = _mm_mul_ps(fscal,dz33);
1727 /* Update vectorial force */
1728 fix3 = _mm_add_ps(fix3,tx);
1729 fiy3 = _mm_add_ps(fiy3,ty);
1730 fiz3 = _mm_add_ps(fiz3,tz);
1732 fjx3 = _mm_add_ps(fjx3,tx);
1733 fjy3 = _mm_add_ps(fjy3,ty);
1734 fjz3 = _mm_add_ps(fjz3,tz);
1738 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1739 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1740 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1741 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1743 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1744 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1745 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1747 /* Inner loop uses 657 flops */
1750 /* End of innermost loop */
1752 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1753 f+i_coord_offset,fshift+i_shift_offset);
1756 /* Update potential energies */
1757 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1758 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1760 /* Increment number of inner iterations */
1761 inneriter += j_index_end - j_index_start;
1763 /* Outer loop uses 26 flops */
1766 /* Increment number of outer iterations */
1769 /* Update outer/inner flops */
1771 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*657);
1774 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single
1775 * Electrostatics interaction: Ewald
1776 * VdW interaction: LennardJones
1777 * Geometry: Water4-Water4
1778 * Calculate force/pot: Force
1781 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single
1782 (t_nblist * gmx_restrict nlist,
1783 rvec * gmx_restrict xx,
1784 rvec * gmx_restrict ff,
1785 t_forcerec * gmx_restrict fr,
1786 t_mdatoms * gmx_restrict mdatoms,
1787 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1788 t_nrnb * gmx_restrict nrnb)
1790 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1791 * just 0 for non-waters.
1792 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1793 * jnr indices corresponding to data put in the four positions in the SIMD register.
1795 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1796 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1797 int jnrA,jnrB,jnrC,jnrD;
1798 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1799 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1800 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1801 real rcutoff_scalar;
1802 real *shiftvec,*fshift,*x,*f;
1803 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1804 real scratch[4*DIM];
1805 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1807 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1809 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1811 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1813 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1814 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1815 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1816 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1817 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1818 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1819 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1820 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1821 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1822 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1823 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1824 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1825 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1826 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1827 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1828 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1829 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1830 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1831 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1832 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1835 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1838 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1839 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1841 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1843 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1844 real rswitch_scalar,d_scalar;
1845 __m128 dummy_mask,cutoff_mask;
1846 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1847 __m128 one = _mm_set1_ps(1.0);
1848 __m128 two = _mm_set1_ps(2.0);
1854 jindex = nlist->jindex;
1856 shiftidx = nlist->shift;
1858 shiftvec = fr->shift_vec[0];
1859 fshift = fr->fshift[0];
1860 facel = _mm_set1_ps(fr->epsfac);
1861 charge = mdatoms->chargeA;
1862 nvdwtype = fr->ntype;
1863 vdwparam = fr->nbfp;
1864 vdwtype = mdatoms->typeA;
1866 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1867 ewtab = fr->ic->tabq_coul_FDV0;
1868 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1869 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1871 /* Setup water-specific parameters */
1872 inr = nlist->iinr[0];
1873 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1874 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1875 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1876 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1878 jq1 = _mm_set1_ps(charge[inr+1]);
1879 jq2 = _mm_set1_ps(charge[inr+2]);
1880 jq3 = _mm_set1_ps(charge[inr+3]);
1881 vdwjidx0A = 2*vdwtype[inr+0];
1882 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1883 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1884 qq11 = _mm_mul_ps(iq1,jq1);
1885 qq12 = _mm_mul_ps(iq1,jq2);
1886 qq13 = _mm_mul_ps(iq1,jq3);
1887 qq21 = _mm_mul_ps(iq2,jq1);
1888 qq22 = _mm_mul_ps(iq2,jq2);
1889 qq23 = _mm_mul_ps(iq2,jq3);
1890 qq31 = _mm_mul_ps(iq3,jq1);
1891 qq32 = _mm_mul_ps(iq3,jq2);
1892 qq33 = _mm_mul_ps(iq3,jq3);
1894 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1895 rcutoff_scalar = fr->rcoulomb;
1896 rcutoff = _mm_set1_ps(rcutoff_scalar);
1897 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1899 rswitch_scalar = fr->rcoulomb_switch;
1900 rswitch = _mm_set1_ps(rswitch_scalar);
1901 /* Setup switch parameters */
1902 d_scalar = rcutoff_scalar-rswitch_scalar;
1903 d = _mm_set1_ps(d_scalar);
1904 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1905 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1906 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1907 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1908 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1909 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1911 /* Avoid stupid compiler warnings */
1912 jnrA = jnrB = jnrC = jnrD = 0;
1913 j_coord_offsetA = 0;
1914 j_coord_offsetB = 0;
1915 j_coord_offsetC = 0;
1916 j_coord_offsetD = 0;
1921 for(iidx=0;iidx<4*DIM;iidx++)
1923 scratch[iidx] = 0.0;
1926 /* Start outer loop over neighborlists */
1927 for(iidx=0; iidx<nri; iidx++)
1929 /* Load shift vector for this list */
1930 i_shift_offset = DIM*shiftidx[iidx];
1932 /* Load limits for loop over neighbors */
1933 j_index_start = jindex[iidx];
1934 j_index_end = jindex[iidx+1];
1936 /* Get outer coordinate index */
1938 i_coord_offset = DIM*inr;
1940 /* Load i particle coords and add shift vector */
1941 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1942 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1944 fix0 = _mm_setzero_ps();
1945 fiy0 = _mm_setzero_ps();
1946 fiz0 = _mm_setzero_ps();
1947 fix1 = _mm_setzero_ps();
1948 fiy1 = _mm_setzero_ps();
1949 fiz1 = _mm_setzero_ps();
1950 fix2 = _mm_setzero_ps();
1951 fiy2 = _mm_setzero_ps();
1952 fiz2 = _mm_setzero_ps();
1953 fix3 = _mm_setzero_ps();
1954 fiy3 = _mm_setzero_ps();
1955 fiz3 = _mm_setzero_ps();
1957 /* Start inner kernel loop */
1958 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1961 /* Get j neighbor index, and coordinate index */
1963 jnrB = jjnr[jidx+1];
1964 jnrC = jjnr[jidx+2];
1965 jnrD = jjnr[jidx+3];
1966 j_coord_offsetA = DIM*jnrA;
1967 j_coord_offsetB = DIM*jnrB;
1968 j_coord_offsetC = DIM*jnrC;
1969 j_coord_offsetD = DIM*jnrD;
1971 /* load j atom coordinates */
1972 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1973 x+j_coord_offsetC,x+j_coord_offsetD,
1974 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1975 &jy2,&jz2,&jx3,&jy3,&jz3);
1977 /* Calculate displacement vector */
1978 dx00 = _mm_sub_ps(ix0,jx0);
1979 dy00 = _mm_sub_ps(iy0,jy0);
1980 dz00 = _mm_sub_ps(iz0,jz0);
1981 dx11 = _mm_sub_ps(ix1,jx1);
1982 dy11 = _mm_sub_ps(iy1,jy1);
1983 dz11 = _mm_sub_ps(iz1,jz1);
1984 dx12 = _mm_sub_ps(ix1,jx2);
1985 dy12 = _mm_sub_ps(iy1,jy2);
1986 dz12 = _mm_sub_ps(iz1,jz2);
1987 dx13 = _mm_sub_ps(ix1,jx3);
1988 dy13 = _mm_sub_ps(iy1,jy3);
1989 dz13 = _mm_sub_ps(iz1,jz3);
1990 dx21 = _mm_sub_ps(ix2,jx1);
1991 dy21 = _mm_sub_ps(iy2,jy1);
1992 dz21 = _mm_sub_ps(iz2,jz1);
1993 dx22 = _mm_sub_ps(ix2,jx2);
1994 dy22 = _mm_sub_ps(iy2,jy2);
1995 dz22 = _mm_sub_ps(iz2,jz2);
1996 dx23 = _mm_sub_ps(ix2,jx3);
1997 dy23 = _mm_sub_ps(iy2,jy3);
1998 dz23 = _mm_sub_ps(iz2,jz3);
1999 dx31 = _mm_sub_ps(ix3,jx1);
2000 dy31 = _mm_sub_ps(iy3,jy1);
2001 dz31 = _mm_sub_ps(iz3,jz1);
2002 dx32 = _mm_sub_ps(ix3,jx2);
2003 dy32 = _mm_sub_ps(iy3,jy2);
2004 dz32 = _mm_sub_ps(iz3,jz2);
2005 dx33 = _mm_sub_ps(ix3,jx3);
2006 dy33 = _mm_sub_ps(iy3,jy3);
2007 dz33 = _mm_sub_ps(iz3,jz3);
2009 /* Calculate squared distance and things based on it */
2010 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2011 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2012 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2013 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2014 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2015 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2016 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2017 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2018 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2019 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2021 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2022 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2023 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2024 rinv13 = gmx_mm_invsqrt_ps(rsq13);
2025 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2026 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2027 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2028 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2029 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2030 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2032 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2033 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2034 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2035 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2036 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2037 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2038 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2039 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2040 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2041 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2043 fjx0 = _mm_setzero_ps();
2044 fjy0 = _mm_setzero_ps();
2045 fjz0 = _mm_setzero_ps();
2046 fjx1 = _mm_setzero_ps();
2047 fjy1 = _mm_setzero_ps();
2048 fjz1 = _mm_setzero_ps();
2049 fjx2 = _mm_setzero_ps();
2050 fjy2 = _mm_setzero_ps();
2051 fjz2 = _mm_setzero_ps();
2052 fjx3 = _mm_setzero_ps();
2053 fjy3 = _mm_setzero_ps();
2054 fjz3 = _mm_setzero_ps();
2056 /**************************
2057 * CALCULATE INTERACTIONS *
2058 **************************/
2060 if (gmx_mm_any_lt(rsq00,rcutoff2))
2063 r00 = _mm_mul_ps(rsq00,rinv00);
2065 /* LENNARD-JONES DISPERSION/REPULSION */
2067 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2068 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
2069 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2070 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2071 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2073 d = _mm_sub_ps(r00,rswitch);
2074 d = _mm_max_ps(d,_mm_setzero_ps());
2075 d2 = _mm_mul_ps(d,d);
2076 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2078 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2080 /* Evaluate switch function */
2081 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2082 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2083 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2087 fscal = _mm_and_ps(fscal,cutoff_mask);
2089 /* Calculate temporary vectorial force */
2090 tx = _mm_mul_ps(fscal,dx00);
2091 ty = _mm_mul_ps(fscal,dy00);
2092 tz = _mm_mul_ps(fscal,dz00);
2094 /* Update vectorial force */
2095 fix0 = _mm_add_ps(fix0,tx);
2096 fiy0 = _mm_add_ps(fiy0,ty);
2097 fiz0 = _mm_add_ps(fiz0,tz);
2099 fjx0 = _mm_add_ps(fjx0,tx);
2100 fjy0 = _mm_add_ps(fjy0,ty);
2101 fjz0 = _mm_add_ps(fjz0,tz);
2105 /**************************
2106 * CALCULATE INTERACTIONS *
2107 **************************/
2109 if (gmx_mm_any_lt(rsq11,rcutoff2))
2112 r11 = _mm_mul_ps(rsq11,rinv11);
2114 /* EWALD ELECTROSTATICS */
2116 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2117 ewrt = _mm_mul_ps(r11,ewtabscale);
2118 ewitab = _mm_cvttps_epi32(ewrt);
2119 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2120 ewitab = _mm_slli_epi32(ewitab,2);
2121 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2122 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2123 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2124 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2125 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2126 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2127 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2128 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2129 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2131 d = _mm_sub_ps(r11,rswitch);
2132 d = _mm_max_ps(d,_mm_setzero_ps());
2133 d2 = _mm_mul_ps(d,d);
2134 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2136 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2138 /* Evaluate switch function */
2139 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2140 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2141 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2145 fscal = _mm_and_ps(fscal,cutoff_mask);
2147 /* Calculate temporary vectorial force */
2148 tx = _mm_mul_ps(fscal,dx11);
2149 ty = _mm_mul_ps(fscal,dy11);
2150 tz = _mm_mul_ps(fscal,dz11);
2152 /* Update vectorial force */
2153 fix1 = _mm_add_ps(fix1,tx);
2154 fiy1 = _mm_add_ps(fiy1,ty);
2155 fiz1 = _mm_add_ps(fiz1,tz);
2157 fjx1 = _mm_add_ps(fjx1,tx);
2158 fjy1 = _mm_add_ps(fjy1,ty);
2159 fjz1 = _mm_add_ps(fjz1,tz);
2163 /**************************
2164 * CALCULATE INTERACTIONS *
2165 **************************/
2167 if (gmx_mm_any_lt(rsq12,rcutoff2))
2170 r12 = _mm_mul_ps(rsq12,rinv12);
2172 /* EWALD ELECTROSTATICS */
2174 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2175 ewrt = _mm_mul_ps(r12,ewtabscale);
2176 ewitab = _mm_cvttps_epi32(ewrt);
2177 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2178 ewitab = _mm_slli_epi32(ewitab,2);
2179 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2180 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2181 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2182 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2183 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2184 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2185 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2186 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2187 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2189 d = _mm_sub_ps(r12,rswitch);
2190 d = _mm_max_ps(d,_mm_setzero_ps());
2191 d2 = _mm_mul_ps(d,d);
2192 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2194 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2196 /* Evaluate switch function */
2197 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2198 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2199 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2203 fscal = _mm_and_ps(fscal,cutoff_mask);
2205 /* Calculate temporary vectorial force */
2206 tx = _mm_mul_ps(fscal,dx12);
2207 ty = _mm_mul_ps(fscal,dy12);
2208 tz = _mm_mul_ps(fscal,dz12);
2210 /* Update vectorial force */
2211 fix1 = _mm_add_ps(fix1,tx);
2212 fiy1 = _mm_add_ps(fiy1,ty);
2213 fiz1 = _mm_add_ps(fiz1,tz);
2215 fjx2 = _mm_add_ps(fjx2,tx);
2216 fjy2 = _mm_add_ps(fjy2,ty);
2217 fjz2 = _mm_add_ps(fjz2,tz);
2221 /**************************
2222 * CALCULATE INTERACTIONS *
2223 **************************/
2225 if (gmx_mm_any_lt(rsq13,rcutoff2))
2228 r13 = _mm_mul_ps(rsq13,rinv13);
2230 /* EWALD ELECTROSTATICS */
2232 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2233 ewrt = _mm_mul_ps(r13,ewtabscale);
2234 ewitab = _mm_cvttps_epi32(ewrt);
2235 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2236 ewitab = _mm_slli_epi32(ewitab,2);
2237 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2238 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2239 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2240 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2241 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2242 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2243 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2244 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2245 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2247 d = _mm_sub_ps(r13,rswitch);
2248 d = _mm_max_ps(d,_mm_setzero_ps());
2249 d2 = _mm_mul_ps(d,d);
2250 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2252 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2254 /* Evaluate switch function */
2255 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2256 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2257 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2261 fscal = _mm_and_ps(fscal,cutoff_mask);
2263 /* Calculate temporary vectorial force */
2264 tx = _mm_mul_ps(fscal,dx13);
2265 ty = _mm_mul_ps(fscal,dy13);
2266 tz = _mm_mul_ps(fscal,dz13);
2268 /* Update vectorial force */
2269 fix1 = _mm_add_ps(fix1,tx);
2270 fiy1 = _mm_add_ps(fiy1,ty);
2271 fiz1 = _mm_add_ps(fiz1,tz);
2273 fjx3 = _mm_add_ps(fjx3,tx);
2274 fjy3 = _mm_add_ps(fjy3,ty);
2275 fjz3 = _mm_add_ps(fjz3,tz);
2279 /**************************
2280 * CALCULATE INTERACTIONS *
2281 **************************/
2283 if (gmx_mm_any_lt(rsq21,rcutoff2))
2286 r21 = _mm_mul_ps(rsq21,rinv21);
2288 /* EWALD ELECTROSTATICS */
2290 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2291 ewrt = _mm_mul_ps(r21,ewtabscale);
2292 ewitab = _mm_cvttps_epi32(ewrt);
2293 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2294 ewitab = _mm_slli_epi32(ewitab,2);
2295 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2296 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2297 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2298 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2299 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2300 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2301 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2302 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2303 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2305 d = _mm_sub_ps(r21,rswitch);
2306 d = _mm_max_ps(d,_mm_setzero_ps());
2307 d2 = _mm_mul_ps(d,d);
2308 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2310 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2312 /* Evaluate switch function */
2313 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2314 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2315 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2319 fscal = _mm_and_ps(fscal,cutoff_mask);
2321 /* Calculate temporary vectorial force */
2322 tx = _mm_mul_ps(fscal,dx21);
2323 ty = _mm_mul_ps(fscal,dy21);
2324 tz = _mm_mul_ps(fscal,dz21);
2326 /* Update vectorial force */
2327 fix2 = _mm_add_ps(fix2,tx);
2328 fiy2 = _mm_add_ps(fiy2,ty);
2329 fiz2 = _mm_add_ps(fiz2,tz);
2331 fjx1 = _mm_add_ps(fjx1,tx);
2332 fjy1 = _mm_add_ps(fjy1,ty);
2333 fjz1 = _mm_add_ps(fjz1,tz);
2337 /**************************
2338 * CALCULATE INTERACTIONS *
2339 **************************/
2341 if (gmx_mm_any_lt(rsq22,rcutoff2))
2344 r22 = _mm_mul_ps(rsq22,rinv22);
2346 /* EWALD ELECTROSTATICS */
2348 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2349 ewrt = _mm_mul_ps(r22,ewtabscale);
2350 ewitab = _mm_cvttps_epi32(ewrt);
2351 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2352 ewitab = _mm_slli_epi32(ewitab,2);
2353 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2354 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2355 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2356 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2357 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2358 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2359 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2360 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2361 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2363 d = _mm_sub_ps(r22,rswitch);
2364 d = _mm_max_ps(d,_mm_setzero_ps());
2365 d2 = _mm_mul_ps(d,d);
2366 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2368 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2370 /* Evaluate switch function */
2371 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2372 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2373 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2377 fscal = _mm_and_ps(fscal,cutoff_mask);
2379 /* Calculate temporary vectorial force */
2380 tx = _mm_mul_ps(fscal,dx22);
2381 ty = _mm_mul_ps(fscal,dy22);
2382 tz = _mm_mul_ps(fscal,dz22);
2384 /* Update vectorial force */
2385 fix2 = _mm_add_ps(fix2,tx);
2386 fiy2 = _mm_add_ps(fiy2,ty);
2387 fiz2 = _mm_add_ps(fiz2,tz);
2389 fjx2 = _mm_add_ps(fjx2,tx);
2390 fjy2 = _mm_add_ps(fjy2,ty);
2391 fjz2 = _mm_add_ps(fjz2,tz);
2395 /**************************
2396 * CALCULATE INTERACTIONS *
2397 **************************/
2399 if (gmx_mm_any_lt(rsq23,rcutoff2))
2402 r23 = _mm_mul_ps(rsq23,rinv23);
2404 /* EWALD ELECTROSTATICS */
2406 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2407 ewrt = _mm_mul_ps(r23,ewtabscale);
2408 ewitab = _mm_cvttps_epi32(ewrt);
2409 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2410 ewitab = _mm_slli_epi32(ewitab,2);
2411 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2412 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2413 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2414 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2415 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2416 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2417 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2418 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
2419 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2421 d = _mm_sub_ps(r23,rswitch);
2422 d = _mm_max_ps(d,_mm_setzero_ps());
2423 d2 = _mm_mul_ps(d,d);
2424 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2426 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2428 /* Evaluate switch function */
2429 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2430 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2431 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2435 fscal = _mm_and_ps(fscal,cutoff_mask);
2437 /* Calculate temporary vectorial force */
2438 tx = _mm_mul_ps(fscal,dx23);
2439 ty = _mm_mul_ps(fscal,dy23);
2440 tz = _mm_mul_ps(fscal,dz23);
2442 /* Update vectorial force */
2443 fix2 = _mm_add_ps(fix2,tx);
2444 fiy2 = _mm_add_ps(fiy2,ty);
2445 fiz2 = _mm_add_ps(fiz2,tz);
2447 fjx3 = _mm_add_ps(fjx3,tx);
2448 fjy3 = _mm_add_ps(fjy3,ty);
2449 fjz3 = _mm_add_ps(fjz3,tz);
2453 /**************************
2454 * CALCULATE INTERACTIONS *
2455 **************************/
2457 if (gmx_mm_any_lt(rsq31,rcutoff2))
2460 r31 = _mm_mul_ps(rsq31,rinv31);
2462 /* EWALD ELECTROSTATICS */
2464 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2465 ewrt = _mm_mul_ps(r31,ewtabscale);
2466 ewitab = _mm_cvttps_epi32(ewrt);
2467 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2468 ewitab = _mm_slli_epi32(ewitab,2);
2469 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2470 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2471 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2472 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2473 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2474 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2475 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2476 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
2477 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2479 d = _mm_sub_ps(r31,rswitch);
2480 d = _mm_max_ps(d,_mm_setzero_ps());
2481 d2 = _mm_mul_ps(d,d);
2482 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2484 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2486 /* Evaluate switch function */
2487 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2488 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2489 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2493 fscal = _mm_and_ps(fscal,cutoff_mask);
2495 /* Calculate temporary vectorial force */
2496 tx = _mm_mul_ps(fscal,dx31);
2497 ty = _mm_mul_ps(fscal,dy31);
2498 tz = _mm_mul_ps(fscal,dz31);
2500 /* Update vectorial force */
2501 fix3 = _mm_add_ps(fix3,tx);
2502 fiy3 = _mm_add_ps(fiy3,ty);
2503 fiz3 = _mm_add_ps(fiz3,tz);
2505 fjx1 = _mm_add_ps(fjx1,tx);
2506 fjy1 = _mm_add_ps(fjy1,ty);
2507 fjz1 = _mm_add_ps(fjz1,tz);
2511 /**************************
2512 * CALCULATE INTERACTIONS *
2513 **************************/
2515 if (gmx_mm_any_lt(rsq32,rcutoff2))
2518 r32 = _mm_mul_ps(rsq32,rinv32);
2520 /* EWALD ELECTROSTATICS */
2522 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2523 ewrt = _mm_mul_ps(r32,ewtabscale);
2524 ewitab = _mm_cvttps_epi32(ewrt);
2525 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2526 ewitab = _mm_slli_epi32(ewitab,2);
2527 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2528 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2529 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2530 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2531 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2532 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2533 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2534 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
2535 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2537 d = _mm_sub_ps(r32,rswitch);
2538 d = _mm_max_ps(d,_mm_setzero_ps());
2539 d2 = _mm_mul_ps(d,d);
2540 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2542 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2544 /* Evaluate switch function */
2545 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2546 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2547 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2551 fscal = _mm_and_ps(fscal,cutoff_mask);
2553 /* Calculate temporary vectorial force */
2554 tx = _mm_mul_ps(fscal,dx32);
2555 ty = _mm_mul_ps(fscal,dy32);
2556 tz = _mm_mul_ps(fscal,dz32);
2558 /* Update vectorial force */
2559 fix3 = _mm_add_ps(fix3,tx);
2560 fiy3 = _mm_add_ps(fiy3,ty);
2561 fiz3 = _mm_add_ps(fiz3,tz);
2563 fjx2 = _mm_add_ps(fjx2,tx);
2564 fjy2 = _mm_add_ps(fjy2,ty);
2565 fjz2 = _mm_add_ps(fjz2,tz);
2569 /**************************
2570 * CALCULATE INTERACTIONS *
2571 **************************/
2573 if (gmx_mm_any_lt(rsq33,rcutoff2))
2576 r33 = _mm_mul_ps(rsq33,rinv33);
2578 /* EWALD ELECTROSTATICS */
2580 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2581 ewrt = _mm_mul_ps(r33,ewtabscale);
2582 ewitab = _mm_cvttps_epi32(ewrt);
2583 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2584 ewitab = _mm_slli_epi32(ewitab,2);
2585 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2586 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2587 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2588 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2589 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2590 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2591 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2592 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
2593 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2595 d = _mm_sub_ps(r33,rswitch);
2596 d = _mm_max_ps(d,_mm_setzero_ps());
2597 d2 = _mm_mul_ps(d,d);
2598 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2600 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2602 /* Evaluate switch function */
2603 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2604 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2605 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2609 fscal = _mm_and_ps(fscal,cutoff_mask);
2611 /* Calculate temporary vectorial force */
2612 tx = _mm_mul_ps(fscal,dx33);
2613 ty = _mm_mul_ps(fscal,dy33);
2614 tz = _mm_mul_ps(fscal,dz33);
2616 /* Update vectorial force */
2617 fix3 = _mm_add_ps(fix3,tx);
2618 fiy3 = _mm_add_ps(fiy3,ty);
2619 fiz3 = _mm_add_ps(fiz3,tz);
2621 fjx3 = _mm_add_ps(fjx3,tx);
2622 fjy3 = _mm_add_ps(fjy3,ty);
2623 fjz3 = _mm_add_ps(fjz3,tz);
2627 fjptrA = f+j_coord_offsetA;
2628 fjptrB = f+j_coord_offsetB;
2629 fjptrC = f+j_coord_offsetC;
2630 fjptrD = f+j_coord_offsetD;
2632 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2633 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2634 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2636 /* Inner loop uses 617 flops */
2639 if(jidx<j_index_end)
2642 /* Get j neighbor index, and coordinate index */
2643 jnrlistA = jjnr[jidx];
2644 jnrlistB = jjnr[jidx+1];
2645 jnrlistC = jjnr[jidx+2];
2646 jnrlistD = jjnr[jidx+3];
2647 /* Sign of each element will be negative for non-real atoms.
2648 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2649 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2651 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2652 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2653 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2654 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2655 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2656 j_coord_offsetA = DIM*jnrA;
2657 j_coord_offsetB = DIM*jnrB;
2658 j_coord_offsetC = DIM*jnrC;
2659 j_coord_offsetD = DIM*jnrD;
2661 /* load j atom coordinates */
2662 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2663 x+j_coord_offsetC,x+j_coord_offsetD,
2664 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2665 &jy2,&jz2,&jx3,&jy3,&jz3);
2667 /* Calculate displacement vector */
2668 dx00 = _mm_sub_ps(ix0,jx0);
2669 dy00 = _mm_sub_ps(iy0,jy0);
2670 dz00 = _mm_sub_ps(iz0,jz0);
2671 dx11 = _mm_sub_ps(ix1,jx1);
2672 dy11 = _mm_sub_ps(iy1,jy1);
2673 dz11 = _mm_sub_ps(iz1,jz1);
2674 dx12 = _mm_sub_ps(ix1,jx2);
2675 dy12 = _mm_sub_ps(iy1,jy2);
2676 dz12 = _mm_sub_ps(iz1,jz2);
2677 dx13 = _mm_sub_ps(ix1,jx3);
2678 dy13 = _mm_sub_ps(iy1,jy3);
2679 dz13 = _mm_sub_ps(iz1,jz3);
2680 dx21 = _mm_sub_ps(ix2,jx1);
2681 dy21 = _mm_sub_ps(iy2,jy1);
2682 dz21 = _mm_sub_ps(iz2,jz1);
2683 dx22 = _mm_sub_ps(ix2,jx2);
2684 dy22 = _mm_sub_ps(iy2,jy2);
2685 dz22 = _mm_sub_ps(iz2,jz2);
2686 dx23 = _mm_sub_ps(ix2,jx3);
2687 dy23 = _mm_sub_ps(iy2,jy3);
2688 dz23 = _mm_sub_ps(iz2,jz3);
2689 dx31 = _mm_sub_ps(ix3,jx1);
2690 dy31 = _mm_sub_ps(iy3,jy1);
2691 dz31 = _mm_sub_ps(iz3,jz1);
2692 dx32 = _mm_sub_ps(ix3,jx2);
2693 dy32 = _mm_sub_ps(iy3,jy2);
2694 dz32 = _mm_sub_ps(iz3,jz2);
2695 dx33 = _mm_sub_ps(ix3,jx3);
2696 dy33 = _mm_sub_ps(iy3,jy3);
2697 dz33 = _mm_sub_ps(iz3,jz3);
2699 /* Calculate squared distance and things based on it */
2700 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2701 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2702 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2703 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2704 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2705 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2706 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2707 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2708 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2709 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2711 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2712 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2713 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2714 rinv13 = gmx_mm_invsqrt_ps(rsq13);
2715 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2716 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2717 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2718 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2719 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2720 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2722 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2723 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2724 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2725 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2726 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2727 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2728 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2729 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2730 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2731 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2733 fjx0 = _mm_setzero_ps();
2734 fjy0 = _mm_setzero_ps();
2735 fjz0 = _mm_setzero_ps();
2736 fjx1 = _mm_setzero_ps();
2737 fjy1 = _mm_setzero_ps();
2738 fjz1 = _mm_setzero_ps();
2739 fjx2 = _mm_setzero_ps();
2740 fjy2 = _mm_setzero_ps();
2741 fjz2 = _mm_setzero_ps();
2742 fjx3 = _mm_setzero_ps();
2743 fjy3 = _mm_setzero_ps();
2744 fjz3 = _mm_setzero_ps();
2746 /**************************
2747 * CALCULATE INTERACTIONS *
2748 **************************/
2750 if (gmx_mm_any_lt(rsq00,rcutoff2))
2753 r00 = _mm_mul_ps(rsq00,rinv00);
2754 r00 = _mm_andnot_ps(dummy_mask,r00);
2756 /* LENNARD-JONES DISPERSION/REPULSION */
2758 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2759 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
2760 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2761 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2762 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2764 d = _mm_sub_ps(r00,rswitch);
2765 d = _mm_max_ps(d,_mm_setzero_ps());
2766 d2 = _mm_mul_ps(d,d);
2767 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2769 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2771 /* Evaluate switch function */
2772 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2773 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2774 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2778 fscal = _mm_and_ps(fscal,cutoff_mask);
2780 fscal = _mm_andnot_ps(dummy_mask,fscal);
2782 /* Calculate temporary vectorial force */
2783 tx = _mm_mul_ps(fscal,dx00);
2784 ty = _mm_mul_ps(fscal,dy00);
2785 tz = _mm_mul_ps(fscal,dz00);
2787 /* Update vectorial force */
2788 fix0 = _mm_add_ps(fix0,tx);
2789 fiy0 = _mm_add_ps(fiy0,ty);
2790 fiz0 = _mm_add_ps(fiz0,tz);
2792 fjx0 = _mm_add_ps(fjx0,tx);
2793 fjy0 = _mm_add_ps(fjy0,ty);
2794 fjz0 = _mm_add_ps(fjz0,tz);
2798 /**************************
2799 * CALCULATE INTERACTIONS *
2800 **************************/
2802 if (gmx_mm_any_lt(rsq11,rcutoff2))
2805 r11 = _mm_mul_ps(rsq11,rinv11);
2806 r11 = _mm_andnot_ps(dummy_mask,r11);
2808 /* EWALD ELECTROSTATICS */
2810 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2811 ewrt = _mm_mul_ps(r11,ewtabscale);
2812 ewitab = _mm_cvttps_epi32(ewrt);
2813 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2814 ewitab = _mm_slli_epi32(ewitab,2);
2815 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2816 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2817 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2818 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2819 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2820 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2821 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2822 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2823 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2825 d = _mm_sub_ps(r11,rswitch);
2826 d = _mm_max_ps(d,_mm_setzero_ps());
2827 d2 = _mm_mul_ps(d,d);
2828 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2830 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2832 /* Evaluate switch function */
2833 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2834 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2835 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2839 fscal = _mm_and_ps(fscal,cutoff_mask);
2841 fscal = _mm_andnot_ps(dummy_mask,fscal);
2843 /* Calculate temporary vectorial force */
2844 tx = _mm_mul_ps(fscal,dx11);
2845 ty = _mm_mul_ps(fscal,dy11);
2846 tz = _mm_mul_ps(fscal,dz11);
2848 /* Update vectorial force */
2849 fix1 = _mm_add_ps(fix1,tx);
2850 fiy1 = _mm_add_ps(fiy1,ty);
2851 fiz1 = _mm_add_ps(fiz1,tz);
2853 fjx1 = _mm_add_ps(fjx1,tx);
2854 fjy1 = _mm_add_ps(fjy1,ty);
2855 fjz1 = _mm_add_ps(fjz1,tz);
2859 /**************************
2860 * CALCULATE INTERACTIONS *
2861 **************************/
2863 if (gmx_mm_any_lt(rsq12,rcutoff2))
2866 r12 = _mm_mul_ps(rsq12,rinv12);
2867 r12 = _mm_andnot_ps(dummy_mask,r12);
2869 /* EWALD ELECTROSTATICS */
2871 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2872 ewrt = _mm_mul_ps(r12,ewtabscale);
2873 ewitab = _mm_cvttps_epi32(ewrt);
2874 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2875 ewitab = _mm_slli_epi32(ewitab,2);
2876 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2877 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2878 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2879 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2880 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2881 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2882 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2883 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2884 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2886 d = _mm_sub_ps(r12,rswitch);
2887 d = _mm_max_ps(d,_mm_setzero_ps());
2888 d2 = _mm_mul_ps(d,d);
2889 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2891 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2893 /* Evaluate switch function */
2894 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2895 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2896 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2900 fscal = _mm_and_ps(fscal,cutoff_mask);
2902 fscal = _mm_andnot_ps(dummy_mask,fscal);
2904 /* Calculate temporary vectorial force */
2905 tx = _mm_mul_ps(fscal,dx12);
2906 ty = _mm_mul_ps(fscal,dy12);
2907 tz = _mm_mul_ps(fscal,dz12);
2909 /* Update vectorial force */
2910 fix1 = _mm_add_ps(fix1,tx);
2911 fiy1 = _mm_add_ps(fiy1,ty);
2912 fiz1 = _mm_add_ps(fiz1,tz);
2914 fjx2 = _mm_add_ps(fjx2,tx);
2915 fjy2 = _mm_add_ps(fjy2,ty);
2916 fjz2 = _mm_add_ps(fjz2,tz);
2920 /**************************
2921 * CALCULATE INTERACTIONS *
2922 **************************/
2924 if (gmx_mm_any_lt(rsq13,rcutoff2))
2927 r13 = _mm_mul_ps(rsq13,rinv13);
2928 r13 = _mm_andnot_ps(dummy_mask,r13);
2930 /* EWALD ELECTROSTATICS */
2932 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2933 ewrt = _mm_mul_ps(r13,ewtabscale);
2934 ewitab = _mm_cvttps_epi32(ewrt);
2935 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2936 ewitab = _mm_slli_epi32(ewitab,2);
2937 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2938 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2939 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2940 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2941 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2942 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2943 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2944 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2945 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2947 d = _mm_sub_ps(r13,rswitch);
2948 d = _mm_max_ps(d,_mm_setzero_ps());
2949 d2 = _mm_mul_ps(d,d);
2950 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2952 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2954 /* Evaluate switch function */
2955 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2956 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2957 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2961 fscal = _mm_and_ps(fscal,cutoff_mask);
2963 fscal = _mm_andnot_ps(dummy_mask,fscal);
2965 /* Calculate temporary vectorial force */
2966 tx = _mm_mul_ps(fscal,dx13);
2967 ty = _mm_mul_ps(fscal,dy13);
2968 tz = _mm_mul_ps(fscal,dz13);
2970 /* Update vectorial force */
2971 fix1 = _mm_add_ps(fix1,tx);
2972 fiy1 = _mm_add_ps(fiy1,ty);
2973 fiz1 = _mm_add_ps(fiz1,tz);
2975 fjx3 = _mm_add_ps(fjx3,tx);
2976 fjy3 = _mm_add_ps(fjy3,ty);
2977 fjz3 = _mm_add_ps(fjz3,tz);
2981 /**************************
2982 * CALCULATE INTERACTIONS *
2983 **************************/
2985 if (gmx_mm_any_lt(rsq21,rcutoff2))
2988 r21 = _mm_mul_ps(rsq21,rinv21);
2989 r21 = _mm_andnot_ps(dummy_mask,r21);
2991 /* EWALD ELECTROSTATICS */
2993 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2994 ewrt = _mm_mul_ps(r21,ewtabscale);
2995 ewitab = _mm_cvttps_epi32(ewrt);
2996 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2997 ewitab = _mm_slli_epi32(ewitab,2);
2998 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2999 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3000 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3001 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3002 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3003 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3004 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3005 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
3006 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
3008 d = _mm_sub_ps(r21,rswitch);
3009 d = _mm_max_ps(d,_mm_setzero_ps());
3010 d2 = _mm_mul_ps(d,d);
3011 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3013 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3015 /* Evaluate switch function */
3016 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3017 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
3018 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
3022 fscal = _mm_and_ps(fscal,cutoff_mask);
3024 fscal = _mm_andnot_ps(dummy_mask,fscal);
3026 /* Calculate temporary vectorial force */
3027 tx = _mm_mul_ps(fscal,dx21);
3028 ty = _mm_mul_ps(fscal,dy21);
3029 tz = _mm_mul_ps(fscal,dz21);
3031 /* Update vectorial force */
3032 fix2 = _mm_add_ps(fix2,tx);
3033 fiy2 = _mm_add_ps(fiy2,ty);
3034 fiz2 = _mm_add_ps(fiz2,tz);
3036 fjx1 = _mm_add_ps(fjx1,tx);
3037 fjy1 = _mm_add_ps(fjy1,ty);
3038 fjz1 = _mm_add_ps(fjz1,tz);
3042 /**************************
3043 * CALCULATE INTERACTIONS *
3044 **************************/
3046 if (gmx_mm_any_lt(rsq22,rcutoff2))
3049 r22 = _mm_mul_ps(rsq22,rinv22);
3050 r22 = _mm_andnot_ps(dummy_mask,r22);
3052 /* EWALD ELECTROSTATICS */
3054 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3055 ewrt = _mm_mul_ps(r22,ewtabscale);
3056 ewitab = _mm_cvttps_epi32(ewrt);
3057 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3058 ewitab = _mm_slli_epi32(ewitab,2);
3059 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3060 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3061 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3062 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3063 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3064 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3065 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3066 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
3067 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
3069 d = _mm_sub_ps(r22,rswitch);
3070 d = _mm_max_ps(d,_mm_setzero_ps());
3071 d2 = _mm_mul_ps(d,d);
3072 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3074 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3076 /* Evaluate switch function */
3077 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3078 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
3079 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
3083 fscal = _mm_and_ps(fscal,cutoff_mask);
3085 fscal = _mm_andnot_ps(dummy_mask,fscal);
3087 /* Calculate temporary vectorial force */
3088 tx = _mm_mul_ps(fscal,dx22);
3089 ty = _mm_mul_ps(fscal,dy22);
3090 tz = _mm_mul_ps(fscal,dz22);
3092 /* Update vectorial force */
3093 fix2 = _mm_add_ps(fix2,tx);
3094 fiy2 = _mm_add_ps(fiy2,ty);
3095 fiz2 = _mm_add_ps(fiz2,tz);
3097 fjx2 = _mm_add_ps(fjx2,tx);
3098 fjy2 = _mm_add_ps(fjy2,ty);
3099 fjz2 = _mm_add_ps(fjz2,tz);
3103 /**************************
3104 * CALCULATE INTERACTIONS *
3105 **************************/
3107 if (gmx_mm_any_lt(rsq23,rcutoff2))
3110 r23 = _mm_mul_ps(rsq23,rinv23);
3111 r23 = _mm_andnot_ps(dummy_mask,r23);
3113 /* EWALD ELECTROSTATICS */
3115 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3116 ewrt = _mm_mul_ps(r23,ewtabscale);
3117 ewitab = _mm_cvttps_epi32(ewrt);
3118 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3119 ewitab = _mm_slli_epi32(ewitab,2);
3120 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3121 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3122 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3123 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3124 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3125 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3126 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3127 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
3128 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
3130 d = _mm_sub_ps(r23,rswitch);
3131 d = _mm_max_ps(d,_mm_setzero_ps());
3132 d2 = _mm_mul_ps(d,d);
3133 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3135 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3137 /* Evaluate switch function */
3138 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3139 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
3140 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
3144 fscal = _mm_and_ps(fscal,cutoff_mask);
3146 fscal = _mm_andnot_ps(dummy_mask,fscal);
3148 /* Calculate temporary vectorial force */
3149 tx = _mm_mul_ps(fscal,dx23);
3150 ty = _mm_mul_ps(fscal,dy23);
3151 tz = _mm_mul_ps(fscal,dz23);
3153 /* Update vectorial force */
3154 fix2 = _mm_add_ps(fix2,tx);
3155 fiy2 = _mm_add_ps(fiy2,ty);
3156 fiz2 = _mm_add_ps(fiz2,tz);
3158 fjx3 = _mm_add_ps(fjx3,tx);
3159 fjy3 = _mm_add_ps(fjy3,ty);
3160 fjz3 = _mm_add_ps(fjz3,tz);
3164 /**************************
3165 * CALCULATE INTERACTIONS *
3166 **************************/
3168 if (gmx_mm_any_lt(rsq31,rcutoff2))
3171 r31 = _mm_mul_ps(rsq31,rinv31);
3172 r31 = _mm_andnot_ps(dummy_mask,r31);
3174 /* EWALD ELECTROSTATICS */
3176 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3177 ewrt = _mm_mul_ps(r31,ewtabscale);
3178 ewitab = _mm_cvttps_epi32(ewrt);
3179 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3180 ewitab = _mm_slli_epi32(ewitab,2);
3181 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3182 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3183 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3184 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3185 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3186 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3187 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3188 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
3189 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
3191 d = _mm_sub_ps(r31,rswitch);
3192 d = _mm_max_ps(d,_mm_setzero_ps());
3193 d2 = _mm_mul_ps(d,d);
3194 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3196 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3198 /* Evaluate switch function */
3199 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3200 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
3201 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
3205 fscal = _mm_and_ps(fscal,cutoff_mask);
3207 fscal = _mm_andnot_ps(dummy_mask,fscal);
3209 /* Calculate temporary vectorial force */
3210 tx = _mm_mul_ps(fscal,dx31);
3211 ty = _mm_mul_ps(fscal,dy31);
3212 tz = _mm_mul_ps(fscal,dz31);
3214 /* Update vectorial force */
3215 fix3 = _mm_add_ps(fix3,tx);
3216 fiy3 = _mm_add_ps(fiy3,ty);
3217 fiz3 = _mm_add_ps(fiz3,tz);
3219 fjx1 = _mm_add_ps(fjx1,tx);
3220 fjy1 = _mm_add_ps(fjy1,ty);
3221 fjz1 = _mm_add_ps(fjz1,tz);
3225 /**************************
3226 * CALCULATE INTERACTIONS *
3227 **************************/
3229 if (gmx_mm_any_lt(rsq32,rcutoff2))
3232 r32 = _mm_mul_ps(rsq32,rinv32);
3233 r32 = _mm_andnot_ps(dummy_mask,r32);
3235 /* EWALD ELECTROSTATICS */
3237 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3238 ewrt = _mm_mul_ps(r32,ewtabscale);
3239 ewitab = _mm_cvttps_epi32(ewrt);
3240 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3241 ewitab = _mm_slli_epi32(ewitab,2);
3242 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3243 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3244 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3245 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3246 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3247 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3248 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3249 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
3250 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
3252 d = _mm_sub_ps(r32,rswitch);
3253 d = _mm_max_ps(d,_mm_setzero_ps());
3254 d2 = _mm_mul_ps(d,d);
3255 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3257 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3259 /* Evaluate switch function */
3260 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3261 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
3262 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
3266 fscal = _mm_and_ps(fscal,cutoff_mask);
3268 fscal = _mm_andnot_ps(dummy_mask,fscal);
3270 /* Calculate temporary vectorial force */
3271 tx = _mm_mul_ps(fscal,dx32);
3272 ty = _mm_mul_ps(fscal,dy32);
3273 tz = _mm_mul_ps(fscal,dz32);
3275 /* Update vectorial force */
3276 fix3 = _mm_add_ps(fix3,tx);
3277 fiy3 = _mm_add_ps(fiy3,ty);
3278 fiz3 = _mm_add_ps(fiz3,tz);
3280 fjx2 = _mm_add_ps(fjx2,tx);
3281 fjy2 = _mm_add_ps(fjy2,ty);
3282 fjz2 = _mm_add_ps(fjz2,tz);
3286 /**************************
3287 * CALCULATE INTERACTIONS *
3288 **************************/
3290 if (gmx_mm_any_lt(rsq33,rcutoff2))
3293 r33 = _mm_mul_ps(rsq33,rinv33);
3294 r33 = _mm_andnot_ps(dummy_mask,r33);
3296 /* EWALD ELECTROSTATICS */
3298 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3299 ewrt = _mm_mul_ps(r33,ewtabscale);
3300 ewitab = _mm_cvttps_epi32(ewrt);
3301 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3302 ewitab = _mm_slli_epi32(ewitab,2);
3303 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3304 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3305 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3306 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3307 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3308 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3309 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3310 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
3311 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
3313 d = _mm_sub_ps(r33,rswitch);
3314 d = _mm_max_ps(d,_mm_setzero_ps());
3315 d2 = _mm_mul_ps(d,d);
3316 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3318 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3320 /* Evaluate switch function */
3321 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3322 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
3323 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
3327 fscal = _mm_and_ps(fscal,cutoff_mask);
3329 fscal = _mm_andnot_ps(dummy_mask,fscal);
3331 /* Calculate temporary vectorial force */
3332 tx = _mm_mul_ps(fscal,dx33);
3333 ty = _mm_mul_ps(fscal,dy33);
3334 tz = _mm_mul_ps(fscal,dz33);
3336 /* Update vectorial force */
3337 fix3 = _mm_add_ps(fix3,tx);
3338 fiy3 = _mm_add_ps(fiy3,ty);
3339 fiz3 = _mm_add_ps(fiz3,tz);
3341 fjx3 = _mm_add_ps(fjx3,tx);
3342 fjy3 = _mm_add_ps(fjy3,ty);
3343 fjz3 = _mm_add_ps(fjz3,tz);
3347 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3348 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3349 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3350 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3352 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
3353 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
3354 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3356 /* Inner loop uses 627 flops */
3359 /* End of innermost loop */
3361 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3362 f+i_coord_offset,fshift+i_shift_offset);
3364 /* Increment number of inner iterations */
3365 inneriter += j_index_end - j_index_start;
3367 /* Outer loop uses 24 flops */
3370 /* Increment number of outer iterations */
3373 /* Update outer/inner flops */
3375 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*627);