2 * Note: this file was generated by the Gromacs sse4_1_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse4_1_single.h"
34 #include "kernelutil_x86_sse4_1_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse4_1_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: None
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse4_1_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
72 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
74 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
75 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
76 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
77 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
78 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
79 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
80 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
81 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
82 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
83 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
84 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
85 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
86 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
87 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
88 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
89 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
90 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
93 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
95 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
96 real rswitch_scalar,d_scalar;
97 __m128 dummy_mask,cutoff_mask;
98 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
99 __m128 one = _mm_set1_ps(1.0);
100 __m128 two = _mm_set1_ps(2.0);
106 jindex = nlist->jindex;
108 shiftidx = nlist->shift;
110 shiftvec = fr->shift_vec[0];
111 fshift = fr->fshift[0];
112 facel = _mm_set1_ps(fr->epsfac);
113 charge = mdatoms->chargeA;
115 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
116 ewtab = fr->ic->tabq_coul_FDV0;
117 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
118 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
120 /* Setup water-specific parameters */
121 inr = nlist->iinr[0];
122 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
123 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
124 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
126 jq1 = _mm_set1_ps(charge[inr+1]);
127 jq2 = _mm_set1_ps(charge[inr+2]);
128 jq3 = _mm_set1_ps(charge[inr+3]);
129 qq11 = _mm_mul_ps(iq1,jq1);
130 qq12 = _mm_mul_ps(iq1,jq2);
131 qq13 = _mm_mul_ps(iq1,jq3);
132 qq21 = _mm_mul_ps(iq2,jq1);
133 qq22 = _mm_mul_ps(iq2,jq2);
134 qq23 = _mm_mul_ps(iq2,jq3);
135 qq31 = _mm_mul_ps(iq3,jq1);
136 qq32 = _mm_mul_ps(iq3,jq2);
137 qq33 = _mm_mul_ps(iq3,jq3);
139 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
140 rcutoff_scalar = fr->rcoulomb;
141 rcutoff = _mm_set1_ps(rcutoff_scalar);
142 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
144 rswitch_scalar = fr->rcoulomb_switch;
145 rswitch = _mm_set1_ps(rswitch_scalar);
146 /* Setup switch parameters */
147 d_scalar = rcutoff_scalar-rswitch_scalar;
148 d = _mm_set1_ps(d_scalar);
149 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
150 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
151 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
152 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
153 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
154 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
156 /* Avoid stupid compiler warnings */
157 jnrA = jnrB = jnrC = jnrD = 0;
166 for(iidx=0;iidx<4*DIM;iidx++)
171 /* Start outer loop over neighborlists */
172 for(iidx=0; iidx<nri; iidx++)
174 /* Load shift vector for this list */
175 i_shift_offset = DIM*shiftidx[iidx];
177 /* Load limits for loop over neighbors */
178 j_index_start = jindex[iidx];
179 j_index_end = jindex[iidx+1];
181 /* Get outer coordinate index */
183 i_coord_offset = DIM*inr;
185 /* Load i particle coords and add shift vector */
186 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
187 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
189 fix1 = _mm_setzero_ps();
190 fiy1 = _mm_setzero_ps();
191 fiz1 = _mm_setzero_ps();
192 fix2 = _mm_setzero_ps();
193 fiy2 = _mm_setzero_ps();
194 fiz2 = _mm_setzero_ps();
195 fix3 = _mm_setzero_ps();
196 fiy3 = _mm_setzero_ps();
197 fiz3 = _mm_setzero_ps();
199 /* Reset potential sums */
200 velecsum = _mm_setzero_ps();
202 /* Start inner kernel loop */
203 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
206 /* Get j neighbor index, and coordinate index */
211 j_coord_offsetA = DIM*jnrA;
212 j_coord_offsetB = DIM*jnrB;
213 j_coord_offsetC = DIM*jnrC;
214 j_coord_offsetD = DIM*jnrD;
216 /* load j atom coordinates */
217 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
218 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
219 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
221 /* Calculate displacement vector */
222 dx11 = _mm_sub_ps(ix1,jx1);
223 dy11 = _mm_sub_ps(iy1,jy1);
224 dz11 = _mm_sub_ps(iz1,jz1);
225 dx12 = _mm_sub_ps(ix1,jx2);
226 dy12 = _mm_sub_ps(iy1,jy2);
227 dz12 = _mm_sub_ps(iz1,jz2);
228 dx13 = _mm_sub_ps(ix1,jx3);
229 dy13 = _mm_sub_ps(iy1,jy3);
230 dz13 = _mm_sub_ps(iz1,jz3);
231 dx21 = _mm_sub_ps(ix2,jx1);
232 dy21 = _mm_sub_ps(iy2,jy1);
233 dz21 = _mm_sub_ps(iz2,jz1);
234 dx22 = _mm_sub_ps(ix2,jx2);
235 dy22 = _mm_sub_ps(iy2,jy2);
236 dz22 = _mm_sub_ps(iz2,jz2);
237 dx23 = _mm_sub_ps(ix2,jx3);
238 dy23 = _mm_sub_ps(iy2,jy3);
239 dz23 = _mm_sub_ps(iz2,jz3);
240 dx31 = _mm_sub_ps(ix3,jx1);
241 dy31 = _mm_sub_ps(iy3,jy1);
242 dz31 = _mm_sub_ps(iz3,jz1);
243 dx32 = _mm_sub_ps(ix3,jx2);
244 dy32 = _mm_sub_ps(iy3,jy2);
245 dz32 = _mm_sub_ps(iz3,jz2);
246 dx33 = _mm_sub_ps(ix3,jx3);
247 dy33 = _mm_sub_ps(iy3,jy3);
248 dz33 = _mm_sub_ps(iz3,jz3);
250 /* Calculate squared distance and things based on it */
251 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
252 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
253 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
254 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
255 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
256 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
257 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
258 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
259 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
261 rinv11 = gmx_mm_invsqrt_ps(rsq11);
262 rinv12 = gmx_mm_invsqrt_ps(rsq12);
263 rinv13 = gmx_mm_invsqrt_ps(rsq13);
264 rinv21 = gmx_mm_invsqrt_ps(rsq21);
265 rinv22 = gmx_mm_invsqrt_ps(rsq22);
266 rinv23 = gmx_mm_invsqrt_ps(rsq23);
267 rinv31 = gmx_mm_invsqrt_ps(rsq31);
268 rinv32 = gmx_mm_invsqrt_ps(rsq32);
269 rinv33 = gmx_mm_invsqrt_ps(rsq33);
271 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
272 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
273 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
274 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
275 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
276 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
277 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
278 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
279 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
281 fjx1 = _mm_setzero_ps();
282 fjy1 = _mm_setzero_ps();
283 fjz1 = _mm_setzero_ps();
284 fjx2 = _mm_setzero_ps();
285 fjy2 = _mm_setzero_ps();
286 fjz2 = _mm_setzero_ps();
287 fjx3 = _mm_setzero_ps();
288 fjy3 = _mm_setzero_ps();
289 fjz3 = _mm_setzero_ps();
291 /**************************
292 * CALCULATE INTERACTIONS *
293 **************************/
295 if (gmx_mm_any_lt(rsq11,rcutoff2))
298 r11 = _mm_mul_ps(rsq11,rinv11);
300 /* EWALD ELECTROSTATICS */
302 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
303 ewrt = _mm_mul_ps(r11,ewtabscale);
304 ewitab = _mm_cvttps_epi32(ewrt);
305 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
306 ewitab = _mm_slli_epi32(ewitab,2);
307 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
308 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
309 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
310 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
311 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
312 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
313 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
314 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
315 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
317 d = _mm_sub_ps(r11,rswitch);
318 d = _mm_max_ps(d,_mm_setzero_ps());
319 d2 = _mm_mul_ps(d,d);
320 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
322 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
324 /* Evaluate switch function */
325 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
326 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
327 velec = _mm_mul_ps(velec,sw);
328 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
330 /* Update potential sum for this i atom from the interaction with this j atom. */
331 velec = _mm_and_ps(velec,cutoff_mask);
332 velecsum = _mm_add_ps(velecsum,velec);
336 fscal = _mm_and_ps(fscal,cutoff_mask);
338 /* Calculate temporary vectorial force */
339 tx = _mm_mul_ps(fscal,dx11);
340 ty = _mm_mul_ps(fscal,dy11);
341 tz = _mm_mul_ps(fscal,dz11);
343 /* Update vectorial force */
344 fix1 = _mm_add_ps(fix1,tx);
345 fiy1 = _mm_add_ps(fiy1,ty);
346 fiz1 = _mm_add_ps(fiz1,tz);
348 fjx1 = _mm_add_ps(fjx1,tx);
349 fjy1 = _mm_add_ps(fjy1,ty);
350 fjz1 = _mm_add_ps(fjz1,tz);
354 /**************************
355 * CALCULATE INTERACTIONS *
356 **************************/
358 if (gmx_mm_any_lt(rsq12,rcutoff2))
361 r12 = _mm_mul_ps(rsq12,rinv12);
363 /* EWALD ELECTROSTATICS */
365 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
366 ewrt = _mm_mul_ps(r12,ewtabscale);
367 ewitab = _mm_cvttps_epi32(ewrt);
368 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
369 ewitab = _mm_slli_epi32(ewitab,2);
370 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
371 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
372 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
373 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
374 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
375 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
376 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
377 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
378 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
380 d = _mm_sub_ps(r12,rswitch);
381 d = _mm_max_ps(d,_mm_setzero_ps());
382 d2 = _mm_mul_ps(d,d);
383 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
385 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
387 /* Evaluate switch function */
388 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
389 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
390 velec = _mm_mul_ps(velec,sw);
391 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
393 /* Update potential sum for this i atom from the interaction with this j atom. */
394 velec = _mm_and_ps(velec,cutoff_mask);
395 velecsum = _mm_add_ps(velecsum,velec);
399 fscal = _mm_and_ps(fscal,cutoff_mask);
401 /* Calculate temporary vectorial force */
402 tx = _mm_mul_ps(fscal,dx12);
403 ty = _mm_mul_ps(fscal,dy12);
404 tz = _mm_mul_ps(fscal,dz12);
406 /* Update vectorial force */
407 fix1 = _mm_add_ps(fix1,tx);
408 fiy1 = _mm_add_ps(fiy1,ty);
409 fiz1 = _mm_add_ps(fiz1,tz);
411 fjx2 = _mm_add_ps(fjx2,tx);
412 fjy2 = _mm_add_ps(fjy2,ty);
413 fjz2 = _mm_add_ps(fjz2,tz);
417 /**************************
418 * CALCULATE INTERACTIONS *
419 **************************/
421 if (gmx_mm_any_lt(rsq13,rcutoff2))
424 r13 = _mm_mul_ps(rsq13,rinv13);
426 /* EWALD ELECTROSTATICS */
428 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
429 ewrt = _mm_mul_ps(r13,ewtabscale);
430 ewitab = _mm_cvttps_epi32(ewrt);
431 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
432 ewitab = _mm_slli_epi32(ewitab,2);
433 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
434 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
435 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
436 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
437 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
438 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
439 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
440 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
441 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
443 d = _mm_sub_ps(r13,rswitch);
444 d = _mm_max_ps(d,_mm_setzero_ps());
445 d2 = _mm_mul_ps(d,d);
446 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
448 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
450 /* Evaluate switch function */
451 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
452 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
453 velec = _mm_mul_ps(velec,sw);
454 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
456 /* Update potential sum for this i atom from the interaction with this j atom. */
457 velec = _mm_and_ps(velec,cutoff_mask);
458 velecsum = _mm_add_ps(velecsum,velec);
462 fscal = _mm_and_ps(fscal,cutoff_mask);
464 /* Calculate temporary vectorial force */
465 tx = _mm_mul_ps(fscal,dx13);
466 ty = _mm_mul_ps(fscal,dy13);
467 tz = _mm_mul_ps(fscal,dz13);
469 /* Update vectorial force */
470 fix1 = _mm_add_ps(fix1,tx);
471 fiy1 = _mm_add_ps(fiy1,ty);
472 fiz1 = _mm_add_ps(fiz1,tz);
474 fjx3 = _mm_add_ps(fjx3,tx);
475 fjy3 = _mm_add_ps(fjy3,ty);
476 fjz3 = _mm_add_ps(fjz3,tz);
480 /**************************
481 * CALCULATE INTERACTIONS *
482 **************************/
484 if (gmx_mm_any_lt(rsq21,rcutoff2))
487 r21 = _mm_mul_ps(rsq21,rinv21);
489 /* EWALD ELECTROSTATICS */
491 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
492 ewrt = _mm_mul_ps(r21,ewtabscale);
493 ewitab = _mm_cvttps_epi32(ewrt);
494 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
495 ewitab = _mm_slli_epi32(ewitab,2);
496 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
497 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
498 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
499 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
500 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
501 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
502 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
503 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
504 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
506 d = _mm_sub_ps(r21,rswitch);
507 d = _mm_max_ps(d,_mm_setzero_ps());
508 d2 = _mm_mul_ps(d,d);
509 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
511 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
513 /* Evaluate switch function */
514 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
515 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
516 velec = _mm_mul_ps(velec,sw);
517 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
519 /* Update potential sum for this i atom from the interaction with this j atom. */
520 velec = _mm_and_ps(velec,cutoff_mask);
521 velecsum = _mm_add_ps(velecsum,velec);
525 fscal = _mm_and_ps(fscal,cutoff_mask);
527 /* Calculate temporary vectorial force */
528 tx = _mm_mul_ps(fscal,dx21);
529 ty = _mm_mul_ps(fscal,dy21);
530 tz = _mm_mul_ps(fscal,dz21);
532 /* Update vectorial force */
533 fix2 = _mm_add_ps(fix2,tx);
534 fiy2 = _mm_add_ps(fiy2,ty);
535 fiz2 = _mm_add_ps(fiz2,tz);
537 fjx1 = _mm_add_ps(fjx1,tx);
538 fjy1 = _mm_add_ps(fjy1,ty);
539 fjz1 = _mm_add_ps(fjz1,tz);
543 /**************************
544 * CALCULATE INTERACTIONS *
545 **************************/
547 if (gmx_mm_any_lt(rsq22,rcutoff2))
550 r22 = _mm_mul_ps(rsq22,rinv22);
552 /* EWALD ELECTROSTATICS */
554 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
555 ewrt = _mm_mul_ps(r22,ewtabscale);
556 ewitab = _mm_cvttps_epi32(ewrt);
557 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
558 ewitab = _mm_slli_epi32(ewitab,2);
559 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
560 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
561 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
562 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
563 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
564 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
565 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
566 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
567 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
569 d = _mm_sub_ps(r22,rswitch);
570 d = _mm_max_ps(d,_mm_setzero_ps());
571 d2 = _mm_mul_ps(d,d);
572 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
574 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
576 /* Evaluate switch function */
577 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
578 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
579 velec = _mm_mul_ps(velec,sw);
580 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
582 /* Update potential sum for this i atom from the interaction with this j atom. */
583 velec = _mm_and_ps(velec,cutoff_mask);
584 velecsum = _mm_add_ps(velecsum,velec);
588 fscal = _mm_and_ps(fscal,cutoff_mask);
590 /* Calculate temporary vectorial force */
591 tx = _mm_mul_ps(fscal,dx22);
592 ty = _mm_mul_ps(fscal,dy22);
593 tz = _mm_mul_ps(fscal,dz22);
595 /* Update vectorial force */
596 fix2 = _mm_add_ps(fix2,tx);
597 fiy2 = _mm_add_ps(fiy2,ty);
598 fiz2 = _mm_add_ps(fiz2,tz);
600 fjx2 = _mm_add_ps(fjx2,tx);
601 fjy2 = _mm_add_ps(fjy2,ty);
602 fjz2 = _mm_add_ps(fjz2,tz);
606 /**************************
607 * CALCULATE INTERACTIONS *
608 **************************/
610 if (gmx_mm_any_lt(rsq23,rcutoff2))
613 r23 = _mm_mul_ps(rsq23,rinv23);
615 /* EWALD ELECTROSTATICS */
617 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
618 ewrt = _mm_mul_ps(r23,ewtabscale);
619 ewitab = _mm_cvttps_epi32(ewrt);
620 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
621 ewitab = _mm_slli_epi32(ewitab,2);
622 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
623 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
624 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
625 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
626 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
627 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
628 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
629 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
630 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
632 d = _mm_sub_ps(r23,rswitch);
633 d = _mm_max_ps(d,_mm_setzero_ps());
634 d2 = _mm_mul_ps(d,d);
635 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
637 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
639 /* Evaluate switch function */
640 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
641 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
642 velec = _mm_mul_ps(velec,sw);
643 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
645 /* Update potential sum for this i atom from the interaction with this j atom. */
646 velec = _mm_and_ps(velec,cutoff_mask);
647 velecsum = _mm_add_ps(velecsum,velec);
651 fscal = _mm_and_ps(fscal,cutoff_mask);
653 /* Calculate temporary vectorial force */
654 tx = _mm_mul_ps(fscal,dx23);
655 ty = _mm_mul_ps(fscal,dy23);
656 tz = _mm_mul_ps(fscal,dz23);
658 /* Update vectorial force */
659 fix2 = _mm_add_ps(fix2,tx);
660 fiy2 = _mm_add_ps(fiy2,ty);
661 fiz2 = _mm_add_ps(fiz2,tz);
663 fjx3 = _mm_add_ps(fjx3,tx);
664 fjy3 = _mm_add_ps(fjy3,ty);
665 fjz3 = _mm_add_ps(fjz3,tz);
669 /**************************
670 * CALCULATE INTERACTIONS *
671 **************************/
673 if (gmx_mm_any_lt(rsq31,rcutoff2))
676 r31 = _mm_mul_ps(rsq31,rinv31);
678 /* EWALD ELECTROSTATICS */
680 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
681 ewrt = _mm_mul_ps(r31,ewtabscale);
682 ewitab = _mm_cvttps_epi32(ewrt);
683 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
684 ewitab = _mm_slli_epi32(ewitab,2);
685 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
686 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
687 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
688 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
689 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
690 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
691 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
692 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
693 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
695 d = _mm_sub_ps(r31,rswitch);
696 d = _mm_max_ps(d,_mm_setzero_ps());
697 d2 = _mm_mul_ps(d,d);
698 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
700 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
702 /* Evaluate switch function */
703 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
704 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
705 velec = _mm_mul_ps(velec,sw);
706 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
708 /* Update potential sum for this i atom from the interaction with this j atom. */
709 velec = _mm_and_ps(velec,cutoff_mask);
710 velecsum = _mm_add_ps(velecsum,velec);
714 fscal = _mm_and_ps(fscal,cutoff_mask);
716 /* Calculate temporary vectorial force */
717 tx = _mm_mul_ps(fscal,dx31);
718 ty = _mm_mul_ps(fscal,dy31);
719 tz = _mm_mul_ps(fscal,dz31);
721 /* Update vectorial force */
722 fix3 = _mm_add_ps(fix3,tx);
723 fiy3 = _mm_add_ps(fiy3,ty);
724 fiz3 = _mm_add_ps(fiz3,tz);
726 fjx1 = _mm_add_ps(fjx1,tx);
727 fjy1 = _mm_add_ps(fjy1,ty);
728 fjz1 = _mm_add_ps(fjz1,tz);
732 /**************************
733 * CALCULATE INTERACTIONS *
734 **************************/
736 if (gmx_mm_any_lt(rsq32,rcutoff2))
739 r32 = _mm_mul_ps(rsq32,rinv32);
741 /* EWALD ELECTROSTATICS */
743 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
744 ewrt = _mm_mul_ps(r32,ewtabscale);
745 ewitab = _mm_cvttps_epi32(ewrt);
746 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
747 ewitab = _mm_slli_epi32(ewitab,2);
748 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
749 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
750 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
751 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
752 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
753 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
754 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
755 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
756 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
758 d = _mm_sub_ps(r32,rswitch);
759 d = _mm_max_ps(d,_mm_setzero_ps());
760 d2 = _mm_mul_ps(d,d);
761 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
763 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
765 /* Evaluate switch function */
766 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
767 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
768 velec = _mm_mul_ps(velec,sw);
769 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
771 /* Update potential sum for this i atom from the interaction with this j atom. */
772 velec = _mm_and_ps(velec,cutoff_mask);
773 velecsum = _mm_add_ps(velecsum,velec);
777 fscal = _mm_and_ps(fscal,cutoff_mask);
779 /* Calculate temporary vectorial force */
780 tx = _mm_mul_ps(fscal,dx32);
781 ty = _mm_mul_ps(fscal,dy32);
782 tz = _mm_mul_ps(fscal,dz32);
784 /* Update vectorial force */
785 fix3 = _mm_add_ps(fix3,tx);
786 fiy3 = _mm_add_ps(fiy3,ty);
787 fiz3 = _mm_add_ps(fiz3,tz);
789 fjx2 = _mm_add_ps(fjx2,tx);
790 fjy2 = _mm_add_ps(fjy2,ty);
791 fjz2 = _mm_add_ps(fjz2,tz);
795 /**************************
796 * CALCULATE INTERACTIONS *
797 **************************/
799 if (gmx_mm_any_lt(rsq33,rcutoff2))
802 r33 = _mm_mul_ps(rsq33,rinv33);
804 /* EWALD ELECTROSTATICS */
806 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
807 ewrt = _mm_mul_ps(r33,ewtabscale);
808 ewitab = _mm_cvttps_epi32(ewrt);
809 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
810 ewitab = _mm_slli_epi32(ewitab,2);
811 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
812 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
813 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
814 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
815 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
816 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
817 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
818 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
819 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
821 d = _mm_sub_ps(r33,rswitch);
822 d = _mm_max_ps(d,_mm_setzero_ps());
823 d2 = _mm_mul_ps(d,d);
824 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
826 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
828 /* Evaluate switch function */
829 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
830 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
831 velec = _mm_mul_ps(velec,sw);
832 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
834 /* Update potential sum for this i atom from the interaction with this j atom. */
835 velec = _mm_and_ps(velec,cutoff_mask);
836 velecsum = _mm_add_ps(velecsum,velec);
840 fscal = _mm_and_ps(fscal,cutoff_mask);
842 /* Calculate temporary vectorial force */
843 tx = _mm_mul_ps(fscal,dx33);
844 ty = _mm_mul_ps(fscal,dy33);
845 tz = _mm_mul_ps(fscal,dz33);
847 /* Update vectorial force */
848 fix3 = _mm_add_ps(fix3,tx);
849 fiy3 = _mm_add_ps(fiy3,ty);
850 fiz3 = _mm_add_ps(fiz3,tz);
852 fjx3 = _mm_add_ps(fjx3,tx);
853 fjy3 = _mm_add_ps(fjy3,ty);
854 fjz3 = _mm_add_ps(fjz3,tz);
858 fjptrA = f+j_coord_offsetA;
859 fjptrB = f+j_coord_offsetB;
860 fjptrC = f+j_coord_offsetC;
861 fjptrD = f+j_coord_offsetD;
863 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
864 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
866 /* Inner loop uses 585 flops */
872 /* Get j neighbor index, and coordinate index */
873 jnrlistA = jjnr[jidx];
874 jnrlistB = jjnr[jidx+1];
875 jnrlistC = jjnr[jidx+2];
876 jnrlistD = jjnr[jidx+3];
877 /* Sign of each element will be negative for non-real atoms.
878 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
879 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
881 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
882 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
883 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
884 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
885 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
886 j_coord_offsetA = DIM*jnrA;
887 j_coord_offsetB = DIM*jnrB;
888 j_coord_offsetC = DIM*jnrC;
889 j_coord_offsetD = DIM*jnrD;
891 /* load j atom coordinates */
892 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
893 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
894 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
896 /* Calculate displacement vector */
897 dx11 = _mm_sub_ps(ix1,jx1);
898 dy11 = _mm_sub_ps(iy1,jy1);
899 dz11 = _mm_sub_ps(iz1,jz1);
900 dx12 = _mm_sub_ps(ix1,jx2);
901 dy12 = _mm_sub_ps(iy1,jy2);
902 dz12 = _mm_sub_ps(iz1,jz2);
903 dx13 = _mm_sub_ps(ix1,jx3);
904 dy13 = _mm_sub_ps(iy1,jy3);
905 dz13 = _mm_sub_ps(iz1,jz3);
906 dx21 = _mm_sub_ps(ix2,jx1);
907 dy21 = _mm_sub_ps(iy2,jy1);
908 dz21 = _mm_sub_ps(iz2,jz1);
909 dx22 = _mm_sub_ps(ix2,jx2);
910 dy22 = _mm_sub_ps(iy2,jy2);
911 dz22 = _mm_sub_ps(iz2,jz2);
912 dx23 = _mm_sub_ps(ix2,jx3);
913 dy23 = _mm_sub_ps(iy2,jy3);
914 dz23 = _mm_sub_ps(iz2,jz3);
915 dx31 = _mm_sub_ps(ix3,jx1);
916 dy31 = _mm_sub_ps(iy3,jy1);
917 dz31 = _mm_sub_ps(iz3,jz1);
918 dx32 = _mm_sub_ps(ix3,jx2);
919 dy32 = _mm_sub_ps(iy3,jy2);
920 dz32 = _mm_sub_ps(iz3,jz2);
921 dx33 = _mm_sub_ps(ix3,jx3);
922 dy33 = _mm_sub_ps(iy3,jy3);
923 dz33 = _mm_sub_ps(iz3,jz3);
925 /* Calculate squared distance and things based on it */
926 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
927 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
928 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
929 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
930 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
931 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
932 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
933 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
934 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
936 rinv11 = gmx_mm_invsqrt_ps(rsq11);
937 rinv12 = gmx_mm_invsqrt_ps(rsq12);
938 rinv13 = gmx_mm_invsqrt_ps(rsq13);
939 rinv21 = gmx_mm_invsqrt_ps(rsq21);
940 rinv22 = gmx_mm_invsqrt_ps(rsq22);
941 rinv23 = gmx_mm_invsqrt_ps(rsq23);
942 rinv31 = gmx_mm_invsqrt_ps(rsq31);
943 rinv32 = gmx_mm_invsqrt_ps(rsq32);
944 rinv33 = gmx_mm_invsqrt_ps(rsq33);
946 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
947 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
948 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
949 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
950 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
951 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
952 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
953 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
954 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
956 fjx1 = _mm_setzero_ps();
957 fjy1 = _mm_setzero_ps();
958 fjz1 = _mm_setzero_ps();
959 fjx2 = _mm_setzero_ps();
960 fjy2 = _mm_setzero_ps();
961 fjz2 = _mm_setzero_ps();
962 fjx3 = _mm_setzero_ps();
963 fjy3 = _mm_setzero_ps();
964 fjz3 = _mm_setzero_ps();
966 /**************************
967 * CALCULATE INTERACTIONS *
968 **************************/
970 if (gmx_mm_any_lt(rsq11,rcutoff2))
973 r11 = _mm_mul_ps(rsq11,rinv11);
974 r11 = _mm_andnot_ps(dummy_mask,r11);
976 /* EWALD ELECTROSTATICS */
978 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
979 ewrt = _mm_mul_ps(r11,ewtabscale);
980 ewitab = _mm_cvttps_epi32(ewrt);
981 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
982 ewitab = _mm_slli_epi32(ewitab,2);
983 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
984 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
985 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
986 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
987 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
988 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
989 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
990 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
991 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
993 d = _mm_sub_ps(r11,rswitch);
994 d = _mm_max_ps(d,_mm_setzero_ps());
995 d2 = _mm_mul_ps(d,d);
996 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
998 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1000 /* Evaluate switch function */
1001 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1002 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1003 velec = _mm_mul_ps(velec,sw);
1004 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1006 /* Update potential sum for this i atom from the interaction with this j atom. */
1007 velec = _mm_and_ps(velec,cutoff_mask);
1008 velec = _mm_andnot_ps(dummy_mask,velec);
1009 velecsum = _mm_add_ps(velecsum,velec);
1013 fscal = _mm_and_ps(fscal,cutoff_mask);
1015 fscal = _mm_andnot_ps(dummy_mask,fscal);
1017 /* Calculate temporary vectorial force */
1018 tx = _mm_mul_ps(fscal,dx11);
1019 ty = _mm_mul_ps(fscal,dy11);
1020 tz = _mm_mul_ps(fscal,dz11);
1022 /* Update vectorial force */
1023 fix1 = _mm_add_ps(fix1,tx);
1024 fiy1 = _mm_add_ps(fiy1,ty);
1025 fiz1 = _mm_add_ps(fiz1,tz);
1027 fjx1 = _mm_add_ps(fjx1,tx);
1028 fjy1 = _mm_add_ps(fjy1,ty);
1029 fjz1 = _mm_add_ps(fjz1,tz);
1033 /**************************
1034 * CALCULATE INTERACTIONS *
1035 **************************/
1037 if (gmx_mm_any_lt(rsq12,rcutoff2))
1040 r12 = _mm_mul_ps(rsq12,rinv12);
1041 r12 = _mm_andnot_ps(dummy_mask,r12);
1043 /* EWALD ELECTROSTATICS */
1045 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1046 ewrt = _mm_mul_ps(r12,ewtabscale);
1047 ewitab = _mm_cvttps_epi32(ewrt);
1048 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1049 ewitab = _mm_slli_epi32(ewitab,2);
1050 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1051 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1052 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1053 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1054 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1055 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1056 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1057 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1058 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1060 d = _mm_sub_ps(r12,rswitch);
1061 d = _mm_max_ps(d,_mm_setzero_ps());
1062 d2 = _mm_mul_ps(d,d);
1063 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1065 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1067 /* Evaluate switch function */
1068 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1069 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1070 velec = _mm_mul_ps(velec,sw);
1071 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1073 /* Update potential sum for this i atom from the interaction with this j atom. */
1074 velec = _mm_and_ps(velec,cutoff_mask);
1075 velec = _mm_andnot_ps(dummy_mask,velec);
1076 velecsum = _mm_add_ps(velecsum,velec);
1080 fscal = _mm_and_ps(fscal,cutoff_mask);
1082 fscal = _mm_andnot_ps(dummy_mask,fscal);
1084 /* Calculate temporary vectorial force */
1085 tx = _mm_mul_ps(fscal,dx12);
1086 ty = _mm_mul_ps(fscal,dy12);
1087 tz = _mm_mul_ps(fscal,dz12);
1089 /* Update vectorial force */
1090 fix1 = _mm_add_ps(fix1,tx);
1091 fiy1 = _mm_add_ps(fiy1,ty);
1092 fiz1 = _mm_add_ps(fiz1,tz);
1094 fjx2 = _mm_add_ps(fjx2,tx);
1095 fjy2 = _mm_add_ps(fjy2,ty);
1096 fjz2 = _mm_add_ps(fjz2,tz);
1100 /**************************
1101 * CALCULATE INTERACTIONS *
1102 **************************/
1104 if (gmx_mm_any_lt(rsq13,rcutoff2))
1107 r13 = _mm_mul_ps(rsq13,rinv13);
1108 r13 = _mm_andnot_ps(dummy_mask,r13);
1110 /* EWALD ELECTROSTATICS */
1112 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1113 ewrt = _mm_mul_ps(r13,ewtabscale);
1114 ewitab = _mm_cvttps_epi32(ewrt);
1115 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1116 ewitab = _mm_slli_epi32(ewitab,2);
1117 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1118 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1119 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1120 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1121 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1122 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1123 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1124 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
1125 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1127 d = _mm_sub_ps(r13,rswitch);
1128 d = _mm_max_ps(d,_mm_setzero_ps());
1129 d2 = _mm_mul_ps(d,d);
1130 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1132 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1134 /* Evaluate switch function */
1135 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1136 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
1137 velec = _mm_mul_ps(velec,sw);
1138 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1140 /* Update potential sum for this i atom from the interaction with this j atom. */
1141 velec = _mm_and_ps(velec,cutoff_mask);
1142 velec = _mm_andnot_ps(dummy_mask,velec);
1143 velecsum = _mm_add_ps(velecsum,velec);
1147 fscal = _mm_and_ps(fscal,cutoff_mask);
1149 fscal = _mm_andnot_ps(dummy_mask,fscal);
1151 /* Calculate temporary vectorial force */
1152 tx = _mm_mul_ps(fscal,dx13);
1153 ty = _mm_mul_ps(fscal,dy13);
1154 tz = _mm_mul_ps(fscal,dz13);
1156 /* Update vectorial force */
1157 fix1 = _mm_add_ps(fix1,tx);
1158 fiy1 = _mm_add_ps(fiy1,ty);
1159 fiz1 = _mm_add_ps(fiz1,tz);
1161 fjx3 = _mm_add_ps(fjx3,tx);
1162 fjy3 = _mm_add_ps(fjy3,ty);
1163 fjz3 = _mm_add_ps(fjz3,tz);
1167 /**************************
1168 * CALCULATE INTERACTIONS *
1169 **************************/
1171 if (gmx_mm_any_lt(rsq21,rcutoff2))
1174 r21 = _mm_mul_ps(rsq21,rinv21);
1175 r21 = _mm_andnot_ps(dummy_mask,r21);
1177 /* EWALD ELECTROSTATICS */
1179 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1180 ewrt = _mm_mul_ps(r21,ewtabscale);
1181 ewitab = _mm_cvttps_epi32(ewrt);
1182 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1183 ewitab = _mm_slli_epi32(ewitab,2);
1184 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1185 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1186 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1187 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1188 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1189 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1190 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1191 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1192 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1194 d = _mm_sub_ps(r21,rswitch);
1195 d = _mm_max_ps(d,_mm_setzero_ps());
1196 d2 = _mm_mul_ps(d,d);
1197 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1199 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1201 /* Evaluate switch function */
1202 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1203 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1204 velec = _mm_mul_ps(velec,sw);
1205 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1207 /* Update potential sum for this i atom from the interaction with this j atom. */
1208 velec = _mm_and_ps(velec,cutoff_mask);
1209 velec = _mm_andnot_ps(dummy_mask,velec);
1210 velecsum = _mm_add_ps(velecsum,velec);
1214 fscal = _mm_and_ps(fscal,cutoff_mask);
1216 fscal = _mm_andnot_ps(dummy_mask,fscal);
1218 /* Calculate temporary vectorial force */
1219 tx = _mm_mul_ps(fscal,dx21);
1220 ty = _mm_mul_ps(fscal,dy21);
1221 tz = _mm_mul_ps(fscal,dz21);
1223 /* Update vectorial force */
1224 fix2 = _mm_add_ps(fix2,tx);
1225 fiy2 = _mm_add_ps(fiy2,ty);
1226 fiz2 = _mm_add_ps(fiz2,tz);
1228 fjx1 = _mm_add_ps(fjx1,tx);
1229 fjy1 = _mm_add_ps(fjy1,ty);
1230 fjz1 = _mm_add_ps(fjz1,tz);
1234 /**************************
1235 * CALCULATE INTERACTIONS *
1236 **************************/
1238 if (gmx_mm_any_lt(rsq22,rcutoff2))
1241 r22 = _mm_mul_ps(rsq22,rinv22);
1242 r22 = _mm_andnot_ps(dummy_mask,r22);
1244 /* EWALD ELECTROSTATICS */
1246 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1247 ewrt = _mm_mul_ps(r22,ewtabscale);
1248 ewitab = _mm_cvttps_epi32(ewrt);
1249 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1250 ewitab = _mm_slli_epi32(ewitab,2);
1251 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1252 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1253 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1254 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1255 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1256 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1257 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1258 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1259 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1261 d = _mm_sub_ps(r22,rswitch);
1262 d = _mm_max_ps(d,_mm_setzero_ps());
1263 d2 = _mm_mul_ps(d,d);
1264 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1266 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1268 /* Evaluate switch function */
1269 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1270 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1271 velec = _mm_mul_ps(velec,sw);
1272 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1274 /* Update potential sum for this i atom from the interaction with this j atom. */
1275 velec = _mm_and_ps(velec,cutoff_mask);
1276 velec = _mm_andnot_ps(dummy_mask,velec);
1277 velecsum = _mm_add_ps(velecsum,velec);
1281 fscal = _mm_and_ps(fscal,cutoff_mask);
1283 fscal = _mm_andnot_ps(dummy_mask,fscal);
1285 /* Calculate temporary vectorial force */
1286 tx = _mm_mul_ps(fscal,dx22);
1287 ty = _mm_mul_ps(fscal,dy22);
1288 tz = _mm_mul_ps(fscal,dz22);
1290 /* Update vectorial force */
1291 fix2 = _mm_add_ps(fix2,tx);
1292 fiy2 = _mm_add_ps(fiy2,ty);
1293 fiz2 = _mm_add_ps(fiz2,tz);
1295 fjx2 = _mm_add_ps(fjx2,tx);
1296 fjy2 = _mm_add_ps(fjy2,ty);
1297 fjz2 = _mm_add_ps(fjz2,tz);
1301 /**************************
1302 * CALCULATE INTERACTIONS *
1303 **************************/
1305 if (gmx_mm_any_lt(rsq23,rcutoff2))
1308 r23 = _mm_mul_ps(rsq23,rinv23);
1309 r23 = _mm_andnot_ps(dummy_mask,r23);
1311 /* EWALD ELECTROSTATICS */
1313 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1314 ewrt = _mm_mul_ps(r23,ewtabscale);
1315 ewitab = _mm_cvttps_epi32(ewrt);
1316 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1317 ewitab = _mm_slli_epi32(ewitab,2);
1318 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1319 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1320 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1321 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1322 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1323 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1324 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1325 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1326 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1328 d = _mm_sub_ps(r23,rswitch);
1329 d = _mm_max_ps(d,_mm_setzero_ps());
1330 d2 = _mm_mul_ps(d,d);
1331 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1333 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1335 /* Evaluate switch function */
1336 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1337 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
1338 velec = _mm_mul_ps(velec,sw);
1339 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1341 /* Update potential sum for this i atom from the interaction with this j atom. */
1342 velec = _mm_and_ps(velec,cutoff_mask);
1343 velec = _mm_andnot_ps(dummy_mask,velec);
1344 velecsum = _mm_add_ps(velecsum,velec);
1348 fscal = _mm_and_ps(fscal,cutoff_mask);
1350 fscal = _mm_andnot_ps(dummy_mask,fscal);
1352 /* Calculate temporary vectorial force */
1353 tx = _mm_mul_ps(fscal,dx23);
1354 ty = _mm_mul_ps(fscal,dy23);
1355 tz = _mm_mul_ps(fscal,dz23);
1357 /* Update vectorial force */
1358 fix2 = _mm_add_ps(fix2,tx);
1359 fiy2 = _mm_add_ps(fiy2,ty);
1360 fiz2 = _mm_add_ps(fiz2,tz);
1362 fjx3 = _mm_add_ps(fjx3,tx);
1363 fjy3 = _mm_add_ps(fjy3,ty);
1364 fjz3 = _mm_add_ps(fjz3,tz);
1368 /**************************
1369 * CALCULATE INTERACTIONS *
1370 **************************/
1372 if (gmx_mm_any_lt(rsq31,rcutoff2))
1375 r31 = _mm_mul_ps(rsq31,rinv31);
1376 r31 = _mm_andnot_ps(dummy_mask,r31);
1378 /* EWALD ELECTROSTATICS */
1380 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1381 ewrt = _mm_mul_ps(r31,ewtabscale);
1382 ewitab = _mm_cvttps_epi32(ewrt);
1383 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1384 ewitab = _mm_slli_epi32(ewitab,2);
1385 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1386 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1387 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1388 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1389 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1390 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1391 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1392 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1393 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1395 d = _mm_sub_ps(r31,rswitch);
1396 d = _mm_max_ps(d,_mm_setzero_ps());
1397 d2 = _mm_mul_ps(d,d);
1398 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1400 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1402 /* Evaluate switch function */
1403 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1404 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
1405 velec = _mm_mul_ps(velec,sw);
1406 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1408 /* Update potential sum for this i atom from the interaction with this j atom. */
1409 velec = _mm_and_ps(velec,cutoff_mask);
1410 velec = _mm_andnot_ps(dummy_mask,velec);
1411 velecsum = _mm_add_ps(velecsum,velec);
1415 fscal = _mm_and_ps(fscal,cutoff_mask);
1417 fscal = _mm_andnot_ps(dummy_mask,fscal);
1419 /* Calculate temporary vectorial force */
1420 tx = _mm_mul_ps(fscal,dx31);
1421 ty = _mm_mul_ps(fscal,dy31);
1422 tz = _mm_mul_ps(fscal,dz31);
1424 /* Update vectorial force */
1425 fix3 = _mm_add_ps(fix3,tx);
1426 fiy3 = _mm_add_ps(fiy3,ty);
1427 fiz3 = _mm_add_ps(fiz3,tz);
1429 fjx1 = _mm_add_ps(fjx1,tx);
1430 fjy1 = _mm_add_ps(fjy1,ty);
1431 fjz1 = _mm_add_ps(fjz1,tz);
1435 /**************************
1436 * CALCULATE INTERACTIONS *
1437 **************************/
1439 if (gmx_mm_any_lt(rsq32,rcutoff2))
1442 r32 = _mm_mul_ps(rsq32,rinv32);
1443 r32 = _mm_andnot_ps(dummy_mask,r32);
1445 /* EWALD ELECTROSTATICS */
1447 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1448 ewrt = _mm_mul_ps(r32,ewtabscale);
1449 ewitab = _mm_cvttps_epi32(ewrt);
1450 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1451 ewitab = _mm_slli_epi32(ewitab,2);
1452 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1453 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1454 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1455 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1456 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1457 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1458 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1459 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1460 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1462 d = _mm_sub_ps(r32,rswitch);
1463 d = _mm_max_ps(d,_mm_setzero_ps());
1464 d2 = _mm_mul_ps(d,d);
1465 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1467 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1469 /* Evaluate switch function */
1470 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1471 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
1472 velec = _mm_mul_ps(velec,sw);
1473 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1475 /* Update potential sum for this i atom from the interaction with this j atom. */
1476 velec = _mm_and_ps(velec,cutoff_mask);
1477 velec = _mm_andnot_ps(dummy_mask,velec);
1478 velecsum = _mm_add_ps(velecsum,velec);
1482 fscal = _mm_and_ps(fscal,cutoff_mask);
1484 fscal = _mm_andnot_ps(dummy_mask,fscal);
1486 /* Calculate temporary vectorial force */
1487 tx = _mm_mul_ps(fscal,dx32);
1488 ty = _mm_mul_ps(fscal,dy32);
1489 tz = _mm_mul_ps(fscal,dz32);
1491 /* Update vectorial force */
1492 fix3 = _mm_add_ps(fix3,tx);
1493 fiy3 = _mm_add_ps(fiy3,ty);
1494 fiz3 = _mm_add_ps(fiz3,tz);
1496 fjx2 = _mm_add_ps(fjx2,tx);
1497 fjy2 = _mm_add_ps(fjy2,ty);
1498 fjz2 = _mm_add_ps(fjz2,tz);
1502 /**************************
1503 * CALCULATE INTERACTIONS *
1504 **************************/
1506 if (gmx_mm_any_lt(rsq33,rcutoff2))
1509 r33 = _mm_mul_ps(rsq33,rinv33);
1510 r33 = _mm_andnot_ps(dummy_mask,r33);
1512 /* EWALD ELECTROSTATICS */
1514 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1515 ewrt = _mm_mul_ps(r33,ewtabscale);
1516 ewitab = _mm_cvttps_epi32(ewrt);
1517 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1518 ewitab = _mm_slli_epi32(ewitab,2);
1519 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1520 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1521 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1522 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1523 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1524 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1525 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1526 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1527 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1529 d = _mm_sub_ps(r33,rswitch);
1530 d = _mm_max_ps(d,_mm_setzero_ps());
1531 d2 = _mm_mul_ps(d,d);
1532 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1534 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1536 /* Evaluate switch function */
1537 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1538 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
1539 velec = _mm_mul_ps(velec,sw);
1540 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1542 /* Update potential sum for this i atom from the interaction with this j atom. */
1543 velec = _mm_and_ps(velec,cutoff_mask);
1544 velec = _mm_andnot_ps(dummy_mask,velec);
1545 velecsum = _mm_add_ps(velecsum,velec);
1549 fscal = _mm_and_ps(fscal,cutoff_mask);
1551 fscal = _mm_andnot_ps(dummy_mask,fscal);
1553 /* Calculate temporary vectorial force */
1554 tx = _mm_mul_ps(fscal,dx33);
1555 ty = _mm_mul_ps(fscal,dy33);
1556 tz = _mm_mul_ps(fscal,dz33);
1558 /* Update vectorial force */
1559 fix3 = _mm_add_ps(fix3,tx);
1560 fiy3 = _mm_add_ps(fiy3,ty);
1561 fiz3 = _mm_add_ps(fiz3,tz);
1563 fjx3 = _mm_add_ps(fjx3,tx);
1564 fjy3 = _mm_add_ps(fjy3,ty);
1565 fjz3 = _mm_add_ps(fjz3,tz);
1569 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1570 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1571 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1572 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1574 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1575 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1577 /* Inner loop uses 594 flops */
1580 /* End of innermost loop */
1582 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1583 f+i_coord_offset+DIM,fshift+i_shift_offset);
1586 /* Update potential energies */
1587 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1589 /* Increment number of inner iterations */
1590 inneriter += j_index_end - j_index_start;
1592 /* Outer loop uses 19 flops */
1595 /* Increment number of outer iterations */
1598 /* Update outer/inner flops */
1600 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*594);
1603 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse4_1_single
1604 * Electrostatics interaction: Ewald
1605 * VdW interaction: None
1606 * Geometry: Water4-Water4
1607 * Calculate force/pot: Force
1610 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse4_1_single
1611 (t_nblist * gmx_restrict nlist,
1612 rvec * gmx_restrict xx,
1613 rvec * gmx_restrict ff,
1614 t_forcerec * gmx_restrict fr,
1615 t_mdatoms * gmx_restrict mdatoms,
1616 nb_kernel_data_t * gmx_restrict kernel_data,
1617 t_nrnb * gmx_restrict nrnb)
1619 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1620 * just 0 for non-waters.
1621 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1622 * jnr indices corresponding to data put in the four positions in the SIMD register.
1624 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1625 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1626 int jnrA,jnrB,jnrC,jnrD;
1627 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1628 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1629 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1630 real rcutoff_scalar;
1631 real *shiftvec,*fshift,*x,*f;
1632 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1633 real scratch[4*DIM];
1634 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1636 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1638 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1640 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1641 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1642 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1643 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1644 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1645 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1646 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1647 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1648 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1649 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1650 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1651 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1652 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1653 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1654 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1655 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1656 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1659 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1661 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1662 real rswitch_scalar,d_scalar;
1663 __m128 dummy_mask,cutoff_mask;
1664 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1665 __m128 one = _mm_set1_ps(1.0);
1666 __m128 two = _mm_set1_ps(2.0);
1672 jindex = nlist->jindex;
1674 shiftidx = nlist->shift;
1676 shiftvec = fr->shift_vec[0];
1677 fshift = fr->fshift[0];
1678 facel = _mm_set1_ps(fr->epsfac);
1679 charge = mdatoms->chargeA;
1681 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1682 ewtab = fr->ic->tabq_coul_FDV0;
1683 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1684 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1686 /* Setup water-specific parameters */
1687 inr = nlist->iinr[0];
1688 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1689 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1690 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1692 jq1 = _mm_set1_ps(charge[inr+1]);
1693 jq2 = _mm_set1_ps(charge[inr+2]);
1694 jq3 = _mm_set1_ps(charge[inr+3]);
1695 qq11 = _mm_mul_ps(iq1,jq1);
1696 qq12 = _mm_mul_ps(iq1,jq2);
1697 qq13 = _mm_mul_ps(iq1,jq3);
1698 qq21 = _mm_mul_ps(iq2,jq1);
1699 qq22 = _mm_mul_ps(iq2,jq2);
1700 qq23 = _mm_mul_ps(iq2,jq3);
1701 qq31 = _mm_mul_ps(iq3,jq1);
1702 qq32 = _mm_mul_ps(iq3,jq2);
1703 qq33 = _mm_mul_ps(iq3,jq3);
1705 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1706 rcutoff_scalar = fr->rcoulomb;
1707 rcutoff = _mm_set1_ps(rcutoff_scalar);
1708 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1710 rswitch_scalar = fr->rcoulomb_switch;
1711 rswitch = _mm_set1_ps(rswitch_scalar);
1712 /* Setup switch parameters */
1713 d_scalar = rcutoff_scalar-rswitch_scalar;
1714 d = _mm_set1_ps(d_scalar);
1715 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1716 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1717 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1718 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1719 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1720 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1722 /* Avoid stupid compiler warnings */
1723 jnrA = jnrB = jnrC = jnrD = 0;
1724 j_coord_offsetA = 0;
1725 j_coord_offsetB = 0;
1726 j_coord_offsetC = 0;
1727 j_coord_offsetD = 0;
1732 for(iidx=0;iidx<4*DIM;iidx++)
1734 scratch[iidx] = 0.0;
1737 /* Start outer loop over neighborlists */
1738 for(iidx=0; iidx<nri; iidx++)
1740 /* Load shift vector for this list */
1741 i_shift_offset = DIM*shiftidx[iidx];
1743 /* Load limits for loop over neighbors */
1744 j_index_start = jindex[iidx];
1745 j_index_end = jindex[iidx+1];
1747 /* Get outer coordinate index */
1749 i_coord_offset = DIM*inr;
1751 /* Load i particle coords and add shift vector */
1752 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1753 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1755 fix1 = _mm_setzero_ps();
1756 fiy1 = _mm_setzero_ps();
1757 fiz1 = _mm_setzero_ps();
1758 fix2 = _mm_setzero_ps();
1759 fiy2 = _mm_setzero_ps();
1760 fiz2 = _mm_setzero_ps();
1761 fix3 = _mm_setzero_ps();
1762 fiy3 = _mm_setzero_ps();
1763 fiz3 = _mm_setzero_ps();
1765 /* Start inner kernel loop */
1766 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1769 /* Get j neighbor index, and coordinate index */
1771 jnrB = jjnr[jidx+1];
1772 jnrC = jjnr[jidx+2];
1773 jnrD = jjnr[jidx+3];
1774 j_coord_offsetA = DIM*jnrA;
1775 j_coord_offsetB = DIM*jnrB;
1776 j_coord_offsetC = DIM*jnrC;
1777 j_coord_offsetD = DIM*jnrD;
1779 /* load j atom coordinates */
1780 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1781 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1782 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1784 /* Calculate displacement vector */
1785 dx11 = _mm_sub_ps(ix1,jx1);
1786 dy11 = _mm_sub_ps(iy1,jy1);
1787 dz11 = _mm_sub_ps(iz1,jz1);
1788 dx12 = _mm_sub_ps(ix1,jx2);
1789 dy12 = _mm_sub_ps(iy1,jy2);
1790 dz12 = _mm_sub_ps(iz1,jz2);
1791 dx13 = _mm_sub_ps(ix1,jx3);
1792 dy13 = _mm_sub_ps(iy1,jy3);
1793 dz13 = _mm_sub_ps(iz1,jz3);
1794 dx21 = _mm_sub_ps(ix2,jx1);
1795 dy21 = _mm_sub_ps(iy2,jy1);
1796 dz21 = _mm_sub_ps(iz2,jz1);
1797 dx22 = _mm_sub_ps(ix2,jx2);
1798 dy22 = _mm_sub_ps(iy2,jy2);
1799 dz22 = _mm_sub_ps(iz2,jz2);
1800 dx23 = _mm_sub_ps(ix2,jx3);
1801 dy23 = _mm_sub_ps(iy2,jy3);
1802 dz23 = _mm_sub_ps(iz2,jz3);
1803 dx31 = _mm_sub_ps(ix3,jx1);
1804 dy31 = _mm_sub_ps(iy3,jy1);
1805 dz31 = _mm_sub_ps(iz3,jz1);
1806 dx32 = _mm_sub_ps(ix3,jx2);
1807 dy32 = _mm_sub_ps(iy3,jy2);
1808 dz32 = _mm_sub_ps(iz3,jz2);
1809 dx33 = _mm_sub_ps(ix3,jx3);
1810 dy33 = _mm_sub_ps(iy3,jy3);
1811 dz33 = _mm_sub_ps(iz3,jz3);
1813 /* Calculate squared distance and things based on it */
1814 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1815 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1816 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1817 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1818 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1819 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1820 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1821 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1822 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1824 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1825 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1826 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1827 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1828 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1829 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1830 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1831 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1832 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1834 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1835 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1836 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1837 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1838 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1839 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1840 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1841 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1842 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1844 fjx1 = _mm_setzero_ps();
1845 fjy1 = _mm_setzero_ps();
1846 fjz1 = _mm_setzero_ps();
1847 fjx2 = _mm_setzero_ps();
1848 fjy2 = _mm_setzero_ps();
1849 fjz2 = _mm_setzero_ps();
1850 fjx3 = _mm_setzero_ps();
1851 fjy3 = _mm_setzero_ps();
1852 fjz3 = _mm_setzero_ps();
1854 /**************************
1855 * CALCULATE INTERACTIONS *
1856 **************************/
1858 if (gmx_mm_any_lt(rsq11,rcutoff2))
1861 r11 = _mm_mul_ps(rsq11,rinv11);
1863 /* EWALD ELECTROSTATICS */
1865 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1866 ewrt = _mm_mul_ps(r11,ewtabscale);
1867 ewitab = _mm_cvttps_epi32(ewrt);
1868 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1869 ewitab = _mm_slli_epi32(ewitab,2);
1870 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1871 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1872 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1873 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1874 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1875 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1876 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1877 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1878 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1880 d = _mm_sub_ps(r11,rswitch);
1881 d = _mm_max_ps(d,_mm_setzero_ps());
1882 d2 = _mm_mul_ps(d,d);
1883 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1885 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1887 /* Evaluate switch function */
1888 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1889 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1890 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1894 fscal = _mm_and_ps(fscal,cutoff_mask);
1896 /* Calculate temporary vectorial force */
1897 tx = _mm_mul_ps(fscal,dx11);
1898 ty = _mm_mul_ps(fscal,dy11);
1899 tz = _mm_mul_ps(fscal,dz11);
1901 /* Update vectorial force */
1902 fix1 = _mm_add_ps(fix1,tx);
1903 fiy1 = _mm_add_ps(fiy1,ty);
1904 fiz1 = _mm_add_ps(fiz1,tz);
1906 fjx1 = _mm_add_ps(fjx1,tx);
1907 fjy1 = _mm_add_ps(fjy1,ty);
1908 fjz1 = _mm_add_ps(fjz1,tz);
1912 /**************************
1913 * CALCULATE INTERACTIONS *
1914 **************************/
1916 if (gmx_mm_any_lt(rsq12,rcutoff2))
1919 r12 = _mm_mul_ps(rsq12,rinv12);
1921 /* EWALD ELECTROSTATICS */
1923 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1924 ewrt = _mm_mul_ps(r12,ewtabscale);
1925 ewitab = _mm_cvttps_epi32(ewrt);
1926 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1927 ewitab = _mm_slli_epi32(ewitab,2);
1928 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1929 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1930 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1931 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1932 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1933 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1934 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1935 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1936 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1938 d = _mm_sub_ps(r12,rswitch);
1939 d = _mm_max_ps(d,_mm_setzero_ps());
1940 d2 = _mm_mul_ps(d,d);
1941 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1943 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1945 /* Evaluate switch function */
1946 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1947 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1948 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1952 fscal = _mm_and_ps(fscal,cutoff_mask);
1954 /* Calculate temporary vectorial force */
1955 tx = _mm_mul_ps(fscal,dx12);
1956 ty = _mm_mul_ps(fscal,dy12);
1957 tz = _mm_mul_ps(fscal,dz12);
1959 /* Update vectorial force */
1960 fix1 = _mm_add_ps(fix1,tx);
1961 fiy1 = _mm_add_ps(fiy1,ty);
1962 fiz1 = _mm_add_ps(fiz1,tz);
1964 fjx2 = _mm_add_ps(fjx2,tx);
1965 fjy2 = _mm_add_ps(fjy2,ty);
1966 fjz2 = _mm_add_ps(fjz2,tz);
1970 /**************************
1971 * CALCULATE INTERACTIONS *
1972 **************************/
1974 if (gmx_mm_any_lt(rsq13,rcutoff2))
1977 r13 = _mm_mul_ps(rsq13,rinv13);
1979 /* EWALD ELECTROSTATICS */
1981 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1982 ewrt = _mm_mul_ps(r13,ewtabscale);
1983 ewitab = _mm_cvttps_epi32(ewrt);
1984 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1985 ewitab = _mm_slli_epi32(ewitab,2);
1986 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1987 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1988 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1989 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1990 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1991 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1992 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1993 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
1994 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1996 d = _mm_sub_ps(r13,rswitch);
1997 d = _mm_max_ps(d,_mm_setzero_ps());
1998 d2 = _mm_mul_ps(d,d);
1999 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2001 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2003 /* Evaluate switch function */
2004 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2005 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2006 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2010 fscal = _mm_and_ps(fscal,cutoff_mask);
2012 /* Calculate temporary vectorial force */
2013 tx = _mm_mul_ps(fscal,dx13);
2014 ty = _mm_mul_ps(fscal,dy13);
2015 tz = _mm_mul_ps(fscal,dz13);
2017 /* Update vectorial force */
2018 fix1 = _mm_add_ps(fix1,tx);
2019 fiy1 = _mm_add_ps(fiy1,ty);
2020 fiz1 = _mm_add_ps(fiz1,tz);
2022 fjx3 = _mm_add_ps(fjx3,tx);
2023 fjy3 = _mm_add_ps(fjy3,ty);
2024 fjz3 = _mm_add_ps(fjz3,tz);
2028 /**************************
2029 * CALCULATE INTERACTIONS *
2030 **************************/
2032 if (gmx_mm_any_lt(rsq21,rcutoff2))
2035 r21 = _mm_mul_ps(rsq21,rinv21);
2037 /* EWALD ELECTROSTATICS */
2039 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2040 ewrt = _mm_mul_ps(r21,ewtabscale);
2041 ewitab = _mm_cvttps_epi32(ewrt);
2042 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2043 ewitab = _mm_slli_epi32(ewitab,2);
2044 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2045 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2046 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2047 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2048 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2049 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2050 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2051 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2052 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2054 d = _mm_sub_ps(r21,rswitch);
2055 d = _mm_max_ps(d,_mm_setzero_ps());
2056 d2 = _mm_mul_ps(d,d);
2057 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2059 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2061 /* Evaluate switch function */
2062 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2063 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2064 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2068 fscal = _mm_and_ps(fscal,cutoff_mask);
2070 /* Calculate temporary vectorial force */
2071 tx = _mm_mul_ps(fscal,dx21);
2072 ty = _mm_mul_ps(fscal,dy21);
2073 tz = _mm_mul_ps(fscal,dz21);
2075 /* Update vectorial force */
2076 fix2 = _mm_add_ps(fix2,tx);
2077 fiy2 = _mm_add_ps(fiy2,ty);
2078 fiz2 = _mm_add_ps(fiz2,tz);
2080 fjx1 = _mm_add_ps(fjx1,tx);
2081 fjy1 = _mm_add_ps(fjy1,ty);
2082 fjz1 = _mm_add_ps(fjz1,tz);
2086 /**************************
2087 * CALCULATE INTERACTIONS *
2088 **************************/
2090 if (gmx_mm_any_lt(rsq22,rcutoff2))
2093 r22 = _mm_mul_ps(rsq22,rinv22);
2095 /* EWALD ELECTROSTATICS */
2097 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2098 ewrt = _mm_mul_ps(r22,ewtabscale);
2099 ewitab = _mm_cvttps_epi32(ewrt);
2100 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2101 ewitab = _mm_slli_epi32(ewitab,2);
2102 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2103 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2104 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2105 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2106 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2107 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2108 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2109 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2110 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2112 d = _mm_sub_ps(r22,rswitch);
2113 d = _mm_max_ps(d,_mm_setzero_ps());
2114 d2 = _mm_mul_ps(d,d);
2115 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2117 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2119 /* Evaluate switch function */
2120 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2121 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2122 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2126 fscal = _mm_and_ps(fscal,cutoff_mask);
2128 /* Calculate temporary vectorial force */
2129 tx = _mm_mul_ps(fscal,dx22);
2130 ty = _mm_mul_ps(fscal,dy22);
2131 tz = _mm_mul_ps(fscal,dz22);
2133 /* Update vectorial force */
2134 fix2 = _mm_add_ps(fix2,tx);
2135 fiy2 = _mm_add_ps(fiy2,ty);
2136 fiz2 = _mm_add_ps(fiz2,tz);
2138 fjx2 = _mm_add_ps(fjx2,tx);
2139 fjy2 = _mm_add_ps(fjy2,ty);
2140 fjz2 = _mm_add_ps(fjz2,tz);
2144 /**************************
2145 * CALCULATE INTERACTIONS *
2146 **************************/
2148 if (gmx_mm_any_lt(rsq23,rcutoff2))
2151 r23 = _mm_mul_ps(rsq23,rinv23);
2153 /* EWALD ELECTROSTATICS */
2155 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2156 ewrt = _mm_mul_ps(r23,ewtabscale);
2157 ewitab = _mm_cvttps_epi32(ewrt);
2158 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2159 ewitab = _mm_slli_epi32(ewitab,2);
2160 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2161 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2162 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2163 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2164 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2165 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2166 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2167 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
2168 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2170 d = _mm_sub_ps(r23,rswitch);
2171 d = _mm_max_ps(d,_mm_setzero_ps());
2172 d2 = _mm_mul_ps(d,d);
2173 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2175 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2177 /* Evaluate switch function */
2178 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2179 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2180 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2184 fscal = _mm_and_ps(fscal,cutoff_mask);
2186 /* Calculate temporary vectorial force */
2187 tx = _mm_mul_ps(fscal,dx23);
2188 ty = _mm_mul_ps(fscal,dy23);
2189 tz = _mm_mul_ps(fscal,dz23);
2191 /* Update vectorial force */
2192 fix2 = _mm_add_ps(fix2,tx);
2193 fiy2 = _mm_add_ps(fiy2,ty);
2194 fiz2 = _mm_add_ps(fiz2,tz);
2196 fjx3 = _mm_add_ps(fjx3,tx);
2197 fjy3 = _mm_add_ps(fjy3,ty);
2198 fjz3 = _mm_add_ps(fjz3,tz);
2202 /**************************
2203 * CALCULATE INTERACTIONS *
2204 **************************/
2206 if (gmx_mm_any_lt(rsq31,rcutoff2))
2209 r31 = _mm_mul_ps(rsq31,rinv31);
2211 /* EWALD ELECTROSTATICS */
2213 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2214 ewrt = _mm_mul_ps(r31,ewtabscale);
2215 ewitab = _mm_cvttps_epi32(ewrt);
2216 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2217 ewitab = _mm_slli_epi32(ewitab,2);
2218 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2219 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2220 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2221 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2222 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2223 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2224 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2225 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
2226 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2228 d = _mm_sub_ps(r31,rswitch);
2229 d = _mm_max_ps(d,_mm_setzero_ps());
2230 d2 = _mm_mul_ps(d,d);
2231 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2233 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2235 /* Evaluate switch function */
2236 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2237 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2238 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2242 fscal = _mm_and_ps(fscal,cutoff_mask);
2244 /* Calculate temporary vectorial force */
2245 tx = _mm_mul_ps(fscal,dx31);
2246 ty = _mm_mul_ps(fscal,dy31);
2247 tz = _mm_mul_ps(fscal,dz31);
2249 /* Update vectorial force */
2250 fix3 = _mm_add_ps(fix3,tx);
2251 fiy3 = _mm_add_ps(fiy3,ty);
2252 fiz3 = _mm_add_ps(fiz3,tz);
2254 fjx1 = _mm_add_ps(fjx1,tx);
2255 fjy1 = _mm_add_ps(fjy1,ty);
2256 fjz1 = _mm_add_ps(fjz1,tz);
2260 /**************************
2261 * CALCULATE INTERACTIONS *
2262 **************************/
2264 if (gmx_mm_any_lt(rsq32,rcutoff2))
2267 r32 = _mm_mul_ps(rsq32,rinv32);
2269 /* EWALD ELECTROSTATICS */
2271 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2272 ewrt = _mm_mul_ps(r32,ewtabscale);
2273 ewitab = _mm_cvttps_epi32(ewrt);
2274 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2275 ewitab = _mm_slli_epi32(ewitab,2);
2276 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2277 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2278 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2279 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2280 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2281 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2282 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2283 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
2284 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2286 d = _mm_sub_ps(r32,rswitch);
2287 d = _mm_max_ps(d,_mm_setzero_ps());
2288 d2 = _mm_mul_ps(d,d);
2289 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2291 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2293 /* Evaluate switch function */
2294 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2295 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2296 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2300 fscal = _mm_and_ps(fscal,cutoff_mask);
2302 /* Calculate temporary vectorial force */
2303 tx = _mm_mul_ps(fscal,dx32);
2304 ty = _mm_mul_ps(fscal,dy32);
2305 tz = _mm_mul_ps(fscal,dz32);
2307 /* Update vectorial force */
2308 fix3 = _mm_add_ps(fix3,tx);
2309 fiy3 = _mm_add_ps(fiy3,ty);
2310 fiz3 = _mm_add_ps(fiz3,tz);
2312 fjx2 = _mm_add_ps(fjx2,tx);
2313 fjy2 = _mm_add_ps(fjy2,ty);
2314 fjz2 = _mm_add_ps(fjz2,tz);
2318 /**************************
2319 * CALCULATE INTERACTIONS *
2320 **************************/
2322 if (gmx_mm_any_lt(rsq33,rcutoff2))
2325 r33 = _mm_mul_ps(rsq33,rinv33);
2327 /* EWALD ELECTROSTATICS */
2329 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2330 ewrt = _mm_mul_ps(r33,ewtabscale);
2331 ewitab = _mm_cvttps_epi32(ewrt);
2332 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2333 ewitab = _mm_slli_epi32(ewitab,2);
2334 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2335 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2336 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2337 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2338 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2339 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2340 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2341 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
2342 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2344 d = _mm_sub_ps(r33,rswitch);
2345 d = _mm_max_ps(d,_mm_setzero_ps());
2346 d2 = _mm_mul_ps(d,d);
2347 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2349 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2351 /* Evaluate switch function */
2352 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2353 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2354 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2358 fscal = _mm_and_ps(fscal,cutoff_mask);
2360 /* Calculate temporary vectorial force */
2361 tx = _mm_mul_ps(fscal,dx33);
2362 ty = _mm_mul_ps(fscal,dy33);
2363 tz = _mm_mul_ps(fscal,dz33);
2365 /* Update vectorial force */
2366 fix3 = _mm_add_ps(fix3,tx);
2367 fiy3 = _mm_add_ps(fiy3,ty);
2368 fiz3 = _mm_add_ps(fiz3,tz);
2370 fjx3 = _mm_add_ps(fjx3,tx);
2371 fjy3 = _mm_add_ps(fjy3,ty);
2372 fjz3 = _mm_add_ps(fjz3,tz);
2376 fjptrA = f+j_coord_offsetA;
2377 fjptrB = f+j_coord_offsetB;
2378 fjptrC = f+j_coord_offsetC;
2379 fjptrD = f+j_coord_offsetD;
2381 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2382 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2384 /* Inner loop uses 558 flops */
2387 if(jidx<j_index_end)
2390 /* Get j neighbor index, and coordinate index */
2391 jnrlistA = jjnr[jidx];
2392 jnrlistB = jjnr[jidx+1];
2393 jnrlistC = jjnr[jidx+2];
2394 jnrlistD = jjnr[jidx+3];
2395 /* Sign of each element will be negative for non-real atoms.
2396 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2397 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2399 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2400 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2401 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2402 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2403 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2404 j_coord_offsetA = DIM*jnrA;
2405 j_coord_offsetB = DIM*jnrB;
2406 j_coord_offsetC = DIM*jnrC;
2407 j_coord_offsetD = DIM*jnrD;
2409 /* load j atom coordinates */
2410 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
2411 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
2412 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2414 /* Calculate displacement vector */
2415 dx11 = _mm_sub_ps(ix1,jx1);
2416 dy11 = _mm_sub_ps(iy1,jy1);
2417 dz11 = _mm_sub_ps(iz1,jz1);
2418 dx12 = _mm_sub_ps(ix1,jx2);
2419 dy12 = _mm_sub_ps(iy1,jy2);
2420 dz12 = _mm_sub_ps(iz1,jz2);
2421 dx13 = _mm_sub_ps(ix1,jx3);
2422 dy13 = _mm_sub_ps(iy1,jy3);
2423 dz13 = _mm_sub_ps(iz1,jz3);
2424 dx21 = _mm_sub_ps(ix2,jx1);
2425 dy21 = _mm_sub_ps(iy2,jy1);
2426 dz21 = _mm_sub_ps(iz2,jz1);
2427 dx22 = _mm_sub_ps(ix2,jx2);
2428 dy22 = _mm_sub_ps(iy2,jy2);
2429 dz22 = _mm_sub_ps(iz2,jz2);
2430 dx23 = _mm_sub_ps(ix2,jx3);
2431 dy23 = _mm_sub_ps(iy2,jy3);
2432 dz23 = _mm_sub_ps(iz2,jz3);
2433 dx31 = _mm_sub_ps(ix3,jx1);
2434 dy31 = _mm_sub_ps(iy3,jy1);
2435 dz31 = _mm_sub_ps(iz3,jz1);
2436 dx32 = _mm_sub_ps(ix3,jx2);
2437 dy32 = _mm_sub_ps(iy3,jy2);
2438 dz32 = _mm_sub_ps(iz3,jz2);
2439 dx33 = _mm_sub_ps(ix3,jx3);
2440 dy33 = _mm_sub_ps(iy3,jy3);
2441 dz33 = _mm_sub_ps(iz3,jz3);
2443 /* Calculate squared distance and things based on it */
2444 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2445 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2446 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2447 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2448 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2449 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2450 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2451 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2452 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2454 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2455 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2456 rinv13 = gmx_mm_invsqrt_ps(rsq13);
2457 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2458 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2459 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2460 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2461 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2462 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2464 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2465 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2466 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2467 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2468 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2469 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2470 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2471 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2472 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2474 fjx1 = _mm_setzero_ps();
2475 fjy1 = _mm_setzero_ps();
2476 fjz1 = _mm_setzero_ps();
2477 fjx2 = _mm_setzero_ps();
2478 fjy2 = _mm_setzero_ps();
2479 fjz2 = _mm_setzero_ps();
2480 fjx3 = _mm_setzero_ps();
2481 fjy3 = _mm_setzero_ps();
2482 fjz3 = _mm_setzero_ps();
2484 /**************************
2485 * CALCULATE INTERACTIONS *
2486 **************************/
2488 if (gmx_mm_any_lt(rsq11,rcutoff2))
2491 r11 = _mm_mul_ps(rsq11,rinv11);
2492 r11 = _mm_andnot_ps(dummy_mask,r11);
2494 /* EWALD ELECTROSTATICS */
2496 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2497 ewrt = _mm_mul_ps(r11,ewtabscale);
2498 ewitab = _mm_cvttps_epi32(ewrt);
2499 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2500 ewitab = _mm_slli_epi32(ewitab,2);
2501 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2502 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2503 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2504 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2505 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2506 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2507 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2508 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2509 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2511 d = _mm_sub_ps(r11,rswitch);
2512 d = _mm_max_ps(d,_mm_setzero_ps());
2513 d2 = _mm_mul_ps(d,d);
2514 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2516 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2518 /* Evaluate switch function */
2519 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2520 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2521 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2525 fscal = _mm_and_ps(fscal,cutoff_mask);
2527 fscal = _mm_andnot_ps(dummy_mask,fscal);
2529 /* Calculate temporary vectorial force */
2530 tx = _mm_mul_ps(fscal,dx11);
2531 ty = _mm_mul_ps(fscal,dy11);
2532 tz = _mm_mul_ps(fscal,dz11);
2534 /* Update vectorial force */
2535 fix1 = _mm_add_ps(fix1,tx);
2536 fiy1 = _mm_add_ps(fiy1,ty);
2537 fiz1 = _mm_add_ps(fiz1,tz);
2539 fjx1 = _mm_add_ps(fjx1,tx);
2540 fjy1 = _mm_add_ps(fjy1,ty);
2541 fjz1 = _mm_add_ps(fjz1,tz);
2545 /**************************
2546 * CALCULATE INTERACTIONS *
2547 **************************/
2549 if (gmx_mm_any_lt(rsq12,rcutoff2))
2552 r12 = _mm_mul_ps(rsq12,rinv12);
2553 r12 = _mm_andnot_ps(dummy_mask,r12);
2555 /* EWALD ELECTROSTATICS */
2557 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2558 ewrt = _mm_mul_ps(r12,ewtabscale);
2559 ewitab = _mm_cvttps_epi32(ewrt);
2560 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2561 ewitab = _mm_slli_epi32(ewitab,2);
2562 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2563 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2564 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2565 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2566 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2567 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2568 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2569 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2570 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2572 d = _mm_sub_ps(r12,rswitch);
2573 d = _mm_max_ps(d,_mm_setzero_ps());
2574 d2 = _mm_mul_ps(d,d);
2575 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2577 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2579 /* Evaluate switch function */
2580 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2581 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2582 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2586 fscal = _mm_and_ps(fscal,cutoff_mask);
2588 fscal = _mm_andnot_ps(dummy_mask,fscal);
2590 /* Calculate temporary vectorial force */
2591 tx = _mm_mul_ps(fscal,dx12);
2592 ty = _mm_mul_ps(fscal,dy12);
2593 tz = _mm_mul_ps(fscal,dz12);
2595 /* Update vectorial force */
2596 fix1 = _mm_add_ps(fix1,tx);
2597 fiy1 = _mm_add_ps(fiy1,ty);
2598 fiz1 = _mm_add_ps(fiz1,tz);
2600 fjx2 = _mm_add_ps(fjx2,tx);
2601 fjy2 = _mm_add_ps(fjy2,ty);
2602 fjz2 = _mm_add_ps(fjz2,tz);
2606 /**************************
2607 * CALCULATE INTERACTIONS *
2608 **************************/
2610 if (gmx_mm_any_lt(rsq13,rcutoff2))
2613 r13 = _mm_mul_ps(rsq13,rinv13);
2614 r13 = _mm_andnot_ps(dummy_mask,r13);
2616 /* EWALD ELECTROSTATICS */
2618 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2619 ewrt = _mm_mul_ps(r13,ewtabscale);
2620 ewitab = _mm_cvttps_epi32(ewrt);
2621 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2622 ewitab = _mm_slli_epi32(ewitab,2);
2623 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2624 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2625 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2626 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2627 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2628 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2629 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2630 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2631 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2633 d = _mm_sub_ps(r13,rswitch);
2634 d = _mm_max_ps(d,_mm_setzero_ps());
2635 d2 = _mm_mul_ps(d,d);
2636 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2638 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2640 /* Evaluate switch function */
2641 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2642 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2643 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2647 fscal = _mm_and_ps(fscal,cutoff_mask);
2649 fscal = _mm_andnot_ps(dummy_mask,fscal);
2651 /* Calculate temporary vectorial force */
2652 tx = _mm_mul_ps(fscal,dx13);
2653 ty = _mm_mul_ps(fscal,dy13);
2654 tz = _mm_mul_ps(fscal,dz13);
2656 /* Update vectorial force */
2657 fix1 = _mm_add_ps(fix1,tx);
2658 fiy1 = _mm_add_ps(fiy1,ty);
2659 fiz1 = _mm_add_ps(fiz1,tz);
2661 fjx3 = _mm_add_ps(fjx3,tx);
2662 fjy3 = _mm_add_ps(fjy3,ty);
2663 fjz3 = _mm_add_ps(fjz3,tz);
2667 /**************************
2668 * CALCULATE INTERACTIONS *
2669 **************************/
2671 if (gmx_mm_any_lt(rsq21,rcutoff2))
2674 r21 = _mm_mul_ps(rsq21,rinv21);
2675 r21 = _mm_andnot_ps(dummy_mask,r21);
2677 /* EWALD ELECTROSTATICS */
2679 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2680 ewrt = _mm_mul_ps(r21,ewtabscale);
2681 ewitab = _mm_cvttps_epi32(ewrt);
2682 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2683 ewitab = _mm_slli_epi32(ewitab,2);
2684 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2685 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2686 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2687 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2688 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2689 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2690 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2691 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2692 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2694 d = _mm_sub_ps(r21,rswitch);
2695 d = _mm_max_ps(d,_mm_setzero_ps());
2696 d2 = _mm_mul_ps(d,d);
2697 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2699 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2701 /* Evaluate switch function */
2702 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2703 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2704 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2708 fscal = _mm_and_ps(fscal,cutoff_mask);
2710 fscal = _mm_andnot_ps(dummy_mask,fscal);
2712 /* Calculate temporary vectorial force */
2713 tx = _mm_mul_ps(fscal,dx21);
2714 ty = _mm_mul_ps(fscal,dy21);
2715 tz = _mm_mul_ps(fscal,dz21);
2717 /* Update vectorial force */
2718 fix2 = _mm_add_ps(fix2,tx);
2719 fiy2 = _mm_add_ps(fiy2,ty);
2720 fiz2 = _mm_add_ps(fiz2,tz);
2722 fjx1 = _mm_add_ps(fjx1,tx);
2723 fjy1 = _mm_add_ps(fjy1,ty);
2724 fjz1 = _mm_add_ps(fjz1,tz);
2728 /**************************
2729 * CALCULATE INTERACTIONS *
2730 **************************/
2732 if (gmx_mm_any_lt(rsq22,rcutoff2))
2735 r22 = _mm_mul_ps(rsq22,rinv22);
2736 r22 = _mm_andnot_ps(dummy_mask,r22);
2738 /* EWALD ELECTROSTATICS */
2740 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2741 ewrt = _mm_mul_ps(r22,ewtabscale);
2742 ewitab = _mm_cvttps_epi32(ewrt);
2743 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2744 ewitab = _mm_slli_epi32(ewitab,2);
2745 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2746 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2747 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2748 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2749 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2750 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2751 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2752 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2753 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2755 d = _mm_sub_ps(r22,rswitch);
2756 d = _mm_max_ps(d,_mm_setzero_ps());
2757 d2 = _mm_mul_ps(d,d);
2758 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2760 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2762 /* Evaluate switch function */
2763 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2764 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2765 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2769 fscal = _mm_and_ps(fscal,cutoff_mask);
2771 fscal = _mm_andnot_ps(dummy_mask,fscal);
2773 /* Calculate temporary vectorial force */
2774 tx = _mm_mul_ps(fscal,dx22);
2775 ty = _mm_mul_ps(fscal,dy22);
2776 tz = _mm_mul_ps(fscal,dz22);
2778 /* Update vectorial force */
2779 fix2 = _mm_add_ps(fix2,tx);
2780 fiy2 = _mm_add_ps(fiy2,ty);
2781 fiz2 = _mm_add_ps(fiz2,tz);
2783 fjx2 = _mm_add_ps(fjx2,tx);
2784 fjy2 = _mm_add_ps(fjy2,ty);
2785 fjz2 = _mm_add_ps(fjz2,tz);
2789 /**************************
2790 * CALCULATE INTERACTIONS *
2791 **************************/
2793 if (gmx_mm_any_lt(rsq23,rcutoff2))
2796 r23 = _mm_mul_ps(rsq23,rinv23);
2797 r23 = _mm_andnot_ps(dummy_mask,r23);
2799 /* EWALD ELECTROSTATICS */
2801 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2802 ewrt = _mm_mul_ps(r23,ewtabscale);
2803 ewitab = _mm_cvttps_epi32(ewrt);
2804 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2805 ewitab = _mm_slli_epi32(ewitab,2);
2806 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2807 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2808 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2809 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2810 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2811 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2812 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2813 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
2814 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2816 d = _mm_sub_ps(r23,rswitch);
2817 d = _mm_max_ps(d,_mm_setzero_ps());
2818 d2 = _mm_mul_ps(d,d);
2819 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2821 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2823 /* Evaluate switch function */
2824 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2825 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2826 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2830 fscal = _mm_and_ps(fscal,cutoff_mask);
2832 fscal = _mm_andnot_ps(dummy_mask,fscal);
2834 /* Calculate temporary vectorial force */
2835 tx = _mm_mul_ps(fscal,dx23);
2836 ty = _mm_mul_ps(fscal,dy23);
2837 tz = _mm_mul_ps(fscal,dz23);
2839 /* Update vectorial force */
2840 fix2 = _mm_add_ps(fix2,tx);
2841 fiy2 = _mm_add_ps(fiy2,ty);
2842 fiz2 = _mm_add_ps(fiz2,tz);
2844 fjx3 = _mm_add_ps(fjx3,tx);
2845 fjy3 = _mm_add_ps(fjy3,ty);
2846 fjz3 = _mm_add_ps(fjz3,tz);
2850 /**************************
2851 * CALCULATE INTERACTIONS *
2852 **************************/
2854 if (gmx_mm_any_lt(rsq31,rcutoff2))
2857 r31 = _mm_mul_ps(rsq31,rinv31);
2858 r31 = _mm_andnot_ps(dummy_mask,r31);
2860 /* EWALD ELECTROSTATICS */
2862 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2863 ewrt = _mm_mul_ps(r31,ewtabscale);
2864 ewitab = _mm_cvttps_epi32(ewrt);
2865 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2866 ewitab = _mm_slli_epi32(ewitab,2);
2867 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2868 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2869 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2870 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2871 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2872 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2873 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2874 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
2875 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2877 d = _mm_sub_ps(r31,rswitch);
2878 d = _mm_max_ps(d,_mm_setzero_ps());
2879 d2 = _mm_mul_ps(d,d);
2880 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2882 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2884 /* Evaluate switch function */
2885 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2886 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2887 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2891 fscal = _mm_and_ps(fscal,cutoff_mask);
2893 fscal = _mm_andnot_ps(dummy_mask,fscal);
2895 /* Calculate temporary vectorial force */
2896 tx = _mm_mul_ps(fscal,dx31);
2897 ty = _mm_mul_ps(fscal,dy31);
2898 tz = _mm_mul_ps(fscal,dz31);
2900 /* Update vectorial force */
2901 fix3 = _mm_add_ps(fix3,tx);
2902 fiy3 = _mm_add_ps(fiy3,ty);
2903 fiz3 = _mm_add_ps(fiz3,tz);
2905 fjx1 = _mm_add_ps(fjx1,tx);
2906 fjy1 = _mm_add_ps(fjy1,ty);
2907 fjz1 = _mm_add_ps(fjz1,tz);
2911 /**************************
2912 * CALCULATE INTERACTIONS *
2913 **************************/
2915 if (gmx_mm_any_lt(rsq32,rcutoff2))
2918 r32 = _mm_mul_ps(rsq32,rinv32);
2919 r32 = _mm_andnot_ps(dummy_mask,r32);
2921 /* EWALD ELECTROSTATICS */
2923 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2924 ewrt = _mm_mul_ps(r32,ewtabscale);
2925 ewitab = _mm_cvttps_epi32(ewrt);
2926 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2927 ewitab = _mm_slli_epi32(ewitab,2);
2928 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2929 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2930 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2931 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2932 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2933 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2934 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2935 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
2936 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2938 d = _mm_sub_ps(r32,rswitch);
2939 d = _mm_max_ps(d,_mm_setzero_ps());
2940 d2 = _mm_mul_ps(d,d);
2941 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2943 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2945 /* Evaluate switch function */
2946 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2947 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2948 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2952 fscal = _mm_and_ps(fscal,cutoff_mask);
2954 fscal = _mm_andnot_ps(dummy_mask,fscal);
2956 /* Calculate temporary vectorial force */
2957 tx = _mm_mul_ps(fscal,dx32);
2958 ty = _mm_mul_ps(fscal,dy32);
2959 tz = _mm_mul_ps(fscal,dz32);
2961 /* Update vectorial force */
2962 fix3 = _mm_add_ps(fix3,tx);
2963 fiy3 = _mm_add_ps(fiy3,ty);
2964 fiz3 = _mm_add_ps(fiz3,tz);
2966 fjx2 = _mm_add_ps(fjx2,tx);
2967 fjy2 = _mm_add_ps(fjy2,ty);
2968 fjz2 = _mm_add_ps(fjz2,tz);
2972 /**************************
2973 * CALCULATE INTERACTIONS *
2974 **************************/
2976 if (gmx_mm_any_lt(rsq33,rcutoff2))
2979 r33 = _mm_mul_ps(rsq33,rinv33);
2980 r33 = _mm_andnot_ps(dummy_mask,r33);
2982 /* EWALD ELECTROSTATICS */
2984 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2985 ewrt = _mm_mul_ps(r33,ewtabscale);
2986 ewitab = _mm_cvttps_epi32(ewrt);
2987 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2988 ewitab = _mm_slli_epi32(ewitab,2);
2989 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2990 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2991 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2992 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2993 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2994 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2995 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2996 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
2997 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2999 d = _mm_sub_ps(r33,rswitch);
3000 d = _mm_max_ps(d,_mm_setzero_ps());
3001 d2 = _mm_mul_ps(d,d);
3002 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3004 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3006 /* Evaluate switch function */
3007 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3008 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
3009 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
3013 fscal = _mm_and_ps(fscal,cutoff_mask);
3015 fscal = _mm_andnot_ps(dummy_mask,fscal);
3017 /* Calculate temporary vectorial force */
3018 tx = _mm_mul_ps(fscal,dx33);
3019 ty = _mm_mul_ps(fscal,dy33);
3020 tz = _mm_mul_ps(fscal,dz33);
3022 /* Update vectorial force */
3023 fix3 = _mm_add_ps(fix3,tx);
3024 fiy3 = _mm_add_ps(fiy3,ty);
3025 fiz3 = _mm_add_ps(fiz3,tz);
3027 fjx3 = _mm_add_ps(fjx3,tx);
3028 fjy3 = _mm_add_ps(fjy3,ty);
3029 fjz3 = _mm_add_ps(fjz3,tz);
3033 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3034 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3035 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3036 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3038 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
3039 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3041 /* Inner loop uses 567 flops */
3044 /* End of innermost loop */
3046 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3047 f+i_coord_offset+DIM,fshift+i_shift_offset);
3049 /* Increment number of inner iterations */
3050 inneriter += j_index_end - j_index_start;
3052 /* Outer loop uses 18 flops */
3055 /* Increment number of outer iterations */
3058 /* Update outer/inner flops */
3060 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*567);