2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: None
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
93 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
95 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
96 real rswitch_scalar,d_scalar;
97 __m128 dummy_mask,cutoff_mask;
98 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
99 __m128 one = _mm_set1_ps(1.0);
100 __m128 two = _mm_set1_ps(2.0);
106 jindex = nlist->jindex;
108 shiftidx = nlist->shift;
110 shiftvec = fr->shift_vec[0];
111 fshift = fr->fshift[0];
112 facel = _mm_set1_ps(fr->epsfac);
113 charge = mdatoms->chargeA;
115 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
116 ewtab = fr->ic->tabq_coul_FDV0;
117 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
118 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
120 /* Setup water-specific parameters */
121 inr = nlist->iinr[0];
122 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
123 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
124 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
126 jq0 = _mm_set1_ps(charge[inr+0]);
127 jq1 = _mm_set1_ps(charge[inr+1]);
128 jq2 = _mm_set1_ps(charge[inr+2]);
129 qq00 = _mm_mul_ps(iq0,jq0);
130 qq01 = _mm_mul_ps(iq0,jq1);
131 qq02 = _mm_mul_ps(iq0,jq2);
132 qq10 = _mm_mul_ps(iq1,jq0);
133 qq11 = _mm_mul_ps(iq1,jq1);
134 qq12 = _mm_mul_ps(iq1,jq2);
135 qq20 = _mm_mul_ps(iq2,jq0);
136 qq21 = _mm_mul_ps(iq2,jq1);
137 qq22 = _mm_mul_ps(iq2,jq2);
139 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
140 rcutoff_scalar = fr->rcoulomb;
141 rcutoff = _mm_set1_ps(rcutoff_scalar);
142 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
144 rswitch_scalar = fr->rcoulomb_switch;
145 rswitch = _mm_set1_ps(rswitch_scalar);
146 /* Setup switch parameters */
147 d_scalar = rcutoff_scalar-rswitch_scalar;
148 d = _mm_set1_ps(d_scalar);
149 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
150 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
151 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
152 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
153 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
154 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
156 /* Avoid stupid compiler warnings */
157 jnrA = jnrB = jnrC = jnrD = 0;
166 for(iidx=0;iidx<4*DIM;iidx++)
171 /* Start outer loop over neighborlists */
172 for(iidx=0; iidx<nri; iidx++)
174 /* Load shift vector for this list */
175 i_shift_offset = DIM*shiftidx[iidx];
177 /* Load limits for loop over neighbors */
178 j_index_start = jindex[iidx];
179 j_index_end = jindex[iidx+1];
181 /* Get outer coordinate index */
183 i_coord_offset = DIM*inr;
185 /* Load i particle coords and add shift vector */
186 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
187 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
189 fix0 = _mm_setzero_ps();
190 fiy0 = _mm_setzero_ps();
191 fiz0 = _mm_setzero_ps();
192 fix1 = _mm_setzero_ps();
193 fiy1 = _mm_setzero_ps();
194 fiz1 = _mm_setzero_ps();
195 fix2 = _mm_setzero_ps();
196 fiy2 = _mm_setzero_ps();
197 fiz2 = _mm_setzero_ps();
199 /* Reset potential sums */
200 velecsum = _mm_setzero_ps();
202 /* Start inner kernel loop */
203 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
206 /* Get j neighbor index, and coordinate index */
211 j_coord_offsetA = DIM*jnrA;
212 j_coord_offsetB = DIM*jnrB;
213 j_coord_offsetC = DIM*jnrC;
214 j_coord_offsetD = DIM*jnrD;
216 /* load j atom coordinates */
217 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
218 x+j_coord_offsetC,x+j_coord_offsetD,
219 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
221 /* Calculate displacement vector */
222 dx00 = _mm_sub_ps(ix0,jx0);
223 dy00 = _mm_sub_ps(iy0,jy0);
224 dz00 = _mm_sub_ps(iz0,jz0);
225 dx01 = _mm_sub_ps(ix0,jx1);
226 dy01 = _mm_sub_ps(iy0,jy1);
227 dz01 = _mm_sub_ps(iz0,jz1);
228 dx02 = _mm_sub_ps(ix0,jx2);
229 dy02 = _mm_sub_ps(iy0,jy2);
230 dz02 = _mm_sub_ps(iz0,jz2);
231 dx10 = _mm_sub_ps(ix1,jx0);
232 dy10 = _mm_sub_ps(iy1,jy0);
233 dz10 = _mm_sub_ps(iz1,jz0);
234 dx11 = _mm_sub_ps(ix1,jx1);
235 dy11 = _mm_sub_ps(iy1,jy1);
236 dz11 = _mm_sub_ps(iz1,jz1);
237 dx12 = _mm_sub_ps(ix1,jx2);
238 dy12 = _mm_sub_ps(iy1,jy2);
239 dz12 = _mm_sub_ps(iz1,jz2);
240 dx20 = _mm_sub_ps(ix2,jx0);
241 dy20 = _mm_sub_ps(iy2,jy0);
242 dz20 = _mm_sub_ps(iz2,jz0);
243 dx21 = _mm_sub_ps(ix2,jx1);
244 dy21 = _mm_sub_ps(iy2,jy1);
245 dz21 = _mm_sub_ps(iz2,jz1);
246 dx22 = _mm_sub_ps(ix2,jx2);
247 dy22 = _mm_sub_ps(iy2,jy2);
248 dz22 = _mm_sub_ps(iz2,jz2);
250 /* Calculate squared distance and things based on it */
251 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
252 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
253 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
254 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
255 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
256 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
257 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
258 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
259 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
261 rinv00 = gmx_mm_invsqrt_ps(rsq00);
262 rinv01 = gmx_mm_invsqrt_ps(rsq01);
263 rinv02 = gmx_mm_invsqrt_ps(rsq02);
264 rinv10 = gmx_mm_invsqrt_ps(rsq10);
265 rinv11 = gmx_mm_invsqrt_ps(rsq11);
266 rinv12 = gmx_mm_invsqrt_ps(rsq12);
267 rinv20 = gmx_mm_invsqrt_ps(rsq20);
268 rinv21 = gmx_mm_invsqrt_ps(rsq21);
269 rinv22 = gmx_mm_invsqrt_ps(rsq22);
271 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
272 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
273 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
274 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
275 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
276 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
277 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
278 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
279 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
281 fjx0 = _mm_setzero_ps();
282 fjy0 = _mm_setzero_ps();
283 fjz0 = _mm_setzero_ps();
284 fjx1 = _mm_setzero_ps();
285 fjy1 = _mm_setzero_ps();
286 fjz1 = _mm_setzero_ps();
287 fjx2 = _mm_setzero_ps();
288 fjy2 = _mm_setzero_ps();
289 fjz2 = _mm_setzero_ps();
291 /**************************
292 * CALCULATE INTERACTIONS *
293 **************************/
295 if (gmx_mm_any_lt(rsq00,rcutoff2))
298 r00 = _mm_mul_ps(rsq00,rinv00);
300 /* EWALD ELECTROSTATICS */
302 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
303 ewrt = _mm_mul_ps(r00,ewtabscale);
304 ewitab = _mm_cvttps_epi32(ewrt);
305 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
306 ewitab = _mm_slli_epi32(ewitab,2);
307 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
308 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
309 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
310 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
311 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
312 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
313 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
314 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
315 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
317 d = _mm_sub_ps(r00,rswitch);
318 d = _mm_max_ps(d,_mm_setzero_ps());
319 d2 = _mm_mul_ps(d,d);
320 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
322 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
324 /* Evaluate switch function */
325 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
326 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
327 velec = _mm_mul_ps(velec,sw);
328 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
330 /* Update potential sum for this i atom from the interaction with this j atom. */
331 velec = _mm_and_ps(velec,cutoff_mask);
332 velecsum = _mm_add_ps(velecsum,velec);
336 fscal = _mm_and_ps(fscal,cutoff_mask);
338 /* Calculate temporary vectorial force */
339 tx = _mm_mul_ps(fscal,dx00);
340 ty = _mm_mul_ps(fscal,dy00);
341 tz = _mm_mul_ps(fscal,dz00);
343 /* Update vectorial force */
344 fix0 = _mm_add_ps(fix0,tx);
345 fiy0 = _mm_add_ps(fiy0,ty);
346 fiz0 = _mm_add_ps(fiz0,tz);
348 fjx0 = _mm_add_ps(fjx0,tx);
349 fjy0 = _mm_add_ps(fjy0,ty);
350 fjz0 = _mm_add_ps(fjz0,tz);
354 /**************************
355 * CALCULATE INTERACTIONS *
356 **************************/
358 if (gmx_mm_any_lt(rsq01,rcutoff2))
361 r01 = _mm_mul_ps(rsq01,rinv01);
363 /* EWALD ELECTROSTATICS */
365 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
366 ewrt = _mm_mul_ps(r01,ewtabscale);
367 ewitab = _mm_cvttps_epi32(ewrt);
368 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
369 ewitab = _mm_slli_epi32(ewitab,2);
370 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
371 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
372 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
373 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
374 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
375 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
376 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
377 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
378 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
380 d = _mm_sub_ps(r01,rswitch);
381 d = _mm_max_ps(d,_mm_setzero_ps());
382 d2 = _mm_mul_ps(d,d);
383 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
385 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
387 /* Evaluate switch function */
388 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
389 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
390 velec = _mm_mul_ps(velec,sw);
391 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
393 /* Update potential sum for this i atom from the interaction with this j atom. */
394 velec = _mm_and_ps(velec,cutoff_mask);
395 velecsum = _mm_add_ps(velecsum,velec);
399 fscal = _mm_and_ps(fscal,cutoff_mask);
401 /* Calculate temporary vectorial force */
402 tx = _mm_mul_ps(fscal,dx01);
403 ty = _mm_mul_ps(fscal,dy01);
404 tz = _mm_mul_ps(fscal,dz01);
406 /* Update vectorial force */
407 fix0 = _mm_add_ps(fix0,tx);
408 fiy0 = _mm_add_ps(fiy0,ty);
409 fiz0 = _mm_add_ps(fiz0,tz);
411 fjx1 = _mm_add_ps(fjx1,tx);
412 fjy1 = _mm_add_ps(fjy1,ty);
413 fjz1 = _mm_add_ps(fjz1,tz);
417 /**************************
418 * CALCULATE INTERACTIONS *
419 **************************/
421 if (gmx_mm_any_lt(rsq02,rcutoff2))
424 r02 = _mm_mul_ps(rsq02,rinv02);
426 /* EWALD ELECTROSTATICS */
428 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
429 ewrt = _mm_mul_ps(r02,ewtabscale);
430 ewitab = _mm_cvttps_epi32(ewrt);
431 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
432 ewitab = _mm_slli_epi32(ewitab,2);
433 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
434 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
435 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
436 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
437 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
438 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
439 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
440 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
441 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
443 d = _mm_sub_ps(r02,rswitch);
444 d = _mm_max_ps(d,_mm_setzero_ps());
445 d2 = _mm_mul_ps(d,d);
446 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
448 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
450 /* Evaluate switch function */
451 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
452 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
453 velec = _mm_mul_ps(velec,sw);
454 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
456 /* Update potential sum for this i atom from the interaction with this j atom. */
457 velec = _mm_and_ps(velec,cutoff_mask);
458 velecsum = _mm_add_ps(velecsum,velec);
462 fscal = _mm_and_ps(fscal,cutoff_mask);
464 /* Calculate temporary vectorial force */
465 tx = _mm_mul_ps(fscal,dx02);
466 ty = _mm_mul_ps(fscal,dy02);
467 tz = _mm_mul_ps(fscal,dz02);
469 /* Update vectorial force */
470 fix0 = _mm_add_ps(fix0,tx);
471 fiy0 = _mm_add_ps(fiy0,ty);
472 fiz0 = _mm_add_ps(fiz0,tz);
474 fjx2 = _mm_add_ps(fjx2,tx);
475 fjy2 = _mm_add_ps(fjy2,ty);
476 fjz2 = _mm_add_ps(fjz2,tz);
480 /**************************
481 * CALCULATE INTERACTIONS *
482 **************************/
484 if (gmx_mm_any_lt(rsq10,rcutoff2))
487 r10 = _mm_mul_ps(rsq10,rinv10);
489 /* EWALD ELECTROSTATICS */
491 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
492 ewrt = _mm_mul_ps(r10,ewtabscale);
493 ewitab = _mm_cvttps_epi32(ewrt);
494 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
495 ewitab = _mm_slli_epi32(ewitab,2);
496 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
497 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
498 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
499 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
500 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
501 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
502 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
503 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
504 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
506 d = _mm_sub_ps(r10,rswitch);
507 d = _mm_max_ps(d,_mm_setzero_ps());
508 d2 = _mm_mul_ps(d,d);
509 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
511 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
513 /* Evaluate switch function */
514 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
515 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
516 velec = _mm_mul_ps(velec,sw);
517 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
519 /* Update potential sum for this i atom from the interaction with this j atom. */
520 velec = _mm_and_ps(velec,cutoff_mask);
521 velecsum = _mm_add_ps(velecsum,velec);
525 fscal = _mm_and_ps(fscal,cutoff_mask);
527 /* Calculate temporary vectorial force */
528 tx = _mm_mul_ps(fscal,dx10);
529 ty = _mm_mul_ps(fscal,dy10);
530 tz = _mm_mul_ps(fscal,dz10);
532 /* Update vectorial force */
533 fix1 = _mm_add_ps(fix1,tx);
534 fiy1 = _mm_add_ps(fiy1,ty);
535 fiz1 = _mm_add_ps(fiz1,tz);
537 fjx0 = _mm_add_ps(fjx0,tx);
538 fjy0 = _mm_add_ps(fjy0,ty);
539 fjz0 = _mm_add_ps(fjz0,tz);
543 /**************************
544 * CALCULATE INTERACTIONS *
545 **************************/
547 if (gmx_mm_any_lt(rsq11,rcutoff2))
550 r11 = _mm_mul_ps(rsq11,rinv11);
552 /* EWALD ELECTROSTATICS */
554 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
555 ewrt = _mm_mul_ps(r11,ewtabscale);
556 ewitab = _mm_cvttps_epi32(ewrt);
557 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
558 ewitab = _mm_slli_epi32(ewitab,2);
559 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
560 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
561 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
562 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
563 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
564 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
565 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
566 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
567 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
569 d = _mm_sub_ps(r11,rswitch);
570 d = _mm_max_ps(d,_mm_setzero_ps());
571 d2 = _mm_mul_ps(d,d);
572 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
574 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
576 /* Evaluate switch function */
577 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
578 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
579 velec = _mm_mul_ps(velec,sw);
580 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
582 /* Update potential sum for this i atom from the interaction with this j atom. */
583 velec = _mm_and_ps(velec,cutoff_mask);
584 velecsum = _mm_add_ps(velecsum,velec);
588 fscal = _mm_and_ps(fscal,cutoff_mask);
590 /* Calculate temporary vectorial force */
591 tx = _mm_mul_ps(fscal,dx11);
592 ty = _mm_mul_ps(fscal,dy11);
593 tz = _mm_mul_ps(fscal,dz11);
595 /* Update vectorial force */
596 fix1 = _mm_add_ps(fix1,tx);
597 fiy1 = _mm_add_ps(fiy1,ty);
598 fiz1 = _mm_add_ps(fiz1,tz);
600 fjx1 = _mm_add_ps(fjx1,tx);
601 fjy1 = _mm_add_ps(fjy1,ty);
602 fjz1 = _mm_add_ps(fjz1,tz);
606 /**************************
607 * CALCULATE INTERACTIONS *
608 **************************/
610 if (gmx_mm_any_lt(rsq12,rcutoff2))
613 r12 = _mm_mul_ps(rsq12,rinv12);
615 /* EWALD ELECTROSTATICS */
617 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
618 ewrt = _mm_mul_ps(r12,ewtabscale);
619 ewitab = _mm_cvttps_epi32(ewrt);
620 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
621 ewitab = _mm_slli_epi32(ewitab,2);
622 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
623 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
624 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
625 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
626 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
627 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
628 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
629 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
630 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
632 d = _mm_sub_ps(r12,rswitch);
633 d = _mm_max_ps(d,_mm_setzero_ps());
634 d2 = _mm_mul_ps(d,d);
635 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
637 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
639 /* Evaluate switch function */
640 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
641 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
642 velec = _mm_mul_ps(velec,sw);
643 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
645 /* Update potential sum for this i atom from the interaction with this j atom. */
646 velec = _mm_and_ps(velec,cutoff_mask);
647 velecsum = _mm_add_ps(velecsum,velec);
651 fscal = _mm_and_ps(fscal,cutoff_mask);
653 /* Calculate temporary vectorial force */
654 tx = _mm_mul_ps(fscal,dx12);
655 ty = _mm_mul_ps(fscal,dy12);
656 tz = _mm_mul_ps(fscal,dz12);
658 /* Update vectorial force */
659 fix1 = _mm_add_ps(fix1,tx);
660 fiy1 = _mm_add_ps(fiy1,ty);
661 fiz1 = _mm_add_ps(fiz1,tz);
663 fjx2 = _mm_add_ps(fjx2,tx);
664 fjy2 = _mm_add_ps(fjy2,ty);
665 fjz2 = _mm_add_ps(fjz2,tz);
669 /**************************
670 * CALCULATE INTERACTIONS *
671 **************************/
673 if (gmx_mm_any_lt(rsq20,rcutoff2))
676 r20 = _mm_mul_ps(rsq20,rinv20);
678 /* EWALD ELECTROSTATICS */
680 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
681 ewrt = _mm_mul_ps(r20,ewtabscale);
682 ewitab = _mm_cvttps_epi32(ewrt);
683 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
684 ewitab = _mm_slli_epi32(ewitab,2);
685 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
686 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
687 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
688 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
689 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
690 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
691 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
692 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
693 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
695 d = _mm_sub_ps(r20,rswitch);
696 d = _mm_max_ps(d,_mm_setzero_ps());
697 d2 = _mm_mul_ps(d,d);
698 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
700 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
702 /* Evaluate switch function */
703 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
704 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
705 velec = _mm_mul_ps(velec,sw);
706 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
708 /* Update potential sum for this i atom from the interaction with this j atom. */
709 velec = _mm_and_ps(velec,cutoff_mask);
710 velecsum = _mm_add_ps(velecsum,velec);
714 fscal = _mm_and_ps(fscal,cutoff_mask);
716 /* Calculate temporary vectorial force */
717 tx = _mm_mul_ps(fscal,dx20);
718 ty = _mm_mul_ps(fscal,dy20);
719 tz = _mm_mul_ps(fscal,dz20);
721 /* Update vectorial force */
722 fix2 = _mm_add_ps(fix2,tx);
723 fiy2 = _mm_add_ps(fiy2,ty);
724 fiz2 = _mm_add_ps(fiz2,tz);
726 fjx0 = _mm_add_ps(fjx0,tx);
727 fjy0 = _mm_add_ps(fjy0,ty);
728 fjz0 = _mm_add_ps(fjz0,tz);
732 /**************************
733 * CALCULATE INTERACTIONS *
734 **************************/
736 if (gmx_mm_any_lt(rsq21,rcutoff2))
739 r21 = _mm_mul_ps(rsq21,rinv21);
741 /* EWALD ELECTROSTATICS */
743 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
744 ewrt = _mm_mul_ps(r21,ewtabscale);
745 ewitab = _mm_cvttps_epi32(ewrt);
746 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
747 ewitab = _mm_slli_epi32(ewitab,2);
748 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
749 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
750 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
751 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
752 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
753 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
754 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
755 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
756 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
758 d = _mm_sub_ps(r21,rswitch);
759 d = _mm_max_ps(d,_mm_setzero_ps());
760 d2 = _mm_mul_ps(d,d);
761 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
763 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
765 /* Evaluate switch function */
766 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
767 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
768 velec = _mm_mul_ps(velec,sw);
769 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
771 /* Update potential sum for this i atom from the interaction with this j atom. */
772 velec = _mm_and_ps(velec,cutoff_mask);
773 velecsum = _mm_add_ps(velecsum,velec);
777 fscal = _mm_and_ps(fscal,cutoff_mask);
779 /* Calculate temporary vectorial force */
780 tx = _mm_mul_ps(fscal,dx21);
781 ty = _mm_mul_ps(fscal,dy21);
782 tz = _mm_mul_ps(fscal,dz21);
784 /* Update vectorial force */
785 fix2 = _mm_add_ps(fix2,tx);
786 fiy2 = _mm_add_ps(fiy2,ty);
787 fiz2 = _mm_add_ps(fiz2,tz);
789 fjx1 = _mm_add_ps(fjx1,tx);
790 fjy1 = _mm_add_ps(fjy1,ty);
791 fjz1 = _mm_add_ps(fjz1,tz);
795 /**************************
796 * CALCULATE INTERACTIONS *
797 **************************/
799 if (gmx_mm_any_lt(rsq22,rcutoff2))
802 r22 = _mm_mul_ps(rsq22,rinv22);
804 /* EWALD ELECTROSTATICS */
806 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
807 ewrt = _mm_mul_ps(r22,ewtabscale);
808 ewitab = _mm_cvttps_epi32(ewrt);
809 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
810 ewitab = _mm_slli_epi32(ewitab,2);
811 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
812 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
813 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
814 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
815 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
816 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
817 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
818 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
819 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
821 d = _mm_sub_ps(r22,rswitch);
822 d = _mm_max_ps(d,_mm_setzero_ps());
823 d2 = _mm_mul_ps(d,d);
824 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
826 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
828 /* Evaluate switch function */
829 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
830 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
831 velec = _mm_mul_ps(velec,sw);
832 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
834 /* Update potential sum for this i atom from the interaction with this j atom. */
835 velec = _mm_and_ps(velec,cutoff_mask);
836 velecsum = _mm_add_ps(velecsum,velec);
840 fscal = _mm_and_ps(fscal,cutoff_mask);
842 /* Calculate temporary vectorial force */
843 tx = _mm_mul_ps(fscal,dx22);
844 ty = _mm_mul_ps(fscal,dy22);
845 tz = _mm_mul_ps(fscal,dz22);
847 /* Update vectorial force */
848 fix2 = _mm_add_ps(fix2,tx);
849 fiy2 = _mm_add_ps(fiy2,ty);
850 fiz2 = _mm_add_ps(fiz2,tz);
852 fjx2 = _mm_add_ps(fjx2,tx);
853 fjy2 = _mm_add_ps(fjy2,ty);
854 fjz2 = _mm_add_ps(fjz2,tz);
858 fjptrA = f+j_coord_offsetA;
859 fjptrB = f+j_coord_offsetB;
860 fjptrC = f+j_coord_offsetC;
861 fjptrD = f+j_coord_offsetD;
863 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
864 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
866 /* Inner loop uses 585 flops */
872 /* Get j neighbor index, and coordinate index */
873 jnrlistA = jjnr[jidx];
874 jnrlistB = jjnr[jidx+1];
875 jnrlistC = jjnr[jidx+2];
876 jnrlistD = jjnr[jidx+3];
877 /* Sign of each element will be negative for non-real atoms.
878 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
879 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
881 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
882 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
883 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
884 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
885 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
886 j_coord_offsetA = DIM*jnrA;
887 j_coord_offsetB = DIM*jnrB;
888 j_coord_offsetC = DIM*jnrC;
889 j_coord_offsetD = DIM*jnrD;
891 /* load j atom coordinates */
892 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
893 x+j_coord_offsetC,x+j_coord_offsetD,
894 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
896 /* Calculate displacement vector */
897 dx00 = _mm_sub_ps(ix0,jx0);
898 dy00 = _mm_sub_ps(iy0,jy0);
899 dz00 = _mm_sub_ps(iz0,jz0);
900 dx01 = _mm_sub_ps(ix0,jx1);
901 dy01 = _mm_sub_ps(iy0,jy1);
902 dz01 = _mm_sub_ps(iz0,jz1);
903 dx02 = _mm_sub_ps(ix0,jx2);
904 dy02 = _mm_sub_ps(iy0,jy2);
905 dz02 = _mm_sub_ps(iz0,jz2);
906 dx10 = _mm_sub_ps(ix1,jx0);
907 dy10 = _mm_sub_ps(iy1,jy0);
908 dz10 = _mm_sub_ps(iz1,jz0);
909 dx11 = _mm_sub_ps(ix1,jx1);
910 dy11 = _mm_sub_ps(iy1,jy1);
911 dz11 = _mm_sub_ps(iz1,jz1);
912 dx12 = _mm_sub_ps(ix1,jx2);
913 dy12 = _mm_sub_ps(iy1,jy2);
914 dz12 = _mm_sub_ps(iz1,jz2);
915 dx20 = _mm_sub_ps(ix2,jx0);
916 dy20 = _mm_sub_ps(iy2,jy0);
917 dz20 = _mm_sub_ps(iz2,jz0);
918 dx21 = _mm_sub_ps(ix2,jx1);
919 dy21 = _mm_sub_ps(iy2,jy1);
920 dz21 = _mm_sub_ps(iz2,jz1);
921 dx22 = _mm_sub_ps(ix2,jx2);
922 dy22 = _mm_sub_ps(iy2,jy2);
923 dz22 = _mm_sub_ps(iz2,jz2);
925 /* Calculate squared distance and things based on it */
926 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
927 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
928 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
929 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
930 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
931 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
932 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
933 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
934 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
936 rinv00 = gmx_mm_invsqrt_ps(rsq00);
937 rinv01 = gmx_mm_invsqrt_ps(rsq01);
938 rinv02 = gmx_mm_invsqrt_ps(rsq02);
939 rinv10 = gmx_mm_invsqrt_ps(rsq10);
940 rinv11 = gmx_mm_invsqrt_ps(rsq11);
941 rinv12 = gmx_mm_invsqrt_ps(rsq12);
942 rinv20 = gmx_mm_invsqrt_ps(rsq20);
943 rinv21 = gmx_mm_invsqrt_ps(rsq21);
944 rinv22 = gmx_mm_invsqrt_ps(rsq22);
946 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
947 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
948 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
949 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
950 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
951 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
952 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
953 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
954 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
956 fjx0 = _mm_setzero_ps();
957 fjy0 = _mm_setzero_ps();
958 fjz0 = _mm_setzero_ps();
959 fjx1 = _mm_setzero_ps();
960 fjy1 = _mm_setzero_ps();
961 fjz1 = _mm_setzero_ps();
962 fjx2 = _mm_setzero_ps();
963 fjy2 = _mm_setzero_ps();
964 fjz2 = _mm_setzero_ps();
966 /**************************
967 * CALCULATE INTERACTIONS *
968 **************************/
970 if (gmx_mm_any_lt(rsq00,rcutoff2))
973 r00 = _mm_mul_ps(rsq00,rinv00);
974 r00 = _mm_andnot_ps(dummy_mask,r00);
976 /* EWALD ELECTROSTATICS */
978 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
979 ewrt = _mm_mul_ps(r00,ewtabscale);
980 ewitab = _mm_cvttps_epi32(ewrt);
981 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
982 ewitab = _mm_slli_epi32(ewitab,2);
983 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
984 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
985 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
986 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
987 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
988 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
989 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
990 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
991 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
993 d = _mm_sub_ps(r00,rswitch);
994 d = _mm_max_ps(d,_mm_setzero_ps());
995 d2 = _mm_mul_ps(d,d);
996 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
998 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1000 /* Evaluate switch function */
1001 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1002 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1003 velec = _mm_mul_ps(velec,sw);
1004 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1006 /* Update potential sum for this i atom from the interaction with this j atom. */
1007 velec = _mm_and_ps(velec,cutoff_mask);
1008 velec = _mm_andnot_ps(dummy_mask,velec);
1009 velecsum = _mm_add_ps(velecsum,velec);
1013 fscal = _mm_and_ps(fscal,cutoff_mask);
1015 fscal = _mm_andnot_ps(dummy_mask,fscal);
1017 /* Calculate temporary vectorial force */
1018 tx = _mm_mul_ps(fscal,dx00);
1019 ty = _mm_mul_ps(fscal,dy00);
1020 tz = _mm_mul_ps(fscal,dz00);
1022 /* Update vectorial force */
1023 fix0 = _mm_add_ps(fix0,tx);
1024 fiy0 = _mm_add_ps(fiy0,ty);
1025 fiz0 = _mm_add_ps(fiz0,tz);
1027 fjx0 = _mm_add_ps(fjx0,tx);
1028 fjy0 = _mm_add_ps(fjy0,ty);
1029 fjz0 = _mm_add_ps(fjz0,tz);
1033 /**************************
1034 * CALCULATE INTERACTIONS *
1035 **************************/
1037 if (gmx_mm_any_lt(rsq01,rcutoff2))
1040 r01 = _mm_mul_ps(rsq01,rinv01);
1041 r01 = _mm_andnot_ps(dummy_mask,r01);
1043 /* EWALD ELECTROSTATICS */
1045 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1046 ewrt = _mm_mul_ps(r01,ewtabscale);
1047 ewitab = _mm_cvttps_epi32(ewrt);
1048 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1049 ewitab = _mm_slli_epi32(ewitab,2);
1050 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1051 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1052 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1053 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1054 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1055 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1056 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1057 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
1058 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1060 d = _mm_sub_ps(r01,rswitch);
1061 d = _mm_max_ps(d,_mm_setzero_ps());
1062 d2 = _mm_mul_ps(d,d);
1063 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1065 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1067 /* Evaluate switch function */
1068 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1069 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1070 velec = _mm_mul_ps(velec,sw);
1071 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1073 /* Update potential sum for this i atom from the interaction with this j atom. */
1074 velec = _mm_and_ps(velec,cutoff_mask);
1075 velec = _mm_andnot_ps(dummy_mask,velec);
1076 velecsum = _mm_add_ps(velecsum,velec);
1080 fscal = _mm_and_ps(fscal,cutoff_mask);
1082 fscal = _mm_andnot_ps(dummy_mask,fscal);
1084 /* Calculate temporary vectorial force */
1085 tx = _mm_mul_ps(fscal,dx01);
1086 ty = _mm_mul_ps(fscal,dy01);
1087 tz = _mm_mul_ps(fscal,dz01);
1089 /* Update vectorial force */
1090 fix0 = _mm_add_ps(fix0,tx);
1091 fiy0 = _mm_add_ps(fiy0,ty);
1092 fiz0 = _mm_add_ps(fiz0,tz);
1094 fjx1 = _mm_add_ps(fjx1,tx);
1095 fjy1 = _mm_add_ps(fjy1,ty);
1096 fjz1 = _mm_add_ps(fjz1,tz);
1100 /**************************
1101 * CALCULATE INTERACTIONS *
1102 **************************/
1104 if (gmx_mm_any_lt(rsq02,rcutoff2))
1107 r02 = _mm_mul_ps(rsq02,rinv02);
1108 r02 = _mm_andnot_ps(dummy_mask,r02);
1110 /* EWALD ELECTROSTATICS */
1112 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1113 ewrt = _mm_mul_ps(r02,ewtabscale);
1114 ewitab = _mm_cvttps_epi32(ewrt);
1115 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1116 ewitab = _mm_slli_epi32(ewitab,2);
1117 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1118 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1119 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1120 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1121 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1122 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1123 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1124 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
1125 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1127 d = _mm_sub_ps(r02,rswitch);
1128 d = _mm_max_ps(d,_mm_setzero_ps());
1129 d2 = _mm_mul_ps(d,d);
1130 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1132 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1134 /* Evaluate switch function */
1135 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1136 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1137 velec = _mm_mul_ps(velec,sw);
1138 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1140 /* Update potential sum for this i atom from the interaction with this j atom. */
1141 velec = _mm_and_ps(velec,cutoff_mask);
1142 velec = _mm_andnot_ps(dummy_mask,velec);
1143 velecsum = _mm_add_ps(velecsum,velec);
1147 fscal = _mm_and_ps(fscal,cutoff_mask);
1149 fscal = _mm_andnot_ps(dummy_mask,fscal);
1151 /* Calculate temporary vectorial force */
1152 tx = _mm_mul_ps(fscal,dx02);
1153 ty = _mm_mul_ps(fscal,dy02);
1154 tz = _mm_mul_ps(fscal,dz02);
1156 /* Update vectorial force */
1157 fix0 = _mm_add_ps(fix0,tx);
1158 fiy0 = _mm_add_ps(fiy0,ty);
1159 fiz0 = _mm_add_ps(fiz0,tz);
1161 fjx2 = _mm_add_ps(fjx2,tx);
1162 fjy2 = _mm_add_ps(fjy2,ty);
1163 fjz2 = _mm_add_ps(fjz2,tz);
1167 /**************************
1168 * CALCULATE INTERACTIONS *
1169 **************************/
1171 if (gmx_mm_any_lt(rsq10,rcutoff2))
1174 r10 = _mm_mul_ps(rsq10,rinv10);
1175 r10 = _mm_andnot_ps(dummy_mask,r10);
1177 /* EWALD ELECTROSTATICS */
1179 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1180 ewrt = _mm_mul_ps(r10,ewtabscale);
1181 ewitab = _mm_cvttps_epi32(ewrt);
1182 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1183 ewitab = _mm_slli_epi32(ewitab,2);
1184 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1185 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1186 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1187 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1188 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1189 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1190 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1191 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1192 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1194 d = _mm_sub_ps(r10,rswitch);
1195 d = _mm_max_ps(d,_mm_setzero_ps());
1196 d2 = _mm_mul_ps(d,d);
1197 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1199 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1201 /* Evaluate switch function */
1202 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1203 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1204 velec = _mm_mul_ps(velec,sw);
1205 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1207 /* Update potential sum for this i atom from the interaction with this j atom. */
1208 velec = _mm_and_ps(velec,cutoff_mask);
1209 velec = _mm_andnot_ps(dummy_mask,velec);
1210 velecsum = _mm_add_ps(velecsum,velec);
1214 fscal = _mm_and_ps(fscal,cutoff_mask);
1216 fscal = _mm_andnot_ps(dummy_mask,fscal);
1218 /* Calculate temporary vectorial force */
1219 tx = _mm_mul_ps(fscal,dx10);
1220 ty = _mm_mul_ps(fscal,dy10);
1221 tz = _mm_mul_ps(fscal,dz10);
1223 /* Update vectorial force */
1224 fix1 = _mm_add_ps(fix1,tx);
1225 fiy1 = _mm_add_ps(fiy1,ty);
1226 fiz1 = _mm_add_ps(fiz1,tz);
1228 fjx0 = _mm_add_ps(fjx0,tx);
1229 fjy0 = _mm_add_ps(fjy0,ty);
1230 fjz0 = _mm_add_ps(fjz0,tz);
1234 /**************************
1235 * CALCULATE INTERACTIONS *
1236 **************************/
1238 if (gmx_mm_any_lt(rsq11,rcutoff2))
1241 r11 = _mm_mul_ps(rsq11,rinv11);
1242 r11 = _mm_andnot_ps(dummy_mask,r11);
1244 /* EWALD ELECTROSTATICS */
1246 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1247 ewrt = _mm_mul_ps(r11,ewtabscale);
1248 ewitab = _mm_cvttps_epi32(ewrt);
1249 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1250 ewitab = _mm_slli_epi32(ewitab,2);
1251 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1252 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1253 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1254 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1255 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1256 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1257 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1258 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1259 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1261 d = _mm_sub_ps(r11,rswitch);
1262 d = _mm_max_ps(d,_mm_setzero_ps());
1263 d2 = _mm_mul_ps(d,d);
1264 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1266 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1268 /* Evaluate switch function */
1269 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1270 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1271 velec = _mm_mul_ps(velec,sw);
1272 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1274 /* Update potential sum for this i atom from the interaction with this j atom. */
1275 velec = _mm_and_ps(velec,cutoff_mask);
1276 velec = _mm_andnot_ps(dummy_mask,velec);
1277 velecsum = _mm_add_ps(velecsum,velec);
1281 fscal = _mm_and_ps(fscal,cutoff_mask);
1283 fscal = _mm_andnot_ps(dummy_mask,fscal);
1285 /* Calculate temporary vectorial force */
1286 tx = _mm_mul_ps(fscal,dx11);
1287 ty = _mm_mul_ps(fscal,dy11);
1288 tz = _mm_mul_ps(fscal,dz11);
1290 /* Update vectorial force */
1291 fix1 = _mm_add_ps(fix1,tx);
1292 fiy1 = _mm_add_ps(fiy1,ty);
1293 fiz1 = _mm_add_ps(fiz1,tz);
1295 fjx1 = _mm_add_ps(fjx1,tx);
1296 fjy1 = _mm_add_ps(fjy1,ty);
1297 fjz1 = _mm_add_ps(fjz1,tz);
1301 /**************************
1302 * CALCULATE INTERACTIONS *
1303 **************************/
1305 if (gmx_mm_any_lt(rsq12,rcutoff2))
1308 r12 = _mm_mul_ps(rsq12,rinv12);
1309 r12 = _mm_andnot_ps(dummy_mask,r12);
1311 /* EWALD ELECTROSTATICS */
1313 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1314 ewrt = _mm_mul_ps(r12,ewtabscale);
1315 ewitab = _mm_cvttps_epi32(ewrt);
1316 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1317 ewitab = _mm_slli_epi32(ewitab,2);
1318 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1319 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1320 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1321 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1322 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1323 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1324 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1325 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1326 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1328 d = _mm_sub_ps(r12,rswitch);
1329 d = _mm_max_ps(d,_mm_setzero_ps());
1330 d2 = _mm_mul_ps(d,d);
1331 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1333 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1335 /* Evaluate switch function */
1336 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1337 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1338 velec = _mm_mul_ps(velec,sw);
1339 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1341 /* Update potential sum for this i atom from the interaction with this j atom. */
1342 velec = _mm_and_ps(velec,cutoff_mask);
1343 velec = _mm_andnot_ps(dummy_mask,velec);
1344 velecsum = _mm_add_ps(velecsum,velec);
1348 fscal = _mm_and_ps(fscal,cutoff_mask);
1350 fscal = _mm_andnot_ps(dummy_mask,fscal);
1352 /* Calculate temporary vectorial force */
1353 tx = _mm_mul_ps(fscal,dx12);
1354 ty = _mm_mul_ps(fscal,dy12);
1355 tz = _mm_mul_ps(fscal,dz12);
1357 /* Update vectorial force */
1358 fix1 = _mm_add_ps(fix1,tx);
1359 fiy1 = _mm_add_ps(fiy1,ty);
1360 fiz1 = _mm_add_ps(fiz1,tz);
1362 fjx2 = _mm_add_ps(fjx2,tx);
1363 fjy2 = _mm_add_ps(fjy2,ty);
1364 fjz2 = _mm_add_ps(fjz2,tz);
1368 /**************************
1369 * CALCULATE INTERACTIONS *
1370 **************************/
1372 if (gmx_mm_any_lt(rsq20,rcutoff2))
1375 r20 = _mm_mul_ps(rsq20,rinv20);
1376 r20 = _mm_andnot_ps(dummy_mask,r20);
1378 /* EWALD ELECTROSTATICS */
1380 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1381 ewrt = _mm_mul_ps(r20,ewtabscale);
1382 ewitab = _mm_cvttps_epi32(ewrt);
1383 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1384 ewitab = _mm_slli_epi32(ewitab,2);
1385 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1386 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1387 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1388 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1389 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1390 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1391 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1392 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1393 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1395 d = _mm_sub_ps(r20,rswitch);
1396 d = _mm_max_ps(d,_mm_setzero_ps());
1397 d2 = _mm_mul_ps(d,d);
1398 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1400 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1402 /* Evaluate switch function */
1403 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1404 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1405 velec = _mm_mul_ps(velec,sw);
1406 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1408 /* Update potential sum for this i atom from the interaction with this j atom. */
1409 velec = _mm_and_ps(velec,cutoff_mask);
1410 velec = _mm_andnot_ps(dummy_mask,velec);
1411 velecsum = _mm_add_ps(velecsum,velec);
1415 fscal = _mm_and_ps(fscal,cutoff_mask);
1417 fscal = _mm_andnot_ps(dummy_mask,fscal);
1419 /* Calculate temporary vectorial force */
1420 tx = _mm_mul_ps(fscal,dx20);
1421 ty = _mm_mul_ps(fscal,dy20);
1422 tz = _mm_mul_ps(fscal,dz20);
1424 /* Update vectorial force */
1425 fix2 = _mm_add_ps(fix2,tx);
1426 fiy2 = _mm_add_ps(fiy2,ty);
1427 fiz2 = _mm_add_ps(fiz2,tz);
1429 fjx0 = _mm_add_ps(fjx0,tx);
1430 fjy0 = _mm_add_ps(fjy0,ty);
1431 fjz0 = _mm_add_ps(fjz0,tz);
1435 /**************************
1436 * CALCULATE INTERACTIONS *
1437 **************************/
1439 if (gmx_mm_any_lt(rsq21,rcutoff2))
1442 r21 = _mm_mul_ps(rsq21,rinv21);
1443 r21 = _mm_andnot_ps(dummy_mask,r21);
1445 /* EWALD ELECTROSTATICS */
1447 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1448 ewrt = _mm_mul_ps(r21,ewtabscale);
1449 ewitab = _mm_cvttps_epi32(ewrt);
1450 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1451 ewitab = _mm_slli_epi32(ewitab,2);
1452 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1453 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1454 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1455 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1456 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1457 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1458 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1459 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1460 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1462 d = _mm_sub_ps(r21,rswitch);
1463 d = _mm_max_ps(d,_mm_setzero_ps());
1464 d2 = _mm_mul_ps(d,d);
1465 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1467 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1469 /* Evaluate switch function */
1470 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1471 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1472 velec = _mm_mul_ps(velec,sw);
1473 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1475 /* Update potential sum for this i atom from the interaction with this j atom. */
1476 velec = _mm_and_ps(velec,cutoff_mask);
1477 velec = _mm_andnot_ps(dummy_mask,velec);
1478 velecsum = _mm_add_ps(velecsum,velec);
1482 fscal = _mm_and_ps(fscal,cutoff_mask);
1484 fscal = _mm_andnot_ps(dummy_mask,fscal);
1486 /* Calculate temporary vectorial force */
1487 tx = _mm_mul_ps(fscal,dx21);
1488 ty = _mm_mul_ps(fscal,dy21);
1489 tz = _mm_mul_ps(fscal,dz21);
1491 /* Update vectorial force */
1492 fix2 = _mm_add_ps(fix2,tx);
1493 fiy2 = _mm_add_ps(fiy2,ty);
1494 fiz2 = _mm_add_ps(fiz2,tz);
1496 fjx1 = _mm_add_ps(fjx1,tx);
1497 fjy1 = _mm_add_ps(fjy1,ty);
1498 fjz1 = _mm_add_ps(fjz1,tz);
1502 /**************************
1503 * CALCULATE INTERACTIONS *
1504 **************************/
1506 if (gmx_mm_any_lt(rsq22,rcutoff2))
1509 r22 = _mm_mul_ps(rsq22,rinv22);
1510 r22 = _mm_andnot_ps(dummy_mask,r22);
1512 /* EWALD ELECTROSTATICS */
1514 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1515 ewrt = _mm_mul_ps(r22,ewtabscale);
1516 ewitab = _mm_cvttps_epi32(ewrt);
1517 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1518 ewitab = _mm_slli_epi32(ewitab,2);
1519 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1520 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1521 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1522 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1523 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1524 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1525 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1526 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1527 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1529 d = _mm_sub_ps(r22,rswitch);
1530 d = _mm_max_ps(d,_mm_setzero_ps());
1531 d2 = _mm_mul_ps(d,d);
1532 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1534 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1536 /* Evaluate switch function */
1537 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1538 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1539 velec = _mm_mul_ps(velec,sw);
1540 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1542 /* Update potential sum for this i atom from the interaction with this j atom. */
1543 velec = _mm_and_ps(velec,cutoff_mask);
1544 velec = _mm_andnot_ps(dummy_mask,velec);
1545 velecsum = _mm_add_ps(velecsum,velec);
1549 fscal = _mm_and_ps(fscal,cutoff_mask);
1551 fscal = _mm_andnot_ps(dummy_mask,fscal);
1553 /* Calculate temporary vectorial force */
1554 tx = _mm_mul_ps(fscal,dx22);
1555 ty = _mm_mul_ps(fscal,dy22);
1556 tz = _mm_mul_ps(fscal,dz22);
1558 /* Update vectorial force */
1559 fix2 = _mm_add_ps(fix2,tx);
1560 fiy2 = _mm_add_ps(fiy2,ty);
1561 fiz2 = _mm_add_ps(fiz2,tz);
1563 fjx2 = _mm_add_ps(fjx2,tx);
1564 fjy2 = _mm_add_ps(fjy2,ty);
1565 fjz2 = _mm_add_ps(fjz2,tz);
1569 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1570 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1571 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1572 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1574 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1575 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1577 /* Inner loop uses 594 flops */
1580 /* End of innermost loop */
1582 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1583 f+i_coord_offset,fshift+i_shift_offset);
1586 /* Update potential energies */
1587 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1589 /* Increment number of inner iterations */
1590 inneriter += j_index_end - j_index_start;
1592 /* Outer loop uses 19 flops */
1595 /* Increment number of outer iterations */
1598 /* Update outer/inner flops */
1600 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*594);
1603 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_single
1604 * Electrostatics interaction: Ewald
1605 * VdW interaction: None
1606 * Geometry: Water3-Water3
1607 * Calculate force/pot: Force
1610 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_single
1611 (t_nblist * gmx_restrict nlist,
1612 rvec * gmx_restrict xx,
1613 rvec * gmx_restrict ff,
1614 t_forcerec * gmx_restrict fr,
1615 t_mdatoms * gmx_restrict mdatoms,
1616 nb_kernel_data_t * gmx_restrict kernel_data,
1617 t_nrnb * gmx_restrict nrnb)
1619 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1620 * just 0 for non-waters.
1621 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1622 * jnr indices corresponding to data put in the four positions in the SIMD register.
1624 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1625 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1626 int jnrA,jnrB,jnrC,jnrD;
1627 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1628 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1629 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1630 real rcutoff_scalar;
1631 real *shiftvec,*fshift,*x,*f;
1632 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1633 real scratch[4*DIM];
1634 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1636 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1638 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1640 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1641 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1642 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1643 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1644 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1645 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1646 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1647 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1648 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1649 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1650 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1651 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1652 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1653 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1654 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1655 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1656 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1659 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1661 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1662 real rswitch_scalar,d_scalar;
1663 __m128 dummy_mask,cutoff_mask;
1664 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1665 __m128 one = _mm_set1_ps(1.0);
1666 __m128 two = _mm_set1_ps(2.0);
1672 jindex = nlist->jindex;
1674 shiftidx = nlist->shift;
1676 shiftvec = fr->shift_vec[0];
1677 fshift = fr->fshift[0];
1678 facel = _mm_set1_ps(fr->epsfac);
1679 charge = mdatoms->chargeA;
1681 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1682 ewtab = fr->ic->tabq_coul_FDV0;
1683 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1684 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1686 /* Setup water-specific parameters */
1687 inr = nlist->iinr[0];
1688 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1689 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1690 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1692 jq0 = _mm_set1_ps(charge[inr+0]);
1693 jq1 = _mm_set1_ps(charge[inr+1]);
1694 jq2 = _mm_set1_ps(charge[inr+2]);
1695 qq00 = _mm_mul_ps(iq0,jq0);
1696 qq01 = _mm_mul_ps(iq0,jq1);
1697 qq02 = _mm_mul_ps(iq0,jq2);
1698 qq10 = _mm_mul_ps(iq1,jq0);
1699 qq11 = _mm_mul_ps(iq1,jq1);
1700 qq12 = _mm_mul_ps(iq1,jq2);
1701 qq20 = _mm_mul_ps(iq2,jq0);
1702 qq21 = _mm_mul_ps(iq2,jq1);
1703 qq22 = _mm_mul_ps(iq2,jq2);
1705 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1706 rcutoff_scalar = fr->rcoulomb;
1707 rcutoff = _mm_set1_ps(rcutoff_scalar);
1708 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1710 rswitch_scalar = fr->rcoulomb_switch;
1711 rswitch = _mm_set1_ps(rswitch_scalar);
1712 /* Setup switch parameters */
1713 d_scalar = rcutoff_scalar-rswitch_scalar;
1714 d = _mm_set1_ps(d_scalar);
1715 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1716 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1717 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1718 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1719 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1720 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1722 /* Avoid stupid compiler warnings */
1723 jnrA = jnrB = jnrC = jnrD = 0;
1724 j_coord_offsetA = 0;
1725 j_coord_offsetB = 0;
1726 j_coord_offsetC = 0;
1727 j_coord_offsetD = 0;
1732 for(iidx=0;iidx<4*DIM;iidx++)
1734 scratch[iidx] = 0.0;
1737 /* Start outer loop over neighborlists */
1738 for(iidx=0; iidx<nri; iidx++)
1740 /* Load shift vector for this list */
1741 i_shift_offset = DIM*shiftidx[iidx];
1743 /* Load limits for loop over neighbors */
1744 j_index_start = jindex[iidx];
1745 j_index_end = jindex[iidx+1];
1747 /* Get outer coordinate index */
1749 i_coord_offset = DIM*inr;
1751 /* Load i particle coords and add shift vector */
1752 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1753 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1755 fix0 = _mm_setzero_ps();
1756 fiy0 = _mm_setzero_ps();
1757 fiz0 = _mm_setzero_ps();
1758 fix1 = _mm_setzero_ps();
1759 fiy1 = _mm_setzero_ps();
1760 fiz1 = _mm_setzero_ps();
1761 fix2 = _mm_setzero_ps();
1762 fiy2 = _mm_setzero_ps();
1763 fiz2 = _mm_setzero_ps();
1765 /* Start inner kernel loop */
1766 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1769 /* Get j neighbor index, and coordinate index */
1771 jnrB = jjnr[jidx+1];
1772 jnrC = jjnr[jidx+2];
1773 jnrD = jjnr[jidx+3];
1774 j_coord_offsetA = DIM*jnrA;
1775 j_coord_offsetB = DIM*jnrB;
1776 j_coord_offsetC = DIM*jnrC;
1777 j_coord_offsetD = DIM*jnrD;
1779 /* load j atom coordinates */
1780 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1781 x+j_coord_offsetC,x+j_coord_offsetD,
1782 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1784 /* Calculate displacement vector */
1785 dx00 = _mm_sub_ps(ix0,jx0);
1786 dy00 = _mm_sub_ps(iy0,jy0);
1787 dz00 = _mm_sub_ps(iz0,jz0);
1788 dx01 = _mm_sub_ps(ix0,jx1);
1789 dy01 = _mm_sub_ps(iy0,jy1);
1790 dz01 = _mm_sub_ps(iz0,jz1);
1791 dx02 = _mm_sub_ps(ix0,jx2);
1792 dy02 = _mm_sub_ps(iy0,jy2);
1793 dz02 = _mm_sub_ps(iz0,jz2);
1794 dx10 = _mm_sub_ps(ix1,jx0);
1795 dy10 = _mm_sub_ps(iy1,jy0);
1796 dz10 = _mm_sub_ps(iz1,jz0);
1797 dx11 = _mm_sub_ps(ix1,jx1);
1798 dy11 = _mm_sub_ps(iy1,jy1);
1799 dz11 = _mm_sub_ps(iz1,jz1);
1800 dx12 = _mm_sub_ps(ix1,jx2);
1801 dy12 = _mm_sub_ps(iy1,jy2);
1802 dz12 = _mm_sub_ps(iz1,jz2);
1803 dx20 = _mm_sub_ps(ix2,jx0);
1804 dy20 = _mm_sub_ps(iy2,jy0);
1805 dz20 = _mm_sub_ps(iz2,jz0);
1806 dx21 = _mm_sub_ps(ix2,jx1);
1807 dy21 = _mm_sub_ps(iy2,jy1);
1808 dz21 = _mm_sub_ps(iz2,jz1);
1809 dx22 = _mm_sub_ps(ix2,jx2);
1810 dy22 = _mm_sub_ps(iy2,jy2);
1811 dz22 = _mm_sub_ps(iz2,jz2);
1813 /* Calculate squared distance and things based on it */
1814 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1815 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1816 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1817 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1818 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1819 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1820 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1821 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1822 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1824 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1825 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1826 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1827 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1828 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1829 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1830 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1831 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1832 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1834 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1835 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1836 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1837 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1838 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1839 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1840 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1841 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1842 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1844 fjx0 = _mm_setzero_ps();
1845 fjy0 = _mm_setzero_ps();
1846 fjz0 = _mm_setzero_ps();
1847 fjx1 = _mm_setzero_ps();
1848 fjy1 = _mm_setzero_ps();
1849 fjz1 = _mm_setzero_ps();
1850 fjx2 = _mm_setzero_ps();
1851 fjy2 = _mm_setzero_ps();
1852 fjz2 = _mm_setzero_ps();
1854 /**************************
1855 * CALCULATE INTERACTIONS *
1856 **************************/
1858 if (gmx_mm_any_lt(rsq00,rcutoff2))
1861 r00 = _mm_mul_ps(rsq00,rinv00);
1863 /* EWALD ELECTROSTATICS */
1865 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1866 ewrt = _mm_mul_ps(r00,ewtabscale);
1867 ewitab = _mm_cvttps_epi32(ewrt);
1868 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1869 ewitab = _mm_slli_epi32(ewitab,2);
1870 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1871 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1872 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1873 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1874 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1875 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1876 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1877 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1878 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1880 d = _mm_sub_ps(r00,rswitch);
1881 d = _mm_max_ps(d,_mm_setzero_ps());
1882 d2 = _mm_mul_ps(d,d);
1883 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1885 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1887 /* Evaluate switch function */
1888 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1889 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1890 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1894 fscal = _mm_and_ps(fscal,cutoff_mask);
1896 /* Calculate temporary vectorial force */
1897 tx = _mm_mul_ps(fscal,dx00);
1898 ty = _mm_mul_ps(fscal,dy00);
1899 tz = _mm_mul_ps(fscal,dz00);
1901 /* Update vectorial force */
1902 fix0 = _mm_add_ps(fix0,tx);
1903 fiy0 = _mm_add_ps(fiy0,ty);
1904 fiz0 = _mm_add_ps(fiz0,tz);
1906 fjx0 = _mm_add_ps(fjx0,tx);
1907 fjy0 = _mm_add_ps(fjy0,ty);
1908 fjz0 = _mm_add_ps(fjz0,tz);
1912 /**************************
1913 * CALCULATE INTERACTIONS *
1914 **************************/
1916 if (gmx_mm_any_lt(rsq01,rcutoff2))
1919 r01 = _mm_mul_ps(rsq01,rinv01);
1921 /* EWALD ELECTROSTATICS */
1923 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1924 ewrt = _mm_mul_ps(r01,ewtabscale);
1925 ewitab = _mm_cvttps_epi32(ewrt);
1926 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1927 ewitab = _mm_slli_epi32(ewitab,2);
1928 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1929 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1930 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1931 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1932 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1933 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1934 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1935 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
1936 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1938 d = _mm_sub_ps(r01,rswitch);
1939 d = _mm_max_ps(d,_mm_setzero_ps());
1940 d2 = _mm_mul_ps(d,d);
1941 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1943 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1945 /* Evaluate switch function */
1946 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1947 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1948 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1952 fscal = _mm_and_ps(fscal,cutoff_mask);
1954 /* Calculate temporary vectorial force */
1955 tx = _mm_mul_ps(fscal,dx01);
1956 ty = _mm_mul_ps(fscal,dy01);
1957 tz = _mm_mul_ps(fscal,dz01);
1959 /* Update vectorial force */
1960 fix0 = _mm_add_ps(fix0,tx);
1961 fiy0 = _mm_add_ps(fiy0,ty);
1962 fiz0 = _mm_add_ps(fiz0,tz);
1964 fjx1 = _mm_add_ps(fjx1,tx);
1965 fjy1 = _mm_add_ps(fjy1,ty);
1966 fjz1 = _mm_add_ps(fjz1,tz);
1970 /**************************
1971 * CALCULATE INTERACTIONS *
1972 **************************/
1974 if (gmx_mm_any_lt(rsq02,rcutoff2))
1977 r02 = _mm_mul_ps(rsq02,rinv02);
1979 /* EWALD ELECTROSTATICS */
1981 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1982 ewrt = _mm_mul_ps(r02,ewtabscale);
1983 ewitab = _mm_cvttps_epi32(ewrt);
1984 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1985 ewitab = _mm_slli_epi32(ewitab,2);
1986 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1987 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1988 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1989 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1990 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1991 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1992 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1993 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
1994 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1996 d = _mm_sub_ps(r02,rswitch);
1997 d = _mm_max_ps(d,_mm_setzero_ps());
1998 d2 = _mm_mul_ps(d,d);
1999 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2001 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2003 /* Evaluate switch function */
2004 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2005 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2006 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2010 fscal = _mm_and_ps(fscal,cutoff_mask);
2012 /* Calculate temporary vectorial force */
2013 tx = _mm_mul_ps(fscal,dx02);
2014 ty = _mm_mul_ps(fscal,dy02);
2015 tz = _mm_mul_ps(fscal,dz02);
2017 /* Update vectorial force */
2018 fix0 = _mm_add_ps(fix0,tx);
2019 fiy0 = _mm_add_ps(fiy0,ty);
2020 fiz0 = _mm_add_ps(fiz0,tz);
2022 fjx2 = _mm_add_ps(fjx2,tx);
2023 fjy2 = _mm_add_ps(fjy2,ty);
2024 fjz2 = _mm_add_ps(fjz2,tz);
2028 /**************************
2029 * CALCULATE INTERACTIONS *
2030 **************************/
2032 if (gmx_mm_any_lt(rsq10,rcutoff2))
2035 r10 = _mm_mul_ps(rsq10,rinv10);
2037 /* EWALD ELECTROSTATICS */
2039 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2040 ewrt = _mm_mul_ps(r10,ewtabscale);
2041 ewitab = _mm_cvttps_epi32(ewrt);
2042 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2043 ewitab = _mm_slli_epi32(ewitab,2);
2044 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2045 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2046 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2047 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2048 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2049 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2050 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2051 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2052 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2054 d = _mm_sub_ps(r10,rswitch);
2055 d = _mm_max_ps(d,_mm_setzero_ps());
2056 d2 = _mm_mul_ps(d,d);
2057 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2059 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2061 /* Evaluate switch function */
2062 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2063 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2064 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2068 fscal = _mm_and_ps(fscal,cutoff_mask);
2070 /* Calculate temporary vectorial force */
2071 tx = _mm_mul_ps(fscal,dx10);
2072 ty = _mm_mul_ps(fscal,dy10);
2073 tz = _mm_mul_ps(fscal,dz10);
2075 /* Update vectorial force */
2076 fix1 = _mm_add_ps(fix1,tx);
2077 fiy1 = _mm_add_ps(fiy1,ty);
2078 fiz1 = _mm_add_ps(fiz1,tz);
2080 fjx0 = _mm_add_ps(fjx0,tx);
2081 fjy0 = _mm_add_ps(fjy0,ty);
2082 fjz0 = _mm_add_ps(fjz0,tz);
2086 /**************************
2087 * CALCULATE INTERACTIONS *
2088 **************************/
2090 if (gmx_mm_any_lt(rsq11,rcutoff2))
2093 r11 = _mm_mul_ps(rsq11,rinv11);
2095 /* EWALD ELECTROSTATICS */
2097 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2098 ewrt = _mm_mul_ps(r11,ewtabscale);
2099 ewitab = _mm_cvttps_epi32(ewrt);
2100 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2101 ewitab = _mm_slli_epi32(ewitab,2);
2102 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2103 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2104 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2105 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2106 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2107 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2108 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2109 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2110 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2112 d = _mm_sub_ps(r11,rswitch);
2113 d = _mm_max_ps(d,_mm_setzero_ps());
2114 d2 = _mm_mul_ps(d,d);
2115 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2117 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2119 /* Evaluate switch function */
2120 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2121 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2122 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2126 fscal = _mm_and_ps(fscal,cutoff_mask);
2128 /* Calculate temporary vectorial force */
2129 tx = _mm_mul_ps(fscal,dx11);
2130 ty = _mm_mul_ps(fscal,dy11);
2131 tz = _mm_mul_ps(fscal,dz11);
2133 /* Update vectorial force */
2134 fix1 = _mm_add_ps(fix1,tx);
2135 fiy1 = _mm_add_ps(fiy1,ty);
2136 fiz1 = _mm_add_ps(fiz1,tz);
2138 fjx1 = _mm_add_ps(fjx1,tx);
2139 fjy1 = _mm_add_ps(fjy1,ty);
2140 fjz1 = _mm_add_ps(fjz1,tz);
2144 /**************************
2145 * CALCULATE INTERACTIONS *
2146 **************************/
2148 if (gmx_mm_any_lt(rsq12,rcutoff2))
2151 r12 = _mm_mul_ps(rsq12,rinv12);
2153 /* EWALD ELECTROSTATICS */
2155 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2156 ewrt = _mm_mul_ps(r12,ewtabscale);
2157 ewitab = _mm_cvttps_epi32(ewrt);
2158 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2159 ewitab = _mm_slli_epi32(ewitab,2);
2160 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2161 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2162 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2163 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2164 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2165 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2166 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2167 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2168 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2170 d = _mm_sub_ps(r12,rswitch);
2171 d = _mm_max_ps(d,_mm_setzero_ps());
2172 d2 = _mm_mul_ps(d,d);
2173 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2175 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2177 /* Evaluate switch function */
2178 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2179 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2180 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2184 fscal = _mm_and_ps(fscal,cutoff_mask);
2186 /* Calculate temporary vectorial force */
2187 tx = _mm_mul_ps(fscal,dx12);
2188 ty = _mm_mul_ps(fscal,dy12);
2189 tz = _mm_mul_ps(fscal,dz12);
2191 /* Update vectorial force */
2192 fix1 = _mm_add_ps(fix1,tx);
2193 fiy1 = _mm_add_ps(fiy1,ty);
2194 fiz1 = _mm_add_ps(fiz1,tz);
2196 fjx2 = _mm_add_ps(fjx2,tx);
2197 fjy2 = _mm_add_ps(fjy2,ty);
2198 fjz2 = _mm_add_ps(fjz2,tz);
2202 /**************************
2203 * CALCULATE INTERACTIONS *
2204 **************************/
2206 if (gmx_mm_any_lt(rsq20,rcutoff2))
2209 r20 = _mm_mul_ps(rsq20,rinv20);
2211 /* EWALD ELECTROSTATICS */
2213 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2214 ewrt = _mm_mul_ps(r20,ewtabscale);
2215 ewitab = _mm_cvttps_epi32(ewrt);
2216 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2217 ewitab = _mm_slli_epi32(ewitab,2);
2218 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2219 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2220 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2221 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2222 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2223 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2224 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2225 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2226 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2228 d = _mm_sub_ps(r20,rswitch);
2229 d = _mm_max_ps(d,_mm_setzero_ps());
2230 d2 = _mm_mul_ps(d,d);
2231 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2233 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2235 /* Evaluate switch function */
2236 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2237 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2238 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2242 fscal = _mm_and_ps(fscal,cutoff_mask);
2244 /* Calculate temporary vectorial force */
2245 tx = _mm_mul_ps(fscal,dx20);
2246 ty = _mm_mul_ps(fscal,dy20);
2247 tz = _mm_mul_ps(fscal,dz20);
2249 /* Update vectorial force */
2250 fix2 = _mm_add_ps(fix2,tx);
2251 fiy2 = _mm_add_ps(fiy2,ty);
2252 fiz2 = _mm_add_ps(fiz2,tz);
2254 fjx0 = _mm_add_ps(fjx0,tx);
2255 fjy0 = _mm_add_ps(fjy0,ty);
2256 fjz0 = _mm_add_ps(fjz0,tz);
2260 /**************************
2261 * CALCULATE INTERACTIONS *
2262 **************************/
2264 if (gmx_mm_any_lt(rsq21,rcutoff2))
2267 r21 = _mm_mul_ps(rsq21,rinv21);
2269 /* EWALD ELECTROSTATICS */
2271 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2272 ewrt = _mm_mul_ps(r21,ewtabscale);
2273 ewitab = _mm_cvttps_epi32(ewrt);
2274 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2275 ewitab = _mm_slli_epi32(ewitab,2);
2276 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2277 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2278 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2279 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2280 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2281 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2282 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2283 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2284 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2286 d = _mm_sub_ps(r21,rswitch);
2287 d = _mm_max_ps(d,_mm_setzero_ps());
2288 d2 = _mm_mul_ps(d,d);
2289 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2291 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2293 /* Evaluate switch function */
2294 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2295 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2296 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2300 fscal = _mm_and_ps(fscal,cutoff_mask);
2302 /* Calculate temporary vectorial force */
2303 tx = _mm_mul_ps(fscal,dx21);
2304 ty = _mm_mul_ps(fscal,dy21);
2305 tz = _mm_mul_ps(fscal,dz21);
2307 /* Update vectorial force */
2308 fix2 = _mm_add_ps(fix2,tx);
2309 fiy2 = _mm_add_ps(fiy2,ty);
2310 fiz2 = _mm_add_ps(fiz2,tz);
2312 fjx1 = _mm_add_ps(fjx1,tx);
2313 fjy1 = _mm_add_ps(fjy1,ty);
2314 fjz1 = _mm_add_ps(fjz1,tz);
2318 /**************************
2319 * CALCULATE INTERACTIONS *
2320 **************************/
2322 if (gmx_mm_any_lt(rsq22,rcutoff2))
2325 r22 = _mm_mul_ps(rsq22,rinv22);
2327 /* EWALD ELECTROSTATICS */
2329 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2330 ewrt = _mm_mul_ps(r22,ewtabscale);
2331 ewitab = _mm_cvttps_epi32(ewrt);
2332 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2333 ewitab = _mm_slli_epi32(ewitab,2);
2334 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2335 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2336 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2337 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2338 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2339 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2340 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2341 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2342 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2344 d = _mm_sub_ps(r22,rswitch);
2345 d = _mm_max_ps(d,_mm_setzero_ps());
2346 d2 = _mm_mul_ps(d,d);
2347 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2349 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2351 /* Evaluate switch function */
2352 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2353 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2354 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2358 fscal = _mm_and_ps(fscal,cutoff_mask);
2360 /* Calculate temporary vectorial force */
2361 tx = _mm_mul_ps(fscal,dx22);
2362 ty = _mm_mul_ps(fscal,dy22);
2363 tz = _mm_mul_ps(fscal,dz22);
2365 /* Update vectorial force */
2366 fix2 = _mm_add_ps(fix2,tx);
2367 fiy2 = _mm_add_ps(fiy2,ty);
2368 fiz2 = _mm_add_ps(fiz2,tz);
2370 fjx2 = _mm_add_ps(fjx2,tx);
2371 fjy2 = _mm_add_ps(fjy2,ty);
2372 fjz2 = _mm_add_ps(fjz2,tz);
2376 fjptrA = f+j_coord_offsetA;
2377 fjptrB = f+j_coord_offsetB;
2378 fjptrC = f+j_coord_offsetC;
2379 fjptrD = f+j_coord_offsetD;
2381 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2382 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2384 /* Inner loop uses 558 flops */
2387 if(jidx<j_index_end)
2390 /* Get j neighbor index, and coordinate index */
2391 jnrlistA = jjnr[jidx];
2392 jnrlistB = jjnr[jidx+1];
2393 jnrlistC = jjnr[jidx+2];
2394 jnrlistD = jjnr[jidx+3];
2395 /* Sign of each element will be negative for non-real atoms.
2396 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2397 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2399 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2400 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2401 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2402 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2403 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2404 j_coord_offsetA = DIM*jnrA;
2405 j_coord_offsetB = DIM*jnrB;
2406 j_coord_offsetC = DIM*jnrC;
2407 j_coord_offsetD = DIM*jnrD;
2409 /* load j atom coordinates */
2410 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2411 x+j_coord_offsetC,x+j_coord_offsetD,
2412 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2414 /* Calculate displacement vector */
2415 dx00 = _mm_sub_ps(ix0,jx0);
2416 dy00 = _mm_sub_ps(iy0,jy0);
2417 dz00 = _mm_sub_ps(iz0,jz0);
2418 dx01 = _mm_sub_ps(ix0,jx1);
2419 dy01 = _mm_sub_ps(iy0,jy1);
2420 dz01 = _mm_sub_ps(iz0,jz1);
2421 dx02 = _mm_sub_ps(ix0,jx2);
2422 dy02 = _mm_sub_ps(iy0,jy2);
2423 dz02 = _mm_sub_ps(iz0,jz2);
2424 dx10 = _mm_sub_ps(ix1,jx0);
2425 dy10 = _mm_sub_ps(iy1,jy0);
2426 dz10 = _mm_sub_ps(iz1,jz0);
2427 dx11 = _mm_sub_ps(ix1,jx1);
2428 dy11 = _mm_sub_ps(iy1,jy1);
2429 dz11 = _mm_sub_ps(iz1,jz1);
2430 dx12 = _mm_sub_ps(ix1,jx2);
2431 dy12 = _mm_sub_ps(iy1,jy2);
2432 dz12 = _mm_sub_ps(iz1,jz2);
2433 dx20 = _mm_sub_ps(ix2,jx0);
2434 dy20 = _mm_sub_ps(iy2,jy0);
2435 dz20 = _mm_sub_ps(iz2,jz0);
2436 dx21 = _mm_sub_ps(ix2,jx1);
2437 dy21 = _mm_sub_ps(iy2,jy1);
2438 dz21 = _mm_sub_ps(iz2,jz1);
2439 dx22 = _mm_sub_ps(ix2,jx2);
2440 dy22 = _mm_sub_ps(iy2,jy2);
2441 dz22 = _mm_sub_ps(iz2,jz2);
2443 /* Calculate squared distance and things based on it */
2444 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2445 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2446 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2447 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2448 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2449 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2450 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2451 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2452 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2454 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2455 rinv01 = gmx_mm_invsqrt_ps(rsq01);
2456 rinv02 = gmx_mm_invsqrt_ps(rsq02);
2457 rinv10 = gmx_mm_invsqrt_ps(rsq10);
2458 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2459 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2460 rinv20 = gmx_mm_invsqrt_ps(rsq20);
2461 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2462 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2464 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2465 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
2466 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
2467 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
2468 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2469 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2470 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
2471 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2472 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2474 fjx0 = _mm_setzero_ps();
2475 fjy0 = _mm_setzero_ps();
2476 fjz0 = _mm_setzero_ps();
2477 fjx1 = _mm_setzero_ps();
2478 fjy1 = _mm_setzero_ps();
2479 fjz1 = _mm_setzero_ps();
2480 fjx2 = _mm_setzero_ps();
2481 fjy2 = _mm_setzero_ps();
2482 fjz2 = _mm_setzero_ps();
2484 /**************************
2485 * CALCULATE INTERACTIONS *
2486 **************************/
2488 if (gmx_mm_any_lt(rsq00,rcutoff2))
2491 r00 = _mm_mul_ps(rsq00,rinv00);
2492 r00 = _mm_andnot_ps(dummy_mask,r00);
2494 /* EWALD ELECTROSTATICS */
2496 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2497 ewrt = _mm_mul_ps(r00,ewtabscale);
2498 ewitab = _mm_cvttps_epi32(ewrt);
2499 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2500 ewitab = _mm_slli_epi32(ewitab,2);
2501 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2502 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2503 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2504 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2505 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2506 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2507 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2508 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
2509 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2511 d = _mm_sub_ps(r00,rswitch);
2512 d = _mm_max_ps(d,_mm_setzero_ps());
2513 d2 = _mm_mul_ps(d,d);
2514 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2516 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2518 /* Evaluate switch function */
2519 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2520 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
2521 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2525 fscal = _mm_and_ps(fscal,cutoff_mask);
2527 fscal = _mm_andnot_ps(dummy_mask,fscal);
2529 /* Calculate temporary vectorial force */
2530 tx = _mm_mul_ps(fscal,dx00);
2531 ty = _mm_mul_ps(fscal,dy00);
2532 tz = _mm_mul_ps(fscal,dz00);
2534 /* Update vectorial force */
2535 fix0 = _mm_add_ps(fix0,tx);
2536 fiy0 = _mm_add_ps(fiy0,ty);
2537 fiz0 = _mm_add_ps(fiz0,tz);
2539 fjx0 = _mm_add_ps(fjx0,tx);
2540 fjy0 = _mm_add_ps(fjy0,ty);
2541 fjz0 = _mm_add_ps(fjz0,tz);
2545 /**************************
2546 * CALCULATE INTERACTIONS *
2547 **************************/
2549 if (gmx_mm_any_lt(rsq01,rcutoff2))
2552 r01 = _mm_mul_ps(rsq01,rinv01);
2553 r01 = _mm_andnot_ps(dummy_mask,r01);
2555 /* EWALD ELECTROSTATICS */
2557 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2558 ewrt = _mm_mul_ps(r01,ewtabscale);
2559 ewitab = _mm_cvttps_epi32(ewrt);
2560 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2561 ewitab = _mm_slli_epi32(ewitab,2);
2562 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2563 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2564 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2565 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2566 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2567 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2568 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2569 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
2570 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2572 d = _mm_sub_ps(r01,rswitch);
2573 d = _mm_max_ps(d,_mm_setzero_ps());
2574 d2 = _mm_mul_ps(d,d);
2575 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2577 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2579 /* Evaluate switch function */
2580 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2581 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2582 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2586 fscal = _mm_and_ps(fscal,cutoff_mask);
2588 fscal = _mm_andnot_ps(dummy_mask,fscal);
2590 /* Calculate temporary vectorial force */
2591 tx = _mm_mul_ps(fscal,dx01);
2592 ty = _mm_mul_ps(fscal,dy01);
2593 tz = _mm_mul_ps(fscal,dz01);
2595 /* Update vectorial force */
2596 fix0 = _mm_add_ps(fix0,tx);
2597 fiy0 = _mm_add_ps(fiy0,ty);
2598 fiz0 = _mm_add_ps(fiz0,tz);
2600 fjx1 = _mm_add_ps(fjx1,tx);
2601 fjy1 = _mm_add_ps(fjy1,ty);
2602 fjz1 = _mm_add_ps(fjz1,tz);
2606 /**************************
2607 * CALCULATE INTERACTIONS *
2608 **************************/
2610 if (gmx_mm_any_lt(rsq02,rcutoff2))
2613 r02 = _mm_mul_ps(rsq02,rinv02);
2614 r02 = _mm_andnot_ps(dummy_mask,r02);
2616 /* EWALD ELECTROSTATICS */
2618 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2619 ewrt = _mm_mul_ps(r02,ewtabscale);
2620 ewitab = _mm_cvttps_epi32(ewrt);
2621 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2622 ewitab = _mm_slli_epi32(ewitab,2);
2623 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2624 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2625 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2626 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2627 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2628 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2629 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2630 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2631 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2633 d = _mm_sub_ps(r02,rswitch);
2634 d = _mm_max_ps(d,_mm_setzero_ps());
2635 d2 = _mm_mul_ps(d,d);
2636 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2638 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2640 /* Evaluate switch function */
2641 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2642 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2643 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2647 fscal = _mm_and_ps(fscal,cutoff_mask);
2649 fscal = _mm_andnot_ps(dummy_mask,fscal);
2651 /* Calculate temporary vectorial force */
2652 tx = _mm_mul_ps(fscal,dx02);
2653 ty = _mm_mul_ps(fscal,dy02);
2654 tz = _mm_mul_ps(fscal,dz02);
2656 /* Update vectorial force */
2657 fix0 = _mm_add_ps(fix0,tx);
2658 fiy0 = _mm_add_ps(fiy0,ty);
2659 fiz0 = _mm_add_ps(fiz0,tz);
2661 fjx2 = _mm_add_ps(fjx2,tx);
2662 fjy2 = _mm_add_ps(fjy2,ty);
2663 fjz2 = _mm_add_ps(fjz2,tz);
2667 /**************************
2668 * CALCULATE INTERACTIONS *
2669 **************************/
2671 if (gmx_mm_any_lt(rsq10,rcutoff2))
2674 r10 = _mm_mul_ps(rsq10,rinv10);
2675 r10 = _mm_andnot_ps(dummy_mask,r10);
2677 /* EWALD ELECTROSTATICS */
2679 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2680 ewrt = _mm_mul_ps(r10,ewtabscale);
2681 ewitab = _mm_cvttps_epi32(ewrt);
2682 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2683 ewitab = _mm_slli_epi32(ewitab,2);
2684 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2685 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2686 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2687 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2688 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2689 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2690 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2691 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2692 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2694 d = _mm_sub_ps(r10,rswitch);
2695 d = _mm_max_ps(d,_mm_setzero_ps());
2696 d2 = _mm_mul_ps(d,d);
2697 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2699 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2701 /* Evaluate switch function */
2702 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2703 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2704 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2708 fscal = _mm_and_ps(fscal,cutoff_mask);
2710 fscal = _mm_andnot_ps(dummy_mask,fscal);
2712 /* Calculate temporary vectorial force */
2713 tx = _mm_mul_ps(fscal,dx10);
2714 ty = _mm_mul_ps(fscal,dy10);
2715 tz = _mm_mul_ps(fscal,dz10);
2717 /* Update vectorial force */
2718 fix1 = _mm_add_ps(fix1,tx);
2719 fiy1 = _mm_add_ps(fiy1,ty);
2720 fiz1 = _mm_add_ps(fiz1,tz);
2722 fjx0 = _mm_add_ps(fjx0,tx);
2723 fjy0 = _mm_add_ps(fjy0,ty);
2724 fjz0 = _mm_add_ps(fjz0,tz);
2728 /**************************
2729 * CALCULATE INTERACTIONS *
2730 **************************/
2732 if (gmx_mm_any_lt(rsq11,rcutoff2))
2735 r11 = _mm_mul_ps(rsq11,rinv11);
2736 r11 = _mm_andnot_ps(dummy_mask,r11);
2738 /* EWALD ELECTROSTATICS */
2740 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2741 ewrt = _mm_mul_ps(r11,ewtabscale);
2742 ewitab = _mm_cvttps_epi32(ewrt);
2743 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2744 ewitab = _mm_slli_epi32(ewitab,2);
2745 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2746 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2747 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2748 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2749 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2750 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2751 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2752 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2753 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2755 d = _mm_sub_ps(r11,rswitch);
2756 d = _mm_max_ps(d,_mm_setzero_ps());
2757 d2 = _mm_mul_ps(d,d);
2758 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2760 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2762 /* Evaluate switch function */
2763 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2764 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2765 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2769 fscal = _mm_and_ps(fscal,cutoff_mask);
2771 fscal = _mm_andnot_ps(dummy_mask,fscal);
2773 /* Calculate temporary vectorial force */
2774 tx = _mm_mul_ps(fscal,dx11);
2775 ty = _mm_mul_ps(fscal,dy11);
2776 tz = _mm_mul_ps(fscal,dz11);
2778 /* Update vectorial force */
2779 fix1 = _mm_add_ps(fix1,tx);
2780 fiy1 = _mm_add_ps(fiy1,ty);
2781 fiz1 = _mm_add_ps(fiz1,tz);
2783 fjx1 = _mm_add_ps(fjx1,tx);
2784 fjy1 = _mm_add_ps(fjy1,ty);
2785 fjz1 = _mm_add_ps(fjz1,tz);
2789 /**************************
2790 * CALCULATE INTERACTIONS *
2791 **************************/
2793 if (gmx_mm_any_lt(rsq12,rcutoff2))
2796 r12 = _mm_mul_ps(rsq12,rinv12);
2797 r12 = _mm_andnot_ps(dummy_mask,r12);
2799 /* EWALD ELECTROSTATICS */
2801 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2802 ewrt = _mm_mul_ps(r12,ewtabscale);
2803 ewitab = _mm_cvttps_epi32(ewrt);
2804 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2805 ewitab = _mm_slli_epi32(ewitab,2);
2806 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2807 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2808 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2809 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2810 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2811 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2812 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2813 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2814 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2816 d = _mm_sub_ps(r12,rswitch);
2817 d = _mm_max_ps(d,_mm_setzero_ps());
2818 d2 = _mm_mul_ps(d,d);
2819 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2821 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2823 /* Evaluate switch function */
2824 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2825 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2826 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2830 fscal = _mm_and_ps(fscal,cutoff_mask);
2832 fscal = _mm_andnot_ps(dummy_mask,fscal);
2834 /* Calculate temporary vectorial force */
2835 tx = _mm_mul_ps(fscal,dx12);
2836 ty = _mm_mul_ps(fscal,dy12);
2837 tz = _mm_mul_ps(fscal,dz12);
2839 /* Update vectorial force */
2840 fix1 = _mm_add_ps(fix1,tx);
2841 fiy1 = _mm_add_ps(fiy1,ty);
2842 fiz1 = _mm_add_ps(fiz1,tz);
2844 fjx2 = _mm_add_ps(fjx2,tx);
2845 fjy2 = _mm_add_ps(fjy2,ty);
2846 fjz2 = _mm_add_ps(fjz2,tz);
2850 /**************************
2851 * CALCULATE INTERACTIONS *
2852 **************************/
2854 if (gmx_mm_any_lt(rsq20,rcutoff2))
2857 r20 = _mm_mul_ps(rsq20,rinv20);
2858 r20 = _mm_andnot_ps(dummy_mask,r20);
2860 /* EWALD ELECTROSTATICS */
2862 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2863 ewrt = _mm_mul_ps(r20,ewtabscale);
2864 ewitab = _mm_cvttps_epi32(ewrt);
2865 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2866 ewitab = _mm_slli_epi32(ewitab,2);
2867 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2868 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2869 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2870 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2871 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2872 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2873 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2874 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2875 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2877 d = _mm_sub_ps(r20,rswitch);
2878 d = _mm_max_ps(d,_mm_setzero_ps());
2879 d2 = _mm_mul_ps(d,d);
2880 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2882 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2884 /* Evaluate switch function */
2885 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2886 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2887 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2891 fscal = _mm_and_ps(fscal,cutoff_mask);
2893 fscal = _mm_andnot_ps(dummy_mask,fscal);
2895 /* Calculate temporary vectorial force */
2896 tx = _mm_mul_ps(fscal,dx20);
2897 ty = _mm_mul_ps(fscal,dy20);
2898 tz = _mm_mul_ps(fscal,dz20);
2900 /* Update vectorial force */
2901 fix2 = _mm_add_ps(fix2,tx);
2902 fiy2 = _mm_add_ps(fiy2,ty);
2903 fiz2 = _mm_add_ps(fiz2,tz);
2905 fjx0 = _mm_add_ps(fjx0,tx);
2906 fjy0 = _mm_add_ps(fjy0,ty);
2907 fjz0 = _mm_add_ps(fjz0,tz);
2911 /**************************
2912 * CALCULATE INTERACTIONS *
2913 **************************/
2915 if (gmx_mm_any_lt(rsq21,rcutoff2))
2918 r21 = _mm_mul_ps(rsq21,rinv21);
2919 r21 = _mm_andnot_ps(dummy_mask,r21);
2921 /* EWALD ELECTROSTATICS */
2923 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2924 ewrt = _mm_mul_ps(r21,ewtabscale);
2925 ewitab = _mm_cvttps_epi32(ewrt);
2926 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2927 ewitab = _mm_slli_epi32(ewitab,2);
2928 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2929 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2930 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2931 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2932 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2933 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2934 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2935 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2936 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2938 d = _mm_sub_ps(r21,rswitch);
2939 d = _mm_max_ps(d,_mm_setzero_ps());
2940 d2 = _mm_mul_ps(d,d);
2941 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2943 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2945 /* Evaluate switch function */
2946 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2947 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2948 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2952 fscal = _mm_and_ps(fscal,cutoff_mask);
2954 fscal = _mm_andnot_ps(dummy_mask,fscal);
2956 /* Calculate temporary vectorial force */
2957 tx = _mm_mul_ps(fscal,dx21);
2958 ty = _mm_mul_ps(fscal,dy21);
2959 tz = _mm_mul_ps(fscal,dz21);
2961 /* Update vectorial force */
2962 fix2 = _mm_add_ps(fix2,tx);
2963 fiy2 = _mm_add_ps(fiy2,ty);
2964 fiz2 = _mm_add_ps(fiz2,tz);
2966 fjx1 = _mm_add_ps(fjx1,tx);
2967 fjy1 = _mm_add_ps(fjy1,ty);
2968 fjz1 = _mm_add_ps(fjz1,tz);
2972 /**************************
2973 * CALCULATE INTERACTIONS *
2974 **************************/
2976 if (gmx_mm_any_lt(rsq22,rcutoff2))
2979 r22 = _mm_mul_ps(rsq22,rinv22);
2980 r22 = _mm_andnot_ps(dummy_mask,r22);
2982 /* EWALD ELECTROSTATICS */
2984 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2985 ewrt = _mm_mul_ps(r22,ewtabscale);
2986 ewitab = _mm_cvttps_epi32(ewrt);
2987 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2988 ewitab = _mm_slli_epi32(ewitab,2);
2989 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2990 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2991 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2992 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2993 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2994 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2995 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2996 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2997 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2999 d = _mm_sub_ps(r22,rswitch);
3000 d = _mm_max_ps(d,_mm_setzero_ps());
3001 d2 = _mm_mul_ps(d,d);
3002 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3004 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3006 /* Evaluate switch function */
3007 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3008 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
3009 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
3013 fscal = _mm_and_ps(fscal,cutoff_mask);
3015 fscal = _mm_andnot_ps(dummy_mask,fscal);
3017 /* Calculate temporary vectorial force */
3018 tx = _mm_mul_ps(fscal,dx22);
3019 ty = _mm_mul_ps(fscal,dy22);
3020 tz = _mm_mul_ps(fscal,dz22);
3022 /* Update vectorial force */
3023 fix2 = _mm_add_ps(fix2,tx);
3024 fiy2 = _mm_add_ps(fiy2,ty);
3025 fiz2 = _mm_add_ps(fiz2,tz);
3027 fjx2 = _mm_add_ps(fjx2,tx);
3028 fjy2 = _mm_add_ps(fjy2,ty);
3029 fjz2 = _mm_add_ps(fjz2,tz);
3033 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3034 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3035 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3036 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3038 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
3039 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3041 /* Inner loop uses 567 flops */
3044 /* End of innermost loop */
3046 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3047 f+i_coord_offset,fshift+i_shift_offset);
3049 /* Increment number of inner iterations */
3050 inneriter += j_index_end - j_index_start;
3052 /* Outer loop uses 18 flops */
3055 /* Increment number of outer iterations */
3058 /* Update outer/inner flops */
3060 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*567);