2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse2_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_single
51 * Electrostatics interaction: Ewald
52 * VdW interaction: None
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
89 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
91 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
93 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
106 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
108 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
109 real rswitch_scalar,d_scalar;
110 __m128 dummy_mask,cutoff_mask;
111 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
112 __m128 one = _mm_set1_ps(1.0);
113 __m128 two = _mm_set1_ps(2.0);
119 jindex = nlist->jindex;
121 shiftidx = nlist->shift;
123 shiftvec = fr->shift_vec[0];
124 fshift = fr->fshift[0];
125 facel = _mm_set1_ps(fr->ic->epsfac);
126 charge = mdatoms->chargeA;
128 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
129 ewtab = fr->ic->tabq_coul_FDV0;
130 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
131 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
133 /* Setup water-specific parameters */
134 inr = nlist->iinr[0];
135 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
136 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
137 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
139 jq0 = _mm_set1_ps(charge[inr+0]);
140 jq1 = _mm_set1_ps(charge[inr+1]);
141 jq2 = _mm_set1_ps(charge[inr+2]);
142 qq00 = _mm_mul_ps(iq0,jq0);
143 qq01 = _mm_mul_ps(iq0,jq1);
144 qq02 = _mm_mul_ps(iq0,jq2);
145 qq10 = _mm_mul_ps(iq1,jq0);
146 qq11 = _mm_mul_ps(iq1,jq1);
147 qq12 = _mm_mul_ps(iq1,jq2);
148 qq20 = _mm_mul_ps(iq2,jq0);
149 qq21 = _mm_mul_ps(iq2,jq1);
150 qq22 = _mm_mul_ps(iq2,jq2);
152 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
153 rcutoff_scalar = fr->ic->rcoulomb;
154 rcutoff = _mm_set1_ps(rcutoff_scalar);
155 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
157 rswitch_scalar = fr->ic->rcoulomb_switch;
158 rswitch = _mm_set1_ps(rswitch_scalar);
159 /* Setup switch parameters */
160 d_scalar = rcutoff_scalar-rswitch_scalar;
161 d = _mm_set1_ps(d_scalar);
162 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
163 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
164 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
165 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
166 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
167 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
169 /* Avoid stupid compiler warnings */
170 jnrA = jnrB = jnrC = jnrD = 0;
179 for(iidx=0;iidx<4*DIM;iidx++)
184 /* Start outer loop over neighborlists */
185 for(iidx=0; iidx<nri; iidx++)
187 /* Load shift vector for this list */
188 i_shift_offset = DIM*shiftidx[iidx];
190 /* Load limits for loop over neighbors */
191 j_index_start = jindex[iidx];
192 j_index_end = jindex[iidx+1];
194 /* Get outer coordinate index */
196 i_coord_offset = DIM*inr;
198 /* Load i particle coords and add shift vector */
199 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
200 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
202 fix0 = _mm_setzero_ps();
203 fiy0 = _mm_setzero_ps();
204 fiz0 = _mm_setzero_ps();
205 fix1 = _mm_setzero_ps();
206 fiy1 = _mm_setzero_ps();
207 fiz1 = _mm_setzero_ps();
208 fix2 = _mm_setzero_ps();
209 fiy2 = _mm_setzero_ps();
210 fiz2 = _mm_setzero_ps();
212 /* Reset potential sums */
213 velecsum = _mm_setzero_ps();
215 /* Start inner kernel loop */
216 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
219 /* Get j neighbor index, and coordinate index */
224 j_coord_offsetA = DIM*jnrA;
225 j_coord_offsetB = DIM*jnrB;
226 j_coord_offsetC = DIM*jnrC;
227 j_coord_offsetD = DIM*jnrD;
229 /* load j atom coordinates */
230 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
231 x+j_coord_offsetC,x+j_coord_offsetD,
232 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
234 /* Calculate displacement vector */
235 dx00 = _mm_sub_ps(ix0,jx0);
236 dy00 = _mm_sub_ps(iy0,jy0);
237 dz00 = _mm_sub_ps(iz0,jz0);
238 dx01 = _mm_sub_ps(ix0,jx1);
239 dy01 = _mm_sub_ps(iy0,jy1);
240 dz01 = _mm_sub_ps(iz0,jz1);
241 dx02 = _mm_sub_ps(ix0,jx2);
242 dy02 = _mm_sub_ps(iy0,jy2);
243 dz02 = _mm_sub_ps(iz0,jz2);
244 dx10 = _mm_sub_ps(ix1,jx0);
245 dy10 = _mm_sub_ps(iy1,jy0);
246 dz10 = _mm_sub_ps(iz1,jz0);
247 dx11 = _mm_sub_ps(ix1,jx1);
248 dy11 = _mm_sub_ps(iy1,jy1);
249 dz11 = _mm_sub_ps(iz1,jz1);
250 dx12 = _mm_sub_ps(ix1,jx2);
251 dy12 = _mm_sub_ps(iy1,jy2);
252 dz12 = _mm_sub_ps(iz1,jz2);
253 dx20 = _mm_sub_ps(ix2,jx0);
254 dy20 = _mm_sub_ps(iy2,jy0);
255 dz20 = _mm_sub_ps(iz2,jz0);
256 dx21 = _mm_sub_ps(ix2,jx1);
257 dy21 = _mm_sub_ps(iy2,jy1);
258 dz21 = _mm_sub_ps(iz2,jz1);
259 dx22 = _mm_sub_ps(ix2,jx2);
260 dy22 = _mm_sub_ps(iy2,jy2);
261 dz22 = _mm_sub_ps(iz2,jz2);
263 /* Calculate squared distance and things based on it */
264 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
265 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
266 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
267 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
268 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
269 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
270 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
271 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
272 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
274 rinv00 = sse2_invsqrt_f(rsq00);
275 rinv01 = sse2_invsqrt_f(rsq01);
276 rinv02 = sse2_invsqrt_f(rsq02);
277 rinv10 = sse2_invsqrt_f(rsq10);
278 rinv11 = sse2_invsqrt_f(rsq11);
279 rinv12 = sse2_invsqrt_f(rsq12);
280 rinv20 = sse2_invsqrt_f(rsq20);
281 rinv21 = sse2_invsqrt_f(rsq21);
282 rinv22 = sse2_invsqrt_f(rsq22);
284 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
285 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
286 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
287 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
288 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
289 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
290 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
291 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
292 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
294 fjx0 = _mm_setzero_ps();
295 fjy0 = _mm_setzero_ps();
296 fjz0 = _mm_setzero_ps();
297 fjx1 = _mm_setzero_ps();
298 fjy1 = _mm_setzero_ps();
299 fjz1 = _mm_setzero_ps();
300 fjx2 = _mm_setzero_ps();
301 fjy2 = _mm_setzero_ps();
302 fjz2 = _mm_setzero_ps();
304 /**************************
305 * CALCULATE INTERACTIONS *
306 **************************/
308 if (gmx_mm_any_lt(rsq00,rcutoff2))
311 r00 = _mm_mul_ps(rsq00,rinv00);
313 /* EWALD ELECTROSTATICS */
315 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
316 ewrt = _mm_mul_ps(r00,ewtabscale);
317 ewitab = _mm_cvttps_epi32(ewrt);
318 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
319 ewitab = _mm_slli_epi32(ewitab,2);
320 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
321 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
322 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
323 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
324 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
325 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
326 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
327 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
328 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
330 d = _mm_sub_ps(r00,rswitch);
331 d = _mm_max_ps(d,_mm_setzero_ps());
332 d2 = _mm_mul_ps(d,d);
333 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
335 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
337 /* Evaluate switch function */
338 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
339 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
340 velec = _mm_mul_ps(velec,sw);
341 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
343 /* Update potential sum for this i atom from the interaction with this j atom. */
344 velec = _mm_and_ps(velec,cutoff_mask);
345 velecsum = _mm_add_ps(velecsum,velec);
349 fscal = _mm_and_ps(fscal,cutoff_mask);
351 /* Calculate temporary vectorial force */
352 tx = _mm_mul_ps(fscal,dx00);
353 ty = _mm_mul_ps(fscal,dy00);
354 tz = _mm_mul_ps(fscal,dz00);
356 /* Update vectorial force */
357 fix0 = _mm_add_ps(fix0,tx);
358 fiy0 = _mm_add_ps(fiy0,ty);
359 fiz0 = _mm_add_ps(fiz0,tz);
361 fjx0 = _mm_add_ps(fjx0,tx);
362 fjy0 = _mm_add_ps(fjy0,ty);
363 fjz0 = _mm_add_ps(fjz0,tz);
367 /**************************
368 * CALCULATE INTERACTIONS *
369 **************************/
371 if (gmx_mm_any_lt(rsq01,rcutoff2))
374 r01 = _mm_mul_ps(rsq01,rinv01);
376 /* EWALD ELECTROSTATICS */
378 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
379 ewrt = _mm_mul_ps(r01,ewtabscale);
380 ewitab = _mm_cvttps_epi32(ewrt);
381 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
382 ewitab = _mm_slli_epi32(ewitab,2);
383 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
384 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
385 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
386 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
387 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
388 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
389 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
390 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
391 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
393 d = _mm_sub_ps(r01,rswitch);
394 d = _mm_max_ps(d,_mm_setzero_ps());
395 d2 = _mm_mul_ps(d,d);
396 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
398 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
400 /* Evaluate switch function */
401 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
402 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
403 velec = _mm_mul_ps(velec,sw);
404 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
406 /* Update potential sum for this i atom from the interaction with this j atom. */
407 velec = _mm_and_ps(velec,cutoff_mask);
408 velecsum = _mm_add_ps(velecsum,velec);
412 fscal = _mm_and_ps(fscal,cutoff_mask);
414 /* Calculate temporary vectorial force */
415 tx = _mm_mul_ps(fscal,dx01);
416 ty = _mm_mul_ps(fscal,dy01);
417 tz = _mm_mul_ps(fscal,dz01);
419 /* Update vectorial force */
420 fix0 = _mm_add_ps(fix0,tx);
421 fiy0 = _mm_add_ps(fiy0,ty);
422 fiz0 = _mm_add_ps(fiz0,tz);
424 fjx1 = _mm_add_ps(fjx1,tx);
425 fjy1 = _mm_add_ps(fjy1,ty);
426 fjz1 = _mm_add_ps(fjz1,tz);
430 /**************************
431 * CALCULATE INTERACTIONS *
432 **************************/
434 if (gmx_mm_any_lt(rsq02,rcutoff2))
437 r02 = _mm_mul_ps(rsq02,rinv02);
439 /* EWALD ELECTROSTATICS */
441 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
442 ewrt = _mm_mul_ps(r02,ewtabscale);
443 ewitab = _mm_cvttps_epi32(ewrt);
444 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
445 ewitab = _mm_slli_epi32(ewitab,2);
446 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
447 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
448 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
449 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
450 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
451 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
452 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
453 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
454 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
456 d = _mm_sub_ps(r02,rswitch);
457 d = _mm_max_ps(d,_mm_setzero_ps());
458 d2 = _mm_mul_ps(d,d);
459 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
461 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
463 /* Evaluate switch function */
464 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
465 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
466 velec = _mm_mul_ps(velec,sw);
467 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
469 /* Update potential sum for this i atom from the interaction with this j atom. */
470 velec = _mm_and_ps(velec,cutoff_mask);
471 velecsum = _mm_add_ps(velecsum,velec);
475 fscal = _mm_and_ps(fscal,cutoff_mask);
477 /* Calculate temporary vectorial force */
478 tx = _mm_mul_ps(fscal,dx02);
479 ty = _mm_mul_ps(fscal,dy02);
480 tz = _mm_mul_ps(fscal,dz02);
482 /* Update vectorial force */
483 fix0 = _mm_add_ps(fix0,tx);
484 fiy0 = _mm_add_ps(fiy0,ty);
485 fiz0 = _mm_add_ps(fiz0,tz);
487 fjx2 = _mm_add_ps(fjx2,tx);
488 fjy2 = _mm_add_ps(fjy2,ty);
489 fjz2 = _mm_add_ps(fjz2,tz);
493 /**************************
494 * CALCULATE INTERACTIONS *
495 **************************/
497 if (gmx_mm_any_lt(rsq10,rcutoff2))
500 r10 = _mm_mul_ps(rsq10,rinv10);
502 /* EWALD ELECTROSTATICS */
504 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
505 ewrt = _mm_mul_ps(r10,ewtabscale);
506 ewitab = _mm_cvttps_epi32(ewrt);
507 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
508 ewitab = _mm_slli_epi32(ewitab,2);
509 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
510 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
511 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
512 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
513 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
514 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
515 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
516 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
517 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
519 d = _mm_sub_ps(r10,rswitch);
520 d = _mm_max_ps(d,_mm_setzero_ps());
521 d2 = _mm_mul_ps(d,d);
522 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
524 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
526 /* Evaluate switch function */
527 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
528 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
529 velec = _mm_mul_ps(velec,sw);
530 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
532 /* Update potential sum for this i atom from the interaction with this j atom. */
533 velec = _mm_and_ps(velec,cutoff_mask);
534 velecsum = _mm_add_ps(velecsum,velec);
538 fscal = _mm_and_ps(fscal,cutoff_mask);
540 /* Calculate temporary vectorial force */
541 tx = _mm_mul_ps(fscal,dx10);
542 ty = _mm_mul_ps(fscal,dy10);
543 tz = _mm_mul_ps(fscal,dz10);
545 /* Update vectorial force */
546 fix1 = _mm_add_ps(fix1,tx);
547 fiy1 = _mm_add_ps(fiy1,ty);
548 fiz1 = _mm_add_ps(fiz1,tz);
550 fjx0 = _mm_add_ps(fjx0,tx);
551 fjy0 = _mm_add_ps(fjy0,ty);
552 fjz0 = _mm_add_ps(fjz0,tz);
556 /**************************
557 * CALCULATE INTERACTIONS *
558 **************************/
560 if (gmx_mm_any_lt(rsq11,rcutoff2))
563 r11 = _mm_mul_ps(rsq11,rinv11);
565 /* EWALD ELECTROSTATICS */
567 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
568 ewrt = _mm_mul_ps(r11,ewtabscale);
569 ewitab = _mm_cvttps_epi32(ewrt);
570 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
571 ewitab = _mm_slli_epi32(ewitab,2);
572 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
573 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
574 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
575 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
576 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
577 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
578 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
579 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
580 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
582 d = _mm_sub_ps(r11,rswitch);
583 d = _mm_max_ps(d,_mm_setzero_ps());
584 d2 = _mm_mul_ps(d,d);
585 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
587 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
589 /* Evaluate switch function */
590 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
591 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
592 velec = _mm_mul_ps(velec,sw);
593 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
595 /* Update potential sum for this i atom from the interaction with this j atom. */
596 velec = _mm_and_ps(velec,cutoff_mask);
597 velecsum = _mm_add_ps(velecsum,velec);
601 fscal = _mm_and_ps(fscal,cutoff_mask);
603 /* Calculate temporary vectorial force */
604 tx = _mm_mul_ps(fscal,dx11);
605 ty = _mm_mul_ps(fscal,dy11);
606 tz = _mm_mul_ps(fscal,dz11);
608 /* Update vectorial force */
609 fix1 = _mm_add_ps(fix1,tx);
610 fiy1 = _mm_add_ps(fiy1,ty);
611 fiz1 = _mm_add_ps(fiz1,tz);
613 fjx1 = _mm_add_ps(fjx1,tx);
614 fjy1 = _mm_add_ps(fjy1,ty);
615 fjz1 = _mm_add_ps(fjz1,tz);
619 /**************************
620 * CALCULATE INTERACTIONS *
621 **************************/
623 if (gmx_mm_any_lt(rsq12,rcutoff2))
626 r12 = _mm_mul_ps(rsq12,rinv12);
628 /* EWALD ELECTROSTATICS */
630 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
631 ewrt = _mm_mul_ps(r12,ewtabscale);
632 ewitab = _mm_cvttps_epi32(ewrt);
633 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
634 ewitab = _mm_slli_epi32(ewitab,2);
635 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
636 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
637 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
638 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
639 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
640 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
641 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
642 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
643 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
645 d = _mm_sub_ps(r12,rswitch);
646 d = _mm_max_ps(d,_mm_setzero_ps());
647 d2 = _mm_mul_ps(d,d);
648 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
650 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
652 /* Evaluate switch function */
653 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
654 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
655 velec = _mm_mul_ps(velec,sw);
656 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
658 /* Update potential sum for this i atom from the interaction with this j atom. */
659 velec = _mm_and_ps(velec,cutoff_mask);
660 velecsum = _mm_add_ps(velecsum,velec);
664 fscal = _mm_and_ps(fscal,cutoff_mask);
666 /* Calculate temporary vectorial force */
667 tx = _mm_mul_ps(fscal,dx12);
668 ty = _mm_mul_ps(fscal,dy12);
669 tz = _mm_mul_ps(fscal,dz12);
671 /* Update vectorial force */
672 fix1 = _mm_add_ps(fix1,tx);
673 fiy1 = _mm_add_ps(fiy1,ty);
674 fiz1 = _mm_add_ps(fiz1,tz);
676 fjx2 = _mm_add_ps(fjx2,tx);
677 fjy2 = _mm_add_ps(fjy2,ty);
678 fjz2 = _mm_add_ps(fjz2,tz);
682 /**************************
683 * CALCULATE INTERACTIONS *
684 **************************/
686 if (gmx_mm_any_lt(rsq20,rcutoff2))
689 r20 = _mm_mul_ps(rsq20,rinv20);
691 /* EWALD ELECTROSTATICS */
693 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
694 ewrt = _mm_mul_ps(r20,ewtabscale);
695 ewitab = _mm_cvttps_epi32(ewrt);
696 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
697 ewitab = _mm_slli_epi32(ewitab,2);
698 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
699 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
700 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
701 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
702 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
703 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
704 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
705 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
706 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
708 d = _mm_sub_ps(r20,rswitch);
709 d = _mm_max_ps(d,_mm_setzero_ps());
710 d2 = _mm_mul_ps(d,d);
711 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
713 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
715 /* Evaluate switch function */
716 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
717 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
718 velec = _mm_mul_ps(velec,sw);
719 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
721 /* Update potential sum for this i atom from the interaction with this j atom. */
722 velec = _mm_and_ps(velec,cutoff_mask);
723 velecsum = _mm_add_ps(velecsum,velec);
727 fscal = _mm_and_ps(fscal,cutoff_mask);
729 /* Calculate temporary vectorial force */
730 tx = _mm_mul_ps(fscal,dx20);
731 ty = _mm_mul_ps(fscal,dy20);
732 tz = _mm_mul_ps(fscal,dz20);
734 /* Update vectorial force */
735 fix2 = _mm_add_ps(fix2,tx);
736 fiy2 = _mm_add_ps(fiy2,ty);
737 fiz2 = _mm_add_ps(fiz2,tz);
739 fjx0 = _mm_add_ps(fjx0,tx);
740 fjy0 = _mm_add_ps(fjy0,ty);
741 fjz0 = _mm_add_ps(fjz0,tz);
745 /**************************
746 * CALCULATE INTERACTIONS *
747 **************************/
749 if (gmx_mm_any_lt(rsq21,rcutoff2))
752 r21 = _mm_mul_ps(rsq21,rinv21);
754 /* EWALD ELECTROSTATICS */
756 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
757 ewrt = _mm_mul_ps(r21,ewtabscale);
758 ewitab = _mm_cvttps_epi32(ewrt);
759 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
760 ewitab = _mm_slli_epi32(ewitab,2);
761 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
762 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
763 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
764 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
765 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
766 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
767 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
768 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
769 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
771 d = _mm_sub_ps(r21,rswitch);
772 d = _mm_max_ps(d,_mm_setzero_ps());
773 d2 = _mm_mul_ps(d,d);
774 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
776 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
778 /* Evaluate switch function */
779 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
780 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
781 velec = _mm_mul_ps(velec,sw);
782 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
784 /* Update potential sum for this i atom from the interaction with this j atom. */
785 velec = _mm_and_ps(velec,cutoff_mask);
786 velecsum = _mm_add_ps(velecsum,velec);
790 fscal = _mm_and_ps(fscal,cutoff_mask);
792 /* Calculate temporary vectorial force */
793 tx = _mm_mul_ps(fscal,dx21);
794 ty = _mm_mul_ps(fscal,dy21);
795 tz = _mm_mul_ps(fscal,dz21);
797 /* Update vectorial force */
798 fix2 = _mm_add_ps(fix2,tx);
799 fiy2 = _mm_add_ps(fiy2,ty);
800 fiz2 = _mm_add_ps(fiz2,tz);
802 fjx1 = _mm_add_ps(fjx1,tx);
803 fjy1 = _mm_add_ps(fjy1,ty);
804 fjz1 = _mm_add_ps(fjz1,tz);
808 /**************************
809 * CALCULATE INTERACTIONS *
810 **************************/
812 if (gmx_mm_any_lt(rsq22,rcutoff2))
815 r22 = _mm_mul_ps(rsq22,rinv22);
817 /* EWALD ELECTROSTATICS */
819 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
820 ewrt = _mm_mul_ps(r22,ewtabscale);
821 ewitab = _mm_cvttps_epi32(ewrt);
822 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
823 ewitab = _mm_slli_epi32(ewitab,2);
824 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
825 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
826 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
827 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
828 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
829 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
830 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
831 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
832 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
834 d = _mm_sub_ps(r22,rswitch);
835 d = _mm_max_ps(d,_mm_setzero_ps());
836 d2 = _mm_mul_ps(d,d);
837 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
839 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
841 /* Evaluate switch function */
842 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
843 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
844 velec = _mm_mul_ps(velec,sw);
845 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
847 /* Update potential sum for this i atom from the interaction with this j atom. */
848 velec = _mm_and_ps(velec,cutoff_mask);
849 velecsum = _mm_add_ps(velecsum,velec);
853 fscal = _mm_and_ps(fscal,cutoff_mask);
855 /* Calculate temporary vectorial force */
856 tx = _mm_mul_ps(fscal,dx22);
857 ty = _mm_mul_ps(fscal,dy22);
858 tz = _mm_mul_ps(fscal,dz22);
860 /* Update vectorial force */
861 fix2 = _mm_add_ps(fix2,tx);
862 fiy2 = _mm_add_ps(fiy2,ty);
863 fiz2 = _mm_add_ps(fiz2,tz);
865 fjx2 = _mm_add_ps(fjx2,tx);
866 fjy2 = _mm_add_ps(fjy2,ty);
867 fjz2 = _mm_add_ps(fjz2,tz);
871 fjptrA = f+j_coord_offsetA;
872 fjptrB = f+j_coord_offsetB;
873 fjptrC = f+j_coord_offsetC;
874 fjptrD = f+j_coord_offsetD;
876 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
877 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
879 /* Inner loop uses 585 flops */
885 /* Get j neighbor index, and coordinate index */
886 jnrlistA = jjnr[jidx];
887 jnrlistB = jjnr[jidx+1];
888 jnrlistC = jjnr[jidx+2];
889 jnrlistD = jjnr[jidx+3];
890 /* Sign of each element will be negative for non-real atoms.
891 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
892 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
894 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
895 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
896 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
897 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
898 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
899 j_coord_offsetA = DIM*jnrA;
900 j_coord_offsetB = DIM*jnrB;
901 j_coord_offsetC = DIM*jnrC;
902 j_coord_offsetD = DIM*jnrD;
904 /* load j atom coordinates */
905 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
906 x+j_coord_offsetC,x+j_coord_offsetD,
907 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
909 /* Calculate displacement vector */
910 dx00 = _mm_sub_ps(ix0,jx0);
911 dy00 = _mm_sub_ps(iy0,jy0);
912 dz00 = _mm_sub_ps(iz0,jz0);
913 dx01 = _mm_sub_ps(ix0,jx1);
914 dy01 = _mm_sub_ps(iy0,jy1);
915 dz01 = _mm_sub_ps(iz0,jz1);
916 dx02 = _mm_sub_ps(ix0,jx2);
917 dy02 = _mm_sub_ps(iy0,jy2);
918 dz02 = _mm_sub_ps(iz0,jz2);
919 dx10 = _mm_sub_ps(ix1,jx0);
920 dy10 = _mm_sub_ps(iy1,jy0);
921 dz10 = _mm_sub_ps(iz1,jz0);
922 dx11 = _mm_sub_ps(ix1,jx1);
923 dy11 = _mm_sub_ps(iy1,jy1);
924 dz11 = _mm_sub_ps(iz1,jz1);
925 dx12 = _mm_sub_ps(ix1,jx2);
926 dy12 = _mm_sub_ps(iy1,jy2);
927 dz12 = _mm_sub_ps(iz1,jz2);
928 dx20 = _mm_sub_ps(ix2,jx0);
929 dy20 = _mm_sub_ps(iy2,jy0);
930 dz20 = _mm_sub_ps(iz2,jz0);
931 dx21 = _mm_sub_ps(ix2,jx1);
932 dy21 = _mm_sub_ps(iy2,jy1);
933 dz21 = _mm_sub_ps(iz2,jz1);
934 dx22 = _mm_sub_ps(ix2,jx2);
935 dy22 = _mm_sub_ps(iy2,jy2);
936 dz22 = _mm_sub_ps(iz2,jz2);
938 /* Calculate squared distance and things based on it */
939 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
940 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
941 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
942 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
943 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
944 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
945 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
946 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
947 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
949 rinv00 = sse2_invsqrt_f(rsq00);
950 rinv01 = sse2_invsqrt_f(rsq01);
951 rinv02 = sse2_invsqrt_f(rsq02);
952 rinv10 = sse2_invsqrt_f(rsq10);
953 rinv11 = sse2_invsqrt_f(rsq11);
954 rinv12 = sse2_invsqrt_f(rsq12);
955 rinv20 = sse2_invsqrt_f(rsq20);
956 rinv21 = sse2_invsqrt_f(rsq21);
957 rinv22 = sse2_invsqrt_f(rsq22);
959 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
960 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
961 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
962 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
963 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
964 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
965 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
966 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
967 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
969 fjx0 = _mm_setzero_ps();
970 fjy0 = _mm_setzero_ps();
971 fjz0 = _mm_setzero_ps();
972 fjx1 = _mm_setzero_ps();
973 fjy1 = _mm_setzero_ps();
974 fjz1 = _mm_setzero_ps();
975 fjx2 = _mm_setzero_ps();
976 fjy2 = _mm_setzero_ps();
977 fjz2 = _mm_setzero_ps();
979 /**************************
980 * CALCULATE INTERACTIONS *
981 **************************/
983 if (gmx_mm_any_lt(rsq00,rcutoff2))
986 r00 = _mm_mul_ps(rsq00,rinv00);
987 r00 = _mm_andnot_ps(dummy_mask,r00);
989 /* EWALD ELECTROSTATICS */
991 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
992 ewrt = _mm_mul_ps(r00,ewtabscale);
993 ewitab = _mm_cvttps_epi32(ewrt);
994 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
995 ewitab = _mm_slli_epi32(ewitab,2);
996 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
997 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
998 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
999 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1000 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1001 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1002 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1003 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1004 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1006 d = _mm_sub_ps(r00,rswitch);
1007 d = _mm_max_ps(d,_mm_setzero_ps());
1008 d2 = _mm_mul_ps(d,d);
1009 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1011 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1013 /* Evaluate switch function */
1014 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1015 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1016 velec = _mm_mul_ps(velec,sw);
1017 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1019 /* Update potential sum for this i atom from the interaction with this j atom. */
1020 velec = _mm_and_ps(velec,cutoff_mask);
1021 velec = _mm_andnot_ps(dummy_mask,velec);
1022 velecsum = _mm_add_ps(velecsum,velec);
1026 fscal = _mm_and_ps(fscal,cutoff_mask);
1028 fscal = _mm_andnot_ps(dummy_mask,fscal);
1030 /* Calculate temporary vectorial force */
1031 tx = _mm_mul_ps(fscal,dx00);
1032 ty = _mm_mul_ps(fscal,dy00);
1033 tz = _mm_mul_ps(fscal,dz00);
1035 /* Update vectorial force */
1036 fix0 = _mm_add_ps(fix0,tx);
1037 fiy0 = _mm_add_ps(fiy0,ty);
1038 fiz0 = _mm_add_ps(fiz0,tz);
1040 fjx0 = _mm_add_ps(fjx0,tx);
1041 fjy0 = _mm_add_ps(fjy0,ty);
1042 fjz0 = _mm_add_ps(fjz0,tz);
1046 /**************************
1047 * CALCULATE INTERACTIONS *
1048 **************************/
1050 if (gmx_mm_any_lt(rsq01,rcutoff2))
1053 r01 = _mm_mul_ps(rsq01,rinv01);
1054 r01 = _mm_andnot_ps(dummy_mask,r01);
1056 /* EWALD ELECTROSTATICS */
1058 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1059 ewrt = _mm_mul_ps(r01,ewtabscale);
1060 ewitab = _mm_cvttps_epi32(ewrt);
1061 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1062 ewitab = _mm_slli_epi32(ewitab,2);
1063 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1064 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1065 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1066 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1067 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1068 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1069 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1070 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
1071 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1073 d = _mm_sub_ps(r01,rswitch);
1074 d = _mm_max_ps(d,_mm_setzero_ps());
1075 d2 = _mm_mul_ps(d,d);
1076 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1078 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1080 /* Evaluate switch function */
1081 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1082 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1083 velec = _mm_mul_ps(velec,sw);
1084 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1086 /* Update potential sum for this i atom from the interaction with this j atom. */
1087 velec = _mm_and_ps(velec,cutoff_mask);
1088 velec = _mm_andnot_ps(dummy_mask,velec);
1089 velecsum = _mm_add_ps(velecsum,velec);
1093 fscal = _mm_and_ps(fscal,cutoff_mask);
1095 fscal = _mm_andnot_ps(dummy_mask,fscal);
1097 /* Calculate temporary vectorial force */
1098 tx = _mm_mul_ps(fscal,dx01);
1099 ty = _mm_mul_ps(fscal,dy01);
1100 tz = _mm_mul_ps(fscal,dz01);
1102 /* Update vectorial force */
1103 fix0 = _mm_add_ps(fix0,tx);
1104 fiy0 = _mm_add_ps(fiy0,ty);
1105 fiz0 = _mm_add_ps(fiz0,tz);
1107 fjx1 = _mm_add_ps(fjx1,tx);
1108 fjy1 = _mm_add_ps(fjy1,ty);
1109 fjz1 = _mm_add_ps(fjz1,tz);
1113 /**************************
1114 * CALCULATE INTERACTIONS *
1115 **************************/
1117 if (gmx_mm_any_lt(rsq02,rcutoff2))
1120 r02 = _mm_mul_ps(rsq02,rinv02);
1121 r02 = _mm_andnot_ps(dummy_mask,r02);
1123 /* EWALD ELECTROSTATICS */
1125 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1126 ewrt = _mm_mul_ps(r02,ewtabscale);
1127 ewitab = _mm_cvttps_epi32(ewrt);
1128 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1129 ewitab = _mm_slli_epi32(ewitab,2);
1130 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1131 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1132 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1133 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1134 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1135 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1136 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1137 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
1138 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1140 d = _mm_sub_ps(r02,rswitch);
1141 d = _mm_max_ps(d,_mm_setzero_ps());
1142 d2 = _mm_mul_ps(d,d);
1143 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1145 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1147 /* Evaluate switch function */
1148 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1149 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1150 velec = _mm_mul_ps(velec,sw);
1151 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1153 /* Update potential sum for this i atom from the interaction with this j atom. */
1154 velec = _mm_and_ps(velec,cutoff_mask);
1155 velec = _mm_andnot_ps(dummy_mask,velec);
1156 velecsum = _mm_add_ps(velecsum,velec);
1160 fscal = _mm_and_ps(fscal,cutoff_mask);
1162 fscal = _mm_andnot_ps(dummy_mask,fscal);
1164 /* Calculate temporary vectorial force */
1165 tx = _mm_mul_ps(fscal,dx02);
1166 ty = _mm_mul_ps(fscal,dy02);
1167 tz = _mm_mul_ps(fscal,dz02);
1169 /* Update vectorial force */
1170 fix0 = _mm_add_ps(fix0,tx);
1171 fiy0 = _mm_add_ps(fiy0,ty);
1172 fiz0 = _mm_add_ps(fiz0,tz);
1174 fjx2 = _mm_add_ps(fjx2,tx);
1175 fjy2 = _mm_add_ps(fjy2,ty);
1176 fjz2 = _mm_add_ps(fjz2,tz);
1180 /**************************
1181 * CALCULATE INTERACTIONS *
1182 **************************/
1184 if (gmx_mm_any_lt(rsq10,rcutoff2))
1187 r10 = _mm_mul_ps(rsq10,rinv10);
1188 r10 = _mm_andnot_ps(dummy_mask,r10);
1190 /* EWALD ELECTROSTATICS */
1192 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1193 ewrt = _mm_mul_ps(r10,ewtabscale);
1194 ewitab = _mm_cvttps_epi32(ewrt);
1195 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1196 ewitab = _mm_slli_epi32(ewitab,2);
1197 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1198 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1199 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1200 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1201 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1202 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1203 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1204 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1205 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1207 d = _mm_sub_ps(r10,rswitch);
1208 d = _mm_max_ps(d,_mm_setzero_ps());
1209 d2 = _mm_mul_ps(d,d);
1210 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1212 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1214 /* Evaluate switch function */
1215 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1216 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1217 velec = _mm_mul_ps(velec,sw);
1218 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1220 /* Update potential sum for this i atom from the interaction with this j atom. */
1221 velec = _mm_and_ps(velec,cutoff_mask);
1222 velec = _mm_andnot_ps(dummy_mask,velec);
1223 velecsum = _mm_add_ps(velecsum,velec);
1227 fscal = _mm_and_ps(fscal,cutoff_mask);
1229 fscal = _mm_andnot_ps(dummy_mask,fscal);
1231 /* Calculate temporary vectorial force */
1232 tx = _mm_mul_ps(fscal,dx10);
1233 ty = _mm_mul_ps(fscal,dy10);
1234 tz = _mm_mul_ps(fscal,dz10);
1236 /* Update vectorial force */
1237 fix1 = _mm_add_ps(fix1,tx);
1238 fiy1 = _mm_add_ps(fiy1,ty);
1239 fiz1 = _mm_add_ps(fiz1,tz);
1241 fjx0 = _mm_add_ps(fjx0,tx);
1242 fjy0 = _mm_add_ps(fjy0,ty);
1243 fjz0 = _mm_add_ps(fjz0,tz);
1247 /**************************
1248 * CALCULATE INTERACTIONS *
1249 **************************/
1251 if (gmx_mm_any_lt(rsq11,rcutoff2))
1254 r11 = _mm_mul_ps(rsq11,rinv11);
1255 r11 = _mm_andnot_ps(dummy_mask,r11);
1257 /* EWALD ELECTROSTATICS */
1259 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1260 ewrt = _mm_mul_ps(r11,ewtabscale);
1261 ewitab = _mm_cvttps_epi32(ewrt);
1262 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1263 ewitab = _mm_slli_epi32(ewitab,2);
1264 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1265 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1266 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1267 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1268 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1269 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1270 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1271 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1272 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1274 d = _mm_sub_ps(r11,rswitch);
1275 d = _mm_max_ps(d,_mm_setzero_ps());
1276 d2 = _mm_mul_ps(d,d);
1277 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1279 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1281 /* Evaluate switch function */
1282 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1283 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1284 velec = _mm_mul_ps(velec,sw);
1285 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1287 /* Update potential sum for this i atom from the interaction with this j atom. */
1288 velec = _mm_and_ps(velec,cutoff_mask);
1289 velec = _mm_andnot_ps(dummy_mask,velec);
1290 velecsum = _mm_add_ps(velecsum,velec);
1294 fscal = _mm_and_ps(fscal,cutoff_mask);
1296 fscal = _mm_andnot_ps(dummy_mask,fscal);
1298 /* Calculate temporary vectorial force */
1299 tx = _mm_mul_ps(fscal,dx11);
1300 ty = _mm_mul_ps(fscal,dy11);
1301 tz = _mm_mul_ps(fscal,dz11);
1303 /* Update vectorial force */
1304 fix1 = _mm_add_ps(fix1,tx);
1305 fiy1 = _mm_add_ps(fiy1,ty);
1306 fiz1 = _mm_add_ps(fiz1,tz);
1308 fjx1 = _mm_add_ps(fjx1,tx);
1309 fjy1 = _mm_add_ps(fjy1,ty);
1310 fjz1 = _mm_add_ps(fjz1,tz);
1314 /**************************
1315 * CALCULATE INTERACTIONS *
1316 **************************/
1318 if (gmx_mm_any_lt(rsq12,rcutoff2))
1321 r12 = _mm_mul_ps(rsq12,rinv12);
1322 r12 = _mm_andnot_ps(dummy_mask,r12);
1324 /* EWALD ELECTROSTATICS */
1326 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1327 ewrt = _mm_mul_ps(r12,ewtabscale);
1328 ewitab = _mm_cvttps_epi32(ewrt);
1329 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1330 ewitab = _mm_slli_epi32(ewitab,2);
1331 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1332 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1333 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1334 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1335 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1336 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1337 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1338 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1339 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1341 d = _mm_sub_ps(r12,rswitch);
1342 d = _mm_max_ps(d,_mm_setzero_ps());
1343 d2 = _mm_mul_ps(d,d);
1344 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1346 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1348 /* Evaluate switch function */
1349 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1350 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1351 velec = _mm_mul_ps(velec,sw);
1352 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1354 /* Update potential sum for this i atom from the interaction with this j atom. */
1355 velec = _mm_and_ps(velec,cutoff_mask);
1356 velec = _mm_andnot_ps(dummy_mask,velec);
1357 velecsum = _mm_add_ps(velecsum,velec);
1361 fscal = _mm_and_ps(fscal,cutoff_mask);
1363 fscal = _mm_andnot_ps(dummy_mask,fscal);
1365 /* Calculate temporary vectorial force */
1366 tx = _mm_mul_ps(fscal,dx12);
1367 ty = _mm_mul_ps(fscal,dy12);
1368 tz = _mm_mul_ps(fscal,dz12);
1370 /* Update vectorial force */
1371 fix1 = _mm_add_ps(fix1,tx);
1372 fiy1 = _mm_add_ps(fiy1,ty);
1373 fiz1 = _mm_add_ps(fiz1,tz);
1375 fjx2 = _mm_add_ps(fjx2,tx);
1376 fjy2 = _mm_add_ps(fjy2,ty);
1377 fjz2 = _mm_add_ps(fjz2,tz);
1381 /**************************
1382 * CALCULATE INTERACTIONS *
1383 **************************/
1385 if (gmx_mm_any_lt(rsq20,rcutoff2))
1388 r20 = _mm_mul_ps(rsq20,rinv20);
1389 r20 = _mm_andnot_ps(dummy_mask,r20);
1391 /* EWALD ELECTROSTATICS */
1393 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1394 ewrt = _mm_mul_ps(r20,ewtabscale);
1395 ewitab = _mm_cvttps_epi32(ewrt);
1396 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1397 ewitab = _mm_slli_epi32(ewitab,2);
1398 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1399 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1400 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1401 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1402 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1403 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1404 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1405 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1406 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1408 d = _mm_sub_ps(r20,rswitch);
1409 d = _mm_max_ps(d,_mm_setzero_ps());
1410 d2 = _mm_mul_ps(d,d);
1411 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1413 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1415 /* Evaluate switch function */
1416 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1417 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1418 velec = _mm_mul_ps(velec,sw);
1419 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1421 /* Update potential sum for this i atom from the interaction with this j atom. */
1422 velec = _mm_and_ps(velec,cutoff_mask);
1423 velec = _mm_andnot_ps(dummy_mask,velec);
1424 velecsum = _mm_add_ps(velecsum,velec);
1428 fscal = _mm_and_ps(fscal,cutoff_mask);
1430 fscal = _mm_andnot_ps(dummy_mask,fscal);
1432 /* Calculate temporary vectorial force */
1433 tx = _mm_mul_ps(fscal,dx20);
1434 ty = _mm_mul_ps(fscal,dy20);
1435 tz = _mm_mul_ps(fscal,dz20);
1437 /* Update vectorial force */
1438 fix2 = _mm_add_ps(fix2,tx);
1439 fiy2 = _mm_add_ps(fiy2,ty);
1440 fiz2 = _mm_add_ps(fiz2,tz);
1442 fjx0 = _mm_add_ps(fjx0,tx);
1443 fjy0 = _mm_add_ps(fjy0,ty);
1444 fjz0 = _mm_add_ps(fjz0,tz);
1448 /**************************
1449 * CALCULATE INTERACTIONS *
1450 **************************/
1452 if (gmx_mm_any_lt(rsq21,rcutoff2))
1455 r21 = _mm_mul_ps(rsq21,rinv21);
1456 r21 = _mm_andnot_ps(dummy_mask,r21);
1458 /* EWALD ELECTROSTATICS */
1460 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1461 ewrt = _mm_mul_ps(r21,ewtabscale);
1462 ewitab = _mm_cvttps_epi32(ewrt);
1463 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1464 ewitab = _mm_slli_epi32(ewitab,2);
1465 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1466 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1467 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1468 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1469 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1470 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1471 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1472 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1473 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1475 d = _mm_sub_ps(r21,rswitch);
1476 d = _mm_max_ps(d,_mm_setzero_ps());
1477 d2 = _mm_mul_ps(d,d);
1478 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1480 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1482 /* Evaluate switch function */
1483 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1484 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1485 velec = _mm_mul_ps(velec,sw);
1486 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1488 /* Update potential sum for this i atom from the interaction with this j atom. */
1489 velec = _mm_and_ps(velec,cutoff_mask);
1490 velec = _mm_andnot_ps(dummy_mask,velec);
1491 velecsum = _mm_add_ps(velecsum,velec);
1495 fscal = _mm_and_ps(fscal,cutoff_mask);
1497 fscal = _mm_andnot_ps(dummy_mask,fscal);
1499 /* Calculate temporary vectorial force */
1500 tx = _mm_mul_ps(fscal,dx21);
1501 ty = _mm_mul_ps(fscal,dy21);
1502 tz = _mm_mul_ps(fscal,dz21);
1504 /* Update vectorial force */
1505 fix2 = _mm_add_ps(fix2,tx);
1506 fiy2 = _mm_add_ps(fiy2,ty);
1507 fiz2 = _mm_add_ps(fiz2,tz);
1509 fjx1 = _mm_add_ps(fjx1,tx);
1510 fjy1 = _mm_add_ps(fjy1,ty);
1511 fjz1 = _mm_add_ps(fjz1,tz);
1515 /**************************
1516 * CALCULATE INTERACTIONS *
1517 **************************/
1519 if (gmx_mm_any_lt(rsq22,rcutoff2))
1522 r22 = _mm_mul_ps(rsq22,rinv22);
1523 r22 = _mm_andnot_ps(dummy_mask,r22);
1525 /* EWALD ELECTROSTATICS */
1527 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1528 ewrt = _mm_mul_ps(r22,ewtabscale);
1529 ewitab = _mm_cvttps_epi32(ewrt);
1530 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1531 ewitab = _mm_slli_epi32(ewitab,2);
1532 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1533 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1534 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1535 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1536 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1537 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1538 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1539 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1540 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1542 d = _mm_sub_ps(r22,rswitch);
1543 d = _mm_max_ps(d,_mm_setzero_ps());
1544 d2 = _mm_mul_ps(d,d);
1545 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1547 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1549 /* Evaluate switch function */
1550 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1551 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1552 velec = _mm_mul_ps(velec,sw);
1553 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1555 /* Update potential sum for this i atom from the interaction with this j atom. */
1556 velec = _mm_and_ps(velec,cutoff_mask);
1557 velec = _mm_andnot_ps(dummy_mask,velec);
1558 velecsum = _mm_add_ps(velecsum,velec);
1562 fscal = _mm_and_ps(fscal,cutoff_mask);
1564 fscal = _mm_andnot_ps(dummy_mask,fscal);
1566 /* Calculate temporary vectorial force */
1567 tx = _mm_mul_ps(fscal,dx22);
1568 ty = _mm_mul_ps(fscal,dy22);
1569 tz = _mm_mul_ps(fscal,dz22);
1571 /* Update vectorial force */
1572 fix2 = _mm_add_ps(fix2,tx);
1573 fiy2 = _mm_add_ps(fiy2,ty);
1574 fiz2 = _mm_add_ps(fiz2,tz);
1576 fjx2 = _mm_add_ps(fjx2,tx);
1577 fjy2 = _mm_add_ps(fjy2,ty);
1578 fjz2 = _mm_add_ps(fjz2,tz);
1582 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1583 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1584 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1585 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1587 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1588 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1590 /* Inner loop uses 594 flops */
1593 /* End of innermost loop */
1595 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1596 f+i_coord_offset,fshift+i_shift_offset);
1599 /* Update potential energies */
1600 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1602 /* Increment number of inner iterations */
1603 inneriter += j_index_end - j_index_start;
1605 /* Outer loop uses 19 flops */
1608 /* Increment number of outer iterations */
1611 /* Update outer/inner flops */
1613 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*594);
1616 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_single
1617 * Electrostatics interaction: Ewald
1618 * VdW interaction: None
1619 * Geometry: Water3-Water3
1620 * Calculate force/pot: Force
1623 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_single
1624 (t_nblist * gmx_restrict nlist,
1625 rvec * gmx_restrict xx,
1626 rvec * gmx_restrict ff,
1627 struct t_forcerec * gmx_restrict fr,
1628 t_mdatoms * gmx_restrict mdatoms,
1629 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1630 t_nrnb * gmx_restrict nrnb)
1632 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1633 * just 0 for non-waters.
1634 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1635 * jnr indices corresponding to data put in the four positions in the SIMD register.
1637 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1638 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1639 int jnrA,jnrB,jnrC,jnrD;
1640 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1641 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1642 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1643 real rcutoff_scalar;
1644 real *shiftvec,*fshift,*x,*f;
1645 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1646 real scratch[4*DIM];
1647 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1649 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1651 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1653 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1654 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1655 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1656 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1657 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1658 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1659 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1660 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1661 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1662 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1663 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1664 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1665 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1666 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1667 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1668 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1669 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1672 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1674 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1675 real rswitch_scalar,d_scalar;
1676 __m128 dummy_mask,cutoff_mask;
1677 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1678 __m128 one = _mm_set1_ps(1.0);
1679 __m128 two = _mm_set1_ps(2.0);
1685 jindex = nlist->jindex;
1687 shiftidx = nlist->shift;
1689 shiftvec = fr->shift_vec[0];
1690 fshift = fr->fshift[0];
1691 facel = _mm_set1_ps(fr->ic->epsfac);
1692 charge = mdatoms->chargeA;
1694 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1695 ewtab = fr->ic->tabq_coul_FDV0;
1696 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1697 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1699 /* Setup water-specific parameters */
1700 inr = nlist->iinr[0];
1701 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1702 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1703 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1705 jq0 = _mm_set1_ps(charge[inr+0]);
1706 jq1 = _mm_set1_ps(charge[inr+1]);
1707 jq2 = _mm_set1_ps(charge[inr+2]);
1708 qq00 = _mm_mul_ps(iq0,jq0);
1709 qq01 = _mm_mul_ps(iq0,jq1);
1710 qq02 = _mm_mul_ps(iq0,jq2);
1711 qq10 = _mm_mul_ps(iq1,jq0);
1712 qq11 = _mm_mul_ps(iq1,jq1);
1713 qq12 = _mm_mul_ps(iq1,jq2);
1714 qq20 = _mm_mul_ps(iq2,jq0);
1715 qq21 = _mm_mul_ps(iq2,jq1);
1716 qq22 = _mm_mul_ps(iq2,jq2);
1718 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1719 rcutoff_scalar = fr->ic->rcoulomb;
1720 rcutoff = _mm_set1_ps(rcutoff_scalar);
1721 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1723 rswitch_scalar = fr->ic->rcoulomb_switch;
1724 rswitch = _mm_set1_ps(rswitch_scalar);
1725 /* Setup switch parameters */
1726 d_scalar = rcutoff_scalar-rswitch_scalar;
1727 d = _mm_set1_ps(d_scalar);
1728 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1729 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1730 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1731 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1732 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1733 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1735 /* Avoid stupid compiler warnings */
1736 jnrA = jnrB = jnrC = jnrD = 0;
1737 j_coord_offsetA = 0;
1738 j_coord_offsetB = 0;
1739 j_coord_offsetC = 0;
1740 j_coord_offsetD = 0;
1745 for(iidx=0;iidx<4*DIM;iidx++)
1747 scratch[iidx] = 0.0;
1750 /* Start outer loop over neighborlists */
1751 for(iidx=0; iidx<nri; iidx++)
1753 /* Load shift vector for this list */
1754 i_shift_offset = DIM*shiftidx[iidx];
1756 /* Load limits for loop over neighbors */
1757 j_index_start = jindex[iidx];
1758 j_index_end = jindex[iidx+1];
1760 /* Get outer coordinate index */
1762 i_coord_offset = DIM*inr;
1764 /* Load i particle coords and add shift vector */
1765 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1766 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1768 fix0 = _mm_setzero_ps();
1769 fiy0 = _mm_setzero_ps();
1770 fiz0 = _mm_setzero_ps();
1771 fix1 = _mm_setzero_ps();
1772 fiy1 = _mm_setzero_ps();
1773 fiz1 = _mm_setzero_ps();
1774 fix2 = _mm_setzero_ps();
1775 fiy2 = _mm_setzero_ps();
1776 fiz2 = _mm_setzero_ps();
1778 /* Start inner kernel loop */
1779 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1782 /* Get j neighbor index, and coordinate index */
1784 jnrB = jjnr[jidx+1];
1785 jnrC = jjnr[jidx+2];
1786 jnrD = jjnr[jidx+3];
1787 j_coord_offsetA = DIM*jnrA;
1788 j_coord_offsetB = DIM*jnrB;
1789 j_coord_offsetC = DIM*jnrC;
1790 j_coord_offsetD = DIM*jnrD;
1792 /* load j atom coordinates */
1793 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1794 x+j_coord_offsetC,x+j_coord_offsetD,
1795 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1797 /* Calculate displacement vector */
1798 dx00 = _mm_sub_ps(ix0,jx0);
1799 dy00 = _mm_sub_ps(iy0,jy0);
1800 dz00 = _mm_sub_ps(iz0,jz0);
1801 dx01 = _mm_sub_ps(ix0,jx1);
1802 dy01 = _mm_sub_ps(iy0,jy1);
1803 dz01 = _mm_sub_ps(iz0,jz1);
1804 dx02 = _mm_sub_ps(ix0,jx2);
1805 dy02 = _mm_sub_ps(iy0,jy2);
1806 dz02 = _mm_sub_ps(iz0,jz2);
1807 dx10 = _mm_sub_ps(ix1,jx0);
1808 dy10 = _mm_sub_ps(iy1,jy0);
1809 dz10 = _mm_sub_ps(iz1,jz0);
1810 dx11 = _mm_sub_ps(ix1,jx1);
1811 dy11 = _mm_sub_ps(iy1,jy1);
1812 dz11 = _mm_sub_ps(iz1,jz1);
1813 dx12 = _mm_sub_ps(ix1,jx2);
1814 dy12 = _mm_sub_ps(iy1,jy2);
1815 dz12 = _mm_sub_ps(iz1,jz2);
1816 dx20 = _mm_sub_ps(ix2,jx0);
1817 dy20 = _mm_sub_ps(iy2,jy0);
1818 dz20 = _mm_sub_ps(iz2,jz0);
1819 dx21 = _mm_sub_ps(ix2,jx1);
1820 dy21 = _mm_sub_ps(iy2,jy1);
1821 dz21 = _mm_sub_ps(iz2,jz1);
1822 dx22 = _mm_sub_ps(ix2,jx2);
1823 dy22 = _mm_sub_ps(iy2,jy2);
1824 dz22 = _mm_sub_ps(iz2,jz2);
1826 /* Calculate squared distance and things based on it */
1827 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1828 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1829 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1830 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1831 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1832 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1833 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1834 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1835 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1837 rinv00 = sse2_invsqrt_f(rsq00);
1838 rinv01 = sse2_invsqrt_f(rsq01);
1839 rinv02 = sse2_invsqrt_f(rsq02);
1840 rinv10 = sse2_invsqrt_f(rsq10);
1841 rinv11 = sse2_invsqrt_f(rsq11);
1842 rinv12 = sse2_invsqrt_f(rsq12);
1843 rinv20 = sse2_invsqrt_f(rsq20);
1844 rinv21 = sse2_invsqrt_f(rsq21);
1845 rinv22 = sse2_invsqrt_f(rsq22);
1847 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1848 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1849 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1850 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1851 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1852 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1853 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1854 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1855 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1857 fjx0 = _mm_setzero_ps();
1858 fjy0 = _mm_setzero_ps();
1859 fjz0 = _mm_setzero_ps();
1860 fjx1 = _mm_setzero_ps();
1861 fjy1 = _mm_setzero_ps();
1862 fjz1 = _mm_setzero_ps();
1863 fjx2 = _mm_setzero_ps();
1864 fjy2 = _mm_setzero_ps();
1865 fjz2 = _mm_setzero_ps();
1867 /**************************
1868 * CALCULATE INTERACTIONS *
1869 **************************/
1871 if (gmx_mm_any_lt(rsq00,rcutoff2))
1874 r00 = _mm_mul_ps(rsq00,rinv00);
1876 /* EWALD ELECTROSTATICS */
1878 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1879 ewrt = _mm_mul_ps(r00,ewtabscale);
1880 ewitab = _mm_cvttps_epi32(ewrt);
1881 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1882 ewitab = _mm_slli_epi32(ewitab,2);
1883 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1884 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1885 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1886 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1887 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1888 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1889 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1890 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1891 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1893 d = _mm_sub_ps(r00,rswitch);
1894 d = _mm_max_ps(d,_mm_setzero_ps());
1895 d2 = _mm_mul_ps(d,d);
1896 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1898 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1900 /* Evaluate switch function */
1901 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1902 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1903 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1907 fscal = _mm_and_ps(fscal,cutoff_mask);
1909 /* Calculate temporary vectorial force */
1910 tx = _mm_mul_ps(fscal,dx00);
1911 ty = _mm_mul_ps(fscal,dy00);
1912 tz = _mm_mul_ps(fscal,dz00);
1914 /* Update vectorial force */
1915 fix0 = _mm_add_ps(fix0,tx);
1916 fiy0 = _mm_add_ps(fiy0,ty);
1917 fiz0 = _mm_add_ps(fiz0,tz);
1919 fjx0 = _mm_add_ps(fjx0,tx);
1920 fjy0 = _mm_add_ps(fjy0,ty);
1921 fjz0 = _mm_add_ps(fjz0,tz);
1925 /**************************
1926 * CALCULATE INTERACTIONS *
1927 **************************/
1929 if (gmx_mm_any_lt(rsq01,rcutoff2))
1932 r01 = _mm_mul_ps(rsq01,rinv01);
1934 /* EWALD ELECTROSTATICS */
1936 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1937 ewrt = _mm_mul_ps(r01,ewtabscale);
1938 ewitab = _mm_cvttps_epi32(ewrt);
1939 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1940 ewitab = _mm_slli_epi32(ewitab,2);
1941 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1942 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1943 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1944 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1945 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1946 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1947 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1948 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
1949 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1951 d = _mm_sub_ps(r01,rswitch);
1952 d = _mm_max_ps(d,_mm_setzero_ps());
1953 d2 = _mm_mul_ps(d,d);
1954 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1956 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1958 /* Evaluate switch function */
1959 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1960 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1961 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1965 fscal = _mm_and_ps(fscal,cutoff_mask);
1967 /* Calculate temporary vectorial force */
1968 tx = _mm_mul_ps(fscal,dx01);
1969 ty = _mm_mul_ps(fscal,dy01);
1970 tz = _mm_mul_ps(fscal,dz01);
1972 /* Update vectorial force */
1973 fix0 = _mm_add_ps(fix0,tx);
1974 fiy0 = _mm_add_ps(fiy0,ty);
1975 fiz0 = _mm_add_ps(fiz0,tz);
1977 fjx1 = _mm_add_ps(fjx1,tx);
1978 fjy1 = _mm_add_ps(fjy1,ty);
1979 fjz1 = _mm_add_ps(fjz1,tz);
1983 /**************************
1984 * CALCULATE INTERACTIONS *
1985 **************************/
1987 if (gmx_mm_any_lt(rsq02,rcutoff2))
1990 r02 = _mm_mul_ps(rsq02,rinv02);
1992 /* EWALD ELECTROSTATICS */
1994 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1995 ewrt = _mm_mul_ps(r02,ewtabscale);
1996 ewitab = _mm_cvttps_epi32(ewrt);
1997 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1998 ewitab = _mm_slli_epi32(ewitab,2);
1999 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2000 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2001 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2002 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2003 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2004 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2005 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2006 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2007 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2009 d = _mm_sub_ps(r02,rswitch);
2010 d = _mm_max_ps(d,_mm_setzero_ps());
2011 d2 = _mm_mul_ps(d,d);
2012 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2014 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2016 /* Evaluate switch function */
2017 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2018 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2019 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2023 fscal = _mm_and_ps(fscal,cutoff_mask);
2025 /* Calculate temporary vectorial force */
2026 tx = _mm_mul_ps(fscal,dx02);
2027 ty = _mm_mul_ps(fscal,dy02);
2028 tz = _mm_mul_ps(fscal,dz02);
2030 /* Update vectorial force */
2031 fix0 = _mm_add_ps(fix0,tx);
2032 fiy0 = _mm_add_ps(fiy0,ty);
2033 fiz0 = _mm_add_ps(fiz0,tz);
2035 fjx2 = _mm_add_ps(fjx2,tx);
2036 fjy2 = _mm_add_ps(fjy2,ty);
2037 fjz2 = _mm_add_ps(fjz2,tz);
2041 /**************************
2042 * CALCULATE INTERACTIONS *
2043 **************************/
2045 if (gmx_mm_any_lt(rsq10,rcutoff2))
2048 r10 = _mm_mul_ps(rsq10,rinv10);
2050 /* EWALD ELECTROSTATICS */
2052 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2053 ewrt = _mm_mul_ps(r10,ewtabscale);
2054 ewitab = _mm_cvttps_epi32(ewrt);
2055 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2056 ewitab = _mm_slli_epi32(ewitab,2);
2057 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2058 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2059 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2060 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2061 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2062 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2063 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2064 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2065 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2067 d = _mm_sub_ps(r10,rswitch);
2068 d = _mm_max_ps(d,_mm_setzero_ps());
2069 d2 = _mm_mul_ps(d,d);
2070 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2072 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2074 /* Evaluate switch function */
2075 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2076 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2077 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2081 fscal = _mm_and_ps(fscal,cutoff_mask);
2083 /* Calculate temporary vectorial force */
2084 tx = _mm_mul_ps(fscal,dx10);
2085 ty = _mm_mul_ps(fscal,dy10);
2086 tz = _mm_mul_ps(fscal,dz10);
2088 /* Update vectorial force */
2089 fix1 = _mm_add_ps(fix1,tx);
2090 fiy1 = _mm_add_ps(fiy1,ty);
2091 fiz1 = _mm_add_ps(fiz1,tz);
2093 fjx0 = _mm_add_ps(fjx0,tx);
2094 fjy0 = _mm_add_ps(fjy0,ty);
2095 fjz0 = _mm_add_ps(fjz0,tz);
2099 /**************************
2100 * CALCULATE INTERACTIONS *
2101 **************************/
2103 if (gmx_mm_any_lt(rsq11,rcutoff2))
2106 r11 = _mm_mul_ps(rsq11,rinv11);
2108 /* EWALD ELECTROSTATICS */
2110 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2111 ewrt = _mm_mul_ps(r11,ewtabscale);
2112 ewitab = _mm_cvttps_epi32(ewrt);
2113 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2114 ewitab = _mm_slli_epi32(ewitab,2);
2115 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2116 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2117 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2118 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2119 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2120 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2121 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2122 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2123 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2125 d = _mm_sub_ps(r11,rswitch);
2126 d = _mm_max_ps(d,_mm_setzero_ps());
2127 d2 = _mm_mul_ps(d,d);
2128 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2130 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2132 /* Evaluate switch function */
2133 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2134 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2135 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2139 fscal = _mm_and_ps(fscal,cutoff_mask);
2141 /* Calculate temporary vectorial force */
2142 tx = _mm_mul_ps(fscal,dx11);
2143 ty = _mm_mul_ps(fscal,dy11);
2144 tz = _mm_mul_ps(fscal,dz11);
2146 /* Update vectorial force */
2147 fix1 = _mm_add_ps(fix1,tx);
2148 fiy1 = _mm_add_ps(fiy1,ty);
2149 fiz1 = _mm_add_ps(fiz1,tz);
2151 fjx1 = _mm_add_ps(fjx1,tx);
2152 fjy1 = _mm_add_ps(fjy1,ty);
2153 fjz1 = _mm_add_ps(fjz1,tz);
2157 /**************************
2158 * CALCULATE INTERACTIONS *
2159 **************************/
2161 if (gmx_mm_any_lt(rsq12,rcutoff2))
2164 r12 = _mm_mul_ps(rsq12,rinv12);
2166 /* EWALD ELECTROSTATICS */
2168 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2169 ewrt = _mm_mul_ps(r12,ewtabscale);
2170 ewitab = _mm_cvttps_epi32(ewrt);
2171 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2172 ewitab = _mm_slli_epi32(ewitab,2);
2173 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2174 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2175 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2176 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2177 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2178 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2179 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2180 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2181 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2183 d = _mm_sub_ps(r12,rswitch);
2184 d = _mm_max_ps(d,_mm_setzero_ps());
2185 d2 = _mm_mul_ps(d,d);
2186 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2188 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2190 /* Evaluate switch function */
2191 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2192 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2193 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2197 fscal = _mm_and_ps(fscal,cutoff_mask);
2199 /* Calculate temporary vectorial force */
2200 tx = _mm_mul_ps(fscal,dx12);
2201 ty = _mm_mul_ps(fscal,dy12);
2202 tz = _mm_mul_ps(fscal,dz12);
2204 /* Update vectorial force */
2205 fix1 = _mm_add_ps(fix1,tx);
2206 fiy1 = _mm_add_ps(fiy1,ty);
2207 fiz1 = _mm_add_ps(fiz1,tz);
2209 fjx2 = _mm_add_ps(fjx2,tx);
2210 fjy2 = _mm_add_ps(fjy2,ty);
2211 fjz2 = _mm_add_ps(fjz2,tz);
2215 /**************************
2216 * CALCULATE INTERACTIONS *
2217 **************************/
2219 if (gmx_mm_any_lt(rsq20,rcutoff2))
2222 r20 = _mm_mul_ps(rsq20,rinv20);
2224 /* EWALD ELECTROSTATICS */
2226 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2227 ewrt = _mm_mul_ps(r20,ewtabscale);
2228 ewitab = _mm_cvttps_epi32(ewrt);
2229 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2230 ewitab = _mm_slli_epi32(ewitab,2);
2231 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2232 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2233 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2234 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2235 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2236 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2237 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2238 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2239 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2241 d = _mm_sub_ps(r20,rswitch);
2242 d = _mm_max_ps(d,_mm_setzero_ps());
2243 d2 = _mm_mul_ps(d,d);
2244 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2246 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2248 /* Evaluate switch function */
2249 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2250 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2251 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2255 fscal = _mm_and_ps(fscal,cutoff_mask);
2257 /* Calculate temporary vectorial force */
2258 tx = _mm_mul_ps(fscal,dx20);
2259 ty = _mm_mul_ps(fscal,dy20);
2260 tz = _mm_mul_ps(fscal,dz20);
2262 /* Update vectorial force */
2263 fix2 = _mm_add_ps(fix2,tx);
2264 fiy2 = _mm_add_ps(fiy2,ty);
2265 fiz2 = _mm_add_ps(fiz2,tz);
2267 fjx0 = _mm_add_ps(fjx0,tx);
2268 fjy0 = _mm_add_ps(fjy0,ty);
2269 fjz0 = _mm_add_ps(fjz0,tz);
2273 /**************************
2274 * CALCULATE INTERACTIONS *
2275 **************************/
2277 if (gmx_mm_any_lt(rsq21,rcutoff2))
2280 r21 = _mm_mul_ps(rsq21,rinv21);
2282 /* EWALD ELECTROSTATICS */
2284 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2285 ewrt = _mm_mul_ps(r21,ewtabscale);
2286 ewitab = _mm_cvttps_epi32(ewrt);
2287 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2288 ewitab = _mm_slli_epi32(ewitab,2);
2289 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2290 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2291 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2292 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2293 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2294 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2295 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2296 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2297 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2299 d = _mm_sub_ps(r21,rswitch);
2300 d = _mm_max_ps(d,_mm_setzero_ps());
2301 d2 = _mm_mul_ps(d,d);
2302 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2304 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2306 /* Evaluate switch function */
2307 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2308 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2309 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2313 fscal = _mm_and_ps(fscal,cutoff_mask);
2315 /* Calculate temporary vectorial force */
2316 tx = _mm_mul_ps(fscal,dx21);
2317 ty = _mm_mul_ps(fscal,dy21);
2318 tz = _mm_mul_ps(fscal,dz21);
2320 /* Update vectorial force */
2321 fix2 = _mm_add_ps(fix2,tx);
2322 fiy2 = _mm_add_ps(fiy2,ty);
2323 fiz2 = _mm_add_ps(fiz2,tz);
2325 fjx1 = _mm_add_ps(fjx1,tx);
2326 fjy1 = _mm_add_ps(fjy1,ty);
2327 fjz1 = _mm_add_ps(fjz1,tz);
2331 /**************************
2332 * CALCULATE INTERACTIONS *
2333 **************************/
2335 if (gmx_mm_any_lt(rsq22,rcutoff2))
2338 r22 = _mm_mul_ps(rsq22,rinv22);
2340 /* EWALD ELECTROSTATICS */
2342 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2343 ewrt = _mm_mul_ps(r22,ewtabscale);
2344 ewitab = _mm_cvttps_epi32(ewrt);
2345 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2346 ewitab = _mm_slli_epi32(ewitab,2);
2347 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2348 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2349 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2350 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2351 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2352 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2353 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2354 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2355 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2357 d = _mm_sub_ps(r22,rswitch);
2358 d = _mm_max_ps(d,_mm_setzero_ps());
2359 d2 = _mm_mul_ps(d,d);
2360 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2362 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2364 /* Evaluate switch function */
2365 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2366 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2367 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2371 fscal = _mm_and_ps(fscal,cutoff_mask);
2373 /* Calculate temporary vectorial force */
2374 tx = _mm_mul_ps(fscal,dx22);
2375 ty = _mm_mul_ps(fscal,dy22);
2376 tz = _mm_mul_ps(fscal,dz22);
2378 /* Update vectorial force */
2379 fix2 = _mm_add_ps(fix2,tx);
2380 fiy2 = _mm_add_ps(fiy2,ty);
2381 fiz2 = _mm_add_ps(fiz2,tz);
2383 fjx2 = _mm_add_ps(fjx2,tx);
2384 fjy2 = _mm_add_ps(fjy2,ty);
2385 fjz2 = _mm_add_ps(fjz2,tz);
2389 fjptrA = f+j_coord_offsetA;
2390 fjptrB = f+j_coord_offsetB;
2391 fjptrC = f+j_coord_offsetC;
2392 fjptrD = f+j_coord_offsetD;
2394 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2395 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2397 /* Inner loop uses 558 flops */
2400 if(jidx<j_index_end)
2403 /* Get j neighbor index, and coordinate index */
2404 jnrlistA = jjnr[jidx];
2405 jnrlistB = jjnr[jidx+1];
2406 jnrlistC = jjnr[jidx+2];
2407 jnrlistD = jjnr[jidx+3];
2408 /* Sign of each element will be negative for non-real atoms.
2409 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2410 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2412 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2413 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2414 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2415 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2416 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2417 j_coord_offsetA = DIM*jnrA;
2418 j_coord_offsetB = DIM*jnrB;
2419 j_coord_offsetC = DIM*jnrC;
2420 j_coord_offsetD = DIM*jnrD;
2422 /* load j atom coordinates */
2423 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2424 x+j_coord_offsetC,x+j_coord_offsetD,
2425 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2427 /* Calculate displacement vector */
2428 dx00 = _mm_sub_ps(ix0,jx0);
2429 dy00 = _mm_sub_ps(iy0,jy0);
2430 dz00 = _mm_sub_ps(iz0,jz0);
2431 dx01 = _mm_sub_ps(ix0,jx1);
2432 dy01 = _mm_sub_ps(iy0,jy1);
2433 dz01 = _mm_sub_ps(iz0,jz1);
2434 dx02 = _mm_sub_ps(ix0,jx2);
2435 dy02 = _mm_sub_ps(iy0,jy2);
2436 dz02 = _mm_sub_ps(iz0,jz2);
2437 dx10 = _mm_sub_ps(ix1,jx0);
2438 dy10 = _mm_sub_ps(iy1,jy0);
2439 dz10 = _mm_sub_ps(iz1,jz0);
2440 dx11 = _mm_sub_ps(ix1,jx1);
2441 dy11 = _mm_sub_ps(iy1,jy1);
2442 dz11 = _mm_sub_ps(iz1,jz1);
2443 dx12 = _mm_sub_ps(ix1,jx2);
2444 dy12 = _mm_sub_ps(iy1,jy2);
2445 dz12 = _mm_sub_ps(iz1,jz2);
2446 dx20 = _mm_sub_ps(ix2,jx0);
2447 dy20 = _mm_sub_ps(iy2,jy0);
2448 dz20 = _mm_sub_ps(iz2,jz0);
2449 dx21 = _mm_sub_ps(ix2,jx1);
2450 dy21 = _mm_sub_ps(iy2,jy1);
2451 dz21 = _mm_sub_ps(iz2,jz1);
2452 dx22 = _mm_sub_ps(ix2,jx2);
2453 dy22 = _mm_sub_ps(iy2,jy2);
2454 dz22 = _mm_sub_ps(iz2,jz2);
2456 /* Calculate squared distance and things based on it */
2457 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2458 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2459 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2460 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2461 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2462 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2463 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2464 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2465 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2467 rinv00 = sse2_invsqrt_f(rsq00);
2468 rinv01 = sse2_invsqrt_f(rsq01);
2469 rinv02 = sse2_invsqrt_f(rsq02);
2470 rinv10 = sse2_invsqrt_f(rsq10);
2471 rinv11 = sse2_invsqrt_f(rsq11);
2472 rinv12 = sse2_invsqrt_f(rsq12);
2473 rinv20 = sse2_invsqrt_f(rsq20);
2474 rinv21 = sse2_invsqrt_f(rsq21);
2475 rinv22 = sse2_invsqrt_f(rsq22);
2477 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2478 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
2479 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
2480 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
2481 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2482 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2483 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
2484 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2485 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2487 fjx0 = _mm_setzero_ps();
2488 fjy0 = _mm_setzero_ps();
2489 fjz0 = _mm_setzero_ps();
2490 fjx1 = _mm_setzero_ps();
2491 fjy1 = _mm_setzero_ps();
2492 fjz1 = _mm_setzero_ps();
2493 fjx2 = _mm_setzero_ps();
2494 fjy2 = _mm_setzero_ps();
2495 fjz2 = _mm_setzero_ps();
2497 /**************************
2498 * CALCULATE INTERACTIONS *
2499 **************************/
2501 if (gmx_mm_any_lt(rsq00,rcutoff2))
2504 r00 = _mm_mul_ps(rsq00,rinv00);
2505 r00 = _mm_andnot_ps(dummy_mask,r00);
2507 /* EWALD ELECTROSTATICS */
2509 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2510 ewrt = _mm_mul_ps(r00,ewtabscale);
2511 ewitab = _mm_cvttps_epi32(ewrt);
2512 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2513 ewitab = _mm_slli_epi32(ewitab,2);
2514 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2515 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2516 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2517 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2518 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2519 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2520 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2521 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
2522 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2524 d = _mm_sub_ps(r00,rswitch);
2525 d = _mm_max_ps(d,_mm_setzero_ps());
2526 d2 = _mm_mul_ps(d,d);
2527 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2529 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2531 /* Evaluate switch function */
2532 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2533 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
2534 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2538 fscal = _mm_and_ps(fscal,cutoff_mask);
2540 fscal = _mm_andnot_ps(dummy_mask,fscal);
2542 /* Calculate temporary vectorial force */
2543 tx = _mm_mul_ps(fscal,dx00);
2544 ty = _mm_mul_ps(fscal,dy00);
2545 tz = _mm_mul_ps(fscal,dz00);
2547 /* Update vectorial force */
2548 fix0 = _mm_add_ps(fix0,tx);
2549 fiy0 = _mm_add_ps(fiy0,ty);
2550 fiz0 = _mm_add_ps(fiz0,tz);
2552 fjx0 = _mm_add_ps(fjx0,tx);
2553 fjy0 = _mm_add_ps(fjy0,ty);
2554 fjz0 = _mm_add_ps(fjz0,tz);
2558 /**************************
2559 * CALCULATE INTERACTIONS *
2560 **************************/
2562 if (gmx_mm_any_lt(rsq01,rcutoff2))
2565 r01 = _mm_mul_ps(rsq01,rinv01);
2566 r01 = _mm_andnot_ps(dummy_mask,r01);
2568 /* EWALD ELECTROSTATICS */
2570 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2571 ewrt = _mm_mul_ps(r01,ewtabscale);
2572 ewitab = _mm_cvttps_epi32(ewrt);
2573 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2574 ewitab = _mm_slli_epi32(ewitab,2);
2575 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2576 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2577 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2578 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2579 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2580 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2581 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2582 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
2583 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2585 d = _mm_sub_ps(r01,rswitch);
2586 d = _mm_max_ps(d,_mm_setzero_ps());
2587 d2 = _mm_mul_ps(d,d);
2588 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2590 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2592 /* Evaluate switch function */
2593 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2594 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2595 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2599 fscal = _mm_and_ps(fscal,cutoff_mask);
2601 fscal = _mm_andnot_ps(dummy_mask,fscal);
2603 /* Calculate temporary vectorial force */
2604 tx = _mm_mul_ps(fscal,dx01);
2605 ty = _mm_mul_ps(fscal,dy01);
2606 tz = _mm_mul_ps(fscal,dz01);
2608 /* Update vectorial force */
2609 fix0 = _mm_add_ps(fix0,tx);
2610 fiy0 = _mm_add_ps(fiy0,ty);
2611 fiz0 = _mm_add_ps(fiz0,tz);
2613 fjx1 = _mm_add_ps(fjx1,tx);
2614 fjy1 = _mm_add_ps(fjy1,ty);
2615 fjz1 = _mm_add_ps(fjz1,tz);
2619 /**************************
2620 * CALCULATE INTERACTIONS *
2621 **************************/
2623 if (gmx_mm_any_lt(rsq02,rcutoff2))
2626 r02 = _mm_mul_ps(rsq02,rinv02);
2627 r02 = _mm_andnot_ps(dummy_mask,r02);
2629 /* EWALD ELECTROSTATICS */
2631 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2632 ewrt = _mm_mul_ps(r02,ewtabscale);
2633 ewitab = _mm_cvttps_epi32(ewrt);
2634 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2635 ewitab = _mm_slli_epi32(ewitab,2);
2636 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2637 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2638 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2639 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2640 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2641 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2642 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2643 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2644 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2646 d = _mm_sub_ps(r02,rswitch);
2647 d = _mm_max_ps(d,_mm_setzero_ps());
2648 d2 = _mm_mul_ps(d,d);
2649 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2651 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2653 /* Evaluate switch function */
2654 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2655 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2656 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2660 fscal = _mm_and_ps(fscal,cutoff_mask);
2662 fscal = _mm_andnot_ps(dummy_mask,fscal);
2664 /* Calculate temporary vectorial force */
2665 tx = _mm_mul_ps(fscal,dx02);
2666 ty = _mm_mul_ps(fscal,dy02);
2667 tz = _mm_mul_ps(fscal,dz02);
2669 /* Update vectorial force */
2670 fix0 = _mm_add_ps(fix0,tx);
2671 fiy0 = _mm_add_ps(fiy0,ty);
2672 fiz0 = _mm_add_ps(fiz0,tz);
2674 fjx2 = _mm_add_ps(fjx2,tx);
2675 fjy2 = _mm_add_ps(fjy2,ty);
2676 fjz2 = _mm_add_ps(fjz2,tz);
2680 /**************************
2681 * CALCULATE INTERACTIONS *
2682 **************************/
2684 if (gmx_mm_any_lt(rsq10,rcutoff2))
2687 r10 = _mm_mul_ps(rsq10,rinv10);
2688 r10 = _mm_andnot_ps(dummy_mask,r10);
2690 /* EWALD ELECTROSTATICS */
2692 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2693 ewrt = _mm_mul_ps(r10,ewtabscale);
2694 ewitab = _mm_cvttps_epi32(ewrt);
2695 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2696 ewitab = _mm_slli_epi32(ewitab,2);
2697 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2698 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2699 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2700 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2701 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2702 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2703 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2704 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2705 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2707 d = _mm_sub_ps(r10,rswitch);
2708 d = _mm_max_ps(d,_mm_setzero_ps());
2709 d2 = _mm_mul_ps(d,d);
2710 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2712 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2714 /* Evaluate switch function */
2715 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2716 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2717 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2721 fscal = _mm_and_ps(fscal,cutoff_mask);
2723 fscal = _mm_andnot_ps(dummy_mask,fscal);
2725 /* Calculate temporary vectorial force */
2726 tx = _mm_mul_ps(fscal,dx10);
2727 ty = _mm_mul_ps(fscal,dy10);
2728 tz = _mm_mul_ps(fscal,dz10);
2730 /* Update vectorial force */
2731 fix1 = _mm_add_ps(fix1,tx);
2732 fiy1 = _mm_add_ps(fiy1,ty);
2733 fiz1 = _mm_add_ps(fiz1,tz);
2735 fjx0 = _mm_add_ps(fjx0,tx);
2736 fjy0 = _mm_add_ps(fjy0,ty);
2737 fjz0 = _mm_add_ps(fjz0,tz);
2741 /**************************
2742 * CALCULATE INTERACTIONS *
2743 **************************/
2745 if (gmx_mm_any_lt(rsq11,rcutoff2))
2748 r11 = _mm_mul_ps(rsq11,rinv11);
2749 r11 = _mm_andnot_ps(dummy_mask,r11);
2751 /* EWALD ELECTROSTATICS */
2753 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2754 ewrt = _mm_mul_ps(r11,ewtabscale);
2755 ewitab = _mm_cvttps_epi32(ewrt);
2756 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2757 ewitab = _mm_slli_epi32(ewitab,2);
2758 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2759 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2760 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2761 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2762 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2763 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2764 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2765 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2766 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2768 d = _mm_sub_ps(r11,rswitch);
2769 d = _mm_max_ps(d,_mm_setzero_ps());
2770 d2 = _mm_mul_ps(d,d);
2771 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2773 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2775 /* Evaluate switch function */
2776 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2777 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2778 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2782 fscal = _mm_and_ps(fscal,cutoff_mask);
2784 fscal = _mm_andnot_ps(dummy_mask,fscal);
2786 /* Calculate temporary vectorial force */
2787 tx = _mm_mul_ps(fscal,dx11);
2788 ty = _mm_mul_ps(fscal,dy11);
2789 tz = _mm_mul_ps(fscal,dz11);
2791 /* Update vectorial force */
2792 fix1 = _mm_add_ps(fix1,tx);
2793 fiy1 = _mm_add_ps(fiy1,ty);
2794 fiz1 = _mm_add_ps(fiz1,tz);
2796 fjx1 = _mm_add_ps(fjx1,tx);
2797 fjy1 = _mm_add_ps(fjy1,ty);
2798 fjz1 = _mm_add_ps(fjz1,tz);
2802 /**************************
2803 * CALCULATE INTERACTIONS *
2804 **************************/
2806 if (gmx_mm_any_lt(rsq12,rcutoff2))
2809 r12 = _mm_mul_ps(rsq12,rinv12);
2810 r12 = _mm_andnot_ps(dummy_mask,r12);
2812 /* EWALD ELECTROSTATICS */
2814 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2815 ewrt = _mm_mul_ps(r12,ewtabscale);
2816 ewitab = _mm_cvttps_epi32(ewrt);
2817 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2818 ewitab = _mm_slli_epi32(ewitab,2);
2819 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2820 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2821 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2822 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2823 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2824 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2825 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2826 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2827 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2829 d = _mm_sub_ps(r12,rswitch);
2830 d = _mm_max_ps(d,_mm_setzero_ps());
2831 d2 = _mm_mul_ps(d,d);
2832 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2834 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2836 /* Evaluate switch function */
2837 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2838 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2839 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2843 fscal = _mm_and_ps(fscal,cutoff_mask);
2845 fscal = _mm_andnot_ps(dummy_mask,fscal);
2847 /* Calculate temporary vectorial force */
2848 tx = _mm_mul_ps(fscal,dx12);
2849 ty = _mm_mul_ps(fscal,dy12);
2850 tz = _mm_mul_ps(fscal,dz12);
2852 /* Update vectorial force */
2853 fix1 = _mm_add_ps(fix1,tx);
2854 fiy1 = _mm_add_ps(fiy1,ty);
2855 fiz1 = _mm_add_ps(fiz1,tz);
2857 fjx2 = _mm_add_ps(fjx2,tx);
2858 fjy2 = _mm_add_ps(fjy2,ty);
2859 fjz2 = _mm_add_ps(fjz2,tz);
2863 /**************************
2864 * CALCULATE INTERACTIONS *
2865 **************************/
2867 if (gmx_mm_any_lt(rsq20,rcutoff2))
2870 r20 = _mm_mul_ps(rsq20,rinv20);
2871 r20 = _mm_andnot_ps(dummy_mask,r20);
2873 /* EWALD ELECTROSTATICS */
2875 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2876 ewrt = _mm_mul_ps(r20,ewtabscale);
2877 ewitab = _mm_cvttps_epi32(ewrt);
2878 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2879 ewitab = _mm_slli_epi32(ewitab,2);
2880 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2881 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2882 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2883 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2884 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2885 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2886 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2887 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2888 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2890 d = _mm_sub_ps(r20,rswitch);
2891 d = _mm_max_ps(d,_mm_setzero_ps());
2892 d2 = _mm_mul_ps(d,d);
2893 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2895 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2897 /* Evaluate switch function */
2898 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2899 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2900 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2904 fscal = _mm_and_ps(fscal,cutoff_mask);
2906 fscal = _mm_andnot_ps(dummy_mask,fscal);
2908 /* Calculate temporary vectorial force */
2909 tx = _mm_mul_ps(fscal,dx20);
2910 ty = _mm_mul_ps(fscal,dy20);
2911 tz = _mm_mul_ps(fscal,dz20);
2913 /* Update vectorial force */
2914 fix2 = _mm_add_ps(fix2,tx);
2915 fiy2 = _mm_add_ps(fiy2,ty);
2916 fiz2 = _mm_add_ps(fiz2,tz);
2918 fjx0 = _mm_add_ps(fjx0,tx);
2919 fjy0 = _mm_add_ps(fjy0,ty);
2920 fjz0 = _mm_add_ps(fjz0,tz);
2924 /**************************
2925 * CALCULATE INTERACTIONS *
2926 **************************/
2928 if (gmx_mm_any_lt(rsq21,rcutoff2))
2931 r21 = _mm_mul_ps(rsq21,rinv21);
2932 r21 = _mm_andnot_ps(dummy_mask,r21);
2934 /* EWALD ELECTROSTATICS */
2936 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2937 ewrt = _mm_mul_ps(r21,ewtabscale);
2938 ewitab = _mm_cvttps_epi32(ewrt);
2939 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2940 ewitab = _mm_slli_epi32(ewitab,2);
2941 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2942 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2943 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2944 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2945 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2946 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2947 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2948 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2949 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2951 d = _mm_sub_ps(r21,rswitch);
2952 d = _mm_max_ps(d,_mm_setzero_ps());
2953 d2 = _mm_mul_ps(d,d);
2954 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2956 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2958 /* Evaluate switch function */
2959 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2960 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2961 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2965 fscal = _mm_and_ps(fscal,cutoff_mask);
2967 fscal = _mm_andnot_ps(dummy_mask,fscal);
2969 /* Calculate temporary vectorial force */
2970 tx = _mm_mul_ps(fscal,dx21);
2971 ty = _mm_mul_ps(fscal,dy21);
2972 tz = _mm_mul_ps(fscal,dz21);
2974 /* Update vectorial force */
2975 fix2 = _mm_add_ps(fix2,tx);
2976 fiy2 = _mm_add_ps(fiy2,ty);
2977 fiz2 = _mm_add_ps(fiz2,tz);
2979 fjx1 = _mm_add_ps(fjx1,tx);
2980 fjy1 = _mm_add_ps(fjy1,ty);
2981 fjz1 = _mm_add_ps(fjz1,tz);
2985 /**************************
2986 * CALCULATE INTERACTIONS *
2987 **************************/
2989 if (gmx_mm_any_lt(rsq22,rcutoff2))
2992 r22 = _mm_mul_ps(rsq22,rinv22);
2993 r22 = _mm_andnot_ps(dummy_mask,r22);
2995 /* EWALD ELECTROSTATICS */
2997 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2998 ewrt = _mm_mul_ps(r22,ewtabscale);
2999 ewitab = _mm_cvttps_epi32(ewrt);
3000 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3001 ewitab = _mm_slli_epi32(ewitab,2);
3002 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3003 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3004 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3005 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3006 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3007 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3008 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3009 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
3010 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
3012 d = _mm_sub_ps(r22,rswitch);
3013 d = _mm_max_ps(d,_mm_setzero_ps());
3014 d2 = _mm_mul_ps(d,d);
3015 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3017 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3019 /* Evaluate switch function */
3020 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3021 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
3022 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
3026 fscal = _mm_and_ps(fscal,cutoff_mask);
3028 fscal = _mm_andnot_ps(dummy_mask,fscal);
3030 /* Calculate temporary vectorial force */
3031 tx = _mm_mul_ps(fscal,dx22);
3032 ty = _mm_mul_ps(fscal,dy22);
3033 tz = _mm_mul_ps(fscal,dz22);
3035 /* Update vectorial force */
3036 fix2 = _mm_add_ps(fix2,tx);
3037 fiy2 = _mm_add_ps(fiy2,ty);
3038 fiz2 = _mm_add_ps(fiz2,tz);
3040 fjx2 = _mm_add_ps(fjx2,tx);
3041 fjy2 = _mm_add_ps(fjy2,ty);
3042 fjz2 = _mm_add_ps(fjz2,tz);
3046 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3047 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3048 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3049 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3051 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
3052 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3054 /* Inner loop uses 567 flops */
3057 /* End of innermost loop */
3059 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3060 f+i_coord_offset,fshift+i_shift_offset);
3062 /* Increment number of inner iterations */
3063 inneriter += j_index_end - j_index_start;
3065 /* Outer loop uses 18 flops */
3068 /* Increment number of outer iterations */
3071 /* Update outer/inner flops */
3073 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*567);