2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_sse2_single.h"
48 #include "kernelutil_x86_sse2_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: None
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_sse2_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
107 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
109 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
110 real rswitch_scalar,d_scalar;
111 __m128 dummy_mask,cutoff_mask;
112 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
113 __m128 one = _mm_set1_ps(1.0);
114 __m128 two = _mm_set1_ps(2.0);
120 jindex = nlist->jindex;
122 shiftidx = nlist->shift;
124 shiftvec = fr->shift_vec[0];
125 fshift = fr->fshift[0];
126 facel = _mm_set1_ps(fr->epsfac);
127 charge = mdatoms->chargeA;
129 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
130 ewtab = fr->ic->tabq_coul_FDV0;
131 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
132 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
134 /* Setup water-specific parameters */
135 inr = nlist->iinr[0];
136 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
137 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
138 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
140 jq0 = _mm_set1_ps(charge[inr+0]);
141 jq1 = _mm_set1_ps(charge[inr+1]);
142 jq2 = _mm_set1_ps(charge[inr+2]);
143 qq00 = _mm_mul_ps(iq0,jq0);
144 qq01 = _mm_mul_ps(iq0,jq1);
145 qq02 = _mm_mul_ps(iq0,jq2);
146 qq10 = _mm_mul_ps(iq1,jq0);
147 qq11 = _mm_mul_ps(iq1,jq1);
148 qq12 = _mm_mul_ps(iq1,jq2);
149 qq20 = _mm_mul_ps(iq2,jq0);
150 qq21 = _mm_mul_ps(iq2,jq1);
151 qq22 = _mm_mul_ps(iq2,jq2);
153 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
154 rcutoff_scalar = fr->rcoulomb;
155 rcutoff = _mm_set1_ps(rcutoff_scalar);
156 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
158 rswitch_scalar = fr->rcoulomb_switch;
159 rswitch = _mm_set1_ps(rswitch_scalar);
160 /* Setup switch parameters */
161 d_scalar = rcutoff_scalar-rswitch_scalar;
162 d = _mm_set1_ps(d_scalar);
163 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
164 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
165 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
166 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
167 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
168 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
170 /* Avoid stupid compiler warnings */
171 jnrA = jnrB = jnrC = jnrD = 0;
180 for(iidx=0;iidx<4*DIM;iidx++)
185 /* Start outer loop over neighborlists */
186 for(iidx=0; iidx<nri; iidx++)
188 /* Load shift vector for this list */
189 i_shift_offset = DIM*shiftidx[iidx];
191 /* Load limits for loop over neighbors */
192 j_index_start = jindex[iidx];
193 j_index_end = jindex[iidx+1];
195 /* Get outer coordinate index */
197 i_coord_offset = DIM*inr;
199 /* Load i particle coords and add shift vector */
200 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
201 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
203 fix0 = _mm_setzero_ps();
204 fiy0 = _mm_setzero_ps();
205 fiz0 = _mm_setzero_ps();
206 fix1 = _mm_setzero_ps();
207 fiy1 = _mm_setzero_ps();
208 fiz1 = _mm_setzero_ps();
209 fix2 = _mm_setzero_ps();
210 fiy2 = _mm_setzero_ps();
211 fiz2 = _mm_setzero_ps();
213 /* Reset potential sums */
214 velecsum = _mm_setzero_ps();
216 /* Start inner kernel loop */
217 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
220 /* Get j neighbor index, and coordinate index */
225 j_coord_offsetA = DIM*jnrA;
226 j_coord_offsetB = DIM*jnrB;
227 j_coord_offsetC = DIM*jnrC;
228 j_coord_offsetD = DIM*jnrD;
230 /* load j atom coordinates */
231 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
232 x+j_coord_offsetC,x+j_coord_offsetD,
233 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
235 /* Calculate displacement vector */
236 dx00 = _mm_sub_ps(ix0,jx0);
237 dy00 = _mm_sub_ps(iy0,jy0);
238 dz00 = _mm_sub_ps(iz0,jz0);
239 dx01 = _mm_sub_ps(ix0,jx1);
240 dy01 = _mm_sub_ps(iy0,jy1);
241 dz01 = _mm_sub_ps(iz0,jz1);
242 dx02 = _mm_sub_ps(ix0,jx2);
243 dy02 = _mm_sub_ps(iy0,jy2);
244 dz02 = _mm_sub_ps(iz0,jz2);
245 dx10 = _mm_sub_ps(ix1,jx0);
246 dy10 = _mm_sub_ps(iy1,jy0);
247 dz10 = _mm_sub_ps(iz1,jz0);
248 dx11 = _mm_sub_ps(ix1,jx1);
249 dy11 = _mm_sub_ps(iy1,jy1);
250 dz11 = _mm_sub_ps(iz1,jz1);
251 dx12 = _mm_sub_ps(ix1,jx2);
252 dy12 = _mm_sub_ps(iy1,jy2);
253 dz12 = _mm_sub_ps(iz1,jz2);
254 dx20 = _mm_sub_ps(ix2,jx0);
255 dy20 = _mm_sub_ps(iy2,jy0);
256 dz20 = _mm_sub_ps(iz2,jz0);
257 dx21 = _mm_sub_ps(ix2,jx1);
258 dy21 = _mm_sub_ps(iy2,jy1);
259 dz21 = _mm_sub_ps(iz2,jz1);
260 dx22 = _mm_sub_ps(ix2,jx2);
261 dy22 = _mm_sub_ps(iy2,jy2);
262 dz22 = _mm_sub_ps(iz2,jz2);
264 /* Calculate squared distance and things based on it */
265 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
266 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
267 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
268 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
269 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
270 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
271 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
272 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
273 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
275 rinv00 = gmx_mm_invsqrt_ps(rsq00);
276 rinv01 = gmx_mm_invsqrt_ps(rsq01);
277 rinv02 = gmx_mm_invsqrt_ps(rsq02);
278 rinv10 = gmx_mm_invsqrt_ps(rsq10);
279 rinv11 = gmx_mm_invsqrt_ps(rsq11);
280 rinv12 = gmx_mm_invsqrt_ps(rsq12);
281 rinv20 = gmx_mm_invsqrt_ps(rsq20);
282 rinv21 = gmx_mm_invsqrt_ps(rsq21);
283 rinv22 = gmx_mm_invsqrt_ps(rsq22);
285 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
286 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
287 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
288 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
289 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
290 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
291 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
292 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
293 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
295 fjx0 = _mm_setzero_ps();
296 fjy0 = _mm_setzero_ps();
297 fjz0 = _mm_setzero_ps();
298 fjx1 = _mm_setzero_ps();
299 fjy1 = _mm_setzero_ps();
300 fjz1 = _mm_setzero_ps();
301 fjx2 = _mm_setzero_ps();
302 fjy2 = _mm_setzero_ps();
303 fjz2 = _mm_setzero_ps();
305 /**************************
306 * CALCULATE INTERACTIONS *
307 **************************/
309 if (gmx_mm_any_lt(rsq00,rcutoff2))
312 r00 = _mm_mul_ps(rsq00,rinv00);
314 /* EWALD ELECTROSTATICS */
316 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
317 ewrt = _mm_mul_ps(r00,ewtabscale);
318 ewitab = _mm_cvttps_epi32(ewrt);
319 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
320 ewitab = _mm_slli_epi32(ewitab,2);
321 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
322 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
323 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
324 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
325 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
326 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
327 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
328 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
329 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
331 d = _mm_sub_ps(r00,rswitch);
332 d = _mm_max_ps(d,_mm_setzero_ps());
333 d2 = _mm_mul_ps(d,d);
334 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
336 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
338 /* Evaluate switch function */
339 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
340 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
341 velec = _mm_mul_ps(velec,sw);
342 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
344 /* Update potential sum for this i atom from the interaction with this j atom. */
345 velec = _mm_and_ps(velec,cutoff_mask);
346 velecsum = _mm_add_ps(velecsum,velec);
350 fscal = _mm_and_ps(fscal,cutoff_mask);
352 /* Calculate temporary vectorial force */
353 tx = _mm_mul_ps(fscal,dx00);
354 ty = _mm_mul_ps(fscal,dy00);
355 tz = _mm_mul_ps(fscal,dz00);
357 /* Update vectorial force */
358 fix0 = _mm_add_ps(fix0,tx);
359 fiy0 = _mm_add_ps(fiy0,ty);
360 fiz0 = _mm_add_ps(fiz0,tz);
362 fjx0 = _mm_add_ps(fjx0,tx);
363 fjy0 = _mm_add_ps(fjy0,ty);
364 fjz0 = _mm_add_ps(fjz0,tz);
368 /**************************
369 * CALCULATE INTERACTIONS *
370 **************************/
372 if (gmx_mm_any_lt(rsq01,rcutoff2))
375 r01 = _mm_mul_ps(rsq01,rinv01);
377 /* EWALD ELECTROSTATICS */
379 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
380 ewrt = _mm_mul_ps(r01,ewtabscale);
381 ewitab = _mm_cvttps_epi32(ewrt);
382 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
383 ewitab = _mm_slli_epi32(ewitab,2);
384 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
385 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
386 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
387 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
388 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
389 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
390 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
391 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
392 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
394 d = _mm_sub_ps(r01,rswitch);
395 d = _mm_max_ps(d,_mm_setzero_ps());
396 d2 = _mm_mul_ps(d,d);
397 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
399 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
401 /* Evaluate switch function */
402 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
403 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
404 velec = _mm_mul_ps(velec,sw);
405 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
407 /* Update potential sum for this i atom from the interaction with this j atom. */
408 velec = _mm_and_ps(velec,cutoff_mask);
409 velecsum = _mm_add_ps(velecsum,velec);
413 fscal = _mm_and_ps(fscal,cutoff_mask);
415 /* Calculate temporary vectorial force */
416 tx = _mm_mul_ps(fscal,dx01);
417 ty = _mm_mul_ps(fscal,dy01);
418 tz = _mm_mul_ps(fscal,dz01);
420 /* Update vectorial force */
421 fix0 = _mm_add_ps(fix0,tx);
422 fiy0 = _mm_add_ps(fiy0,ty);
423 fiz0 = _mm_add_ps(fiz0,tz);
425 fjx1 = _mm_add_ps(fjx1,tx);
426 fjy1 = _mm_add_ps(fjy1,ty);
427 fjz1 = _mm_add_ps(fjz1,tz);
431 /**************************
432 * CALCULATE INTERACTIONS *
433 **************************/
435 if (gmx_mm_any_lt(rsq02,rcutoff2))
438 r02 = _mm_mul_ps(rsq02,rinv02);
440 /* EWALD ELECTROSTATICS */
442 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
443 ewrt = _mm_mul_ps(r02,ewtabscale);
444 ewitab = _mm_cvttps_epi32(ewrt);
445 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
446 ewitab = _mm_slli_epi32(ewitab,2);
447 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
448 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
449 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
450 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
451 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
452 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
453 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
454 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
455 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
457 d = _mm_sub_ps(r02,rswitch);
458 d = _mm_max_ps(d,_mm_setzero_ps());
459 d2 = _mm_mul_ps(d,d);
460 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
462 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
464 /* Evaluate switch function */
465 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
466 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
467 velec = _mm_mul_ps(velec,sw);
468 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
470 /* Update potential sum for this i atom from the interaction with this j atom. */
471 velec = _mm_and_ps(velec,cutoff_mask);
472 velecsum = _mm_add_ps(velecsum,velec);
476 fscal = _mm_and_ps(fscal,cutoff_mask);
478 /* Calculate temporary vectorial force */
479 tx = _mm_mul_ps(fscal,dx02);
480 ty = _mm_mul_ps(fscal,dy02);
481 tz = _mm_mul_ps(fscal,dz02);
483 /* Update vectorial force */
484 fix0 = _mm_add_ps(fix0,tx);
485 fiy0 = _mm_add_ps(fiy0,ty);
486 fiz0 = _mm_add_ps(fiz0,tz);
488 fjx2 = _mm_add_ps(fjx2,tx);
489 fjy2 = _mm_add_ps(fjy2,ty);
490 fjz2 = _mm_add_ps(fjz2,tz);
494 /**************************
495 * CALCULATE INTERACTIONS *
496 **************************/
498 if (gmx_mm_any_lt(rsq10,rcutoff2))
501 r10 = _mm_mul_ps(rsq10,rinv10);
503 /* EWALD ELECTROSTATICS */
505 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
506 ewrt = _mm_mul_ps(r10,ewtabscale);
507 ewitab = _mm_cvttps_epi32(ewrt);
508 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
509 ewitab = _mm_slli_epi32(ewitab,2);
510 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
511 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
512 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
513 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
514 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
515 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
516 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
517 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
518 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
520 d = _mm_sub_ps(r10,rswitch);
521 d = _mm_max_ps(d,_mm_setzero_ps());
522 d2 = _mm_mul_ps(d,d);
523 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
525 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
527 /* Evaluate switch function */
528 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
529 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
530 velec = _mm_mul_ps(velec,sw);
531 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
533 /* Update potential sum for this i atom from the interaction with this j atom. */
534 velec = _mm_and_ps(velec,cutoff_mask);
535 velecsum = _mm_add_ps(velecsum,velec);
539 fscal = _mm_and_ps(fscal,cutoff_mask);
541 /* Calculate temporary vectorial force */
542 tx = _mm_mul_ps(fscal,dx10);
543 ty = _mm_mul_ps(fscal,dy10);
544 tz = _mm_mul_ps(fscal,dz10);
546 /* Update vectorial force */
547 fix1 = _mm_add_ps(fix1,tx);
548 fiy1 = _mm_add_ps(fiy1,ty);
549 fiz1 = _mm_add_ps(fiz1,tz);
551 fjx0 = _mm_add_ps(fjx0,tx);
552 fjy0 = _mm_add_ps(fjy0,ty);
553 fjz0 = _mm_add_ps(fjz0,tz);
557 /**************************
558 * CALCULATE INTERACTIONS *
559 **************************/
561 if (gmx_mm_any_lt(rsq11,rcutoff2))
564 r11 = _mm_mul_ps(rsq11,rinv11);
566 /* EWALD ELECTROSTATICS */
568 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
569 ewrt = _mm_mul_ps(r11,ewtabscale);
570 ewitab = _mm_cvttps_epi32(ewrt);
571 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
572 ewitab = _mm_slli_epi32(ewitab,2);
573 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
574 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
575 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
576 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
577 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
578 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
579 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
580 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
581 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
583 d = _mm_sub_ps(r11,rswitch);
584 d = _mm_max_ps(d,_mm_setzero_ps());
585 d2 = _mm_mul_ps(d,d);
586 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
588 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
590 /* Evaluate switch function */
591 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
592 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
593 velec = _mm_mul_ps(velec,sw);
594 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
596 /* Update potential sum for this i atom from the interaction with this j atom. */
597 velec = _mm_and_ps(velec,cutoff_mask);
598 velecsum = _mm_add_ps(velecsum,velec);
602 fscal = _mm_and_ps(fscal,cutoff_mask);
604 /* Calculate temporary vectorial force */
605 tx = _mm_mul_ps(fscal,dx11);
606 ty = _mm_mul_ps(fscal,dy11);
607 tz = _mm_mul_ps(fscal,dz11);
609 /* Update vectorial force */
610 fix1 = _mm_add_ps(fix1,tx);
611 fiy1 = _mm_add_ps(fiy1,ty);
612 fiz1 = _mm_add_ps(fiz1,tz);
614 fjx1 = _mm_add_ps(fjx1,tx);
615 fjy1 = _mm_add_ps(fjy1,ty);
616 fjz1 = _mm_add_ps(fjz1,tz);
620 /**************************
621 * CALCULATE INTERACTIONS *
622 **************************/
624 if (gmx_mm_any_lt(rsq12,rcutoff2))
627 r12 = _mm_mul_ps(rsq12,rinv12);
629 /* EWALD ELECTROSTATICS */
631 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
632 ewrt = _mm_mul_ps(r12,ewtabscale);
633 ewitab = _mm_cvttps_epi32(ewrt);
634 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
635 ewitab = _mm_slli_epi32(ewitab,2);
636 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
637 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
638 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
639 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
640 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
641 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
642 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
643 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
644 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
646 d = _mm_sub_ps(r12,rswitch);
647 d = _mm_max_ps(d,_mm_setzero_ps());
648 d2 = _mm_mul_ps(d,d);
649 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
651 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
653 /* Evaluate switch function */
654 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
655 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
656 velec = _mm_mul_ps(velec,sw);
657 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
659 /* Update potential sum for this i atom from the interaction with this j atom. */
660 velec = _mm_and_ps(velec,cutoff_mask);
661 velecsum = _mm_add_ps(velecsum,velec);
665 fscal = _mm_and_ps(fscal,cutoff_mask);
667 /* Calculate temporary vectorial force */
668 tx = _mm_mul_ps(fscal,dx12);
669 ty = _mm_mul_ps(fscal,dy12);
670 tz = _mm_mul_ps(fscal,dz12);
672 /* Update vectorial force */
673 fix1 = _mm_add_ps(fix1,tx);
674 fiy1 = _mm_add_ps(fiy1,ty);
675 fiz1 = _mm_add_ps(fiz1,tz);
677 fjx2 = _mm_add_ps(fjx2,tx);
678 fjy2 = _mm_add_ps(fjy2,ty);
679 fjz2 = _mm_add_ps(fjz2,tz);
683 /**************************
684 * CALCULATE INTERACTIONS *
685 **************************/
687 if (gmx_mm_any_lt(rsq20,rcutoff2))
690 r20 = _mm_mul_ps(rsq20,rinv20);
692 /* EWALD ELECTROSTATICS */
694 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
695 ewrt = _mm_mul_ps(r20,ewtabscale);
696 ewitab = _mm_cvttps_epi32(ewrt);
697 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
698 ewitab = _mm_slli_epi32(ewitab,2);
699 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
700 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
701 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
702 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
703 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
704 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
705 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
706 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
707 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
709 d = _mm_sub_ps(r20,rswitch);
710 d = _mm_max_ps(d,_mm_setzero_ps());
711 d2 = _mm_mul_ps(d,d);
712 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
714 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
716 /* Evaluate switch function */
717 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
718 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
719 velec = _mm_mul_ps(velec,sw);
720 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
722 /* Update potential sum for this i atom from the interaction with this j atom. */
723 velec = _mm_and_ps(velec,cutoff_mask);
724 velecsum = _mm_add_ps(velecsum,velec);
728 fscal = _mm_and_ps(fscal,cutoff_mask);
730 /* Calculate temporary vectorial force */
731 tx = _mm_mul_ps(fscal,dx20);
732 ty = _mm_mul_ps(fscal,dy20);
733 tz = _mm_mul_ps(fscal,dz20);
735 /* Update vectorial force */
736 fix2 = _mm_add_ps(fix2,tx);
737 fiy2 = _mm_add_ps(fiy2,ty);
738 fiz2 = _mm_add_ps(fiz2,tz);
740 fjx0 = _mm_add_ps(fjx0,tx);
741 fjy0 = _mm_add_ps(fjy0,ty);
742 fjz0 = _mm_add_ps(fjz0,tz);
746 /**************************
747 * CALCULATE INTERACTIONS *
748 **************************/
750 if (gmx_mm_any_lt(rsq21,rcutoff2))
753 r21 = _mm_mul_ps(rsq21,rinv21);
755 /* EWALD ELECTROSTATICS */
757 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
758 ewrt = _mm_mul_ps(r21,ewtabscale);
759 ewitab = _mm_cvttps_epi32(ewrt);
760 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
761 ewitab = _mm_slli_epi32(ewitab,2);
762 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
763 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
764 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
765 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
766 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
767 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
768 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
769 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
770 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
772 d = _mm_sub_ps(r21,rswitch);
773 d = _mm_max_ps(d,_mm_setzero_ps());
774 d2 = _mm_mul_ps(d,d);
775 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
777 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
779 /* Evaluate switch function */
780 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
781 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
782 velec = _mm_mul_ps(velec,sw);
783 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
785 /* Update potential sum for this i atom from the interaction with this j atom. */
786 velec = _mm_and_ps(velec,cutoff_mask);
787 velecsum = _mm_add_ps(velecsum,velec);
791 fscal = _mm_and_ps(fscal,cutoff_mask);
793 /* Calculate temporary vectorial force */
794 tx = _mm_mul_ps(fscal,dx21);
795 ty = _mm_mul_ps(fscal,dy21);
796 tz = _mm_mul_ps(fscal,dz21);
798 /* Update vectorial force */
799 fix2 = _mm_add_ps(fix2,tx);
800 fiy2 = _mm_add_ps(fiy2,ty);
801 fiz2 = _mm_add_ps(fiz2,tz);
803 fjx1 = _mm_add_ps(fjx1,tx);
804 fjy1 = _mm_add_ps(fjy1,ty);
805 fjz1 = _mm_add_ps(fjz1,tz);
809 /**************************
810 * CALCULATE INTERACTIONS *
811 **************************/
813 if (gmx_mm_any_lt(rsq22,rcutoff2))
816 r22 = _mm_mul_ps(rsq22,rinv22);
818 /* EWALD ELECTROSTATICS */
820 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
821 ewrt = _mm_mul_ps(r22,ewtabscale);
822 ewitab = _mm_cvttps_epi32(ewrt);
823 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
824 ewitab = _mm_slli_epi32(ewitab,2);
825 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
826 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
827 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
828 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
829 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
830 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
831 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
832 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
833 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
835 d = _mm_sub_ps(r22,rswitch);
836 d = _mm_max_ps(d,_mm_setzero_ps());
837 d2 = _mm_mul_ps(d,d);
838 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
840 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
842 /* Evaluate switch function */
843 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
844 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
845 velec = _mm_mul_ps(velec,sw);
846 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
848 /* Update potential sum for this i atom from the interaction with this j atom. */
849 velec = _mm_and_ps(velec,cutoff_mask);
850 velecsum = _mm_add_ps(velecsum,velec);
854 fscal = _mm_and_ps(fscal,cutoff_mask);
856 /* Calculate temporary vectorial force */
857 tx = _mm_mul_ps(fscal,dx22);
858 ty = _mm_mul_ps(fscal,dy22);
859 tz = _mm_mul_ps(fscal,dz22);
861 /* Update vectorial force */
862 fix2 = _mm_add_ps(fix2,tx);
863 fiy2 = _mm_add_ps(fiy2,ty);
864 fiz2 = _mm_add_ps(fiz2,tz);
866 fjx2 = _mm_add_ps(fjx2,tx);
867 fjy2 = _mm_add_ps(fjy2,ty);
868 fjz2 = _mm_add_ps(fjz2,tz);
872 fjptrA = f+j_coord_offsetA;
873 fjptrB = f+j_coord_offsetB;
874 fjptrC = f+j_coord_offsetC;
875 fjptrD = f+j_coord_offsetD;
877 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
878 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
880 /* Inner loop uses 585 flops */
886 /* Get j neighbor index, and coordinate index */
887 jnrlistA = jjnr[jidx];
888 jnrlistB = jjnr[jidx+1];
889 jnrlistC = jjnr[jidx+2];
890 jnrlistD = jjnr[jidx+3];
891 /* Sign of each element will be negative for non-real atoms.
892 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
893 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
895 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
896 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
897 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
898 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
899 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
900 j_coord_offsetA = DIM*jnrA;
901 j_coord_offsetB = DIM*jnrB;
902 j_coord_offsetC = DIM*jnrC;
903 j_coord_offsetD = DIM*jnrD;
905 /* load j atom coordinates */
906 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
907 x+j_coord_offsetC,x+j_coord_offsetD,
908 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
910 /* Calculate displacement vector */
911 dx00 = _mm_sub_ps(ix0,jx0);
912 dy00 = _mm_sub_ps(iy0,jy0);
913 dz00 = _mm_sub_ps(iz0,jz0);
914 dx01 = _mm_sub_ps(ix0,jx1);
915 dy01 = _mm_sub_ps(iy0,jy1);
916 dz01 = _mm_sub_ps(iz0,jz1);
917 dx02 = _mm_sub_ps(ix0,jx2);
918 dy02 = _mm_sub_ps(iy0,jy2);
919 dz02 = _mm_sub_ps(iz0,jz2);
920 dx10 = _mm_sub_ps(ix1,jx0);
921 dy10 = _mm_sub_ps(iy1,jy0);
922 dz10 = _mm_sub_ps(iz1,jz0);
923 dx11 = _mm_sub_ps(ix1,jx1);
924 dy11 = _mm_sub_ps(iy1,jy1);
925 dz11 = _mm_sub_ps(iz1,jz1);
926 dx12 = _mm_sub_ps(ix1,jx2);
927 dy12 = _mm_sub_ps(iy1,jy2);
928 dz12 = _mm_sub_ps(iz1,jz2);
929 dx20 = _mm_sub_ps(ix2,jx0);
930 dy20 = _mm_sub_ps(iy2,jy0);
931 dz20 = _mm_sub_ps(iz2,jz0);
932 dx21 = _mm_sub_ps(ix2,jx1);
933 dy21 = _mm_sub_ps(iy2,jy1);
934 dz21 = _mm_sub_ps(iz2,jz1);
935 dx22 = _mm_sub_ps(ix2,jx2);
936 dy22 = _mm_sub_ps(iy2,jy2);
937 dz22 = _mm_sub_ps(iz2,jz2);
939 /* Calculate squared distance and things based on it */
940 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
941 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
942 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
943 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
944 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
945 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
946 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
947 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
948 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
950 rinv00 = gmx_mm_invsqrt_ps(rsq00);
951 rinv01 = gmx_mm_invsqrt_ps(rsq01);
952 rinv02 = gmx_mm_invsqrt_ps(rsq02);
953 rinv10 = gmx_mm_invsqrt_ps(rsq10);
954 rinv11 = gmx_mm_invsqrt_ps(rsq11);
955 rinv12 = gmx_mm_invsqrt_ps(rsq12);
956 rinv20 = gmx_mm_invsqrt_ps(rsq20);
957 rinv21 = gmx_mm_invsqrt_ps(rsq21);
958 rinv22 = gmx_mm_invsqrt_ps(rsq22);
960 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
961 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
962 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
963 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
964 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
965 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
966 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
967 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
968 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
970 fjx0 = _mm_setzero_ps();
971 fjy0 = _mm_setzero_ps();
972 fjz0 = _mm_setzero_ps();
973 fjx1 = _mm_setzero_ps();
974 fjy1 = _mm_setzero_ps();
975 fjz1 = _mm_setzero_ps();
976 fjx2 = _mm_setzero_ps();
977 fjy2 = _mm_setzero_ps();
978 fjz2 = _mm_setzero_ps();
980 /**************************
981 * CALCULATE INTERACTIONS *
982 **************************/
984 if (gmx_mm_any_lt(rsq00,rcutoff2))
987 r00 = _mm_mul_ps(rsq00,rinv00);
988 r00 = _mm_andnot_ps(dummy_mask,r00);
990 /* EWALD ELECTROSTATICS */
992 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
993 ewrt = _mm_mul_ps(r00,ewtabscale);
994 ewitab = _mm_cvttps_epi32(ewrt);
995 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
996 ewitab = _mm_slli_epi32(ewitab,2);
997 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
998 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
999 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1000 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1001 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1002 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1003 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1004 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1005 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1007 d = _mm_sub_ps(r00,rswitch);
1008 d = _mm_max_ps(d,_mm_setzero_ps());
1009 d2 = _mm_mul_ps(d,d);
1010 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1012 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1014 /* Evaluate switch function */
1015 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1016 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1017 velec = _mm_mul_ps(velec,sw);
1018 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1020 /* Update potential sum for this i atom from the interaction with this j atom. */
1021 velec = _mm_and_ps(velec,cutoff_mask);
1022 velec = _mm_andnot_ps(dummy_mask,velec);
1023 velecsum = _mm_add_ps(velecsum,velec);
1027 fscal = _mm_and_ps(fscal,cutoff_mask);
1029 fscal = _mm_andnot_ps(dummy_mask,fscal);
1031 /* Calculate temporary vectorial force */
1032 tx = _mm_mul_ps(fscal,dx00);
1033 ty = _mm_mul_ps(fscal,dy00);
1034 tz = _mm_mul_ps(fscal,dz00);
1036 /* Update vectorial force */
1037 fix0 = _mm_add_ps(fix0,tx);
1038 fiy0 = _mm_add_ps(fiy0,ty);
1039 fiz0 = _mm_add_ps(fiz0,tz);
1041 fjx0 = _mm_add_ps(fjx0,tx);
1042 fjy0 = _mm_add_ps(fjy0,ty);
1043 fjz0 = _mm_add_ps(fjz0,tz);
1047 /**************************
1048 * CALCULATE INTERACTIONS *
1049 **************************/
1051 if (gmx_mm_any_lt(rsq01,rcutoff2))
1054 r01 = _mm_mul_ps(rsq01,rinv01);
1055 r01 = _mm_andnot_ps(dummy_mask,r01);
1057 /* EWALD ELECTROSTATICS */
1059 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1060 ewrt = _mm_mul_ps(r01,ewtabscale);
1061 ewitab = _mm_cvttps_epi32(ewrt);
1062 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1063 ewitab = _mm_slli_epi32(ewitab,2);
1064 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1065 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1066 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1067 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1068 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1069 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1070 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1071 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
1072 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1074 d = _mm_sub_ps(r01,rswitch);
1075 d = _mm_max_ps(d,_mm_setzero_ps());
1076 d2 = _mm_mul_ps(d,d);
1077 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1079 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1081 /* Evaluate switch function */
1082 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1083 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1084 velec = _mm_mul_ps(velec,sw);
1085 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1087 /* Update potential sum for this i atom from the interaction with this j atom. */
1088 velec = _mm_and_ps(velec,cutoff_mask);
1089 velec = _mm_andnot_ps(dummy_mask,velec);
1090 velecsum = _mm_add_ps(velecsum,velec);
1094 fscal = _mm_and_ps(fscal,cutoff_mask);
1096 fscal = _mm_andnot_ps(dummy_mask,fscal);
1098 /* Calculate temporary vectorial force */
1099 tx = _mm_mul_ps(fscal,dx01);
1100 ty = _mm_mul_ps(fscal,dy01);
1101 tz = _mm_mul_ps(fscal,dz01);
1103 /* Update vectorial force */
1104 fix0 = _mm_add_ps(fix0,tx);
1105 fiy0 = _mm_add_ps(fiy0,ty);
1106 fiz0 = _mm_add_ps(fiz0,tz);
1108 fjx1 = _mm_add_ps(fjx1,tx);
1109 fjy1 = _mm_add_ps(fjy1,ty);
1110 fjz1 = _mm_add_ps(fjz1,tz);
1114 /**************************
1115 * CALCULATE INTERACTIONS *
1116 **************************/
1118 if (gmx_mm_any_lt(rsq02,rcutoff2))
1121 r02 = _mm_mul_ps(rsq02,rinv02);
1122 r02 = _mm_andnot_ps(dummy_mask,r02);
1124 /* EWALD ELECTROSTATICS */
1126 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1127 ewrt = _mm_mul_ps(r02,ewtabscale);
1128 ewitab = _mm_cvttps_epi32(ewrt);
1129 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1130 ewitab = _mm_slli_epi32(ewitab,2);
1131 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1132 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1133 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1134 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1135 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1136 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1137 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1138 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
1139 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1141 d = _mm_sub_ps(r02,rswitch);
1142 d = _mm_max_ps(d,_mm_setzero_ps());
1143 d2 = _mm_mul_ps(d,d);
1144 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1146 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1148 /* Evaluate switch function */
1149 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1150 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1151 velec = _mm_mul_ps(velec,sw);
1152 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1154 /* Update potential sum for this i atom from the interaction with this j atom. */
1155 velec = _mm_and_ps(velec,cutoff_mask);
1156 velec = _mm_andnot_ps(dummy_mask,velec);
1157 velecsum = _mm_add_ps(velecsum,velec);
1161 fscal = _mm_and_ps(fscal,cutoff_mask);
1163 fscal = _mm_andnot_ps(dummy_mask,fscal);
1165 /* Calculate temporary vectorial force */
1166 tx = _mm_mul_ps(fscal,dx02);
1167 ty = _mm_mul_ps(fscal,dy02);
1168 tz = _mm_mul_ps(fscal,dz02);
1170 /* Update vectorial force */
1171 fix0 = _mm_add_ps(fix0,tx);
1172 fiy0 = _mm_add_ps(fiy0,ty);
1173 fiz0 = _mm_add_ps(fiz0,tz);
1175 fjx2 = _mm_add_ps(fjx2,tx);
1176 fjy2 = _mm_add_ps(fjy2,ty);
1177 fjz2 = _mm_add_ps(fjz2,tz);
1181 /**************************
1182 * CALCULATE INTERACTIONS *
1183 **************************/
1185 if (gmx_mm_any_lt(rsq10,rcutoff2))
1188 r10 = _mm_mul_ps(rsq10,rinv10);
1189 r10 = _mm_andnot_ps(dummy_mask,r10);
1191 /* EWALD ELECTROSTATICS */
1193 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1194 ewrt = _mm_mul_ps(r10,ewtabscale);
1195 ewitab = _mm_cvttps_epi32(ewrt);
1196 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1197 ewitab = _mm_slli_epi32(ewitab,2);
1198 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1199 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1200 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1201 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1202 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1203 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1204 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1205 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1206 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1208 d = _mm_sub_ps(r10,rswitch);
1209 d = _mm_max_ps(d,_mm_setzero_ps());
1210 d2 = _mm_mul_ps(d,d);
1211 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1213 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1215 /* Evaluate switch function */
1216 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1217 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1218 velec = _mm_mul_ps(velec,sw);
1219 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1221 /* Update potential sum for this i atom from the interaction with this j atom. */
1222 velec = _mm_and_ps(velec,cutoff_mask);
1223 velec = _mm_andnot_ps(dummy_mask,velec);
1224 velecsum = _mm_add_ps(velecsum,velec);
1228 fscal = _mm_and_ps(fscal,cutoff_mask);
1230 fscal = _mm_andnot_ps(dummy_mask,fscal);
1232 /* Calculate temporary vectorial force */
1233 tx = _mm_mul_ps(fscal,dx10);
1234 ty = _mm_mul_ps(fscal,dy10);
1235 tz = _mm_mul_ps(fscal,dz10);
1237 /* Update vectorial force */
1238 fix1 = _mm_add_ps(fix1,tx);
1239 fiy1 = _mm_add_ps(fiy1,ty);
1240 fiz1 = _mm_add_ps(fiz1,tz);
1242 fjx0 = _mm_add_ps(fjx0,tx);
1243 fjy0 = _mm_add_ps(fjy0,ty);
1244 fjz0 = _mm_add_ps(fjz0,tz);
1248 /**************************
1249 * CALCULATE INTERACTIONS *
1250 **************************/
1252 if (gmx_mm_any_lt(rsq11,rcutoff2))
1255 r11 = _mm_mul_ps(rsq11,rinv11);
1256 r11 = _mm_andnot_ps(dummy_mask,r11);
1258 /* EWALD ELECTROSTATICS */
1260 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1261 ewrt = _mm_mul_ps(r11,ewtabscale);
1262 ewitab = _mm_cvttps_epi32(ewrt);
1263 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1264 ewitab = _mm_slli_epi32(ewitab,2);
1265 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1266 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1267 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1268 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1269 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1270 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1271 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1272 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1273 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1275 d = _mm_sub_ps(r11,rswitch);
1276 d = _mm_max_ps(d,_mm_setzero_ps());
1277 d2 = _mm_mul_ps(d,d);
1278 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1280 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1282 /* Evaluate switch function */
1283 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1284 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1285 velec = _mm_mul_ps(velec,sw);
1286 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1288 /* Update potential sum for this i atom from the interaction with this j atom. */
1289 velec = _mm_and_ps(velec,cutoff_mask);
1290 velec = _mm_andnot_ps(dummy_mask,velec);
1291 velecsum = _mm_add_ps(velecsum,velec);
1295 fscal = _mm_and_ps(fscal,cutoff_mask);
1297 fscal = _mm_andnot_ps(dummy_mask,fscal);
1299 /* Calculate temporary vectorial force */
1300 tx = _mm_mul_ps(fscal,dx11);
1301 ty = _mm_mul_ps(fscal,dy11);
1302 tz = _mm_mul_ps(fscal,dz11);
1304 /* Update vectorial force */
1305 fix1 = _mm_add_ps(fix1,tx);
1306 fiy1 = _mm_add_ps(fiy1,ty);
1307 fiz1 = _mm_add_ps(fiz1,tz);
1309 fjx1 = _mm_add_ps(fjx1,tx);
1310 fjy1 = _mm_add_ps(fjy1,ty);
1311 fjz1 = _mm_add_ps(fjz1,tz);
1315 /**************************
1316 * CALCULATE INTERACTIONS *
1317 **************************/
1319 if (gmx_mm_any_lt(rsq12,rcutoff2))
1322 r12 = _mm_mul_ps(rsq12,rinv12);
1323 r12 = _mm_andnot_ps(dummy_mask,r12);
1325 /* EWALD ELECTROSTATICS */
1327 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1328 ewrt = _mm_mul_ps(r12,ewtabscale);
1329 ewitab = _mm_cvttps_epi32(ewrt);
1330 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1331 ewitab = _mm_slli_epi32(ewitab,2);
1332 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1333 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1334 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1335 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1336 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1337 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1338 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1339 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1340 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1342 d = _mm_sub_ps(r12,rswitch);
1343 d = _mm_max_ps(d,_mm_setzero_ps());
1344 d2 = _mm_mul_ps(d,d);
1345 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1347 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1349 /* Evaluate switch function */
1350 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1351 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1352 velec = _mm_mul_ps(velec,sw);
1353 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1355 /* Update potential sum for this i atom from the interaction with this j atom. */
1356 velec = _mm_and_ps(velec,cutoff_mask);
1357 velec = _mm_andnot_ps(dummy_mask,velec);
1358 velecsum = _mm_add_ps(velecsum,velec);
1362 fscal = _mm_and_ps(fscal,cutoff_mask);
1364 fscal = _mm_andnot_ps(dummy_mask,fscal);
1366 /* Calculate temporary vectorial force */
1367 tx = _mm_mul_ps(fscal,dx12);
1368 ty = _mm_mul_ps(fscal,dy12);
1369 tz = _mm_mul_ps(fscal,dz12);
1371 /* Update vectorial force */
1372 fix1 = _mm_add_ps(fix1,tx);
1373 fiy1 = _mm_add_ps(fiy1,ty);
1374 fiz1 = _mm_add_ps(fiz1,tz);
1376 fjx2 = _mm_add_ps(fjx2,tx);
1377 fjy2 = _mm_add_ps(fjy2,ty);
1378 fjz2 = _mm_add_ps(fjz2,tz);
1382 /**************************
1383 * CALCULATE INTERACTIONS *
1384 **************************/
1386 if (gmx_mm_any_lt(rsq20,rcutoff2))
1389 r20 = _mm_mul_ps(rsq20,rinv20);
1390 r20 = _mm_andnot_ps(dummy_mask,r20);
1392 /* EWALD ELECTROSTATICS */
1394 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1395 ewrt = _mm_mul_ps(r20,ewtabscale);
1396 ewitab = _mm_cvttps_epi32(ewrt);
1397 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1398 ewitab = _mm_slli_epi32(ewitab,2);
1399 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1400 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1401 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1402 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1403 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1404 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1405 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1406 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1407 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1409 d = _mm_sub_ps(r20,rswitch);
1410 d = _mm_max_ps(d,_mm_setzero_ps());
1411 d2 = _mm_mul_ps(d,d);
1412 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1414 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1416 /* Evaluate switch function */
1417 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1418 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1419 velec = _mm_mul_ps(velec,sw);
1420 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1422 /* Update potential sum for this i atom from the interaction with this j atom. */
1423 velec = _mm_and_ps(velec,cutoff_mask);
1424 velec = _mm_andnot_ps(dummy_mask,velec);
1425 velecsum = _mm_add_ps(velecsum,velec);
1429 fscal = _mm_and_ps(fscal,cutoff_mask);
1431 fscal = _mm_andnot_ps(dummy_mask,fscal);
1433 /* Calculate temporary vectorial force */
1434 tx = _mm_mul_ps(fscal,dx20);
1435 ty = _mm_mul_ps(fscal,dy20);
1436 tz = _mm_mul_ps(fscal,dz20);
1438 /* Update vectorial force */
1439 fix2 = _mm_add_ps(fix2,tx);
1440 fiy2 = _mm_add_ps(fiy2,ty);
1441 fiz2 = _mm_add_ps(fiz2,tz);
1443 fjx0 = _mm_add_ps(fjx0,tx);
1444 fjy0 = _mm_add_ps(fjy0,ty);
1445 fjz0 = _mm_add_ps(fjz0,tz);
1449 /**************************
1450 * CALCULATE INTERACTIONS *
1451 **************************/
1453 if (gmx_mm_any_lt(rsq21,rcutoff2))
1456 r21 = _mm_mul_ps(rsq21,rinv21);
1457 r21 = _mm_andnot_ps(dummy_mask,r21);
1459 /* EWALD ELECTROSTATICS */
1461 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1462 ewrt = _mm_mul_ps(r21,ewtabscale);
1463 ewitab = _mm_cvttps_epi32(ewrt);
1464 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1465 ewitab = _mm_slli_epi32(ewitab,2);
1466 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1467 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1468 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1469 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1470 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1471 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1472 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1473 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1474 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1476 d = _mm_sub_ps(r21,rswitch);
1477 d = _mm_max_ps(d,_mm_setzero_ps());
1478 d2 = _mm_mul_ps(d,d);
1479 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1481 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1483 /* Evaluate switch function */
1484 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1485 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1486 velec = _mm_mul_ps(velec,sw);
1487 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1489 /* Update potential sum for this i atom from the interaction with this j atom. */
1490 velec = _mm_and_ps(velec,cutoff_mask);
1491 velec = _mm_andnot_ps(dummy_mask,velec);
1492 velecsum = _mm_add_ps(velecsum,velec);
1496 fscal = _mm_and_ps(fscal,cutoff_mask);
1498 fscal = _mm_andnot_ps(dummy_mask,fscal);
1500 /* Calculate temporary vectorial force */
1501 tx = _mm_mul_ps(fscal,dx21);
1502 ty = _mm_mul_ps(fscal,dy21);
1503 tz = _mm_mul_ps(fscal,dz21);
1505 /* Update vectorial force */
1506 fix2 = _mm_add_ps(fix2,tx);
1507 fiy2 = _mm_add_ps(fiy2,ty);
1508 fiz2 = _mm_add_ps(fiz2,tz);
1510 fjx1 = _mm_add_ps(fjx1,tx);
1511 fjy1 = _mm_add_ps(fjy1,ty);
1512 fjz1 = _mm_add_ps(fjz1,tz);
1516 /**************************
1517 * CALCULATE INTERACTIONS *
1518 **************************/
1520 if (gmx_mm_any_lt(rsq22,rcutoff2))
1523 r22 = _mm_mul_ps(rsq22,rinv22);
1524 r22 = _mm_andnot_ps(dummy_mask,r22);
1526 /* EWALD ELECTROSTATICS */
1528 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1529 ewrt = _mm_mul_ps(r22,ewtabscale);
1530 ewitab = _mm_cvttps_epi32(ewrt);
1531 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1532 ewitab = _mm_slli_epi32(ewitab,2);
1533 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1534 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1535 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1536 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1537 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1538 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1539 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1540 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1541 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1543 d = _mm_sub_ps(r22,rswitch);
1544 d = _mm_max_ps(d,_mm_setzero_ps());
1545 d2 = _mm_mul_ps(d,d);
1546 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1548 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1550 /* Evaluate switch function */
1551 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1552 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1553 velec = _mm_mul_ps(velec,sw);
1554 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1556 /* Update potential sum for this i atom from the interaction with this j atom. */
1557 velec = _mm_and_ps(velec,cutoff_mask);
1558 velec = _mm_andnot_ps(dummy_mask,velec);
1559 velecsum = _mm_add_ps(velecsum,velec);
1563 fscal = _mm_and_ps(fscal,cutoff_mask);
1565 fscal = _mm_andnot_ps(dummy_mask,fscal);
1567 /* Calculate temporary vectorial force */
1568 tx = _mm_mul_ps(fscal,dx22);
1569 ty = _mm_mul_ps(fscal,dy22);
1570 tz = _mm_mul_ps(fscal,dz22);
1572 /* Update vectorial force */
1573 fix2 = _mm_add_ps(fix2,tx);
1574 fiy2 = _mm_add_ps(fiy2,ty);
1575 fiz2 = _mm_add_ps(fiz2,tz);
1577 fjx2 = _mm_add_ps(fjx2,tx);
1578 fjy2 = _mm_add_ps(fjy2,ty);
1579 fjz2 = _mm_add_ps(fjz2,tz);
1583 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1584 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1585 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1586 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1588 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1589 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1591 /* Inner loop uses 594 flops */
1594 /* End of innermost loop */
1596 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1597 f+i_coord_offset,fshift+i_shift_offset);
1600 /* Update potential energies */
1601 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1603 /* Increment number of inner iterations */
1604 inneriter += j_index_end - j_index_start;
1606 /* Outer loop uses 19 flops */
1609 /* Increment number of outer iterations */
1612 /* Update outer/inner flops */
1614 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*594);
1617 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_single
1618 * Electrostatics interaction: Ewald
1619 * VdW interaction: None
1620 * Geometry: Water3-Water3
1621 * Calculate force/pot: Force
1624 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_sse2_single
1625 (t_nblist * gmx_restrict nlist,
1626 rvec * gmx_restrict xx,
1627 rvec * gmx_restrict ff,
1628 t_forcerec * gmx_restrict fr,
1629 t_mdatoms * gmx_restrict mdatoms,
1630 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1631 t_nrnb * gmx_restrict nrnb)
1633 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1634 * just 0 for non-waters.
1635 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1636 * jnr indices corresponding to data put in the four positions in the SIMD register.
1638 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1639 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1640 int jnrA,jnrB,jnrC,jnrD;
1641 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1642 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1643 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1644 real rcutoff_scalar;
1645 real *shiftvec,*fshift,*x,*f;
1646 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1647 real scratch[4*DIM];
1648 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1650 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1652 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1654 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1655 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1656 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1657 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1658 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1659 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1660 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1661 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1662 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1663 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1664 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1665 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1666 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1667 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1668 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1669 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1670 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1673 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1675 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1676 real rswitch_scalar,d_scalar;
1677 __m128 dummy_mask,cutoff_mask;
1678 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1679 __m128 one = _mm_set1_ps(1.0);
1680 __m128 two = _mm_set1_ps(2.0);
1686 jindex = nlist->jindex;
1688 shiftidx = nlist->shift;
1690 shiftvec = fr->shift_vec[0];
1691 fshift = fr->fshift[0];
1692 facel = _mm_set1_ps(fr->epsfac);
1693 charge = mdatoms->chargeA;
1695 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1696 ewtab = fr->ic->tabq_coul_FDV0;
1697 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1698 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1700 /* Setup water-specific parameters */
1701 inr = nlist->iinr[0];
1702 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1703 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1704 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1706 jq0 = _mm_set1_ps(charge[inr+0]);
1707 jq1 = _mm_set1_ps(charge[inr+1]);
1708 jq2 = _mm_set1_ps(charge[inr+2]);
1709 qq00 = _mm_mul_ps(iq0,jq0);
1710 qq01 = _mm_mul_ps(iq0,jq1);
1711 qq02 = _mm_mul_ps(iq0,jq2);
1712 qq10 = _mm_mul_ps(iq1,jq0);
1713 qq11 = _mm_mul_ps(iq1,jq1);
1714 qq12 = _mm_mul_ps(iq1,jq2);
1715 qq20 = _mm_mul_ps(iq2,jq0);
1716 qq21 = _mm_mul_ps(iq2,jq1);
1717 qq22 = _mm_mul_ps(iq2,jq2);
1719 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1720 rcutoff_scalar = fr->rcoulomb;
1721 rcutoff = _mm_set1_ps(rcutoff_scalar);
1722 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1724 rswitch_scalar = fr->rcoulomb_switch;
1725 rswitch = _mm_set1_ps(rswitch_scalar);
1726 /* Setup switch parameters */
1727 d_scalar = rcutoff_scalar-rswitch_scalar;
1728 d = _mm_set1_ps(d_scalar);
1729 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1730 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1731 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1732 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1733 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1734 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1736 /* Avoid stupid compiler warnings */
1737 jnrA = jnrB = jnrC = jnrD = 0;
1738 j_coord_offsetA = 0;
1739 j_coord_offsetB = 0;
1740 j_coord_offsetC = 0;
1741 j_coord_offsetD = 0;
1746 for(iidx=0;iidx<4*DIM;iidx++)
1748 scratch[iidx] = 0.0;
1751 /* Start outer loop over neighborlists */
1752 for(iidx=0; iidx<nri; iidx++)
1754 /* Load shift vector for this list */
1755 i_shift_offset = DIM*shiftidx[iidx];
1757 /* Load limits for loop over neighbors */
1758 j_index_start = jindex[iidx];
1759 j_index_end = jindex[iidx+1];
1761 /* Get outer coordinate index */
1763 i_coord_offset = DIM*inr;
1765 /* Load i particle coords and add shift vector */
1766 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1767 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1769 fix0 = _mm_setzero_ps();
1770 fiy0 = _mm_setzero_ps();
1771 fiz0 = _mm_setzero_ps();
1772 fix1 = _mm_setzero_ps();
1773 fiy1 = _mm_setzero_ps();
1774 fiz1 = _mm_setzero_ps();
1775 fix2 = _mm_setzero_ps();
1776 fiy2 = _mm_setzero_ps();
1777 fiz2 = _mm_setzero_ps();
1779 /* Start inner kernel loop */
1780 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1783 /* Get j neighbor index, and coordinate index */
1785 jnrB = jjnr[jidx+1];
1786 jnrC = jjnr[jidx+2];
1787 jnrD = jjnr[jidx+3];
1788 j_coord_offsetA = DIM*jnrA;
1789 j_coord_offsetB = DIM*jnrB;
1790 j_coord_offsetC = DIM*jnrC;
1791 j_coord_offsetD = DIM*jnrD;
1793 /* load j atom coordinates */
1794 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1795 x+j_coord_offsetC,x+j_coord_offsetD,
1796 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1798 /* Calculate displacement vector */
1799 dx00 = _mm_sub_ps(ix0,jx0);
1800 dy00 = _mm_sub_ps(iy0,jy0);
1801 dz00 = _mm_sub_ps(iz0,jz0);
1802 dx01 = _mm_sub_ps(ix0,jx1);
1803 dy01 = _mm_sub_ps(iy0,jy1);
1804 dz01 = _mm_sub_ps(iz0,jz1);
1805 dx02 = _mm_sub_ps(ix0,jx2);
1806 dy02 = _mm_sub_ps(iy0,jy2);
1807 dz02 = _mm_sub_ps(iz0,jz2);
1808 dx10 = _mm_sub_ps(ix1,jx0);
1809 dy10 = _mm_sub_ps(iy1,jy0);
1810 dz10 = _mm_sub_ps(iz1,jz0);
1811 dx11 = _mm_sub_ps(ix1,jx1);
1812 dy11 = _mm_sub_ps(iy1,jy1);
1813 dz11 = _mm_sub_ps(iz1,jz1);
1814 dx12 = _mm_sub_ps(ix1,jx2);
1815 dy12 = _mm_sub_ps(iy1,jy2);
1816 dz12 = _mm_sub_ps(iz1,jz2);
1817 dx20 = _mm_sub_ps(ix2,jx0);
1818 dy20 = _mm_sub_ps(iy2,jy0);
1819 dz20 = _mm_sub_ps(iz2,jz0);
1820 dx21 = _mm_sub_ps(ix2,jx1);
1821 dy21 = _mm_sub_ps(iy2,jy1);
1822 dz21 = _mm_sub_ps(iz2,jz1);
1823 dx22 = _mm_sub_ps(ix2,jx2);
1824 dy22 = _mm_sub_ps(iy2,jy2);
1825 dz22 = _mm_sub_ps(iz2,jz2);
1827 /* Calculate squared distance and things based on it */
1828 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1829 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1830 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1831 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1832 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1833 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1834 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1835 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1836 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1838 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1839 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1840 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1841 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1842 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1843 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1844 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1845 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1846 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1848 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1849 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1850 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1851 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1852 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1853 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1854 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1855 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1856 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1858 fjx0 = _mm_setzero_ps();
1859 fjy0 = _mm_setzero_ps();
1860 fjz0 = _mm_setzero_ps();
1861 fjx1 = _mm_setzero_ps();
1862 fjy1 = _mm_setzero_ps();
1863 fjz1 = _mm_setzero_ps();
1864 fjx2 = _mm_setzero_ps();
1865 fjy2 = _mm_setzero_ps();
1866 fjz2 = _mm_setzero_ps();
1868 /**************************
1869 * CALCULATE INTERACTIONS *
1870 **************************/
1872 if (gmx_mm_any_lt(rsq00,rcutoff2))
1875 r00 = _mm_mul_ps(rsq00,rinv00);
1877 /* EWALD ELECTROSTATICS */
1879 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1880 ewrt = _mm_mul_ps(r00,ewtabscale);
1881 ewitab = _mm_cvttps_epi32(ewrt);
1882 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1883 ewitab = _mm_slli_epi32(ewitab,2);
1884 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1885 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1886 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1887 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1888 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1889 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1890 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1891 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1892 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1894 d = _mm_sub_ps(r00,rswitch);
1895 d = _mm_max_ps(d,_mm_setzero_ps());
1896 d2 = _mm_mul_ps(d,d);
1897 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1899 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1901 /* Evaluate switch function */
1902 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1903 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1904 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1908 fscal = _mm_and_ps(fscal,cutoff_mask);
1910 /* Calculate temporary vectorial force */
1911 tx = _mm_mul_ps(fscal,dx00);
1912 ty = _mm_mul_ps(fscal,dy00);
1913 tz = _mm_mul_ps(fscal,dz00);
1915 /* Update vectorial force */
1916 fix0 = _mm_add_ps(fix0,tx);
1917 fiy0 = _mm_add_ps(fiy0,ty);
1918 fiz0 = _mm_add_ps(fiz0,tz);
1920 fjx0 = _mm_add_ps(fjx0,tx);
1921 fjy0 = _mm_add_ps(fjy0,ty);
1922 fjz0 = _mm_add_ps(fjz0,tz);
1926 /**************************
1927 * CALCULATE INTERACTIONS *
1928 **************************/
1930 if (gmx_mm_any_lt(rsq01,rcutoff2))
1933 r01 = _mm_mul_ps(rsq01,rinv01);
1935 /* EWALD ELECTROSTATICS */
1937 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1938 ewrt = _mm_mul_ps(r01,ewtabscale);
1939 ewitab = _mm_cvttps_epi32(ewrt);
1940 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1941 ewitab = _mm_slli_epi32(ewitab,2);
1942 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1943 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1944 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1945 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1946 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1947 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1948 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1949 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
1950 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1952 d = _mm_sub_ps(r01,rswitch);
1953 d = _mm_max_ps(d,_mm_setzero_ps());
1954 d2 = _mm_mul_ps(d,d);
1955 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1957 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1959 /* Evaluate switch function */
1960 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1961 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1962 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1966 fscal = _mm_and_ps(fscal,cutoff_mask);
1968 /* Calculate temporary vectorial force */
1969 tx = _mm_mul_ps(fscal,dx01);
1970 ty = _mm_mul_ps(fscal,dy01);
1971 tz = _mm_mul_ps(fscal,dz01);
1973 /* Update vectorial force */
1974 fix0 = _mm_add_ps(fix0,tx);
1975 fiy0 = _mm_add_ps(fiy0,ty);
1976 fiz0 = _mm_add_ps(fiz0,tz);
1978 fjx1 = _mm_add_ps(fjx1,tx);
1979 fjy1 = _mm_add_ps(fjy1,ty);
1980 fjz1 = _mm_add_ps(fjz1,tz);
1984 /**************************
1985 * CALCULATE INTERACTIONS *
1986 **************************/
1988 if (gmx_mm_any_lt(rsq02,rcutoff2))
1991 r02 = _mm_mul_ps(rsq02,rinv02);
1993 /* EWALD ELECTROSTATICS */
1995 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1996 ewrt = _mm_mul_ps(r02,ewtabscale);
1997 ewitab = _mm_cvttps_epi32(ewrt);
1998 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1999 ewitab = _mm_slli_epi32(ewitab,2);
2000 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2001 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2002 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2003 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2004 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2005 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2006 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2007 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2008 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2010 d = _mm_sub_ps(r02,rswitch);
2011 d = _mm_max_ps(d,_mm_setzero_ps());
2012 d2 = _mm_mul_ps(d,d);
2013 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2015 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2017 /* Evaluate switch function */
2018 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2019 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2020 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2024 fscal = _mm_and_ps(fscal,cutoff_mask);
2026 /* Calculate temporary vectorial force */
2027 tx = _mm_mul_ps(fscal,dx02);
2028 ty = _mm_mul_ps(fscal,dy02);
2029 tz = _mm_mul_ps(fscal,dz02);
2031 /* Update vectorial force */
2032 fix0 = _mm_add_ps(fix0,tx);
2033 fiy0 = _mm_add_ps(fiy0,ty);
2034 fiz0 = _mm_add_ps(fiz0,tz);
2036 fjx2 = _mm_add_ps(fjx2,tx);
2037 fjy2 = _mm_add_ps(fjy2,ty);
2038 fjz2 = _mm_add_ps(fjz2,tz);
2042 /**************************
2043 * CALCULATE INTERACTIONS *
2044 **************************/
2046 if (gmx_mm_any_lt(rsq10,rcutoff2))
2049 r10 = _mm_mul_ps(rsq10,rinv10);
2051 /* EWALD ELECTROSTATICS */
2053 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2054 ewrt = _mm_mul_ps(r10,ewtabscale);
2055 ewitab = _mm_cvttps_epi32(ewrt);
2056 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2057 ewitab = _mm_slli_epi32(ewitab,2);
2058 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2059 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2060 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2061 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2062 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2063 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2064 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2065 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2066 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2068 d = _mm_sub_ps(r10,rswitch);
2069 d = _mm_max_ps(d,_mm_setzero_ps());
2070 d2 = _mm_mul_ps(d,d);
2071 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2073 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2075 /* Evaluate switch function */
2076 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2077 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2078 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2082 fscal = _mm_and_ps(fscal,cutoff_mask);
2084 /* Calculate temporary vectorial force */
2085 tx = _mm_mul_ps(fscal,dx10);
2086 ty = _mm_mul_ps(fscal,dy10);
2087 tz = _mm_mul_ps(fscal,dz10);
2089 /* Update vectorial force */
2090 fix1 = _mm_add_ps(fix1,tx);
2091 fiy1 = _mm_add_ps(fiy1,ty);
2092 fiz1 = _mm_add_ps(fiz1,tz);
2094 fjx0 = _mm_add_ps(fjx0,tx);
2095 fjy0 = _mm_add_ps(fjy0,ty);
2096 fjz0 = _mm_add_ps(fjz0,tz);
2100 /**************************
2101 * CALCULATE INTERACTIONS *
2102 **************************/
2104 if (gmx_mm_any_lt(rsq11,rcutoff2))
2107 r11 = _mm_mul_ps(rsq11,rinv11);
2109 /* EWALD ELECTROSTATICS */
2111 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2112 ewrt = _mm_mul_ps(r11,ewtabscale);
2113 ewitab = _mm_cvttps_epi32(ewrt);
2114 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2115 ewitab = _mm_slli_epi32(ewitab,2);
2116 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2117 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2118 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2119 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2120 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2121 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2122 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2123 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2124 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2126 d = _mm_sub_ps(r11,rswitch);
2127 d = _mm_max_ps(d,_mm_setzero_ps());
2128 d2 = _mm_mul_ps(d,d);
2129 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2131 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2133 /* Evaluate switch function */
2134 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2135 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2136 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2140 fscal = _mm_and_ps(fscal,cutoff_mask);
2142 /* Calculate temporary vectorial force */
2143 tx = _mm_mul_ps(fscal,dx11);
2144 ty = _mm_mul_ps(fscal,dy11);
2145 tz = _mm_mul_ps(fscal,dz11);
2147 /* Update vectorial force */
2148 fix1 = _mm_add_ps(fix1,tx);
2149 fiy1 = _mm_add_ps(fiy1,ty);
2150 fiz1 = _mm_add_ps(fiz1,tz);
2152 fjx1 = _mm_add_ps(fjx1,tx);
2153 fjy1 = _mm_add_ps(fjy1,ty);
2154 fjz1 = _mm_add_ps(fjz1,tz);
2158 /**************************
2159 * CALCULATE INTERACTIONS *
2160 **************************/
2162 if (gmx_mm_any_lt(rsq12,rcutoff2))
2165 r12 = _mm_mul_ps(rsq12,rinv12);
2167 /* EWALD ELECTROSTATICS */
2169 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2170 ewrt = _mm_mul_ps(r12,ewtabscale);
2171 ewitab = _mm_cvttps_epi32(ewrt);
2172 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2173 ewitab = _mm_slli_epi32(ewitab,2);
2174 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2175 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2176 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2177 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2178 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2179 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2180 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2181 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2182 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2184 d = _mm_sub_ps(r12,rswitch);
2185 d = _mm_max_ps(d,_mm_setzero_ps());
2186 d2 = _mm_mul_ps(d,d);
2187 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2189 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2191 /* Evaluate switch function */
2192 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2193 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2194 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2198 fscal = _mm_and_ps(fscal,cutoff_mask);
2200 /* Calculate temporary vectorial force */
2201 tx = _mm_mul_ps(fscal,dx12);
2202 ty = _mm_mul_ps(fscal,dy12);
2203 tz = _mm_mul_ps(fscal,dz12);
2205 /* Update vectorial force */
2206 fix1 = _mm_add_ps(fix1,tx);
2207 fiy1 = _mm_add_ps(fiy1,ty);
2208 fiz1 = _mm_add_ps(fiz1,tz);
2210 fjx2 = _mm_add_ps(fjx2,tx);
2211 fjy2 = _mm_add_ps(fjy2,ty);
2212 fjz2 = _mm_add_ps(fjz2,tz);
2216 /**************************
2217 * CALCULATE INTERACTIONS *
2218 **************************/
2220 if (gmx_mm_any_lt(rsq20,rcutoff2))
2223 r20 = _mm_mul_ps(rsq20,rinv20);
2225 /* EWALD ELECTROSTATICS */
2227 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2228 ewrt = _mm_mul_ps(r20,ewtabscale);
2229 ewitab = _mm_cvttps_epi32(ewrt);
2230 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2231 ewitab = _mm_slli_epi32(ewitab,2);
2232 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2233 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2234 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2235 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2236 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2237 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2238 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2239 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2240 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2242 d = _mm_sub_ps(r20,rswitch);
2243 d = _mm_max_ps(d,_mm_setzero_ps());
2244 d2 = _mm_mul_ps(d,d);
2245 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2247 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2249 /* Evaluate switch function */
2250 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2251 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2252 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2256 fscal = _mm_and_ps(fscal,cutoff_mask);
2258 /* Calculate temporary vectorial force */
2259 tx = _mm_mul_ps(fscal,dx20);
2260 ty = _mm_mul_ps(fscal,dy20);
2261 tz = _mm_mul_ps(fscal,dz20);
2263 /* Update vectorial force */
2264 fix2 = _mm_add_ps(fix2,tx);
2265 fiy2 = _mm_add_ps(fiy2,ty);
2266 fiz2 = _mm_add_ps(fiz2,tz);
2268 fjx0 = _mm_add_ps(fjx0,tx);
2269 fjy0 = _mm_add_ps(fjy0,ty);
2270 fjz0 = _mm_add_ps(fjz0,tz);
2274 /**************************
2275 * CALCULATE INTERACTIONS *
2276 **************************/
2278 if (gmx_mm_any_lt(rsq21,rcutoff2))
2281 r21 = _mm_mul_ps(rsq21,rinv21);
2283 /* EWALD ELECTROSTATICS */
2285 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2286 ewrt = _mm_mul_ps(r21,ewtabscale);
2287 ewitab = _mm_cvttps_epi32(ewrt);
2288 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2289 ewitab = _mm_slli_epi32(ewitab,2);
2290 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2291 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2292 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2293 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2294 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2295 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2296 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2297 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2298 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2300 d = _mm_sub_ps(r21,rswitch);
2301 d = _mm_max_ps(d,_mm_setzero_ps());
2302 d2 = _mm_mul_ps(d,d);
2303 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2305 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2307 /* Evaluate switch function */
2308 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2309 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2310 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2314 fscal = _mm_and_ps(fscal,cutoff_mask);
2316 /* Calculate temporary vectorial force */
2317 tx = _mm_mul_ps(fscal,dx21);
2318 ty = _mm_mul_ps(fscal,dy21);
2319 tz = _mm_mul_ps(fscal,dz21);
2321 /* Update vectorial force */
2322 fix2 = _mm_add_ps(fix2,tx);
2323 fiy2 = _mm_add_ps(fiy2,ty);
2324 fiz2 = _mm_add_ps(fiz2,tz);
2326 fjx1 = _mm_add_ps(fjx1,tx);
2327 fjy1 = _mm_add_ps(fjy1,ty);
2328 fjz1 = _mm_add_ps(fjz1,tz);
2332 /**************************
2333 * CALCULATE INTERACTIONS *
2334 **************************/
2336 if (gmx_mm_any_lt(rsq22,rcutoff2))
2339 r22 = _mm_mul_ps(rsq22,rinv22);
2341 /* EWALD ELECTROSTATICS */
2343 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2344 ewrt = _mm_mul_ps(r22,ewtabscale);
2345 ewitab = _mm_cvttps_epi32(ewrt);
2346 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2347 ewitab = _mm_slli_epi32(ewitab,2);
2348 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2349 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2350 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2351 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2352 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2353 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2354 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2355 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2356 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2358 d = _mm_sub_ps(r22,rswitch);
2359 d = _mm_max_ps(d,_mm_setzero_ps());
2360 d2 = _mm_mul_ps(d,d);
2361 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2363 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2365 /* Evaluate switch function */
2366 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2367 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2368 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2372 fscal = _mm_and_ps(fscal,cutoff_mask);
2374 /* Calculate temporary vectorial force */
2375 tx = _mm_mul_ps(fscal,dx22);
2376 ty = _mm_mul_ps(fscal,dy22);
2377 tz = _mm_mul_ps(fscal,dz22);
2379 /* Update vectorial force */
2380 fix2 = _mm_add_ps(fix2,tx);
2381 fiy2 = _mm_add_ps(fiy2,ty);
2382 fiz2 = _mm_add_ps(fiz2,tz);
2384 fjx2 = _mm_add_ps(fjx2,tx);
2385 fjy2 = _mm_add_ps(fjy2,ty);
2386 fjz2 = _mm_add_ps(fjz2,tz);
2390 fjptrA = f+j_coord_offsetA;
2391 fjptrB = f+j_coord_offsetB;
2392 fjptrC = f+j_coord_offsetC;
2393 fjptrD = f+j_coord_offsetD;
2395 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2396 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2398 /* Inner loop uses 558 flops */
2401 if(jidx<j_index_end)
2404 /* Get j neighbor index, and coordinate index */
2405 jnrlistA = jjnr[jidx];
2406 jnrlistB = jjnr[jidx+1];
2407 jnrlistC = jjnr[jidx+2];
2408 jnrlistD = jjnr[jidx+3];
2409 /* Sign of each element will be negative for non-real atoms.
2410 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2411 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2413 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2414 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2415 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2416 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2417 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2418 j_coord_offsetA = DIM*jnrA;
2419 j_coord_offsetB = DIM*jnrB;
2420 j_coord_offsetC = DIM*jnrC;
2421 j_coord_offsetD = DIM*jnrD;
2423 /* load j atom coordinates */
2424 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2425 x+j_coord_offsetC,x+j_coord_offsetD,
2426 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2428 /* Calculate displacement vector */
2429 dx00 = _mm_sub_ps(ix0,jx0);
2430 dy00 = _mm_sub_ps(iy0,jy0);
2431 dz00 = _mm_sub_ps(iz0,jz0);
2432 dx01 = _mm_sub_ps(ix0,jx1);
2433 dy01 = _mm_sub_ps(iy0,jy1);
2434 dz01 = _mm_sub_ps(iz0,jz1);
2435 dx02 = _mm_sub_ps(ix0,jx2);
2436 dy02 = _mm_sub_ps(iy0,jy2);
2437 dz02 = _mm_sub_ps(iz0,jz2);
2438 dx10 = _mm_sub_ps(ix1,jx0);
2439 dy10 = _mm_sub_ps(iy1,jy0);
2440 dz10 = _mm_sub_ps(iz1,jz0);
2441 dx11 = _mm_sub_ps(ix1,jx1);
2442 dy11 = _mm_sub_ps(iy1,jy1);
2443 dz11 = _mm_sub_ps(iz1,jz1);
2444 dx12 = _mm_sub_ps(ix1,jx2);
2445 dy12 = _mm_sub_ps(iy1,jy2);
2446 dz12 = _mm_sub_ps(iz1,jz2);
2447 dx20 = _mm_sub_ps(ix2,jx0);
2448 dy20 = _mm_sub_ps(iy2,jy0);
2449 dz20 = _mm_sub_ps(iz2,jz0);
2450 dx21 = _mm_sub_ps(ix2,jx1);
2451 dy21 = _mm_sub_ps(iy2,jy1);
2452 dz21 = _mm_sub_ps(iz2,jz1);
2453 dx22 = _mm_sub_ps(ix2,jx2);
2454 dy22 = _mm_sub_ps(iy2,jy2);
2455 dz22 = _mm_sub_ps(iz2,jz2);
2457 /* Calculate squared distance and things based on it */
2458 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2459 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2460 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2461 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2462 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2463 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2464 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2465 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2466 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2468 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2469 rinv01 = gmx_mm_invsqrt_ps(rsq01);
2470 rinv02 = gmx_mm_invsqrt_ps(rsq02);
2471 rinv10 = gmx_mm_invsqrt_ps(rsq10);
2472 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2473 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2474 rinv20 = gmx_mm_invsqrt_ps(rsq20);
2475 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2476 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2478 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2479 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
2480 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
2481 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
2482 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2483 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2484 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
2485 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2486 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2488 fjx0 = _mm_setzero_ps();
2489 fjy0 = _mm_setzero_ps();
2490 fjz0 = _mm_setzero_ps();
2491 fjx1 = _mm_setzero_ps();
2492 fjy1 = _mm_setzero_ps();
2493 fjz1 = _mm_setzero_ps();
2494 fjx2 = _mm_setzero_ps();
2495 fjy2 = _mm_setzero_ps();
2496 fjz2 = _mm_setzero_ps();
2498 /**************************
2499 * CALCULATE INTERACTIONS *
2500 **************************/
2502 if (gmx_mm_any_lt(rsq00,rcutoff2))
2505 r00 = _mm_mul_ps(rsq00,rinv00);
2506 r00 = _mm_andnot_ps(dummy_mask,r00);
2508 /* EWALD ELECTROSTATICS */
2510 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2511 ewrt = _mm_mul_ps(r00,ewtabscale);
2512 ewitab = _mm_cvttps_epi32(ewrt);
2513 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2514 ewitab = _mm_slli_epi32(ewitab,2);
2515 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2516 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2517 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2518 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2519 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2520 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2521 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2522 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
2523 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2525 d = _mm_sub_ps(r00,rswitch);
2526 d = _mm_max_ps(d,_mm_setzero_ps());
2527 d2 = _mm_mul_ps(d,d);
2528 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2530 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2532 /* Evaluate switch function */
2533 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2534 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
2535 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2539 fscal = _mm_and_ps(fscal,cutoff_mask);
2541 fscal = _mm_andnot_ps(dummy_mask,fscal);
2543 /* Calculate temporary vectorial force */
2544 tx = _mm_mul_ps(fscal,dx00);
2545 ty = _mm_mul_ps(fscal,dy00);
2546 tz = _mm_mul_ps(fscal,dz00);
2548 /* Update vectorial force */
2549 fix0 = _mm_add_ps(fix0,tx);
2550 fiy0 = _mm_add_ps(fiy0,ty);
2551 fiz0 = _mm_add_ps(fiz0,tz);
2553 fjx0 = _mm_add_ps(fjx0,tx);
2554 fjy0 = _mm_add_ps(fjy0,ty);
2555 fjz0 = _mm_add_ps(fjz0,tz);
2559 /**************************
2560 * CALCULATE INTERACTIONS *
2561 **************************/
2563 if (gmx_mm_any_lt(rsq01,rcutoff2))
2566 r01 = _mm_mul_ps(rsq01,rinv01);
2567 r01 = _mm_andnot_ps(dummy_mask,r01);
2569 /* EWALD ELECTROSTATICS */
2571 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2572 ewrt = _mm_mul_ps(r01,ewtabscale);
2573 ewitab = _mm_cvttps_epi32(ewrt);
2574 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2575 ewitab = _mm_slli_epi32(ewitab,2);
2576 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2577 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2578 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2579 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2580 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2581 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2582 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2583 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
2584 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2586 d = _mm_sub_ps(r01,rswitch);
2587 d = _mm_max_ps(d,_mm_setzero_ps());
2588 d2 = _mm_mul_ps(d,d);
2589 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2591 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2593 /* Evaluate switch function */
2594 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2595 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2596 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2600 fscal = _mm_and_ps(fscal,cutoff_mask);
2602 fscal = _mm_andnot_ps(dummy_mask,fscal);
2604 /* Calculate temporary vectorial force */
2605 tx = _mm_mul_ps(fscal,dx01);
2606 ty = _mm_mul_ps(fscal,dy01);
2607 tz = _mm_mul_ps(fscal,dz01);
2609 /* Update vectorial force */
2610 fix0 = _mm_add_ps(fix0,tx);
2611 fiy0 = _mm_add_ps(fiy0,ty);
2612 fiz0 = _mm_add_ps(fiz0,tz);
2614 fjx1 = _mm_add_ps(fjx1,tx);
2615 fjy1 = _mm_add_ps(fjy1,ty);
2616 fjz1 = _mm_add_ps(fjz1,tz);
2620 /**************************
2621 * CALCULATE INTERACTIONS *
2622 **************************/
2624 if (gmx_mm_any_lt(rsq02,rcutoff2))
2627 r02 = _mm_mul_ps(rsq02,rinv02);
2628 r02 = _mm_andnot_ps(dummy_mask,r02);
2630 /* EWALD ELECTROSTATICS */
2632 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2633 ewrt = _mm_mul_ps(r02,ewtabscale);
2634 ewitab = _mm_cvttps_epi32(ewrt);
2635 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2636 ewitab = _mm_slli_epi32(ewitab,2);
2637 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2638 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2639 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2640 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2641 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2642 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2643 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2644 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2645 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2647 d = _mm_sub_ps(r02,rswitch);
2648 d = _mm_max_ps(d,_mm_setzero_ps());
2649 d2 = _mm_mul_ps(d,d);
2650 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2652 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2654 /* Evaluate switch function */
2655 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2656 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2657 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2661 fscal = _mm_and_ps(fscal,cutoff_mask);
2663 fscal = _mm_andnot_ps(dummy_mask,fscal);
2665 /* Calculate temporary vectorial force */
2666 tx = _mm_mul_ps(fscal,dx02);
2667 ty = _mm_mul_ps(fscal,dy02);
2668 tz = _mm_mul_ps(fscal,dz02);
2670 /* Update vectorial force */
2671 fix0 = _mm_add_ps(fix0,tx);
2672 fiy0 = _mm_add_ps(fiy0,ty);
2673 fiz0 = _mm_add_ps(fiz0,tz);
2675 fjx2 = _mm_add_ps(fjx2,tx);
2676 fjy2 = _mm_add_ps(fjy2,ty);
2677 fjz2 = _mm_add_ps(fjz2,tz);
2681 /**************************
2682 * CALCULATE INTERACTIONS *
2683 **************************/
2685 if (gmx_mm_any_lt(rsq10,rcutoff2))
2688 r10 = _mm_mul_ps(rsq10,rinv10);
2689 r10 = _mm_andnot_ps(dummy_mask,r10);
2691 /* EWALD ELECTROSTATICS */
2693 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2694 ewrt = _mm_mul_ps(r10,ewtabscale);
2695 ewitab = _mm_cvttps_epi32(ewrt);
2696 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2697 ewitab = _mm_slli_epi32(ewitab,2);
2698 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2699 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2700 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2701 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2702 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2703 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2704 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2705 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2706 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2708 d = _mm_sub_ps(r10,rswitch);
2709 d = _mm_max_ps(d,_mm_setzero_ps());
2710 d2 = _mm_mul_ps(d,d);
2711 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2713 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2715 /* Evaluate switch function */
2716 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2717 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2718 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2722 fscal = _mm_and_ps(fscal,cutoff_mask);
2724 fscal = _mm_andnot_ps(dummy_mask,fscal);
2726 /* Calculate temporary vectorial force */
2727 tx = _mm_mul_ps(fscal,dx10);
2728 ty = _mm_mul_ps(fscal,dy10);
2729 tz = _mm_mul_ps(fscal,dz10);
2731 /* Update vectorial force */
2732 fix1 = _mm_add_ps(fix1,tx);
2733 fiy1 = _mm_add_ps(fiy1,ty);
2734 fiz1 = _mm_add_ps(fiz1,tz);
2736 fjx0 = _mm_add_ps(fjx0,tx);
2737 fjy0 = _mm_add_ps(fjy0,ty);
2738 fjz0 = _mm_add_ps(fjz0,tz);
2742 /**************************
2743 * CALCULATE INTERACTIONS *
2744 **************************/
2746 if (gmx_mm_any_lt(rsq11,rcutoff2))
2749 r11 = _mm_mul_ps(rsq11,rinv11);
2750 r11 = _mm_andnot_ps(dummy_mask,r11);
2752 /* EWALD ELECTROSTATICS */
2754 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2755 ewrt = _mm_mul_ps(r11,ewtabscale);
2756 ewitab = _mm_cvttps_epi32(ewrt);
2757 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2758 ewitab = _mm_slli_epi32(ewitab,2);
2759 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2760 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2761 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2762 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2763 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2764 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2765 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2766 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2767 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2769 d = _mm_sub_ps(r11,rswitch);
2770 d = _mm_max_ps(d,_mm_setzero_ps());
2771 d2 = _mm_mul_ps(d,d);
2772 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2774 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2776 /* Evaluate switch function */
2777 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2778 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2779 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2783 fscal = _mm_and_ps(fscal,cutoff_mask);
2785 fscal = _mm_andnot_ps(dummy_mask,fscal);
2787 /* Calculate temporary vectorial force */
2788 tx = _mm_mul_ps(fscal,dx11);
2789 ty = _mm_mul_ps(fscal,dy11);
2790 tz = _mm_mul_ps(fscal,dz11);
2792 /* Update vectorial force */
2793 fix1 = _mm_add_ps(fix1,tx);
2794 fiy1 = _mm_add_ps(fiy1,ty);
2795 fiz1 = _mm_add_ps(fiz1,tz);
2797 fjx1 = _mm_add_ps(fjx1,tx);
2798 fjy1 = _mm_add_ps(fjy1,ty);
2799 fjz1 = _mm_add_ps(fjz1,tz);
2803 /**************************
2804 * CALCULATE INTERACTIONS *
2805 **************************/
2807 if (gmx_mm_any_lt(rsq12,rcutoff2))
2810 r12 = _mm_mul_ps(rsq12,rinv12);
2811 r12 = _mm_andnot_ps(dummy_mask,r12);
2813 /* EWALD ELECTROSTATICS */
2815 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2816 ewrt = _mm_mul_ps(r12,ewtabscale);
2817 ewitab = _mm_cvttps_epi32(ewrt);
2818 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2819 ewitab = _mm_slli_epi32(ewitab,2);
2820 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2821 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2822 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2823 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2824 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2825 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2826 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2827 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2828 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2830 d = _mm_sub_ps(r12,rswitch);
2831 d = _mm_max_ps(d,_mm_setzero_ps());
2832 d2 = _mm_mul_ps(d,d);
2833 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2835 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2837 /* Evaluate switch function */
2838 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2839 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2840 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2844 fscal = _mm_and_ps(fscal,cutoff_mask);
2846 fscal = _mm_andnot_ps(dummy_mask,fscal);
2848 /* Calculate temporary vectorial force */
2849 tx = _mm_mul_ps(fscal,dx12);
2850 ty = _mm_mul_ps(fscal,dy12);
2851 tz = _mm_mul_ps(fscal,dz12);
2853 /* Update vectorial force */
2854 fix1 = _mm_add_ps(fix1,tx);
2855 fiy1 = _mm_add_ps(fiy1,ty);
2856 fiz1 = _mm_add_ps(fiz1,tz);
2858 fjx2 = _mm_add_ps(fjx2,tx);
2859 fjy2 = _mm_add_ps(fjy2,ty);
2860 fjz2 = _mm_add_ps(fjz2,tz);
2864 /**************************
2865 * CALCULATE INTERACTIONS *
2866 **************************/
2868 if (gmx_mm_any_lt(rsq20,rcutoff2))
2871 r20 = _mm_mul_ps(rsq20,rinv20);
2872 r20 = _mm_andnot_ps(dummy_mask,r20);
2874 /* EWALD ELECTROSTATICS */
2876 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2877 ewrt = _mm_mul_ps(r20,ewtabscale);
2878 ewitab = _mm_cvttps_epi32(ewrt);
2879 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2880 ewitab = _mm_slli_epi32(ewitab,2);
2881 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2882 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2883 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2884 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2885 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2886 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2887 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2888 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2889 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2891 d = _mm_sub_ps(r20,rswitch);
2892 d = _mm_max_ps(d,_mm_setzero_ps());
2893 d2 = _mm_mul_ps(d,d);
2894 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2896 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2898 /* Evaluate switch function */
2899 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2900 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2901 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2905 fscal = _mm_and_ps(fscal,cutoff_mask);
2907 fscal = _mm_andnot_ps(dummy_mask,fscal);
2909 /* Calculate temporary vectorial force */
2910 tx = _mm_mul_ps(fscal,dx20);
2911 ty = _mm_mul_ps(fscal,dy20);
2912 tz = _mm_mul_ps(fscal,dz20);
2914 /* Update vectorial force */
2915 fix2 = _mm_add_ps(fix2,tx);
2916 fiy2 = _mm_add_ps(fiy2,ty);
2917 fiz2 = _mm_add_ps(fiz2,tz);
2919 fjx0 = _mm_add_ps(fjx0,tx);
2920 fjy0 = _mm_add_ps(fjy0,ty);
2921 fjz0 = _mm_add_ps(fjz0,tz);
2925 /**************************
2926 * CALCULATE INTERACTIONS *
2927 **************************/
2929 if (gmx_mm_any_lt(rsq21,rcutoff2))
2932 r21 = _mm_mul_ps(rsq21,rinv21);
2933 r21 = _mm_andnot_ps(dummy_mask,r21);
2935 /* EWALD ELECTROSTATICS */
2937 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2938 ewrt = _mm_mul_ps(r21,ewtabscale);
2939 ewitab = _mm_cvttps_epi32(ewrt);
2940 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2941 ewitab = _mm_slli_epi32(ewitab,2);
2942 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2943 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2944 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2945 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2946 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2947 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2948 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2949 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2950 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2952 d = _mm_sub_ps(r21,rswitch);
2953 d = _mm_max_ps(d,_mm_setzero_ps());
2954 d2 = _mm_mul_ps(d,d);
2955 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2957 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2959 /* Evaluate switch function */
2960 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2961 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2962 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2966 fscal = _mm_and_ps(fscal,cutoff_mask);
2968 fscal = _mm_andnot_ps(dummy_mask,fscal);
2970 /* Calculate temporary vectorial force */
2971 tx = _mm_mul_ps(fscal,dx21);
2972 ty = _mm_mul_ps(fscal,dy21);
2973 tz = _mm_mul_ps(fscal,dz21);
2975 /* Update vectorial force */
2976 fix2 = _mm_add_ps(fix2,tx);
2977 fiy2 = _mm_add_ps(fiy2,ty);
2978 fiz2 = _mm_add_ps(fiz2,tz);
2980 fjx1 = _mm_add_ps(fjx1,tx);
2981 fjy1 = _mm_add_ps(fjy1,ty);
2982 fjz1 = _mm_add_ps(fjz1,tz);
2986 /**************************
2987 * CALCULATE INTERACTIONS *
2988 **************************/
2990 if (gmx_mm_any_lt(rsq22,rcutoff2))
2993 r22 = _mm_mul_ps(rsq22,rinv22);
2994 r22 = _mm_andnot_ps(dummy_mask,r22);
2996 /* EWALD ELECTROSTATICS */
2998 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2999 ewrt = _mm_mul_ps(r22,ewtabscale);
3000 ewitab = _mm_cvttps_epi32(ewrt);
3001 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3002 ewitab = _mm_slli_epi32(ewitab,2);
3003 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3004 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3005 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3006 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3007 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3008 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3009 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3010 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
3011 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
3013 d = _mm_sub_ps(r22,rswitch);
3014 d = _mm_max_ps(d,_mm_setzero_ps());
3015 d2 = _mm_mul_ps(d,d);
3016 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3018 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3020 /* Evaluate switch function */
3021 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3022 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
3023 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
3027 fscal = _mm_and_ps(fscal,cutoff_mask);
3029 fscal = _mm_andnot_ps(dummy_mask,fscal);
3031 /* Calculate temporary vectorial force */
3032 tx = _mm_mul_ps(fscal,dx22);
3033 ty = _mm_mul_ps(fscal,dy22);
3034 tz = _mm_mul_ps(fscal,dz22);
3036 /* Update vectorial force */
3037 fix2 = _mm_add_ps(fix2,tx);
3038 fiy2 = _mm_add_ps(fiy2,ty);
3039 fiz2 = _mm_add_ps(fiz2,tz);
3041 fjx2 = _mm_add_ps(fjx2,tx);
3042 fjy2 = _mm_add_ps(fjy2,ty);
3043 fjz2 = _mm_add_ps(fjz2,tz);
3047 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3048 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3049 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3050 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3052 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
3053 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3055 /* Inner loop uses 567 flops */
3058 /* End of innermost loop */
3060 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3061 f+i_coord_offset,fshift+i_shift_offset);
3063 /* Increment number of inner iterations */
3064 inneriter += j_index_end - j_index_start;
3066 /* Outer loop uses 18 flops */
3069 /* Increment number of outer iterations */
3072 /* Update outer/inner flops */
3074 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*567);