2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse2_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LennardJones
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
89 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
91 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
93 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
106 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
109 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
110 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
112 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
114 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
115 real rswitch_scalar,d_scalar;
116 __m128 dummy_mask,cutoff_mask;
117 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
118 __m128 one = _mm_set1_ps(1.0);
119 __m128 two = _mm_set1_ps(2.0);
125 jindex = nlist->jindex;
127 shiftidx = nlist->shift;
129 shiftvec = fr->shift_vec[0];
130 fshift = fr->fshift[0];
131 facel = _mm_set1_ps(fr->ic->epsfac);
132 charge = mdatoms->chargeA;
133 nvdwtype = fr->ntype;
135 vdwtype = mdatoms->typeA;
137 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
138 ewtab = fr->ic->tabq_coul_FDV0;
139 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
140 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
142 /* Setup water-specific parameters */
143 inr = nlist->iinr[0];
144 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
145 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
146 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
147 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
149 jq0 = _mm_set1_ps(charge[inr+0]);
150 jq1 = _mm_set1_ps(charge[inr+1]);
151 jq2 = _mm_set1_ps(charge[inr+2]);
152 vdwjidx0A = 2*vdwtype[inr+0];
153 qq00 = _mm_mul_ps(iq0,jq0);
154 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
155 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
156 qq01 = _mm_mul_ps(iq0,jq1);
157 qq02 = _mm_mul_ps(iq0,jq2);
158 qq10 = _mm_mul_ps(iq1,jq0);
159 qq11 = _mm_mul_ps(iq1,jq1);
160 qq12 = _mm_mul_ps(iq1,jq2);
161 qq20 = _mm_mul_ps(iq2,jq0);
162 qq21 = _mm_mul_ps(iq2,jq1);
163 qq22 = _mm_mul_ps(iq2,jq2);
165 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
166 rcutoff_scalar = fr->ic->rcoulomb;
167 rcutoff = _mm_set1_ps(rcutoff_scalar);
168 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
170 rswitch_scalar = fr->ic->rcoulomb_switch;
171 rswitch = _mm_set1_ps(rswitch_scalar);
172 /* Setup switch parameters */
173 d_scalar = rcutoff_scalar-rswitch_scalar;
174 d = _mm_set1_ps(d_scalar);
175 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
176 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
177 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
178 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
179 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
180 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
182 /* Avoid stupid compiler warnings */
183 jnrA = jnrB = jnrC = jnrD = 0;
192 for(iidx=0;iidx<4*DIM;iidx++)
197 /* Start outer loop over neighborlists */
198 for(iidx=0; iidx<nri; iidx++)
200 /* Load shift vector for this list */
201 i_shift_offset = DIM*shiftidx[iidx];
203 /* Load limits for loop over neighbors */
204 j_index_start = jindex[iidx];
205 j_index_end = jindex[iidx+1];
207 /* Get outer coordinate index */
209 i_coord_offset = DIM*inr;
211 /* Load i particle coords and add shift vector */
212 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
213 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
215 fix0 = _mm_setzero_ps();
216 fiy0 = _mm_setzero_ps();
217 fiz0 = _mm_setzero_ps();
218 fix1 = _mm_setzero_ps();
219 fiy1 = _mm_setzero_ps();
220 fiz1 = _mm_setzero_ps();
221 fix2 = _mm_setzero_ps();
222 fiy2 = _mm_setzero_ps();
223 fiz2 = _mm_setzero_ps();
225 /* Reset potential sums */
226 velecsum = _mm_setzero_ps();
227 vvdwsum = _mm_setzero_ps();
229 /* Start inner kernel loop */
230 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
233 /* Get j neighbor index, and coordinate index */
238 j_coord_offsetA = DIM*jnrA;
239 j_coord_offsetB = DIM*jnrB;
240 j_coord_offsetC = DIM*jnrC;
241 j_coord_offsetD = DIM*jnrD;
243 /* load j atom coordinates */
244 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
245 x+j_coord_offsetC,x+j_coord_offsetD,
246 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
248 /* Calculate displacement vector */
249 dx00 = _mm_sub_ps(ix0,jx0);
250 dy00 = _mm_sub_ps(iy0,jy0);
251 dz00 = _mm_sub_ps(iz0,jz0);
252 dx01 = _mm_sub_ps(ix0,jx1);
253 dy01 = _mm_sub_ps(iy0,jy1);
254 dz01 = _mm_sub_ps(iz0,jz1);
255 dx02 = _mm_sub_ps(ix0,jx2);
256 dy02 = _mm_sub_ps(iy0,jy2);
257 dz02 = _mm_sub_ps(iz0,jz2);
258 dx10 = _mm_sub_ps(ix1,jx0);
259 dy10 = _mm_sub_ps(iy1,jy0);
260 dz10 = _mm_sub_ps(iz1,jz0);
261 dx11 = _mm_sub_ps(ix1,jx1);
262 dy11 = _mm_sub_ps(iy1,jy1);
263 dz11 = _mm_sub_ps(iz1,jz1);
264 dx12 = _mm_sub_ps(ix1,jx2);
265 dy12 = _mm_sub_ps(iy1,jy2);
266 dz12 = _mm_sub_ps(iz1,jz2);
267 dx20 = _mm_sub_ps(ix2,jx0);
268 dy20 = _mm_sub_ps(iy2,jy0);
269 dz20 = _mm_sub_ps(iz2,jz0);
270 dx21 = _mm_sub_ps(ix2,jx1);
271 dy21 = _mm_sub_ps(iy2,jy1);
272 dz21 = _mm_sub_ps(iz2,jz1);
273 dx22 = _mm_sub_ps(ix2,jx2);
274 dy22 = _mm_sub_ps(iy2,jy2);
275 dz22 = _mm_sub_ps(iz2,jz2);
277 /* Calculate squared distance and things based on it */
278 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
279 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
280 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
281 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
282 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
283 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
284 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
285 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
286 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
288 rinv00 = sse2_invsqrt_f(rsq00);
289 rinv01 = sse2_invsqrt_f(rsq01);
290 rinv02 = sse2_invsqrt_f(rsq02);
291 rinv10 = sse2_invsqrt_f(rsq10);
292 rinv11 = sse2_invsqrt_f(rsq11);
293 rinv12 = sse2_invsqrt_f(rsq12);
294 rinv20 = sse2_invsqrt_f(rsq20);
295 rinv21 = sse2_invsqrt_f(rsq21);
296 rinv22 = sse2_invsqrt_f(rsq22);
298 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
299 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
300 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
301 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
302 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
303 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
304 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
305 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
306 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
308 fjx0 = _mm_setzero_ps();
309 fjy0 = _mm_setzero_ps();
310 fjz0 = _mm_setzero_ps();
311 fjx1 = _mm_setzero_ps();
312 fjy1 = _mm_setzero_ps();
313 fjz1 = _mm_setzero_ps();
314 fjx2 = _mm_setzero_ps();
315 fjy2 = _mm_setzero_ps();
316 fjz2 = _mm_setzero_ps();
318 /**************************
319 * CALCULATE INTERACTIONS *
320 **************************/
322 if (gmx_mm_any_lt(rsq00,rcutoff2))
325 r00 = _mm_mul_ps(rsq00,rinv00);
327 /* EWALD ELECTROSTATICS */
329 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
330 ewrt = _mm_mul_ps(r00,ewtabscale);
331 ewitab = _mm_cvttps_epi32(ewrt);
332 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
333 ewitab = _mm_slli_epi32(ewitab,2);
334 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
335 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
336 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
337 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
338 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
339 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
340 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
341 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
342 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
344 /* LENNARD-JONES DISPERSION/REPULSION */
346 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
347 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
348 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
349 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
350 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
352 d = _mm_sub_ps(r00,rswitch);
353 d = _mm_max_ps(d,_mm_setzero_ps());
354 d2 = _mm_mul_ps(d,d);
355 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
357 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
359 /* Evaluate switch function */
360 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
361 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
362 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
363 velec = _mm_mul_ps(velec,sw);
364 vvdw = _mm_mul_ps(vvdw,sw);
365 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
367 /* Update potential sum for this i atom from the interaction with this j atom. */
368 velec = _mm_and_ps(velec,cutoff_mask);
369 velecsum = _mm_add_ps(velecsum,velec);
370 vvdw = _mm_and_ps(vvdw,cutoff_mask);
371 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
373 fscal = _mm_add_ps(felec,fvdw);
375 fscal = _mm_and_ps(fscal,cutoff_mask);
377 /* Calculate temporary vectorial force */
378 tx = _mm_mul_ps(fscal,dx00);
379 ty = _mm_mul_ps(fscal,dy00);
380 tz = _mm_mul_ps(fscal,dz00);
382 /* Update vectorial force */
383 fix0 = _mm_add_ps(fix0,tx);
384 fiy0 = _mm_add_ps(fiy0,ty);
385 fiz0 = _mm_add_ps(fiz0,tz);
387 fjx0 = _mm_add_ps(fjx0,tx);
388 fjy0 = _mm_add_ps(fjy0,ty);
389 fjz0 = _mm_add_ps(fjz0,tz);
393 /**************************
394 * CALCULATE INTERACTIONS *
395 **************************/
397 if (gmx_mm_any_lt(rsq01,rcutoff2))
400 r01 = _mm_mul_ps(rsq01,rinv01);
402 /* EWALD ELECTROSTATICS */
404 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
405 ewrt = _mm_mul_ps(r01,ewtabscale);
406 ewitab = _mm_cvttps_epi32(ewrt);
407 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
408 ewitab = _mm_slli_epi32(ewitab,2);
409 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
410 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
411 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
412 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
413 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
414 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
415 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
416 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
417 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
419 d = _mm_sub_ps(r01,rswitch);
420 d = _mm_max_ps(d,_mm_setzero_ps());
421 d2 = _mm_mul_ps(d,d);
422 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
424 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
426 /* Evaluate switch function */
427 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
428 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
429 velec = _mm_mul_ps(velec,sw);
430 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
432 /* Update potential sum for this i atom from the interaction with this j atom. */
433 velec = _mm_and_ps(velec,cutoff_mask);
434 velecsum = _mm_add_ps(velecsum,velec);
438 fscal = _mm_and_ps(fscal,cutoff_mask);
440 /* Calculate temporary vectorial force */
441 tx = _mm_mul_ps(fscal,dx01);
442 ty = _mm_mul_ps(fscal,dy01);
443 tz = _mm_mul_ps(fscal,dz01);
445 /* Update vectorial force */
446 fix0 = _mm_add_ps(fix0,tx);
447 fiy0 = _mm_add_ps(fiy0,ty);
448 fiz0 = _mm_add_ps(fiz0,tz);
450 fjx1 = _mm_add_ps(fjx1,tx);
451 fjy1 = _mm_add_ps(fjy1,ty);
452 fjz1 = _mm_add_ps(fjz1,tz);
456 /**************************
457 * CALCULATE INTERACTIONS *
458 **************************/
460 if (gmx_mm_any_lt(rsq02,rcutoff2))
463 r02 = _mm_mul_ps(rsq02,rinv02);
465 /* EWALD ELECTROSTATICS */
467 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
468 ewrt = _mm_mul_ps(r02,ewtabscale);
469 ewitab = _mm_cvttps_epi32(ewrt);
470 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
471 ewitab = _mm_slli_epi32(ewitab,2);
472 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
473 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
474 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
475 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
476 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
477 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
478 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
479 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
480 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
482 d = _mm_sub_ps(r02,rswitch);
483 d = _mm_max_ps(d,_mm_setzero_ps());
484 d2 = _mm_mul_ps(d,d);
485 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
487 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
489 /* Evaluate switch function */
490 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
491 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
492 velec = _mm_mul_ps(velec,sw);
493 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
495 /* Update potential sum for this i atom from the interaction with this j atom. */
496 velec = _mm_and_ps(velec,cutoff_mask);
497 velecsum = _mm_add_ps(velecsum,velec);
501 fscal = _mm_and_ps(fscal,cutoff_mask);
503 /* Calculate temporary vectorial force */
504 tx = _mm_mul_ps(fscal,dx02);
505 ty = _mm_mul_ps(fscal,dy02);
506 tz = _mm_mul_ps(fscal,dz02);
508 /* Update vectorial force */
509 fix0 = _mm_add_ps(fix0,tx);
510 fiy0 = _mm_add_ps(fiy0,ty);
511 fiz0 = _mm_add_ps(fiz0,tz);
513 fjx2 = _mm_add_ps(fjx2,tx);
514 fjy2 = _mm_add_ps(fjy2,ty);
515 fjz2 = _mm_add_ps(fjz2,tz);
519 /**************************
520 * CALCULATE INTERACTIONS *
521 **************************/
523 if (gmx_mm_any_lt(rsq10,rcutoff2))
526 r10 = _mm_mul_ps(rsq10,rinv10);
528 /* EWALD ELECTROSTATICS */
530 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
531 ewrt = _mm_mul_ps(r10,ewtabscale);
532 ewitab = _mm_cvttps_epi32(ewrt);
533 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
534 ewitab = _mm_slli_epi32(ewitab,2);
535 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
536 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
537 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
538 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
539 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
540 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
541 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
542 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
543 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
545 d = _mm_sub_ps(r10,rswitch);
546 d = _mm_max_ps(d,_mm_setzero_ps());
547 d2 = _mm_mul_ps(d,d);
548 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
550 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
552 /* Evaluate switch function */
553 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
554 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
555 velec = _mm_mul_ps(velec,sw);
556 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
558 /* Update potential sum for this i atom from the interaction with this j atom. */
559 velec = _mm_and_ps(velec,cutoff_mask);
560 velecsum = _mm_add_ps(velecsum,velec);
564 fscal = _mm_and_ps(fscal,cutoff_mask);
566 /* Calculate temporary vectorial force */
567 tx = _mm_mul_ps(fscal,dx10);
568 ty = _mm_mul_ps(fscal,dy10);
569 tz = _mm_mul_ps(fscal,dz10);
571 /* Update vectorial force */
572 fix1 = _mm_add_ps(fix1,tx);
573 fiy1 = _mm_add_ps(fiy1,ty);
574 fiz1 = _mm_add_ps(fiz1,tz);
576 fjx0 = _mm_add_ps(fjx0,tx);
577 fjy0 = _mm_add_ps(fjy0,ty);
578 fjz0 = _mm_add_ps(fjz0,tz);
582 /**************************
583 * CALCULATE INTERACTIONS *
584 **************************/
586 if (gmx_mm_any_lt(rsq11,rcutoff2))
589 r11 = _mm_mul_ps(rsq11,rinv11);
591 /* EWALD ELECTROSTATICS */
593 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
594 ewrt = _mm_mul_ps(r11,ewtabscale);
595 ewitab = _mm_cvttps_epi32(ewrt);
596 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
597 ewitab = _mm_slli_epi32(ewitab,2);
598 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
599 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
600 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
601 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
602 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
603 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
604 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
605 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
606 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
608 d = _mm_sub_ps(r11,rswitch);
609 d = _mm_max_ps(d,_mm_setzero_ps());
610 d2 = _mm_mul_ps(d,d);
611 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
613 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
615 /* Evaluate switch function */
616 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
617 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
618 velec = _mm_mul_ps(velec,sw);
619 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
621 /* Update potential sum for this i atom from the interaction with this j atom. */
622 velec = _mm_and_ps(velec,cutoff_mask);
623 velecsum = _mm_add_ps(velecsum,velec);
627 fscal = _mm_and_ps(fscal,cutoff_mask);
629 /* Calculate temporary vectorial force */
630 tx = _mm_mul_ps(fscal,dx11);
631 ty = _mm_mul_ps(fscal,dy11);
632 tz = _mm_mul_ps(fscal,dz11);
634 /* Update vectorial force */
635 fix1 = _mm_add_ps(fix1,tx);
636 fiy1 = _mm_add_ps(fiy1,ty);
637 fiz1 = _mm_add_ps(fiz1,tz);
639 fjx1 = _mm_add_ps(fjx1,tx);
640 fjy1 = _mm_add_ps(fjy1,ty);
641 fjz1 = _mm_add_ps(fjz1,tz);
645 /**************************
646 * CALCULATE INTERACTIONS *
647 **************************/
649 if (gmx_mm_any_lt(rsq12,rcutoff2))
652 r12 = _mm_mul_ps(rsq12,rinv12);
654 /* EWALD ELECTROSTATICS */
656 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
657 ewrt = _mm_mul_ps(r12,ewtabscale);
658 ewitab = _mm_cvttps_epi32(ewrt);
659 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
660 ewitab = _mm_slli_epi32(ewitab,2);
661 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
662 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
663 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
664 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
665 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
666 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
667 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
668 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
669 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
671 d = _mm_sub_ps(r12,rswitch);
672 d = _mm_max_ps(d,_mm_setzero_ps());
673 d2 = _mm_mul_ps(d,d);
674 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
676 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
678 /* Evaluate switch function */
679 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
680 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
681 velec = _mm_mul_ps(velec,sw);
682 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
684 /* Update potential sum for this i atom from the interaction with this j atom. */
685 velec = _mm_and_ps(velec,cutoff_mask);
686 velecsum = _mm_add_ps(velecsum,velec);
690 fscal = _mm_and_ps(fscal,cutoff_mask);
692 /* Calculate temporary vectorial force */
693 tx = _mm_mul_ps(fscal,dx12);
694 ty = _mm_mul_ps(fscal,dy12);
695 tz = _mm_mul_ps(fscal,dz12);
697 /* Update vectorial force */
698 fix1 = _mm_add_ps(fix1,tx);
699 fiy1 = _mm_add_ps(fiy1,ty);
700 fiz1 = _mm_add_ps(fiz1,tz);
702 fjx2 = _mm_add_ps(fjx2,tx);
703 fjy2 = _mm_add_ps(fjy2,ty);
704 fjz2 = _mm_add_ps(fjz2,tz);
708 /**************************
709 * CALCULATE INTERACTIONS *
710 **************************/
712 if (gmx_mm_any_lt(rsq20,rcutoff2))
715 r20 = _mm_mul_ps(rsq20,rinv20);
717 /* EWALD ELECTROSTATICS */
719 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
720 ewrt = _mm_mul_ps(r20,ewtabscale);
721 ewitab = _mm_cvttps_epi32(ewrt);
722 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
723 ewitab = _mm_slli_epi32(ewitab,2);
724 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
725 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
726 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
727 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
728 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
729 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
730 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
731 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
732 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
734 d = _mm_sub_ps(r20,rswitch);
735 d = _mm_max_ps(d,_mm_setzero_ps());
736 d2 = _mm_mul_ps(d,d);
737 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
739 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
741 /* Evaluate switch function */
742 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
743 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
744 velec = _mm_mul_ps(velec,sw);
745 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
747 /* Update potential sum for this i atom from the interaction with this j atom. */
748 velec = _mm_and_ps(velec,cutoff_mask);
749 velecsum = _mm_add_ps(velecsum,velec);
753 fscal = _mm_and_ps(fscal,cutoff_mask);
755 /* Calculate temporary vectorial force */
756 tx = _mm_mul_ps(fscal,dx20);
757 ty = _mm_mul_ps(fscal,dy20);
758 tz = _mm_mul_ps(fscal,dz20);
760 /* Update vectorial force */
761 fix2 = _mm_add_ps(fix2,tx);
762 fiy2 = _mm_add_ps(fiy2,ty);
763 fiz2 = _mm_add_ps(fiz2,tz);
765 fjx0 = _mm_add_ps(fjx0,tx);
766 fjy0 = _mm_add_ps(fjy0,ty);
767 fjz0 = _mm_add_ps(fjz0,tz);
771 /**************************
772 * CALCULATE INTERACTIONS *
773 **************************/
775 if (gmx_mm_any_lt(rsq21,rcutoff2))
778 r21 = _mm_mul_ps(rsq21,rinv21);
780 /* EWALD ELECTROSTATICS */
782 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
783 ewrt = _mm_mul_ps(r21,ewtabscale);
784 ewitab = _mm_cvttps_epi32(ewrt);
785 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
786 ewitab = _mm_slli_epi32(ewitab,2);
787 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
788 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
789 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
790 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
791 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
792 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
793 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
794 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
795 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
797 d = _mm_sub_ps(r21,rswitch);
798 d = _mm_max_ps(d,_mm_setzero_ps());
799 d2 = _mm_mul_ps(d,d);
800 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
802 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
804 /* Evaluate switch function */
805 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
806 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
807 velec = _mm_mul_ps(velec,sw);
808 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
810 /* Update potential sum for this i atom from the interaction with this j atom. */
811 velec = _mm_and_ps(velec,cutoff_mask);
812 velecsum = _mm_add_ps(velecsum,velec);
816 fscal = _mm_and_ps(fscal,cutoff_mask);
818 /* Calculate temporary vectorial force */
819 tx = _mm_mul_ps(fscal,dx21);
820 ty = _mm_mul_ps(fscal,dy21);
821 tz = _mm_mul_ps(fscal,dz21);
823 /* Update vectorial force */
824 fix2 = _mm_add_ps(fix2,tx);
825 fiy2 = _mm_add_ps(fiy2,ty);
826 fiz2 = _mm_add_ps(fiz2,tz);
828 fjx1 = _mm_add_ps(fjx1,tx);
829 fjy1 = _mm_add_ps(fjy1,ty);
830 fjz1 = _mm_add_ps(fjz1,tz);
834 /**************************
835 * CALCULATE INTERACTIONS *
836 **************************/
838 if (gmx_mm_any_lt(rsq22,rcutoff2))
841 r22 = _mm_mul_ps(rsq22,rinv22);
843 /* EWALD ELECTROSTATICS */
845 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
846 ewrt = _mm_mul_ps(r22,ewtabscale);
847 ewitab = _mm_cvttps_epi32(ewrt);
848 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
849 ewitab = _mm_slli_epi32(ewitab,2);
850 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
851 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
852 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
853 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
854 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
855 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
856 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
857 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
858 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
860 d = _mm_sub_ps(r22,rswitch);
861 d = _mm_max_ps(d,_mm_setzero_ps());
862 d2 = _mm_mul_ps(d,d);
863 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
865 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
867 /* Evaluate switch function */
868 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
869 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
870 velec = _mm_mul_ps(velec,sw);
871 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
873 /* Update potential sum for this i atom from the interaction with this j atom. */
874 velec = _mm_and_ps(velec,cutoff_mask);
875 velecsum = _mm_add_ps(velecsum,velec);
879 fscal = _mm_and_ps(fscal,cutoff_mask);
881 /* Calculate temporary vectorial force */
882 tx = _mm_mul_ps(fscal,dx22);
883 ty = _mm_mul_ps(fscal,dy22);
884 tz = _mm_mul_ps(fscal,dz22);
886 /* Update vectorial force */
887 fix2 = _mm_add_ps(fix2,tx);
888 fiy2 = _mm_add_ps(fiy2,ty);
889 fiz2 = _mm_add_ps(fiz2,tz);
891 fjx2 = _mm_add_ps(fjx2,tx);
892 fjy2 = _mm_add_ps(fjy2,ty);
893 fjz2 = _mm_add_ps(fjz2,tz);
897 fjptrA = f+j_coord_offsetA;
898 fjptrB = f+j_coord_offsetB;
899 fjptrC = f+j_coord_offsetC;
900 fjptrD = f+j_coord_offsetD;
902 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
903 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
905 /* Inner loop uses 603 flops */
911 /* Get j neighbor index, and coordinate index */
912 jnrlistA = jjnr[jidx];
913 jnrlistB = jjnr[jidx+1];
914 jnrlistC = jjnr[jidx+2];
915 jnrlistD = jjnr[jidx+3];
916 /* Sign of each element will be negative for non-real atoms.
917 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
918 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
920 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
921 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
922 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
923 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
924 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
925 j_coord_offsetA = DIM*jnrA;
926 j_coord_offsetB = DIM*jnrB;
927 j_coord_offsetC = DIM*jnrC;
928 j_coord_offsetD = DIM*jnrD;
930 /* load j atom coordinates */
931 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
932 x+j_coord_offsetC,x+j_coord_offsetD,
933 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
935 /* Calculate displacement vector */
936 dx00 = _mm_sub_ps(ix0,jx0);
937 dy00 = _mm_sub_ps(iy0,jy0);
938 dz00 = _mm_sub_ps(iz0,jz0);
939 dx01 = _mm_sub_ps(ix0,jx1);
940 dy01 = _mm_sub_ps(iy0,jy1);
941 dz01 = _mm_sub_ps(iz0,jz1);
942 dx02 = _mm_sub_ps(ix0,jx2);
943 dy02 = _mm_sub_ps(iy0,jy2);
944 dz02 = _mm_sub_ps(iz0,jz2);
945 dx10 = _mm_sub_ps(ix1,jx0);
946 dy10 = _mm_sub_ps(iy1,jy0);
947 dz10 = _mm_sub_ps(iz1,jz0);
948 dx11 = _mm_sub_ps(ix1,jx1);
949 dy11 = _mm_sub_ps(iy1,jy1);
950 dz11 = _mm_sub_ps(iz1,jz1);
951 dx12 = _mm_sub_ps(ix1,jx2);
952 dy12 = _mm_sub_ps(iy1,jy2);
953 dz12 = _mm_sub_ps(iz1,jz2);
954 dx20 = _mm_sub_ps(ix2,jx0);
955 dy20 = _mm_sub_ps(iy2,jy0);
956 dz20 = _mm_sub_ps(iz2,jz0);
957 dx21 = _mm_sub_ps(ix2,jx1);
958 dy21 = _mm_sub_ps(iy2,jy1);
959 dz21 = _mm_sub_ps(iz2,jz1);
960 dx22 = _mm_sub_ps(ix2,jx2);
961 dy22 = _mm_sub_ps(iy2,jy2);
962 dz22 = _mm_sub_ps(iz2,jz2);
964 /* Calculate squared distance and things based on it */
965 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
966 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
967 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
968 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
969 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
970 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
971 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
972 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
973 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
975 rinv00 = sse2_invsqrt_f(rsq00);
976 rinv01 = sse2_invsqrt_f(rsq01);
977 rinv02 = sse2_invsqrt_f(rsq02);
978 rinv10 = sse2_invsqrt_f(rsq10);
979 rinv11 = sse2_invsqrt_f(rsq11);
980 rinv12 = sse2_invsqrt_f(rsq12);
981 rinv20 = sse2_invsqrt_f(rsq20);
982 rinv21 = sse2_invsqrt_f(rsq21);
983 rinv22 = sse2_invsqrt_f(rsq22);
985 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
986 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
987 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
988 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
989 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
990 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
991 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
992 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
993 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
995 fjx0 = _mm_setzero_ps();
996 fjy0 = _mm_setzero_ps();
997 fjz0 = _mm_setzero_ps();
998 fjx1 = _mm_setzero_ps();
999 fjy1 = _mm_setzero_ps();
1000 fjz1 = _mm_setzero_ps();
1001 fjx2 = _mm_setzero_ps();
1002 fjy2 = _mm_setzero_ps();
1003 fjz2 = _mm_setzero_ps();
1005 /**************************
1006 * CALCULATE INTERACTIONS *
1007 **************************/
1009 if (gmx_mm_any_lt(rsq00,rcutoff2))
1012 r00 = _mm_mul_ps(rsq00,rinv00);
1013 r00 = _mm_andnot_ps(dummy_mask,r00);
1015 /* EWALD ELECTROSTATICS */
1017 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1018 ewrt = _mm_mul_ps(r00,ewtabscale);
1019 ewitab = _mm_cvttps_epi32(ewrt);
1020 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1021 ewitab = _mm_slli_epi32(ewitab,2);
1022 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1023 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1024 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1025 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1026 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1027 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1028 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1029 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1030 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1032 /* LENNARD-JONES DISPERSION/REPULSION */
1034 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1035 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1036 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1037 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1038 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1040 d = _mm_sub_ps(r00,rswitch);
1041 d = _mm_max_ps(d,_mm_setzero_ps());
1042 d2 = _mm_mul_ps(d,d);
1043 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1045 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1047 /* Evaluate switch function */
1048 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1049 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1050 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1051 velec = _mm_mul_ps(velec,sw);
1052 vvdw = _mm_mul_ps(vvdw,sw);
1053 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1055 /* Update potential sum for this i atom from the interaction with this j atom. */
1056 velec = _mm_and_ps(velec,cutoff_mask);
1057 velec = _mm_andnot_ps(dummy_mask,velec);
1058 velecsum = _mm_add_ps(velecsum,velec);
1059 vvdw = _mm_and_ps(vvdw,cutoff_mask);
1060 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
1061 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
1063 fscal = _mm_add_ps(felec,fvdw);
1065 fscal = _mm_and_ps(fscal,cutoff_mask);
1067 fscal = _mm_andnot_ps(dummy_mask,fscal);
1069 /* Calculate temporary vectorial force */
1070 tx = _mm_mul_ps(fscal,dx00);
1071 ty = _mm_mul_ps(fscal,dy00);
1072 tz = _mm_mul_ps(fscal,dz00);
1074 /* Update vectorial force */
1075 fix0 = _mm_add_ps(fix0,tx);
1076 fiy0 = _mm_add_ps(fiy0,ty);
1077 fiz0 = _mm_add_ps(fiz0,tz);
1079 fjx0 = _mm_add_ps(fjx0,tx);
1080 fjy0 = _mm_add_ps(fjy0,ty);
1081 fjz0 = _mm_add_ps(fjz0,tz);
1085 /**************************
1086 * CALCULATE INTERACTIONS *
1087 **************************/
1089 if (gmx_mm_any_lt(rsq01,rcutoff2))
1092 r01 = _mm_mul_ps(rsq01,rinv01);
1093 r01 = _mm_andnot_ps(dummy_mask,r01);
1095 /* EWALD ELECTROSTATICS */
1097 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1098 ewrt = _mm_mul_ps(r01,ewtabscale);
1099 ewitab = _mm_cvttps_epi32(ewrt);
1100 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1101 ewitab = _mm_slli_epi32(ewitab,2);
1102 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1103 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1104 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1105 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1106 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1107 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1108 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1109 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
1110 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1112 d = _mm_sub_ps(r01,rswitch);
1113 d = _mm_max_ps(d,_mm_setzero_ps());
1114 d2 = _mm_mul_ps(d,d);
1115 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1117 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1119 /* Evaluate switch function */
1120 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1121 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1122 velec = _mm_mul_ps(velec,sw);
1123 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1125 /* Update potential sum for this i atom from the interaction with this j atom. */
1126 velec = _mm_and_ps(velec,cutoff_mask);
1127 velec = _mm_andnot_ps(dummy_mask,velec);
1128 velecsum = _mm_add_ps(velecsum,velec);
1132 fscal = _mm_and_ps(fscal,cutoff_mask);
1134 fscal = _mm_andnot_ps(dummy_mask,fscal);
1136 /* Calculate temporary vectorial force */
1137 tx = _mm_mul_ps(fscal,dx01);
1138 ty = _mm_mul_ps(fscal,dy01);
1139 tz = _mm_mul_ps(fscal,dz01);
1141 /* Update vectorial force */
1142 fix0 = _mm_add_ps(fix0,tx);
1143 fiy0 = _mm_add_ps(fiy0,ty);
1144 fiz0 = _mm_add_ps(fiz0,tz);
1146 fjx1 = _mm_add_ps(fjx1,tx);
1147 fjy1 = _mm_add_ps(fjy1,ty);
1148 fjz1 = _mm_add_ps(fjz1,tz);
1152 /**************************
1153 * CALCULATE INTERACTIONS *
1154 **************************/
1156 if (gmx_mm_any_lt(rsq02,rcutoff2))
1159 r02 = _mm_mul_ps(rsq02,rinv02);
1160 r02 = _mm_andnot_ps(dummy_mask,r02);
1162 /* EWALD ELECTROSTATICS */
1164 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1165 ewrt = _mm_mul_ps(r02,ewtabscale);
1166 ewitab = _mm_cvttps_epi32(ewrt);
1167 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1168 ewitab = _mm_slli_epi32(ewitab,2);
1169 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1170 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1171 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1172 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1173 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1174 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1175 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1176 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
1177 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1179 d = _mm_sub_ps(r02,rswitch);
1180 d = _mm_max_ps(d,_mm_setzero_ps());
1181 d2 = _mm_mul_ps(d,d);
1182 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1184 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1186 /* Evaluate switch function */
1187 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1188 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1189 velec = _mm_mul_ps(velec,sw);
1190 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1192 /* Update potential sum for this i atom from the interaction with this j atom. */
1193 velec = _mm_and_ps(velec,cutoff_mask);
1194 velec = _mm_andnot_ps(dummy_mask,velec);
1195 velecsum = _mm_add_ps(velecsum,velec);
1199 fscal = _mm_and_ps(fscal,cutoff_mask);
1201 fscal = _mm_andnot_ps(dummy_mask,fscal);
1203 /* Calculate temporary vectorial force */
1204 tx = _mm_mul_ps(fscal,dx02);
1205 ty = _mm_mul_ps(fscal,dy02);
1206 tz = _mm_mul_ps(fscal,dz02);
1208 /* Update vectorial force */
1209 fix0 = _mm_add_ps(fix0,tx);
1210 fiy0 = _mm_add_ps(fiy0,ty);
1211 fiz0 = _mm_add_ps(fiz0,tz);
1213 fjx2 = _mm_add_ps(fjx2,tx);
1214 fjy2 = _mm_add_ps(fjy2,ty);
1215 fjz2 = _mm_add_ps(fjz2,tz);
1219 /**************************
1220 * CALCULATE INTERACTIONS *
1221 **************************/
1223 if (gmx_mm_any_lt(rsq10,rcutoff2))
1226 r10 = _mm_mul_ps(rsq10,rinv10);
1227 r10 = _mm_andnot_ps(dummy_mask,r10);
1229 /* EWALD ELECTROSTATICS */
1231 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1232 ewrt = _mm_mul_ps(r10,ewtabscale);
1233 ewitab = _mm_cvttps_epi32(ewrt);
1234 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1235 ewitab = _mm_slli_epi32(ewitab,2);
1236 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1237 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1238 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1239 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1240 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1241 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1242 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1243 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1244 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1246 d = _mm_sub_ps(r10,rswitch);
1247 d = _mm_max_ps(d,_mm_setzero_ps());
1248 d2 = _mm_mul_ps(d,d);
1249 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1251 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1253 /* Evaluate switch function */
1254 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1255 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1256 velec = _mm_mul_ps(velec,sw);
1257 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1259 /* Update potential sum for this i atom from the interaction with this j atom. */
1260 velec = _mm_and_ps(velec,cutoff_mask);
1261 velec = _mm_andnot_ps(dummy_mask,velec);
1262 velecsum = _mm_add_ps(velecsum,velec);
1266 fscal = _mm_and_ps(fscal,cutoff_mask);
1268 fscal = _mm_andnot_ps(dummy_mask,fscal);
1270 /* Calculate temporary vectorial force */
1271 tx = _mm_mul_ps(fscal,dx10);
1272 ty = _mm_mul_ps(fscal,dy10);
1273 tz = _mm_mul_ps(fscal,dz10);
1275 /* Update vectorial force */
1276 fix1 = _mm_add_ps(fix1,tx);
1277 fiy1 = _mm_add_ps(fiy1,ty);
1278 fiz1 = _mm_add_ps(fiz1,tz);
1280 fjx0 = _mm_add_ps(fjx0,tx);
1281 fjy0 = _mm_add_ps(fjy0,ty);
1282 fjz0 = _mm_add_ps(fjz0,tz);
1286 /**************************
1287 * CALCULATE INTERACTIONS *
1288 **************************/
1290 if (gmx_mm_any_lt(rsq11,rcutoff2))
1293 r11 = _mm_mul_ps(rsq11,rinv11);
1294 r11 = _mm_andnot_ps(dummy_mask,r11);
1296 /* EWALD ELECTROSTATICS */
1298 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1299 ewrt = _mm_mul_ps(r11,ewtabscale);
1300 ewitab = _mm_cvttps_epi32(ewrt);
1301 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1302 ewitab = _mm_slli_epi32(ewitab,2);
1303 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1304 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1305 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1306 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1307 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1308 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1309 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1310 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1311 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1313 d = _mm_sub_ps(r11,rswitch);
1314 d = _mm_max_ps(d,_mm_setzero_ps());
1315 d2 = _mm_mul_ps(d,d);
1316 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1318 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1320 /* Evaluate switch function */
1321 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1322 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1323 velec = _mm_mul_ps(velec,sw);
1324 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1326 /* Update potential sum for this i atom from the interaction with this j atom. */
1327 velec = _mm_and_ps(velec,cutoff_mask);
1328 velec = _mm_andnot_ps(dummy_mask,velec);
1329 velecsum = _mm_add_ps(velecsum,velec);
1333 fscal = _mm_and_ps(fscal,cutoff_mask);
1335 fscal = _mm_andnot_ps(dummy_mask,fscal);
1337 /* Calculate temporary vectorial force */
1338 tx = _mm_mul_ps(fscal,dx11);
1339 ty = _mm_mul_ps(fscal,dy11);
1340 tz = _mm_mul_ps(fscal,dz11);
1342 /* Update vectorial force */
1343 fix1 = _mm_add_ps(fix1,tx);
1344 fiy1 = _mm_add_ps(fiy1,ty);
1345 fiz1 = _mm_add_ps(fiz1,tz);
1347 fjx1 = _mm_add_ps(fjx1,tx);
1348 fjy1 = _mm_add_ps(fjy1,ty);
1349 fjz1 = _mm_add_ps(fjz1,tz);
1353 /**************************
1354 * CALCULATE INTERACTIONS *
1355 **************************/
1357 if (gmx_mm_any_lt(rsq12,rcutoff2))
1360 r12 = _mm_mul_ps(rsq12,rinv12);
1361 r12 = _mm_andnot_ps(dummy_mask,r12);
1363 /* EWALD ELECTROSTATICS */
1365 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1366 ewrt = _mm_mul_ps(r12,ewtabscale);
1367 ewitab = _mm_cvttps_epi32(ewrt);
1368 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1369 ewitab = _mm_slli_epi32(ewitab,2);
1370 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1371 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1372 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1373 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1374 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1375 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1376 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1377 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1378 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1380 d = _mm_sub_ps(r12,rswitch);
1381 d = _mm_max_ps(d,_mm_setzero_ps());
1382 d2 = _mm_mul_ps(d,d);
1383 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1385 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1387 /* Evaluate switch function */
1388 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1389 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1390 velec = _mm_mul_ps(velec,sw);
1391 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1393 /* Update potential sum for this i atom from the interaction with this j atom. */
1394 velec = _mm_and_ps(velec,cutoff_mask);
1395 velec = _mm_andnot_ps(dummy_mask,velec);
1396 velecsum = _mm_add_ps(velecsum,velec);
1400 fscal = _mm_and_ps(fscal,cutoff_mask);
1402 fscal = _mm_andnot_ps(dummy_mask,fscal);
1404 /* Calculate temporary vectorial force */
1405 tx = _mm_mul_ps(fscal,dx12);
1406 ty = _mm_mul_ps(fscal,dy12);
1407 tz = _mm_mul_ps(fscal,dz12);
1409 /* Update vectorial force */
1410 fix1 = _mm_add_ps(fix1,tx);
1411 fiy1 = _mm_add_ps(fiy1,ty);
1412 fiz1 = _mm_add_ps(fiz1,tz);
1414 fjx2 = _mm_add_ps(fjx2,tx);
1415 fjy2 = _mm_add_ps(fjy2,ty);
1416 fjz2 = _mm_add_ps(fjz2,tz);
1420 /**************************
1421 * CALCULATE INTERACTIONS *
1422 **************************/
1424 if (gmx_mm_any_lt(rsq20,rcutoff2))
1427 r20 = _mm_mul_ps(rsq20,rinv20);
1428 r20 = _mm_andnot_ps(dummy_mask,r20);
1430 /* EWALD ELECTROSTATICS */
1432 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1433 ewrt = _mm_mul_ps(r20,ewtabscale);
1434 ewitab = _mm_cvttps_epi32(ewrt);
1435 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1436 ewitab = _mm_slli_epi32(ewitab,2);
1437 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1438 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1439 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1440 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1441 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1442 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1443 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1444 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1445 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1447 d = _mm_sub_ps(r20,rswitch);
1448 d = _mm_max_ps(d,_mm_setzero_ps());
1449 d2 = _mm_mul_ps(d,d);
1450 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1452 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1454 /* Evaluate switch function */
1455 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1456 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1457 velec = _mm_mul_ps(velec,sw);
1458 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1460 /* Update potential sum for this i atom from the interaction with this j atom. */
1461 velec = _mm_and_ps(velec,cutoff_mask);
1462 velec = _mm_andnot_ps(dummy_mask,velec);
1463 velecsum = _mm_add_ps(velecsum,velec);
1467 fscal = _mm_and_ps(fscal,cutoff_mask);
1469 fscal = _mm_andnot_ps(dummy_mask,fscal);
1471 /* Calculate temporary vectorial force */
1472 tx = _mm_mul_ps(fscal,dx20);
1473 ty = _mm_mul_ps(fscal,dy20);
1474 tz = _mm_mul_ps(fscal,dz20);
1476 /* Update vectorial force */
1477 fix2 = _mm_add_ps(fix2,tx);
1478 fiy2 = _mm_add_ps(fiy2,ty);
1479 fiz2 = _mm_add_ps(fiz2,tz);
1481 fjx0 = _mm_add_ps(fjx0,tx);
1482 fjy0 = _mm_add_ps(fjy0,ty);
1483 fjz0 = _mm_add_ps(fjz0,tz);
1487 /**************************
1488 * CALCULATE INTERACTIONS *
1489 **************************/
1491 if (gmx_mm_any_lt(rsq21,rcutoff2))
1494 r21 = _mm_mul_ps(rsq21,rinv21);
1495 r21 = _mm_andnot_ps(dummy_mask,r21);
1497 /* EWALD ELECTROSTATICS */
1499 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1500 ewrt = _mm_mul_ps(r21,ewtabscale);
1501 ewitab = _mm_cvttps_epi32(ewrt);
1502 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1503 ewitab = _mm_slli_epi32(ewitab,2);
1504 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1505 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1506 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1507 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1508 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1509 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1510 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1511 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1512 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1514 d = _mm_sub_ps(r21,rswitch);
1515 d = _mm_max_ps(d,_mm_setzero_ps());
1516 d2 = _mm_mul_ps(d,d);
1517 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1519 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1521 /* Evaluate switch function */
1522 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1523 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1524 velec = _mm_mul_ps(velec,sw);
1525 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1527 /* Update potential sum for this i atom from the interaction with this j atom. */
1528 velec = _mm_and_ps(velec,cutoff_mask);
1529 velec = _mm_andnot_ps(dummy_mask,velec);
1530 velecsum = _mm_add_ps(velecsum,velec);
1534 fscal = _mm_and_ps(fscal,cutoff_mask);
1536 fscal = _mm_andnot_ps(dummy_mask,fscal);
1538 /* Calculate temporary vectorial force */
1539 tx = _mm_mul_ps(fscal,dx21);
1540 ty = _mm_mul_ps(fscal,dy21);
1541 tz = _mm_mul_ps(fscal,dz21);
1543 /* Update vectorial force */
1544 fix2 = _mm_add_ps(fix2,tx);
1545 fiy2 = _mm_add_ps(fiy2,ty);
1546 fiz2 = _mm_add_ps(fiz2,tz);
1548 fjx1 = _mm_add_ps(fjx1,tx);
1549 fjy1 = _mm_add_ps(fjy1,ty);
1550 fjz1 = _mm_add_ps(fjz1,tz);
1554 /**************************
1555 * CALCULATE INTERACTIONS *
1556 **************************/
1558 if (gmx_mm_any_lt(rsq22,rcutoff2))
1561 r22 = _mm_mul_ps(rsq22,rinv22);
1562 r22 = _mm_andnot_ps(dummy_mask,r22);
1564 /* EWALD ELECTROSTATICS */
1566 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1567 ewrt = _mm_mul_ps(r22,ewtabscale);
1568 ewitab = _mm_cvttps_epi32(ewrt);
1569 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1570 ewitab = _mm_slli_epi32(ewitab,2);
1571 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1572 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1573 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1574 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1575 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1576 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1577 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1578 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1579 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1581 d = _mm_sub_ps(r22,rswitch);
1582 d = _mm_max_ps(d,_mm_setzero_ps());
1583 d2 = _mm_mul_ps(d,d);
1584 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1586 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1588 /* Evaluate switch function */
1589 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1590 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1591 velec = _mm_mul_ps(velec,sw);
1592 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1594 /* Update potential sum for this i atom from the interaction with this j atom. */
1595 velec = _mm_and_ps(velec,cutoff_mask);
1596 velec = _mm_andnot_ps(dummy_mask,velec);
1597 velecsum = _mm_add_ps(velecsum,velec);
1601 fscal = _mm_and_ps(fscal,cutoff_mask);
1603 fscal = _mm_andnot_ps(dummy_mask,fscal);
1605 /* Calculate temporary vectorial force */
1606 tx = _mm_mul_ps(fscal,dx22);
1607 ty = _mm_mul_ps(fscal,dy22);
1608 tz = _mm_mul_ps(fscal,dz22);
1610 /* Update vectorial force */
1611 fix2 = _mm_add_ps(fix2,tx);
1612 fiy2 = _mm_add_ps(fiy2,ty);
1613 fiz2 = _mm_add_ps(fiz2,tz);
1615 fjx2 = _mm_add_ps(fjx2,tx);
1616 fjy2 = _mm_add_ps(fjy2,ty);
1617 fjz2 = _mm_add_ps(fjz2,tz);
1621 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1622 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1623 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1624 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1626 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1627 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1629 /* Inner loop uses 612 flops */
1632 /* End of innermost loop */
1634 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1635 f+i_coord_offset,fshift+i_shift_offset);
1638 /* Update potential energies */
1639 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1640 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1642 /* Increment number of inner iterations */
1643 inneriter += j_index_end - j_index_start;
1645 /* Outer loop uses 20 flops */
1648 /* Increment number of outer iterations */
1651 /* Update outer/inner flops */
1653 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*612);
1656 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single
1657 * Electrostatics interaction: Ewald
1658 * VdW interaction: LennardJones
1659 * Geometry: Water3-Water3
1660 * Calculate force/pot: Force
1663 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single
1664 (t_nblist * gmx_restrict nlist,
1665 rvec * gmx_restrict xx,
1666 rvec * gmx_restrict ff,
1667 struct t_forcerec * gmx_restrict fr,
1668 t_mdatoms * gmx_restrict mdatoms,
1669 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1670 t_nrnb * gmx_restrict nrnb)
1672 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1673 * just 0 for non-waters.
1674 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1675 * jnr indices corresponding to data put in the four positions in the SIMD register.
1677 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1678 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1679 int jnrA,jnrB,jnrC,jnrD;
1680 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1681 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1682 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1683 real rcutoff_scalar;
1684 real *shiftvec,*fshift,*x,*f;
1685 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1686 real scratch[4*DIM];
1687 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1689 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1691 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1693 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1694 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1695 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1696 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1697 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1698 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1699 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1700 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1701 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1702 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1703 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1704 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1705 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1706 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1707 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1708 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1709 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1712 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1715 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1716 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1718 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1720 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1721 real rswitch_scalar,d_scalar;
1722 __m128 dummy_mask,cutoff_mask;
1723 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1724 __m128 one = _mm_set1_ps(1.0);
1725 __m128 two = _mm_set1_ps(2.0);
1731 jindex = nlist->jindex;
1733 shiftidx = nlist->shift;
1735 shiftvec = fr->shift_vec[0];
1736 fshift = fr->fshift[0];
1737 facel = _mm_set1_ps(fr->ic->epsfac);
1738 charge = mdatoms->chargeA;
1739 nvdwtype = fr->ntype;
1740 vdwparam = fr->nbfp;
1741 vdwtype = mdatoms->typeA;
1743 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1744 ewtab = fr->ic->tabq_coul_FDV0;
1745 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1746 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1748 /* Setup water-specific parameters */
1749 inr = nlist->iinr[0];
1750 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1751 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1752 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1753 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1755 jq0 = _mm_set1_ps(charge[inr+0]);
1756 jq1 = _mm_set1_ps(charge[inr+1]);
1757 jq2 = _mm_set1_ps(charge[inr+2]);
1758 vdwjidx0A = 2*vdwtype[inr+0];
1759 qq00 = _mm_mul_ps(iq0,jq0);
1760 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1761 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1762 qq01 = _mm_mul_ps(iq0,jq1);
1763 qq02 = _mm_mul_ps(iq0,jq2);
1764 qq10 = _mm_mul_ps(iq1,jq0);
1765 qq11 = _mm_mul_ps(iq1,jq1);
1766 qq12 = _mm_mul_ps(iq1,jq2);
1767 qq20 = _mm_mul_ps(iq2,jq0);
1768 qq21 = _mm_mul_ps(iq2,jq1);
1769 qq22 = _mm_mul_ps(iq2,jq2);
1771 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1772 rcutoff_scalar = fr->ic->rcoulomb;
1773 rcutoff = _mm_set1_ps(rcutoff_scalar);
1774 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1776 rswitch_scalar = fr->ic->rcoulomb_switch;
1777 rswitch = _mm_set1_ps(rswitch_scalar);
1778 /* Setup switch parameters */
1779 d_scalar = rcutoff_scalar-rswitch_scalar;
1780 d = _mm_set1_ps(d_scalar);
1781 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1782 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1783 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1784 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1785 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1786 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1788 /* Avoid stupid compiler warnings */
1789 jnrA = jnrB = jnrC = jnrD = 0;
1790 j_coord_offsetA = 0;
1791 j_coord_offsetB = 0;
1792 j_coord_offsetC = 0;
1793 j_coord_offsetD = 0;
1798 for(iidx=0;iidx<4*DIM;iidx++)
1800 scratch[iidx] = 0.0;
1803 /* Start outer loop over neighborlists */
1804 for(iidx=0; iidx<nri; iidx++)
1806 /* Load shift vector for this list */
1807 i_shift_offset = DIM*shiftidx[iidx];
1809 /* Load limits for loop over neighbors */
1810 j_index_start = jindex[iidx];
1811 j_index_end = jindex[iidx+1];
1813 /* Get outer coordinate index */
1815 i_coord_offset = DIM*inr;
1817 /* Load i particle coords and add shift vector */
1818 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1819 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1821 fix0 = _mm_setzero_ps();
1822 fiy0 = _mm_setzero_ps();
1823 fiz0 = _mm_setzero_ps();
1824 fix1 = _mm_setzero_ps();
1825 fiy1 = _mm_setzero_ps();
1826 fiz1 = _mm_setzero_ps();
1827 fix2 = _mm_setzero_ps();
1828 fiy2 = _mm_setzero_ps();
1829 fiz2 = _mm_setzero_ps();
1831 /* Start inner kernel loop */
1832 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1835 /* Get j neighbor index, and coordinate index */
1837 jnrB = jjnr[jidx+1];
1838 jnrC = jjnr[jidx+2];
1839 jnrD = jjnr[jidx+3];
1840 j_coord_offsetA = DIM*jnrA;
1841 j_coord_offsetB = DIM*jnrB;
1842 j_coord_offsetC = DIM*jnrC;
1843 j_coord_offsetD = DIM*jnrD;
1845 /* load j atom coordinates */
1846 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1847 x+j_coord_offsetC,x+j_coord_offsetD,
1848 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1850 /* Calculate displacement vector */
1851 dx00 = _mm_sub_ps(ix0,jx0);
1852 dy00 = _mm_sub_ps(iy0,jy0);
1853 dz00 = _mm_sub_ps(iz0,jz0);
1854 dx01 = _mm_sub_ps(ix0,jx1);
1855 dy01 = _mm_sub_ps(iy0,jy1);
1856 dz01 = _mm_sub_ps(iz0,jz1);
1857 dx02 = _mm_sub_ps(ix0,jx2);
1858 dy02 = _mm_sub_ps(iy0,jy2);
1859 dz02 = _mm_sub_ps(iz0,jz2);
1860 dx10 = _mm_sub_ps(ix1,jx0);
1861 dy10 = _mm_sub_ps(iy1,jy0);
1862 dz10 = _mm_sub_ps(iz1,jz0);
1863 dx11 = _mm_sub_ps(ix1,jx1);
1864 dy11 = _mm_sub_ps(iy1,jy1);
1865 dz11 = _mm_sub_ps(iz1,jz1);
1866 dx12 = _mm_sub_ps(ix1,jx2);
1867 dy12 = _mm_sub_ps(iy1,jy2);
1868 dz12 = _mm_sub_ps(iz1,jz2);
1869 dx20 = _mm_sub_ps(ix2,jx0);
1870 dy20 = _mm_sub_ps(iy2,jy0);
1871 dz20 = _mm_sub_ps(iz2,jz0);
1872 dx21 = _mm_sub_ps(ix2,jx1);
1873 dy21 = _mm_sub_ps(iy2,jy1);
1874 dz21 = _mm_sub_ps(iz2,jz1);
1875 dx22 = _mm_sub_ps(ix2,jx2);
1876 dy22 = _mm_sub_ps(iy2,jy2);
1877 dz22 = _mm_sub_ps(iz2,jz2);
1879 /* Calculate squared distance and things based on it */
1880 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1881 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1882 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1883 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1884 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1885 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1886 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1887 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1888 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1890 rinv00 = sse2_invsqrt_f(rsq00);
1891 rinv01 = sse2_invsqrt_f(rsq01);
1892 rinv02 = sse2_invsqrt_f(rsq02);
1893 rinv10 = sse2_invsqrt_f(rsq10);
1894 rinv11 = sse2_invsqrt_f(rsq11);
1895 rinv12 = sse2_invsqrt_f(rsq12);
1896 rinv20 = sse2_invsqrt_f(rsq20);
1897 rinv21 = sse2_invsqrt_f(rsq21);
1898 rinv22 = sse2_invsqrt_f(rsq22);
1900 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1901 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1902 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1903 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1904 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1905 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1906 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1907 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1908 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1910 fjx0 = _mm_setzero_ps();
1911 fjy0 = _mm_setzero_ps();
1912 fjz0 = _mm_setzero_ps();
1913 fjx1 = _mm_setzero_ps();
1914 fjy1 = _mm_setzero_ps();
1915 fjz1 = _mm_setzero_ps();
1916 fjx2 = _mm_setzero_ps();
1917 fjy2 = _mm_setzero_ps();
1918 fjz2 = _mm_setzero_ps();
1920 /**************************
1921 * CALCULATE INTERACTIONS *
1922 **************************/
1924 if (gmx_mm_any_lt(rsq00,rcutoff2))
1927 r00 = _mm_mul_ps(rsq00,rinv00);
1929 /* EWALD ELECTROSTATICS */
1931 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1932 ewrt = _mm_mul_ps(r00,ewtabscale);
1933 ewitab = _mm_cvttps_epi32(ewrt);
1934 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1935 ewitab = _mm_slli_epi32(ewitab,2);
1936 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1937 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1938 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1939 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1940 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1941 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1942 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1943 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1944 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1946 /* LENNARD-JONES DISPERSION/REPULSION */
1948 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1949 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1950 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1951 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1952 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1954 d = _mm_sub_ps(r00,rswitch);
1955 d = _mm_max_ps(d,_mm_setzero_ps());
1956 d2 = _mm_mul_ps(d,d);
1957 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1959 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1961 /* Evaluate switch function */
1962 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1963 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1964 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1965 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1967 fscal = _mm_add_ps(felec,fvdw);
1969 fscal = _mm_and_ps(fscal,cutoff_mask);
1971 /* Calculate temporary vectorial force */
1972 tx = _mm_mul_ps(fscal,dx00);
1973 ty = _mm_mul_ps(fscal,dy00);
1974 tz = _mm_mul_ps(fscal,dz00);
1976 /* Update vectorial force */
1977 fix0 = _mm_add_ps(fix0,tx);
1978 fiy0 = _mm_add_ps(fiy0,ty);
1979 fiz0 = _mm_add_ps(fiz0,tz);
1981 fjx0 = _mm_add_ps(fjx0,tx);
1982 fjy0 = _mm_add_ps(fjy0,ty);
1983 fjz0 = _mm_add_ps(fjz0,tz);
1987 /**************************
1988 * CALCULATE INTERACTIONS *
1989 **************************/
1991 if (gmx_mm_any_lt(rsq01,rcutoff2))
1994 r01 = _mm_mul_ps(rsq01,rinv01);
1996 /* EWALD ELECTROSTATICS */
1998 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1999 ewrt = _mm_mul_ps(r01,ewtabscale);
2000 ewitab = _mm_cvttps_epi32(ewrt);
2001 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2002 ewitab = _mm_slli_epi32(ewitab,2);
2003 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2004 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2005 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2006 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2007 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2008 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2009 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2010 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
2011 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2013 d = _mm_sub_ps(r01,rswitch);
2014 d = _mm_max_ps(d,_mm_setzero_ps());
2015 d2 = _mm_mul_ps(d,d);
2016 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2018 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2020 /* Evaluate switch function */
2021 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2022 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2023 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2027 fscal = _mm_and_ps(fscal,cutoff_mask);
2029 /* Calculate temporary vectorial force */
2030 tx = _mm_mul_ps(fscal,dx01);
2031 ty = _mm_mul_ps(fscal,dy01);
2032 tz = _mm_mul_ps(fscal,dz01);
2034 /* Update vectorial force */
2035 fix0 = _mm_add_ps(fix0,tx);
2036 fiy0 = _mm_add_ps(fiy0,ty);
2037 fiz0 = _mm_add_ps(fiz0,tz);
2039 fjx1 = _mm_add_ps(fjx1,tx);
2040 fjy1 = _mm_add_ps(fjy1,ty);
2041 fjz1 = _mm_add_ps(fjz1,tz);
2045 /**************************
2046 * CALCULATE INTERACTIONS *
2047 **************************/
2049 if (gmx_mm_any_lt(rsq02,rcutoff2))
2052 r02 = _mm_mul_ps(rsq02,rinv02);
2054 /* EWALD ELECTROSTATICS */
2056 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2057 ewrt = _mm_mul_ps(r02,ewtabscale);
2058 ewitab = _mm_cvttps_epi32(ewrt);
2059 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2060 ewitab = _mm_slli_epi32(ewitab,2);
2061 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2062 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2063 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2064 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2065 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2066 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2067 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2068 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2069 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2071 d = _mm_sub_ps(r02,rswitch);
2072 d = _mm_max_ps(d,_mm_setzero_ps());
2073 d2 = _mm_mul_ps(d,d);
2074 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2076 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2078 /* Evaluate switch function */
2079 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2080 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2081 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2085 fscal = _mm_and_ps(fscal,cutoff_mask);
2087 /* Calculate temporary vectorial force */
2088 tx = _mm_mul_ps(fscal,dx02);
2089 ty = _mm_mul_ps(fscal,dy02);
2090 tz = _mm_mul_ps(fscal,dz02);
2092 /* Update vectorial force */
2093 fix0 = _mm_add_ps(fix0,tx);
2094 fiy0 = _mm_add_ps(fiy0,ty);
2095 fiz0 = _mm_add_ps(fiz0,tz);
2097 fjx2 = _mm_add_ps(fjx2,tx);
2098 fjy2 = _mm_add_ps(fjy2,ty);
2099 fjz2 = _mm_add_ps(fjz2,tz);
2103 /**************************
2104 * CALCULATE INTERACTIONS *
2105 **************************/
2107 if (gmx_mm_any_lt(rsq10,rcutoff2))
2110 r10 = _mm_mul_ps(rsq10,rinv10);
2112 /* EWALD ELECTROSTATICS */
2114 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2115 ewrt = _mm_mul_ps(r10,ewtabscale);
2116 ewitab = _mm_cvttps_epi32(ewrt);
2117 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2118 ewitab = _mm_slli_epi32(ewitab,2);
2119 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2120 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2121 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2122 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2123 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2124 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2125 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2126 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2127 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2129 d = _mm_sub_ps(r10,rswitch);
2130 d = _mm_max_ps(d,_mm_setzero_ps());
2131 d2 = _mm_mul_ps(d,d);
2132 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2134 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2136 /* Evaluate switch function */
2137 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2138 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2139 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2143 fscal = _mm_and_ps(fscal,cutoff_mask);
2145 /* Calculate temporary vectorial force */
2146 tx = _mm_mul_ps(fscal,dx10);
2147 ty = _mm_mul_ps(fscal,dy10);
2148 tz = _mm_mul_ps(fscal,dz10);
2150 /* Update vectorial force */
2151 fix1 = _mm_add_ps(fix1,tx);
2152 fiy1 = _mm_add_ps(fiy1,ty);
2153 fiz1 = _mm_add_ps(fiz1,tz);
2155 fjx0 = _mm_add_ps(fjx0,tx);
2156 fjy0 = _mm_add_ps(fjy0,ty);
2157 fjz0 = _mm_add_ps(fjz0,tz);
2161 /**************************
2162 * CALCULATE INTERACTIONS *
2163 **************************/
2165 if (gmx_mm_any_lt(rsq11,rcutoff2))
2168 r11 = _mm_mul_ps(rsq11,rinv11);
2170 /* EWALD ELECTROSTATICS */
2172 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2173 ewrt = _mm_mul_ps(r11,ewtabscale);
2174 ewitab = _mm_cvttps_epi32(ewrt);
2175 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2176 ewitab = _mm_slli_epi32(ewitab,2);
2177 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2178 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2179 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2180 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2181 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2182 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2183 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2184 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2185 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2187 d = _mm_sub_ps(r11,rswitch);
2188 d = _mm_max_ps(d,_mm_setzero_ps());
2189 d2 = _mm_mul_ps(d,d);
2190 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2192 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2194 /* Evaluate switch function */
2195 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2196 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2197 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2201 fscal = _mm_and_ps(fscal,cutoff_mask);
2203 /* Calculate temporary vectorial force */
2204 tx = _mm_mul_ps(fscal,dx11);
2205 ty = _mm_mul_ps(fscal,dy11);
2206 tz = _mm_mul_ps(fscal,dz11);
2208 /* Update vectorial force */
2209 fix1 = _mm_add_ps(fix1,tx);
2210 fiy1 = _mm_add_ps(fiy1,ty);
2211 fiz1 = _mm_add_ps(fiz1,tz);
2213 fjx1 = _mm_add_ps(fjx1,tx);
2214 fjy1 = _mm_add_ps(fjy1,ty);
2215 fjz1 = _mm_add_ps(fjz1,tz);
2219 /**************************
2220 * CALCULATE INTERACTIONS *
2221 **************************/
2223 if (gmx_mm_any_lt(rsq12,rcutoff2))
2226 r12 = _mm_mul_ps(rsq12,rinv12);
2228 /* EWALD ELECTROSTATICS */
2230 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2231 ewrt = _mm_mul_ps(r12,ewtabscale);
2232 ewitab = _mm_cvttps_epi32(ewrt);
2233 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2234 ewitab = _mm_slli_epi32(ewitab,2);
2235 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2236 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2237 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2238 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2239 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2240 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2241 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2242 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2243 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2245 d = _mm_sub_ps(r12,rswitch);
2246 d = _mm_max_ps(d,_mm_setzero_ps());
2247 d2 = _mm_mul_ps(d,d);
2248 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2250 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2252 /* Evaluate switch function */
2253 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2254 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2255 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2259 fscal = _mm_and_ps(fscal,cutoff_mask);
2261 /* Calculate temporary vectorial force */
2262 tx = _mm_mul_ps(fscal,dx12);
2263 ty = _mm_mul_ps(fscal,dy12);
2264 tz = _mm_mul_ps(fscal,dz12);
2266 /* Update vectorial force */
2267 fix1 = _mm_add_ps(fix1,tx);
2268 fiy1 = _mm_add_ps(fiy1,ty);
2269 fiz1 = _mm_add_ps(fiz1,tz);
2271 fjx2 = _mm_add_ps(fjx2,tx);
2272 fjy2 = _mm_add_ps(fjy2,ty);
2273 fjz2 = _mm_add_ps(fjz2,tz);
2277 /**************************
2278 * CALCULATE INTERACTIONS *
2279 **************************/
2281 if (gmx_mm_any_lt(rsq20,rcutoff2))
2284 r20 = _mm_mul_ps(rsq20,rinv20);
2286 /* EWALD ELECTROSTATICS */
2288 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2289 ewrt = _mm_mul_ps(r20,ewtabscale);
2290 ewitab = _mm_cvttps_epi32(ewrt);
2291 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2292 ewitab = _mm_slli_epi32(ewitab,2);
2293 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2294 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2295 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2296 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2297 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2298 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2299 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2300 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2301 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2303 d = _mm_sub_ps(r20,rswitch);
2304 d = _mm_max_ps(d,_mm_setzero_ps());
2305 d2 = _mm_mul_ps(d,d);
2306 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2308 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2310 /* Evaluate switch function */
2311 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2312 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2313 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2317 fscal = _mm_and_ps(fscal,cutoff_mask);
2319 /* Calculate temporary vectorial force */
2320 tx = _mm_mul_ps(fscal,dx20);
2321 ty = _mm_mul_ps(fscal,dy20);
2322 tz = _mm_mul_ps(fscal,dz20);
2324 /* Update vectorial force */
2325 fix2 = _mm_add_ps(fix2,tx);
2326 fiy2 = _mm_add_ps(fiy2,ty);
2327 fiz2 = _mm_add_ps(fiz2,tz);
2329 fjx0 = _mm_add_ps(fjx0,tx);
2330 fjy0 = _mm_add_ps(fjy0,ty);
2331 fjz0 = _mm_add_ps(fjz0,tz);
2335 /**************************
2336 * CALCULATE INTERACTIONS *
2337 **************************/
2339 if (gmx_mm_any_lt(rsq21,rcutoff2))
2342 r21 = _mm_mul_ps(rsq21,rinv21);
2344 /* EWALD ELECTROSTATICS */
2346 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2347 ewrt = _mm_mul_ps(r21,ewtabscale);
2348 ewitab = _mm_cvttps_epi32(ewrt);
2349 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2350 ewitab = _mm_slli_epi32(ewitab,2);
2351 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2352 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2353 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2354 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2355 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2356 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2357 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2358 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2359 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2361 d = _mm_sub_ps(r21,rswitch);
2362 d = _mm_max_ps(d,_mm_setzero_ps());
2363 d2 = _mm_mul_ps(d,d);
2364 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2366 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2368 /* Evaluate switch function */
2369 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2370 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2371 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2375 fscal = _mm_and_ps(fscal,cutoff_mask);
2377 /* Calculate temporary vectorial force */
2378 tx = _mm_mul_ps(fscal,dx21);
2379 ty = _mm_mul_ps(fscal,dy21);
2380 tz = _mm_mul_ps(fscal,dz21);
2382 /* Update vectorial force */
2383 fix2 = _mm_add_ps(fix2,tx);
2384 fiy2 = _mm_add_ps(fiy2,ty);
2385 fiz2 = _mm_add_ps(fiz2,tz);
2387 fjx1 = _mm_add_ps(fjx1,tx);
2388 fjy1 = _mm_add_ps(fjy1,ty);
2389 fjz1 = _mm_add_ps(fjz1,tz);
2393 /**************************
2394 * CALCULATE INTERACTIONS *
2395 **************************/
2397 if (gmx_mm_any_lt(rsq22,rcutoff2))
2400 r22 = _mm_mul_ps(rsq22,rinv22);
2402 /* EWALD ELECTROSTATICS */
2404 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2405 ewrt = _mm_mul_ps(r22,ewtabscale);
2406 ewitab = _mm_cvttps_epi32(ewrt);
2407 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2408 ewitab = _mm_slli_epi32(ewitab,2);
2409 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2410 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2411 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2412 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2413 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2414 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2415 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2416 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2417 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2419 d = _mm_sub_ps(r22,rswitch);
2420 d = _mm_max_ps(d,_mm_setzero_ps());
2421 d2 = _mm_mul_ps(d,d);
2422 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2424 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2426 /* Evaluate switch function */
2427 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2428 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2429 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2433 fscal = _mm_and_ps(fscal,cutoff_mask);
2435 /* Calculate temporary vectorial force */
2436 tx = _mm_mul_ps(fscal,dx22);
2437 ty = _mm_mul_ps(fscal,dy22);
2438 tz = _mm_mul_ps(fscal,dz22);
2440 /* Update vectorial force */
2441 fix2 = _mm_add_ps(fix2,tx);
2442 fiy2 = _mm_add_ps(fiy2,ty);
2443 fiz2 = _mm_add_ps(fiz2,tz);
2445 fjx2 = _mm_add_ps(fjx2,tx);
2446 fjy2 = _mm_add_ps(fjy2,ty);
2447 fjz2 = _mm_add_ps(fjz2,tz);
2451 fjptrA = f+j_coord_offsetA;
2452 fjptrB = f+j_coord_offsetB;
2453 fjptrC = f+j_coord_offsetC;
2454 fjptrD = f+j_coord_offsetD;
2456 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2457 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2459 /* Inner loop uses 573 flops */
2462 if(jidx<j_index_end)
2465 /* Get j neighbor index, and coordinate index */
2466 jnrlistA = jjnr[jidx];
2467 jnrlistB = jjnr[jidx+1];
2468 jnrlistC = jjnr[jidx+2];
2469 jnrlistD = jjnr[jidx+3];
2470 /* Sign of each element will be negative for non-real atoms.
2471 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2472 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2474 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2475 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2476 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2477 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2478 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2479 j_coord_offsetA = DIM*jnrA;
2480 j_coord_offsetB = DIM*jnrB;
2481 j_coord_offsetC = DIM*jnrC;
2482 j_coord_offsetD = DIM*jnrD;
2484 /* load j atom coordinates */
2485 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2486 x+j_coord_offsetC,x+j_coord_offsetD,
2487 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2489 /* Calculate displacement vector */
2490 dx00 = _mm_sub_ps(ix0,jx0);
2491 dy00 = _mm_sub_ps(iy0,jy0);
2492 dz00 = _mm_sub_ps(iz0,jz0);
2493 dx01 = _mm_sub_ps(ix0,jx1);
2494 dy01 = _mm_sub_ps(iy0,jy1);
2495 dz01 = _mm_sub_ps(iz0,jz1);
2496 dx02 = _mm_sub_ps(ix0,jx2);
2497 dy02 = _mm_sub_ps(iy0,jy2);
2498 dz02 = _mm_sub_ps(iz0,jz2);
2499 dx10 = _mm_sub_ps(ix1,jx0);
2500 dy10 = _mm_sub_ps(iy1,jy0);
2501 dz10 = _mm_sub_ps(iz1,jz0);
2502 dx11 = _mm_sub_ps(ix1,jx1);
2503 dy11 = _mm_sub_ps(iy1,jy1);
2504 dz11 = _mm_sub_ps(iz1,jz1);
2505 dx12 = _mm_sub_ps(ix1,jx2);
2506 dy12 = _mm_sub_ps(iy1,jy2);
2507 dz12 = _mm_sub_ps(iz1,jz2);
2508 dx20 = _mm_sub_ps(ix2,jx0);
2509 dy20 = _mm_sub_ps(iy2,jy0);
2510 dz20 = _mm_sub_ps(iz2,jz0);
2511 dx21 = _mm_sub_ps(ix2,jx1);
2512 dy21 = _mm_sub_ps(iy2,jy1);
2513 dz21 = _mm_sub_ps(iz2,jz1);
2514 dx22 = _mm_sub_ps(ix2,jx2);
2515 dy22 = _mm_sub_ps(iy2,jy2);
2516 dz22 = _mm_sub_ps(iz2,jz2);
2518 /* Calculate squared distance and things based on it */
2519 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2520 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2521 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2522 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2523 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2524 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2525 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2526 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2527 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2529 rinv00 = sse2_invsqrt_f(rsq00);
2530 rinv01 = sse2_invsqrt_f(rsq01);
2531 rinv02 = sse2_invsqrt_f(rsq02);
2532 rinv10 = sse2_invsqrt_f(rsq10);
2533 rinv11 = sse2_invsqrt_f(rsq11);
2534 rinv12 = sse2_invsqrt_f(rsq12);
2535 rinv20 = sse2_invsqrt_f(rsq20);
2536 rinv21 = sse2_invsqrt_f(rsq21);
2537 rinv22 = sse2_invsqrt_f(rsq22);
2539 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2540 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
2541 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
2542 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
2543 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2544 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2545 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
2546 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2547 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2549 fjx0 = _mm_setzero_ps();
2550 fjy0 = _mm_setzero_ps();
2551 fjz0 = _mm_setzero_ps();
2552 fjx1 = _mm_setzero_ps();
2553 fjy1 = _mm_setzero_ps();
2554 fjz1 = _mm_setzero_ps();
2555 fjx2 = _mm_setzero_ps();
2556 fjy2 = _mm_setzero_ps();
2557 fjz2 = _mm_setzero_ps();
2559 /**************************
2560 * CALCULATE INTERACTIONS *
2561 **************************/
2563 if (gmx_mm_any_lt(rsq00,rcutoff2))
2566 r00 = _mm_mul_ps(rsq00,rinv00);
2567 r00 = _mm_andnot_ps(dummy_mask,r00);
2569 /* EWALD ELECTROSTATICS */
2571 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2572 ewrt = _mm_mul_ps(r00,ewtabscale);
2573 ewitab = _mm_cvttps_epi32(ewrt);
2574 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2575 ewitab = _mm_slli_epi32(ewitab,2);
2576 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2577 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2578 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2579 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2580 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2581 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2582 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2583 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
2584 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2586 /* LENNARD-JONES DISPERSION/REPULSION */
2588 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2589 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
2590 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2591 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2592 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2594 d = _mm_sub_ps(r00,rswitch);
2595 d = _mm_max_ps(d,_mm_setzero_ps());
2596 d2 = _mm_mul_ps(d,d);
2597 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2599 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2601 /* Evaluate switch function */
2602 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2603 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
2604 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2605 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2607 fscal = _mm_add_ps(felec,fvdw);
2609 fscal = _mm_and_ps(fscal,cutoff_mask);
2611 fscal = _mm_andnot_ps(dummy_mask,fscal);
2613 /* Calculate temporary vectorial force */
2614 tx = _mm_mul_ps(fscal,dx00);
2615 ty = _mm_mul_ps(fscal,dy00);
2616 tz = _mm_mul_ps(fscal,dz00);
2618 /* Update vectorial force */
2619 fix0 = _mm_add_ps(fix0,tx);
2620 fiy0 = _mm_add_ps(fiy0,ty);
2621 fiz0 = _mm_add_ps(fiz0,tz);
2623 fjx0 = _mm_add_ps(fjx0,tx);
2624 fjy0 = _mm_add_ps(fjy0,ty);
2625 fjz0 = _mm_add_ps(fjz0,tz);
2629 /**************************
2630 * CALCULATE INTERACTIONS *
2631 **************************/
2633 if (gmx_mm_any_lt(rsq01,rcutoff2))
2636 r01 = _mm_mul_ps(rsq01,rinv01);
2637 r01 = _mm_andnot_ps(dummy_mask,r01);
2639 /* EWALD ELECTROSTATICS */
2641 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2642 ewrt = _mm_mul_ps(r01,ewtabscale);
2643 ewitab = _mm_cvttps_epi32(ewrt);
2644 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2645 ewitab = _mm_slli_epi32(ewitab,2);
2646 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2647 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2648 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2649 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2650 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2651 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2652 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2653 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
2654 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2656 d = _mm_sub_ps(r01,rswitch);
2657 d = _mm_max_ps(d,_mm_setzero_ps());
2658 d2 = _mm_mul_ps(d,d);
2659 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2661 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2663 /* Evaluate switch function */
2664 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2665 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2666 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2670 fscal = _mm_and_ps(fscal,cutoff_mask);
2672 fscal = _mm_andnot_ps(dummy_mask,fscal);
2674 /* Calculate temporary vectorial force */
2675 tx = _mm_mul_ps(fscal,dx01);
2676 ty = _mm_mul_ps(fscal,dy01);
2677 tz = _mm_mul_ps(fscal,dz01);
2679 /* Update vectorial force */
2680 fix0 = _mm_add_ps(fix0,tx);
2681 fiy0 = _mm_add_ps(fiy0,ty);
2682 fiz0 = _mm_add_ps(fiz0,tz);
2684 fjx1 = _mm_add_ps(fjx1,tx);
2685 fjy1 = _mm_add_ps(fjy1,ty);
2686 fjz1 = _mm_add_ps(fjz1,tz);
2690 /**************************
2691 * CALCULATE INTERACTIONS *
2692 **************************/
2694 if (gmx_mm_any_lt(rsq02,rcutoff2))
2697 r02 = _mm_mul_ps(rsq02,rinv02);
2698 r02 = _mm_andnot_ps(dummy_mask,r02);
2700 /* EWALD ELECTROSTATICS */
2702 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2703 ewrt = _mm_mul_ps(r02,ewtabscale);
2704 ewitab = _mm_cvttps_epi32(ewrt);
2705 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2706 ewitab = _mm_slli_epi32(ewitab,2);
2707 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2708 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2709 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2710 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2711 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2712 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2713 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2714 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2715 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2717 d = _mm_sub_ps(r02,rswitch);
2718 d = _mm_max_ps(d,_mm_setzero_ps());
2719 d2 = _mm_mul_ps(d,d);
2720 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2722 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2724 /* Evaluate switch function */
2725 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2726 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2727 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2731 fscal = _mm_and_ps(fscal,cutoff_mask);
2733 fscal = _mm_andnot_ps(dummy_mask,fscal);
2735 /* Calculate temporary vectorial force */
2736 tx = _mm_mul_ps(fscal,dx02);
2737 ty = _mm_mul_ps(fscal,dy02);
2738 tz = _mm_mul_ps(fscal,dz02);
2740 /* Update vectorial force */
2741 fix0 = _mm_add_ps(fix0,tx);
2742 fiy0 = _mm_add_ps(fiy0,ty);
2743 fiz0 = _mm_add_ps(fiz0,tz);
2745 fjx2 = _mm_add_ps(fjx2,tx);
2746 fjy2 = _mm_add_ps(fjy2,ty);
2747 fjz2 = _mm_add_ps(fjz2,tz);
2751 /**************************
2752 * CALCULATE INTERACTIONS *
2753 **************************/
2755 if (gmx_mm_any_lt(rsq10,rcutoff2))
2758 r10 = _mm_mul_ps(rsq10,rinv10);
2759 r10 = _mm_andnot_ps(dummy_mask,r10);
2761 /* EWALD ELECTROSTATICS */
2763 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2764 ewrt = _mm_mul_ps(r10,ewtabscale);
2765 ewitab = _mm_cvttps_epi32(ewrt);
2766 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2767 ewitab = _mm_slli_epi32(ewitab,2);
2768 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2769 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2770 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2771 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2772 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2773 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2774 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2775 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2776 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2778 d = _mm_sub_ps(r10,rswitch);
2779 d = _mm_max_ps(d,_mm_setzero_ps());
2780 d2 = _mm_mul_ps(d,d);
2781 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2783 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2785 /* Evaluate switch function */
2786 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2787 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2788 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2792 fscal = _mm_and_ps(fscal,cutoff_mask);
2794 fscal = _mm_andnot_ps(dummy_mask,fscal);
2796 /* Calculate temporary vectorial force */
2797 tx = _mm_mul_ps(fscal,dx10);
2798 ty = _mm_mul_ps(fscal,dy10);
2799 tz = _mm_mul_ps(fscal,dz10);
2801 /* Update vectorial force */
2802 fix1 = _mm_add_ps(fix1,tx);
2803 fiy1 = _mm_add_ps(fiy1,ty);
2804 fiz1 = _mm_add_ps(fiz1,tz);
2806 fjx0 = _mm_add_ps(fjx0,tx);
2807 fjy0 = _mm_add_ps(fjy0,ty);
2808 fjz0 = _mm_add_ps(fjz0,tz);
2812 /**************************
2813 * CALCULATE INTERACTIONS *
2814 **************************/
2816 if (gmx_mm_any_lt(rsq11,rcutoff2))
2819 r11 = _mm_mul_ps(rsq11,rinv11);
2820 r11 = _mm_andnot_ps(dummy_mask,r11);
2822 /* EWALD ELECTROSTATICS */
2824 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2825 ewrt = _mm_mul_ps(r11,ewtabscale);
2826 ewitab = _mm_cvttps_epi32(ewrt);
2827 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2828 ewitab = _mm_slli_epi32(ewitab,2);
2829 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2830 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2831 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2832 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2833 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2834 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2835 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2836 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2837 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2839 d = _mm_sub_ps(r11,rswitch);
2840 d = _mm_max_ps(d,_mm_setzero_ps());
2841 d2 = _mm_mul_ps(d,d);
2842 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2844 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2846 /* Evaluate switch function */
2847 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2848 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2849 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2853 fscal = _mm_and_ps(fscal,cutoff_mask);
2855 fscal = _mm_andnot_ps(dummy_mask,fscal);
2857 /* Calculate temporary vectorial force */
2858 tx = _mm_mul_ps(fscal,dx11);
2859 ty = _mm_mul_ps(fscal,dy11);
2860 tz = _mm_mul_ps(fscal,dz11);
2862 /* Update vectorial force */
2863 fix1 = _mm_add_ps(fix1,tx);
2864 fiy1 = _mm_add_ps(fiy1,ty);
2865 fiz1 = _mm_add_ps(fiz1,tz);
2867 fjx1 = _mm_add_ps(fjx1,tx);
2868 fjy1 = _mm_add_ps(fjy1,ty);
2869 fjz1 = _mm_add_ps(fjz1,tz);
2873 /**************************
2874 * CALCULATE INTERACTIONS *
2875 **************************/
2877 if (gmx_mm_any_lt(rsq12,rcutoff2))
2880 r12 = _mm_mul_ps(rsq12,rinv12);
2881 r12 = _mm_andnot_ps(dummy_mask,r12);
2883 /* EWALD ELECTROSTATICS */
2885 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2886 ewrt = _mm_mul_ps(r12,ewtabscale);
2887 ewitab = _mm_cvttps_epi32(ewrt);
2888 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2889 ewitab = _mm_slli_epi32(ewitab,2);
2890 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2891 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2892 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2893 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2894 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2895 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2896 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2897 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2898 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2900 d = _mm_sub_ps(r12,rswitch);
2901 d = _mm_max_ps(d,_mm_setzero_ps());
2902 d2 = _mm_mul_ps(d,d);
2903 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2905 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2907 /* Evaluate switch function */
2908 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2909 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2910 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2914 fscal = _mm_and_ps(fscal,cutoff_mask);
2916 fscal = _mm_andnot_ps(dummy_mask,fscal);
2918 /* Calculate temporary vectorial force */
2919 tx = _mm_mul_ps(fscal,dx12);
2920 ty = _mm_mul_ps(fscal,dy12);
2921 tz = _mm_mul_ps(fscal,dz12);
2923 /* Update vectorial force */
2924 fix1 = _mm_add_ps(fix1,tx);
2925 fiy1 = _mm_add_ps(fiy1,ty);
2926 fiz1 = _mm_add_ps(fiz1,tz);
2928 fjx2 = _mm_add_ps(fjx2,tx);
2929 fjy2 = _mm_add_ps(fjy2,ty);
2930 fjz2 = _mm_add_ps(fjz2,tz);
2934 /**************************
2935 * CALCULATE INTERACTIONS *
2936 **************************/
2938 if (gmx_mm_any_lt(rsq20,rcutoff2))
2941 r20 = _mm_mul_ps(rsq20,rinv20);
2942 r20 = _mm_andnot_ps(dummy_mask,r20);
2944 /* EWALD ELECTROSTATICS */
2946 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2947 ewrt = _mm_mul_ps(r20,ewtabscale);
2948 ewitab = _mm_cvttps_epi32(ewrt);
2949 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2950 ewitab = _mm_slli_epi32(ewitab,2);
2951 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2952 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2953 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2954 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2955 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2956 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2957 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2958 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2959 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2961 d = _mm_sub_ps(r20,rswitch);
2962 d = _mm_max_ps(d,_mm_setzero_ps());
2963 d2 = _mm_mul_ps(d,d);
2964 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2966 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2968 /* Evaluate switch function */
2969 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2970 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2971 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2975 fscal = _mm_and_ps(fscal,cutoff_mask);
2977 fscal = _mm_andnot_ps(dummy_mask,fscal);
2979 /* Calculate temporary vectorial force */
2980 tx = _mm_mul_ps(fscal,dx20);
2981 ty = _mm_mul_ps(fscal,dy20);
2982 tz = _mm_mul_ps(fscal,dz20);
2984 /* Update vectorial force */
2985 fix2 = _mm_add_ps(fix2,tx);
2986 fiy2 = _mm_add_ps(fiy2,ty);
2987 fiz2 = _mm_add_ps(fiz2,tz);
2989 fjx0 = _mm_add_ps(fjx0,tx);
2990 fjy0 = _mm_add_ps(fjy0,ty);
2991 fjz0 = _mm_add_ps(fjz0,tz);
2995 /**************************
2996 * CALCULATE INTERACTIONS *
2997 **************************/
2999 if (gmx_mm_any_lt(rsq21,rcutoff2))
3002 r21 = _mm_mul_ps(rsq21,rinv21);
3003 r21 = _mm_andnot_ps(dummy_mask,r21);
3005 /* EWALD ELECTROSTATICS */
3007 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3008 ewrt = _mm_mul_ps(r21,ewtabscale);
3009 ewitab = _mm_cvttps_epi32(ewrt);
3010 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3011 ewitab = _mm_slli_epi32(ewitab,2);
3012 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3013 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3014 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3015 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3016 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3017 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3018 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3019 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
3020 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
3022 d = _mm_sub_ps(r21,rswitch);
3023 d = _mm_max_ps(d,_mm_setzero_ps());
3024 d2 = _mm_mul_ps(d,d);
3025 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3027 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3029 /* Evaluate switch function */
3030 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3031 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
3032 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
3036 fscal = _mm_and_ps(fscal,cutoff_mask);
3038 fscal = _mm_andnot_ps(dummy_mask,fscal);
3040 /* Calculate temporary vectorial force */
3041 tx = _mm_mul_ps(fscal,dx21);
3042 ty = _mm_mul_ps(fscal,dy21);
3043 tz = _mm_mul_ps(fscal,dz21);
3045 /* Update vectorial force */
3046 fix2 = _mm_add_ps(fix2,tx);
3047 fiy2 = _mm_add_ps(fiy2,ty);
3048 fiz2 = _mm_add_ps(fiz2,tz);
3050 fjx1 = _mm_add_ps(fjx1,tx);
3051 fjy1 = _mm_add_ps(fjy1,ty);
3052 fjz1 = _mm_add_ps(fjz1,tz);
3056 /**************************
3057 * CALCULATE INTERACTIONS *
3058 **************************/
3060 if (gmx_mm_any_lt(rsq22,rcutoff2))
3063 r22 = _mm_mul_ps(rsq22,rinv22);
3064 r22 = _mm_andnot_ps(dummy_mask,r22);
3066 /* EWALD ELECTROSTATICS */
3068 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3069 ewrt = _mm_mul_ps(r22,ewtabscale);
3070 ewitab = _mm_cvttps_epi32(ewrt);
3071 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3072 ewitab = _mm_slli_epi32(ewitab,2);
3073 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3074 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3075 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3076 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3077 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3078 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3079 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3080 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
3081 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
3083 d = _mm_sub_ps(r22,rswitch);
3084 d = _mm_max_ps(d,_mm_setzero_ps());
3085 d2 = _mm_mul_ps(d,d);
3086 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3088 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3090 /* Evaluate switch function */
3091 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3092 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
3093 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
3097 fscal = _mm_and_ps(fscal,cutoff_mask);
3099 fscal = _mm_andnot_ps(dummy_mask,fscal);
3101 /* Calculate temporary vectorial force */
3102 tx = _mm_mul_ps(fscal,dx22);
3103 ty = _mm_mul_ps(fscal,dy22);
3104 tz = _mm_mul_ps(fscal,dz22);
3106 /* Update vectorial force */
3107 fix2 = _mm_add_ps(fix2,tx);
3108 fiy2 = _mm_add_ps(fiy2,ty);
3109 fiz2 = _mm_add_ps(fiz2,tz);
3111 fjx2 = _mm_add_ps(fjx2,tx);
3112 fjy2 = _mm_add_ps(fjy2,ty);
3113 fjz2 = _mm_add_ps(fjz2,tz);
3117 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3118 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3119 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3120 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3122 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
3123 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3125 /* Inner loop uses 582 flops */
3128 /* End of innermost loop */
3130 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3131 f+i_coord_offset,fshift+i_shift_offset);
3133 /* Increment number of inner iterations */
3134 inneriter += j_index_end - j_index_start;
3136 /* Outer loop uses 18 flops */
3139 /* Increment number of outer iterations */
3142 /* Update outer/inner flops */
3144 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*582);