2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_sse4_1_single.h"
48 #include "kernelutil_x86_sse4_1_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse4_1_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LennardJones
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse4_1_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
107 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
111 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
113 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
115 __m128 dummy_mask,cutoff_mask;
116 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
117 __m128 one = _mm_set1_ps(1.0);
118 __m128 two = _mm_set1_ps(2.0);
124 jindex = nlist->jindex;
126 shiftidx = nlist->shift;
128 shiftvec = fr->shift_vec[0];
129 fshift = fr->fshift[0];
130 facel = _mm_set1_ps(fr->epsfac);
131 charge = mdatoms->chargeA;
132 nvdwtype = fr->ntype;
134 vdwtype = mdatoms->typeA;
136 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
137 ewtab = fr->ic->tabq_coul_FDV0;
138 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
139 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
141 /* Setup water-specific parameters */
142 inr = nlist->iinr[0];
143 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
144 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
145 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
146 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
148 jq0 = _mm_set1_ps(charge[inr+0]);
149 jq1 = _mm_set1_ps(charge[inr+1]);
150 jq2 = _mm_set1_ps(charge[inr+2]);
151 vdwjidx0A = 2*vdwtype[inr+0];
152 qq00 = _mm_mul_ps(iq0,jq0);
153 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
154 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
155 qq01 = _mm_mul_ps(iq0,jq1);
156 qq02 = _mm_mul_ps(iq0,jq2);
157 qq10 = _mm_mul_ps(iq1,jq0);
158 qq11 = _mm_mul_ps(iq1,jq1);
159 qq12 = _mm_mul_ps(iq1,jq2);
160 qq20 = _mm_mul_ps(iq2,jq0);
161 qq21 = _mm_mul_ps(iq2,jq1);
162 qq22 = _mm_mul_ps(iq2,jq2);
164 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
165 rcutoff_scalar = fr->rcoulomb;
166 rcutoff = _mm_set1_ps(rcutoff_scalar);
167 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
169 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
170 rvdw = _mm_set1_ps(fr->rvdw);
172 /* Avoid stupid compiler warnings */
173 jnrA = jnrB = jnrC = jnrD = 0;
182 for(iidx=0;iidx<4*DIM;iidx++)
187 /* Start outer loop over neighborlists */
188 for(iidx=0; iidx<nri; iidx++)
190 /* Load shift vector for this list */
191 i_shift_offset = DIM*shiftidx[iidx];
193 /* Load limits for loop over neighbors */
194 j_index_start = jindex[iidx];
195 j_index_end = jindex[iidx+1];
197 /* Get outer coordinate index */
199 i_coord_offset = DIM*inr;
201 /* Load i particle coords and add shift vector */
202 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
203 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
205 fix0 = _mm_setzero_ps();
206 fiy0 = _mm_setzero_ps();
207 fiz0 = _mm_setzero_ps();
208 fix1 = _mm_setzero_ps();
209 fiy1 = _mm_setzero_ps();
210 fiz1 = _mm_setzero_ps();
211 fix2 = _mm_setzero_ps();
212 fiy2 = _mm_setzero_ps();
213 fiz2 = _mm_setzero_ps();
215 /* Reset potential sums */
216 velecsum = _mm_setzero_ps();
217 vvdwsum = _mm_setzero_ps();
219 /* Start inner kernel loop */
220 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
223 /* Get j neighbor index, and coordinate index */
228 j_coord_offsetA = DIM*jnrA;
229 j_coord_offsetB = DIM*jnrB;
230 j_coord_offsetC = DIM*jnrC;
231 j_coord_offsetD = DIM*jnrD;
233 /* load j atom coordinates */
234 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
235 x+j_coord_offsetC,x+j_coord_offsetD,
236 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
238 /* Calculate displacement vector */
239 dx00 = _mm_sub_ps(ix0,jx0);
240 dy00 = _mm_sub_ps(iy0,jy0);
241 dz00 = _mm_sub_ps(iz0,jz0);
242 dx01 = _mm_sub_ps(ix0,jx1);
243 dy01 = _mm_sub_ps(iy0,jy1);
244 dz01 = _mm_sub_ps(iz0,jz1);
245 dx02 = _mm_sub_ps(ix0,jx2);
246 dy02 = _mm_sub_ps(iy0,jy2);
247 dz02 = _mm_sub_ps(iz0,jz2);
248 dx10 = _mm_sub_ps(ix1,jx0);
249 dy10 = _mm_sub_ps(iy1,jy0);
250 dz10 = _mm_sub_ps(iz1,jz0);
251 dx11 = _mm_sub_ps(ix1,jx1);
252 dy11 = _mm_sub_ps(iy1,jy1);
253 dz11 = _mm_sub_ps(iz1,jz1);
254 dx12 = _mm_sub_ps(ix1,jx2);
255 dy12 = _mm_sub_ps(iy1,jy2);
256 dz12 = _mm_sub_ps(iz1,jz2);
257 dx20 = _mm_sub_ps(ix2,jx0);
258 dy20 = _mm_sub_ps(iy2,jy0);
259 dz20 = _mm_sub_ps(iz2,jz0);
260 dx21 = _mm_sub_ps(ix2,jx1);
261 dy21 = _mm_sub_ps(iy2,jy1);
262 dz21 = _mm_sub_ps(iz2,jz1);
263 dx22 = _mm_sub_ps(ix2,jx2);
264 dy22 = _mm_sub_ps(iy2,jy2);
265 dz22 = _mm_sub_ps(iz2,jz2);
267 /* Calculate squared distance and things based on it */
268 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
269 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
270 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
271 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
272 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
273 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
274 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
275 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
276 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
278 rinv00 = gmx_mm_invsqrt_ps(rsq00);
279 rinv01 = gmx_mm_invsqrt_ps(rsq01);
280 rinv02 = gmx_mm_invsqrt_ps(rsq02);
281 rinv10 = gmx_mm_invsqrt_ps(rsq10);
282 rinv11 = gmx_mm_invsqrt_ps(rsq11);
283 rinv12 = gmx_mm_invsqrt_ps(rsq12);
284 rinv20 = gmx_mm_invsqrt_ps(rsq20);
285 rinv21 = gmx_mm_invsqrt_ps(rsq21);
286 rinv22 = gmx_mm_invsqrt_ps(rsq22);
288 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
289 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
290 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
291 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
292 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
293 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
294 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
295 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
296 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
298 fjx0 = _mm_setzero_ps();
299 fjy0 = _mm_setzero_ps();
300 fjz0 = _mm_setzero_ps();
301 fjx1 = _mm_setzero_ps();
302 fjy1 = _mm_setzero_ps();
303 fjz1 = _mm_setzero_ps();
304 fjx2 = _mm_setzero_ps();
305 fjy2 = _mm_setzero_ps();
306 fjz2 = _mm_setzero_ps();
308 /**************************
309 * CALCULATE INTERACTIONS *
310 **************************/
312 if (gmx_mm_any_lt(rsq00,rcutoff2))
315 r00 = _mm_mul_ps(rsq00,rinv00);
317 /* EWALD ELECTROSTATICS */
319 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
320 ewrt = _mm_mul_ps(r00,ewtabscale);
321 ewitab = _mm_cvttps_epi32(ewrt);
322 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
323 ewitab = _mm_slli_epi32(ewitab,2);
324 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
325 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
326 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
327 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
328 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
329 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
330 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
331 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
332 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
334 /* LENNARD-JONES DISPERSION/REPULSION */
336 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
337 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
338 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
339 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
340 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
341 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
343 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
345 /* Update potential sum for this i atom from the interaction with this j atom. */
346 velec = _mm_and_ps(velec,cutoff_mask);
347 velecsum = _mm_add_ps(velecsum,velec);
348 vvdw = _mm_and_ps(vvdw,cutoff_mask);
349 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
351 fscal = _mm_add_ps(felec,fvdw);
353 fscal = _mm_and_ps(fscal,cutoff_mask);
355 /* Calculate temporary vectorial force */
356 tx = _mm_mul_ps(fscal,dx00);
357 ty = _mm_mul_ps(fscal,dy00);
358 tz = _mm_mul_ps(fscal,dz00);
360 /* Update vectorial force */
361 fix0 = _mm_add_ps(fix0,tx);
362 fiy0 = _mm_add_ps(fiy0,ty);
363 fiz0 = _mm_add_ps(fiz0,tz);
365 fjx0 = _mm_add_ps(fjx0,tx);
366 fjy0 = _mm_add_ps(fjy0,ty);
367 fjz0 = _mm_add_ps(fjz0,tz);
371 /**************************
372 * CALCULATE INTERACTIONS *
373 **************************/
375 if (gmx_mm_any_lt(rsq01,rcutoff2))
378 r01 = _mm_mul_ps(rsq01,rinv01);
380 /* EWALD ELECTROSTATICS */
382 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
383 ewrt = _mm_mul_ps(r01,ewtabscale);
384 ewitab = _mm_cvttps_epi32(ewrt);
385 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
386 ewitab = _mm_slli_epi32(ewitab,2);
387 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
388 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
389 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
390 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
391 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
392 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
393 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
394 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
395 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
397 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
399 /* Update potential sum for this i atom from the interaction with this j atom. */
400 velec = _mm_and_ps(velec,cutoff_mask);
401 velecsum = _mm_add_ps(velecsum,velec);
405 fscal = _mm_and_ps(fscal,cutoff_mask);
407 /* Calculate temporary vectorial force */
408 tx = _mm_mul_ps(fscal,dx01);
409 ty = _mm_mul_ps(fscal,dy01);
410 tz = _mm_mul_ps(fscal,dz01);
412 /* Update vectorial force */
413 fix0 = _mm_add_ps(fix0,tx);
414 fiy0 = _mm_add_ps(fiy0,ty);
415 fiz0 = _mm_add_ps(fiz0,tz);
417 fjx1 = _mm_add_ps(fjx1,tx);
418 fjy1 = _mm_add_ps(fjy1,ty);
419 fjz1 = _mm_add_ps(fjz1,tz);
423 /**************************
424 * CALCULATE INTERACTIONS *
425 **************************/
427 if (gmx_mm_any_lt(rsq02,rcutoff2))
430 r02 = _mm_mul_ps(rsq02,rinv02);
432 /* EWALD ELECTROSTATICS */
434 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
435 ewrt = _mm_mul_ps(r02,ewtabscale);
436 ewitab = _mm_cvttps_epi32(ewrt);
437 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
438 ewitab = _mm_slli_epi32(ewitab,2);
439 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
440 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
441 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
442 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
443 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
444 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
445 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
446 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
447 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
449 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
451 /* Update potential sum for this i atom from the interaction with this j atom. */
452 velec = _mm_and_ps(velec,cutoff_mask);
453 velecsum = _mm_add_ps(velecsum,velec);
457 fscal = _mm_and_ps(fscal,cutoff_mask);
459 /* Calculate temporary vectorial force */
460 tx = _mm_mul_ps(fscal,dx02);
461 ty = _mm_mul_ps(fscal,dy02);
462 tz = _mm_mul_ps(fscal,dz02);
464 /* Update vectorial force */
465 fix0 = _mm_add_ps(fix0,tx);
466 fiy0 = _mm_add_ps(fiy0,ty);
467 fiz0 = _mm_add_ps(fiz0,tz);
469 fjx2 = _mm_add_ps(fjx2,tx);
470 fjy2 = _mm_add_ps(fjy2,ty);
471 fjz2 = _mm_add_ps(fjz2,tz);
475 /**************************
476 * CALCULATE INTERACTIONS *
477 **************************/
479 if (gmx_mm_any_lt(rsq10,rcutoff2))
482 r10 = _mm_mul_ps(rsq10,rinv10);
484 /* EWALD ELECTROSTATICS */
486 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
487 ewrt = _mm_mul_ps(r10,ewtabscale);
488 ewitab = _mm_cvttps_epi32(ewrt);
489 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
490 ewitab = _mm_slli_epi32(ewitab,2);
491 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
492 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
493 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
494 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
495 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
496 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
497 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
498 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
499 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
501 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
503 /* Update potential sum for this i atom from the interaction with this j atom. */
504 velec = _mm_and_ps(velec,cutoff_mask);
505 velecsum = _mm_add_ps(velecsum,velec);
509 fscal = _mm_and_ps(fscal,cutoff_mask);
511 /* Calculate temporary vectorial force */
512 tx = _mm_mul_ps(fscal,dx10);
513 ty = _mm_mul_ps(fscal,dy10);
514 tz = _mm_mul_ps(fscal,dz10);
516 /* Update vectorial force */
517 fix1 = _mm_add_ps(fix1,tx);
518 fiy1 = _mm_add_ps(fiy1,ty);
519 fiz1 = _mm_add_ps(fiz1,tz);
521 fjx0 = _mm_add_ps(fjx0,tx);
522 fjy0 = _mm_add_ps(fjy0,ty);
523 fjz0 = _mm_add_ps(fjz0,tz);
527 /**************************
528 * CALCULATE INTERACTIONS *
529 **************************/
531 if (gmx_mm_any_lt(rsq11,rcutoff2))
534 r11 = _mm_mul_ps(rsq11,rinv11);
536 /* EWALD ELECTROSTATICS */
538 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
539 ewrt = _mm_mul_ps(r11,ewtabscale);
540 ewitab = _mm_cvttps_epi32(ewrt);
541 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
542 ewitab = _mm_slli_epi32(ewitab,2);
543 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
544 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
545 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
546 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
547 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
548 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
549 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
550 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
551 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
553 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
555 /* Update potential sum for this i atom from the interaction with this j atom. */
556 velec = _mm_and_ps(velec,cutoff_mask);
557 velecsum = _mm_add_ps(velecsum,velec);
561 fscal = _mm_and_ps(fscal,cutoff_mask);
563 /* Calculate temporary vectorial force */
564 tx = _mm_mul_ps(fscal,dx11);
565 ty = _mm_mul_ps(fscal,dy11);
566 tz = _mm_mul_ps(fscal,dz11);
568 /* Update vectorial force */
569 fix1 = _mm_add_ps(fix1,tx);
570 fiy1 = _mm_add_ps(fiy1,ty);
571 fiz1 = _mm_add_ps(fiz1,tz);
573 fjx1 = _mm_add_ps(fjx1,tx);
574 fjy1 = _mm_add_ps(fjy1,ty);
575 fjz1 = _mm_add_ps(fjz1,tz);
579 /**************************
580 * CALCULATE INTERACTIONS *
581 **************************/
583 if (gmx_mm_any_lt(rsq12,rcutoff2))
586 r12 = _mm_mul_ps(rsq12,rinv12);
588 /* EWALD ELECTROSTATICS */
590 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
591 ewrt = _mm_mul_ps(r12,ewtabscale);
592 ewitab = _mm_cvttps_epi32(ewrt);
593 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
594 ewitab = _mm_slli_epi32(ewitab,2);
595 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
596 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
597 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
598 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
599 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
600 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
601 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
602 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
603 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
605 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
607 /* Update potential sum for this i atom from the interaction with this j atom. */
608 velec = _mm_and_ps(velec,cutoff_mask);
609 velecsum = _mm_add_ps(velecsum,velec);
613 fscal = _mm_and_ps(fscal,cutoff_mask);
615 /* Calculate temporary vectorial force */
616 tx = _mm_mul_ps(fscal,dx12);
617 ty = _mm_mul_ps(fscal,dy12);
618 tz = _mm_mul_ps(fscal,dz12);
620 /* Update vectorial force */
621 fix1 = _mm_add_ps(fix1,tx);
622 fiy1 = _mm_add_ps(fiy1,ty);
623 fiz1 = _mm_add_ps(fiz1,tz);
625 fjx2 = _mm_add_ps(fjx2,tx);
626 fjy2 = _mm_add_ps(fjy2,ty);
627 fjz2 = _mm_add_ps(fjz2,tz);
631 /**************************
632 * CALCULATE INTERACTIONS *
633 **************************/
635 if (gmx_mm_any_lt(rsq20,rcutoff2))
638 r20 = _mm_mul_ps(rsq20,rinv20);
640 /* EWALD ELECTROSTATICS */
642 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
643 ewrt = _mm_mul_ps(r20,ewtabscale);
644 ewitab = _mm_cvttps_epi32(ewrt);
645 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
646 ewitab = _mm_slli_epi32(ewitab,2);
647 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
648 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
649 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
650 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
651 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
652 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
653 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
654 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
655 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
657 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
659 /* Update potential sum for this i atom from the interaction with this j atom. */
660 velec = _mm_and_ps(velec,cutoff_mask);
661 velecsum = _mm_add_ps(velecsum,velec);
665 fscal = _mm_and_ps(fscal,cutoff_mask);
667 /* Calculate temporary vectorial force */
668 tx = _mm_mul_ps(fscal,dx20);
669 ty = _mm_mul_ps(fscal,dy20);
670 tz = _mm_mul_ps(fscal,dz20);
672 /* Update vectorial force */
673 fix2 = _mm_add_ps(fix2,tx);
674 fiy2 = _mm_add_ps(fiy2,ty);
675 fiz2 = _mm_add_ps(fiz2,tz);
677 fjx0 = _mm_add_ps(fjx0,tx);
678 fjy0 = _mm_add_ps(fjy0,ty);
679 fjz0 = _mm_add_ps(fjz0,tz);
683 /**************************
684 * CALCULATE INTERACTIONS *
685 **************************/
687 if (gmx_mm_any_lt(rsq21,rcutoff2))
690 r21 = _mm_mul_ps(rsq21,rinv21);
692 /* EWALD ELECTROSTATICS */
694 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
695 ewrt = _mm_mul_ps(r21,ewtabscale);
696 ewitab = _mm_cvttps_epi32(ewrt);
697 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
698 ewitab = _mm_slli_epi32(ewitab,2);
699 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
700 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
701 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
702 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
703 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
704 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
705 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
706 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
707 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
709 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
711 /* Update potential sum for this i atom from the interaction with this j atom. */
712 velec = _mm_and_ps(velec,cutoff_mask);
713 velecsum = _mm_add_ps(velecsum,velec);
717 fscal = _mm_and_ps(fscal,cutoff_mask);
719 /* Calculate temporary vectorial force */
720 tx = _mm_mul_ps(fscal,dx21);
721 ty = _mm_mul_ps(fscal,dy21);
722 tz = _mm_mul_ps(fscal,dz21);
724 /* Update vectorial force */
725 fix2 = _mm_add_ps(fix2,tx);
726 fiy2 = _mm_add_ps(fiy2,ty);
727 fiz2 = _mm_add_ps(fiz2,tz);
729 fjx1 = _mm_add_ps(fjx1,tx);
730 fjy1 = _mm_add_ps(fjy1,ty);
731 fjz1 = _mm_add_ps(fjz1,tz);
735 /**************************
736 * CALCULATE INTERACTIONS *
737 **************************/
739 if (gmx_mm_any_lt(rsq22,rcutoff2))
742 r22 = _mm_mul_ps(rsq22,rinv22);
744 /* EWALD ELECTROSTATICS */
746 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
747 ewrt = _mm_mul_ps(r22,ewtabscale);
748 ewitab = _mm_cvttps_epi32(ewrt);
749 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
750 ewitab = _mm_slli_epi32(ewitab,2);
751 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
752 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
753 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
754 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
755 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
756 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
757 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
758 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
759 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
761 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
763 /* Update potential sum for this i atom from the interaction with this j atom. */
764 velec = _mm_and_ps(velec,cutoff_mask);
765 velecsum = _mm_add_ps(velecsum,velec);
769 fscal = _mm_and_ps(fscal,cutoff_mask);
771 /* Calculate temporary vectorial force */
772 tx = _mm_mul_ps(fscal,dx22);
773 ty = _mm_mul_ps(fscal,dy22);
774 tz = _mm_mul_ps(fscal,dz22);
776 /* Update vectorial force */
777 fix2 = _mm_add_ps(fix2,tx);
778 fiy2 = _mm_add_ps(fiy2,ty);
779 fiz2 = _mm_add_ps(fiz2,tz);
781 fjx2 = _mm_add_ps(fjx2,tx);
782 fjy2 = _mm_add_ps(fjy2,ty);
783 fjz2 = _mm_add_ps(fjz2,tz);
787 fjptrA = f+j_coord_offsetA;
788 fjptrB = f+j_coord_offsetB;
789 fjptrC = f+j_coord_offsetC;
790 fjptrD = f+j_coord_offsetD;
792 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
793 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
795 /* Inner loop uses 432 flops */
801 /* Get j neighbor index, and coordinate index */
802 jnrlistA = jjnr[jidx];
803 jnrlistB = jjnr[jidx+1];
804 jnrlistC = jjnr[jidx+2];
805 jnrlistD = jjnr[jidx+3];
806 /* Sign of each element will be negative for non-real atoms.
807 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
808 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
810 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
811 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
812 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
813 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
814 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
815 j_coord_offsetA = DIM*jnrA;
816 j_coord_offsetB = DIM*jnrB;
817 j_coord_offsetC = DIM*jnrC;
818 j_coord_offsetD = DIM*jnrD;
820 /* load j atom coordinates */
821 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
822 x+j_coord_offsetC,x+j_coord_offsetD,
823 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
825 /* Calculate displacement vector */
826 dx00 = _mm_sub_ps(ix0,jx0);
827 dy00 = _mm_sub_ps(iy0,jy0);
828 dz00 = _mm_sub_ps(iz0,jz0);
829 dx01 = _mm_sub_ps(ix0,jx1);
830 dy01 = _mm_sub_ps(iy0,jy1);
831 dz01 = _mm_sub_ps(iz0,jz1);
832 dx02 = _mm_sub_ps(ix0,jx2);
833 dy02 = _mm_sub_ps(iy0,jy2);
834 dz02 = _mm_sub_ps(iz0,jz2);
835 dx10 = _mm_sub_ps(ix1,jx0);
836 dy10 = _mm_sub_ps(iy1,jy0);
837 dz10 = _mm_sub_ps(iz1,jz0);
838 dx11 = _mm_sub_ps(ix1,jx1);
839 dy11 = _mm_sub_ps(iy1,jy1);
840 dz11 = _mm_sub_ps(iz1,jz1);
841 dx12 = _mm_sub_ps(ix1,jx2);
842 dy12 = _mm_sub_ps(iy1,jy2);
843 dz12 = _mm_sub_ps(iz1,jz2);
844 dx20 = _mm_sub_ps(ix2,jx0);
845 dy20 = _mm_sub_ps(iy2,jy0);
846 dz20 = _mm_sub_ps(iz2,jz0);
847 dx21 = _mm_sub_ps(ix2,jx1);
848 dy21 = _mm_sub_ps(iy2,jy1);
849 dz21 = _mm_sub_ps(iz2,jz1);
850 dx22 = _mm_sub_ps(ix2,jx2);
851 dy22 = _mm_sub_ps(iy2,jy2);
852 dz22 = _mm_sub_ps(iz2,jz2);
854 /* Calculate squared distance and things based on it */
855 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
856 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
857 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
858 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
859 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
860 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
861 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
862 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
863 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
865 rinv00 = gmx_mm_invsqrt_ps(rsq00);
866 rinv01 = gmx_mm_invsqrt_ps(rsq01);
867 rinv02 = gmx_mm_invsqrt_ps(rsq02);
868 rinv10 = gmx_mm_invsqrt_ps(rsq10);
869 rinv11 = gmx_mm_invsqrt_ps(rsq11);
870 rinv12 = gmx_mm_invsqrt_ps(rsq12);
871 rinv20 = gmx_mm_invsqrt_ps(rsq20);
872 rinv21 = gmx_mm_invsqrt_ps(rsq21);
873 rinv22 = gmx_mm_invsqrt_ps(rsq22);
875 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
876 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
877 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
878 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
879 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
880 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
881 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
882 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
883 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
885 fjx0 = _mm_setzero_ps();
886 fjy0 = _mm_setzero_ps();
887 fjz0 = _mm_setzero_ps();
888 fjx1 = _mm_setzero_ps();
889 fjy1 = _mm_setzero_ps();
890 fjz1 = _mm_setzero_ps();
891 fjx2 = _mm_setzero_ps();
892 fjy2 = _mm_setzero_ps();
893 fjz2 = _mm_setzero_ps();
895 /**************************
896 * CALCULATE INTERACTIONS *
897 **************************/
899 if (gmx_mm_any_lt(rsq00,rcutoff2))
902 r00 = _mm_mul_ps(rsq00,rinv00);
903 r00 = _mm_andnot_ps(dummy_mask,r00);
905 /* EWALD ELECTROSTATICS */
907 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
908 ewrt = _mm_mul_ps(r00,ewtabscale);
909 ewitab = _mm_cvttps_epi32(ewrt);
910 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
911 ewitab = _mm_slli_epi32(ewitab,2);
912 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
913 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
914 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
915 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
916 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
917 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
918 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
919 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
920 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
922 /* LENNARD-JONES DISPERSION/REPULSION */
924 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
925 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
926 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
927 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
928 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
929 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
931 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
933 /* Update potential sum for this i atom from the interaction with this j atom. */
934 velec = _mm_and_ps(velec,cutoff_mask);
935 velec = _mm_andnot_ps(dummy_mask,velec);
936 velecsum = _mm_add_ps(velecsum,velec);
937 vvdw = _mm_and_ps(vvdw,cutoff_mask);
938 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
939 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
941 fscal = _mm_add_ps(felec,fvdw);
943 fscal = _mm_and_ps(fscal,cutoff_mask);
945 fscal = _mm_andnot_ps(dummy_mask,fscal);
947 /* Calculate temporary vectorial force */
948 tx = _mm_mul_ps(fscal,dx00);
949 ty = _mm_mul_ps(fscal,dy00);
950 tz = _mm_mul_ps(fscal,dz00);
952 /* Update vectorial force */
953 fix0 = _mm_add_ps(fix0,tx);
954 fiy0 = _mm_add_ps(fiy0,ty);
955 fiz0 = _mm_add_ps(fiz0,tz);
957 fjx0 = _mm_add_ps(fjx0,tx);
958 fjy0 = _mm_add_ps(fjy0,ty);
959 fjz0 = _mm_add_ps(fjz0,tz);
963 /**************************
964 * CALCULATE INTERACTIONS *
965 **************************/
967 if (gmx_mm_any_lt(rsq01,rcutoff2))
970 r01 = _mm_mul_ps(rsq01,rinv01);
971 r01 = _mm_andnot_ps(dummy_mask,r01);
973 /* EWALD ELECTROSTATICS */
975 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
976 ewrt = _mm_mul_ps(r01,ewtabscale);
977 ewitab = _mm_cvttps_epi32(ewrt);
978 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
979 ewitab = _mm_slli_epi32(ewitab,2);
980 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
981 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
982 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
983 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
984 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
985 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
986 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
987 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
988 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
990 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
992 /* Update potential sum for this i atom from the interaction with this j atom. */
993 velec = _mm_and_ps(velec,cutoff_mask);
994 velec = _mm_andnot_ps(dummy_mask,velec);
995 velecsum = _mm_add_ps(velecsum,velec);
999 fscal = _mm_and_ps(fscal,cutoff_mask);
1001 fscal = _mm_andnot_ps(dummy_mask,fscal);
1003 /* Calculate temporary vectorial force */
1004 tx = _mm_mul_ps(fscal,dx01);
1005 ty = _mm_mul_ps(fscal,dy01);
1006 tz = _mm_mul_ps(fscal,dz01);
1008 /* Update vectorial force */
1009 fix0 = _mm_add_ps(fix0,tx);
1010 fiy0 = _mm_add_ps(fiy0,ty);
1011 fiz0 = _mm_add_ps(fiz0,tz);
1013 fjx1 = _mm_add_ps(fjx1,tx);
1014 fjy1 = _mm_add_ps(fjy1,ty);
1015 fjz1 = _mm_add_ps(fjz1,tz);
1019 /**************************
1020 * CALCULATE INTERACTIONS *
1021 **************************/
1023 if (gmx_mm_any_lt(rsq02,rcutoff2))
1026 r02 = _mm_mul_ps(rsq02,rinv02);
1027 r02 = _mm_andnot_ps(dummy_mask,r02);
1029 /* EWALD ELECTROSTATICS */
1031 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1032 ewrt = _mm_mul_ps(r02,ewtabscale);
1033 ewitab = _mm_cvttps_epi32(ewrt);
1034 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1035 ewitab = _mm_slli_epi32(ewitab,2);
1036 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1037 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1038 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1039 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1040 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1041 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1042 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1043 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
1044 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1046 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1048 /* Update potential sum for this i atom from the interaction with this j atom. */
1049 velec = _mm_and_ps(velec,cutoff_mask);
1050 velec = _mm_andnot_ps(dummy_mask,velec);
1051 velecsum = _mm_add_ps(velecsum,velec);
1055 fscal = _mm_and_ps(fscal,cutoff_mask);
1057 fscal = _mm_andnot_ps(dummy_mask,fscal);
1059 /* Calculate temporary vectorial force */
1060 tx = _mm_mul_ps(fscal,dx02);
1061 ty = _mm_mul_ps(fscal,dy02);
1062 tz = _mm_mul_ps(fscal,dz02);
1064 /* Update vectorial force */
1065 fix0 = _mm_add_ps(fix0,tx);
1066 fiy0 = _mm_add_ps(fiy0,ty);
1067 fiz0 = _mm_add_ps(fiz0,tz);
1069 fjx2 = _mm_add_ps(fjx2,tx);
1070 fjy2 = _mm_add_ps(fjy2,ty);
1071 fjz2 = _mm_add_ps(fjz2,tz);
1075 /**************************
1076 * CALCULATE INTERACTIONS *
1077 **************************/
1079 if (gmx_mm_any_lt(rsq10,rcutoff2))
1082 r10 = _mm_mul_ps(rsq10,rinv10);
1083 r10 = _mm_andnot_ps(dummy_mask,r10);
1085 /* EWALD ELECTROSTATICS */
1087 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1088 ewrt = _mm_mul_ps(r10,ewtabscale);
1089 ewitab = _mm_cvttps_epi32(ewrt);
1090 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1091 ewitab = _mm_slli_epi32(ewitab,2);
1092 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1093 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1094 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1095 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1096 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1097 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1098 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1099 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
1100 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1102 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1104 /* Update potential sum for this i atom from the interaction with this j atom. */
1105 velec = _mm_and_ps(velec,cutoff_mask);
1106 velec = _mm_andnot_ps(dummy_mask,velec);
1107 velecsum = _mm_add_ps(velecsum,velec);
1111 fscal = _mm_and_ps(fscal,cutoff_mask);
1113 fscal = _mm_andnot_ps(dummy_mask,fscal);
1115 /* Calculate temporary vectorial force */
1116 tx = _mm_mul_ps(fscal,dx10);
1117 ty = _mm_mul_ps(fscal,dy10);
1118 tz = _mm_mul_ps(fscal,dz10);
1120 /* Update vectorial force */
1121 fix1 = _mm_add_ps(fix1,tx);
1122 fiy1 = _mm_add_ps(fiy1,ty);
1123 fiz1 = _mm_add_ps(fiz1,tz);
1125 fjx0 = _mm_add_ps(fjx0,tx);
1126 fjy0 = _mm_add_ps(fjy0,ty);
1127 fjz0 = _mm_add_ps(fjz0,tz);
1131 /**************************
1132 * CALCULATE INTERACTIONS *
1133 **************************/
1135 if (gmx_mm_any_lt(rsq11,rcutoff2))
1138 r11 = _mm_mul_ps(rsq11,rinv11);
1139 r11 = _mm_andnot_ps(dummy_mask,r11);
1141 /* EWALD ELECTROSTATICS */
1143 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1144 ewrt = _mm_mul_ps(r11,ewtabscale);
1145 ewitab = _mm_cvttps_epi32(ewrt);
1146 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1147 ewitab = _mm_slli_epi32(ewitab,2);
1148 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1149 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1150 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1151 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1152 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1153 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1154 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1155 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
1156 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1158 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1160 /* Update potential sum for this i atom from the interaction with this j atom. */
1161 velec = _mm_and_ps(velec,cutoff_mask);
1162 velec = _mm_andnot_ps(dummy_mask,velec);
1163 velecsum = _mm_add_ps(velecsum,velec);
1167 fscal = _mm_and_ps(fscal,cutoff_mask);
1169 fscal = _mm_andnot_ps(dummy_mask,fscal);
1171 /* Calculate temporary vectorial force */
1172 tx = _mm_mul_ps(fscal,dx11);
1173 ty = _mm_mul_ps(fscal,dy11);
1174 tz = _mm_mul_ps(fscal,dz11);
1176 /* Update vectorial force */
1177 fix1 = _mm_add_ps(fix1,tx);
1178 fiy1 = _mm_add_ps(fiy1,ty);
1179 fiz1 = _mm_add_ps(fiz1,tz);
1181 fjx1 = _mm_add_ps(fjx1,tx);
1182 fjy1 = _mm_add_ps(fjy1,ty);
1183 fjz1 = _mm_add_ps(fjz1,tz);
1187 /**************************
1188 * CALCULATE INTERACTIONS *
1189 **************************/
1191 if (gmx_mm_any_lt(rsq12,rcutoff2))
1194 r12 = _mm_mul_ps(rsq12,rinv12);
1195 r12 = _mm_andnot_ps(dummy_mask,r12);
1197 /* EWALD ELECTROSTATICS */
1199 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1200 ewrt = _mm_mul_ps(r12,ewtabscale);
1201 ewitab = _mm_cvttps_epi32(ewrt);
1202 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1203 ewitab = _mm_slli_epi32(ewitab,2);
1204 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1205 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1206 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1207 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1208 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1209 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1210 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1211 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
1212 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1214 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1216 /* Update potential sum for this i atom from the interaction with this j atom. */
1217 velec = _mm_and_ps(velec,cutoff_mask);
1218 velec = _mm_andnot_ps(dummy_mask,velec);
1219 velecsum = _mm_add_ps(velecsum,velec);
1223 fscal = _mm_and_ps(fscal,cutoff_mask);
1225 fscal = _mm_andnot_ps(dummy_mask,fscal);
1227 /* Calculate temporary vectorial force */
1228 tx = _mm_mul_ps(fscal,dx12);
1229 ty = _mm_mul_ps(fscal,dy12);
1230 tz = _mm_mul_ps(fscal,dz12);
1232 /* Update vectorial force */
1233 fix1 = _mm_add_ps(fix1,tx);
1234 fiy1 = _mm_add_ps(fiy1,ty);
1235 fiz1 = _mm_add_ps(fiz1,tz);
1237 fjx2 = _mm_add_ps(fjx2,tx);
1238 fjy2 = _mm_add_ps(fjy2,ty);
1239 fjz2 = _mm_add_ps(fjz2,tz);
1243 /**************************
1244 * CALCULATE INTERACTIONS *
1245 **************************/
1247 if (gmx_mm_any_lt(rsq20,rcutoff2))
1250 r20 = _mm_mul_ps(rsq20,rinv20);
1251 r20 = _mm_andnot_ps(dummy_mask,r20);
1253 /* EWALD ELECTROSTATICS */
1255 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1256 ewrt = _mm_mul_ps(r20,ewtabscale);
1257 ewitab = _mm_cvttps_epi32(ewrt);
1258 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1259 ewitab = _mm_slli_epi32(ewitab,2);
1260 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1261 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1262 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1263 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1264 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1265 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1266 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1267 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
1268 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1270 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1272 /* Update potential sum for this i atom from the interaction with this j atom. */
1273 velec = _mm_and_ps(velec,cutoff_mask);
1274 velec = _mm_andnot_ps(dummy_mask,velec);
1275 velecsum = _mm_add_ps(velecsum,velec);
1279 fscal = _mm_and_ps(fscal,cutoff_mask);
1281 fscal = _mm_andnot_ps(dummy_mask,fscal);
1283 /* Calculate temporary vectorial force */
1284 tx = _mm_mul_ps(fscal,dx20);
1285 ty = _mm_mul_ps(fscal,dy20);
1286 tz = _mm_mul_ps(fscal,dz20);
1288 /* Update vectorial force */
1289 fix2 = _mm_add_ps(fix2,tx);
1290 fiy2 = _mm_add_ps(fiy2,ty);
1291 fiz2 = _mm_add_ps(fiz2,tz);
1293 fjx0 = _mm_add_ps(fjx0,tx);
1294 fjy0 = _mm_add_ps(fjy0,ty);
1295 fjz0 = _mm_add_ps(fjz0,tz);
1299 /**************************
1300 * CALCULATE INTERACTIONS *
1301 **************************/
1303 if (gmx_mm_any_lt(rsq21,rcutoff2))
1306 r21 = _mm_mul_ps(rsq21,rinv21);
1307 r21 = _mm_andnot_ps(dummy_mask,r21);
1309 /* EWALD ELECTROSTATICS */
1311 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1312 ewrt = _mm_mul_ps(r21,ewtabscale);
1313 ewitab = _mm_cvttps_epi32(ewrt);
1314 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1315 ewitab = _mm_slli_epi32(ewitab,2);
1316 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1317 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1318 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1319 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1320 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1321 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1322 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1323 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
1324 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1326 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1328 /* Update potential sum for this i atom from the interaction with this j atom. */
1329 velec = _mm_and_ps(velec,cutoff_mask);
1330 velec = _mm_andnot_ps(dummy_mask,velec);
1331 velecsum = _mm_add_ps(velecsum,velec);
1335 fscal = _mm_and_ps(fscal,cutoff_mask);
1337 fscal = _mm_andnot_ps(dummy_mask,fscal);
1339 /* Calculate temporary vectorial force */
1340 tx = _mm_mul_ps(fscal,dx21);
1341 ty = _mm_mul_ps(fscal,dy21);
1342 tz = _mm_mul_ps(fscal,dz21);
1344 /* Update vectorial force */
1345 fix2 = _mm_add_ps(fix2,tx);
1346 fiy2 = _mm_add_ps(fiy2,ty);
1347 fiz2 = _mm_add_ps(fiz2,tz);
1349 fjx1 = _mm_add_ps(fjx1,tx);
1350 fjy1 = _mm_add_ps(fjy1,ty);
1351 fjz1 = _mm_add_ps(fjz1,tz);
1355 /**************************
1356 * CALCULATE INTERACTIONS *
1357 **************************/
1359 if (gmx_mm_any_lt(rsq22,rcutoff2))
1362 r22 = _mm_mul_ps(rsq22,rinv22);
1363 r22 = _mm_andnot_ps(dummy_mask,r22);
1365 /* EWALD ELECTROSTATICS */
1367 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1368 ewrt = _mm_mul_ps(r22,ewtabscale);
1369 ewitab = _mm_cvttps_epi32(ewrt);
1370 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1371 ewitab = _mm_slli_epi32(ewitab,2);
1372 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1373 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1374 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1375 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1376 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1377 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1378 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1379 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
1380 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1382 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1384 /* Update potential sum for this i atom from the interaction with this j atom. */
1385 velec = _mm_and_ps(velec,cutoff_mask);
1386 velec = _mm_andnot_ps(dummy_mask,velec);
1387 velecsum = _mm_add_ps(velecsum,velec);
1391 fscal = _mm_and_ps(fscal,cutoff_mask);
1393 fscal = _mm_andnot_ps(dummy_mask,fscal);
1395 /* Calculate temporary vectorial force */
1396 tx = _mm_mul_ps(fscal,dx22);
1397 ty = _mm_mul_ps(fscal,dy22);
1398 tz = _mm_mul_ps(fscal,dz22);
1400 /* Update vectorial force */
1401 fix2 = _mm_add_ps(fix2,tx);
1402 fiy2 = _mm_add_ps(fiy2,ty);
1403 fiz2 = _mm_add_ps(fiz2,tz);
1405 fjx2 = _mm_add_ps(fjx2,tx);
1406 fjy2 = _mm_add_ps(fjy2,ty);
1407 fjz2 = _mm_add_ps(fjz2,tz);
1411 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1412 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1413 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1414 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1416 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1417 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1419 /* Inner loop uses 441 flops */
1422 /* End of innermost loop */
1424 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1425 f+i_coord_offset,fshift+i_shift_offset);
1428 /* Update potential energies */
1429 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1430 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1432 /* Increment number of inner iterations */
1433 inneriter += j_index_end - j_index_start;
1435 /* Outer loop uses 20 flops */
1438 /* Increment number of outer iterations */
1441 /* Update outer/inner flops */
1443 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*441);
1446 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_single
1447 * Electrostatics interaction: Ewald
1448 * VdW interaction: LennardJones
1449 * Geometry: Water3-Water3
1450 * Calculate force/pot: Force
1453 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_single
1454 (t_nblist * gmx_restrict nlist,
1455 rvec * gmx_restrict xx,
1456 rvec * gmx_restrict ff,
1457 t_forcerec * gmx_restrict fr,
1458 t_mdatoms * gmx_restrict mdatoms,
1459 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1460 t_nrnb * gmx_restrict nrnb)
1462 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1463 * just 0 for non-waters.
1464 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1465 * jnr indices corresponding to data put in the four positions in the SIMD register.
1467 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1468 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1469 int jnrA,jnrB,jnrC,jnrD;
1470 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1471 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1472 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1473 real rcutoff_scalar;
1474 real *shiftvec,*fshift,*x,*f;
1475 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1476 real scratch[4*DIM];
1477 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1479 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1481 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1483 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1484 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1485 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1486 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1487 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1488 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1489 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1490 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1491 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1492 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1493 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1494 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1495 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1496 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1497 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1498 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1499 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1502 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1505 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1506 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1508 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1510 __m128 dummy_mask,cutoff_mask;
1511 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1512 __m128 one = _mm_set1_ps(1.0);
1513 __m128 two = _mm_set1_ps(2.0);
1519 jindex = nlist->jindex;
1521 shiftidx = nlist->shift;
1523 shiftvec = fr->shift_vec[0];
1524 fshift = fr->fshift[0];
1525 facel = _mm_set1_ps(fr->epsfac);
1526 charge = mdatoms->chargeA;
1527 nvdwtype = fr->ntype;
1528 vdwparam = fr->nbfp;
1529 vdwtype = mdatoms->typeA;
1531 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1532 ewtab = fr->ic->tabq_coul_F;
1533 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1534 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1536 /* Setup water-specific parameters */
1537 inr = nlist->iinr[0];
1538 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1539 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1540 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1541 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1543 jq0 = _mm_set1_ps(charge[inr+0]);
1544 jq1 = _mm_set1_ps(charge[inr+1]);
1545 jq2 = _mm_set1_ps(charge[inr+2]);
1546 vdwjidx0A = 2*vdwtype[inr+0];
1547 qq00 = _mm_mul_ps(iq0,jq0);
1548 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1549 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1550 qq01 = _mm_mul_ps(iq0,jq1);
1551 qq02 = _mm_mul_ps(iq0,jq2);
1552 qq10 = _mm_mul_ps(iq1,jq0);
1553 qq11 = _mm_mul_ps(iq1,jq1);
1554 qq12 = _mm_mul_ps(iq1,jq2);
1555 qq20 = _mm_mul_ps(iq2,jq0);
1556 qq21 = _mm_mul_ps(iq2,jq1);
1557 qq22 = _mm_mul_ps(iq2,jq2);
1559 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1560 rcutoff_scalar = fr->rcoulomb;
1561 rcutoff = _mm_set1_ps(rcutoff_scalar);
1562 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1564 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
1565 rvdw = _mm_set1_ps(fr->rvdw);
1567 /* Avoid stupid compiler warnings */
1568 jnrA = jnrB = jnrC = jnrD = 0;
1569 j_coord_offsetA = 0;
1570 j_coord_offsetB = 0;
1571 j_coord_offsetC = 0;
1572 j_coord_offsetD = 0;
1577 for(iidx=0;iidx<4*DIM;iidx++)
1579 scratch[iidx] = 0.0;
1582 /* Start outer loop over neighborlists */
1583 for(iidx=0; iidx<nri; iidx++)
1585 /* Load shift vector for this list */
1586 i_shift_offset = DIM*shiftidx[iidx];
1588 /* Load limits for loop over neighbors */
1589 j_index_start = jindex[iidx];
1590 j_index_end = jindex[iidx+1];
1592 /* Get outer coordinate index */
1594 i_coord_offset = DIM*inr;
1596 /* Load i particle coords and add shift vector */
1597 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1598 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1600 fix0 = _mm_setzero_ps();
1601 fiy0 = _mm_setzero_ps();
1602 fiz0 = _mm_setzero_ps();
1603 fix1 = _mm_setzero_ps();
1604 fiy1 = _mm_setzero_ps();
1605 fiz1 = _mm_setzero_ps();
1606 fix2 = _mm_setzero_ps();
1607 fiy2 = _mm_setzero_ps();
1608 fiz2 = _mm_setzero_ps();
1610 /* Start inner kernel loop */
1611 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1614 /* Get j neighbor index, and coordinate index */
1616 jnrB = jjnr[jidx+1];
1617 jnrC = jjnr[jidx+2];
1618 jnrD = jjnr[jidx+3];
1619 j_coord_offsetA = DIM*jnrA;
1620 j_coord_offsetB = DIM*jnrB;
1621 j_coord_offsetC = DIM*jnrC;
1622 j_coord_offsetD = DIM*jnrD;
1624 /* load j atom coordinates */
1625 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1626 x+j_coord_offsetC,x+j_coord_offsetD,
1627 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1629 /* Calculate displacement vector */
1630 dx00 = _mm_sub_ps(ix0,jx0);
1631 dy00 = _mm_sub_ps(iy0,jy0);
1632 dz00 = _mm_sub_ps(iz0,jz0);
1633 dx01 = _mm_sub_ps(ix0,jx1);
1634 dy01 = _mm_sub_ps(iy0,jy1);
1635 dz01 = _mm_sub_ps(iz0,jz1);
1636 dx02 = _mm_sub_ps(ix0,jx2);
1637 dy02 = _mm_sub_ps(iy0,jy2);
1638 dz02 = _mm_sub_ps(iz0,jz2);
1639 dx10 = _mm_sub_ps(ix1,jx0);
1640 dy10 = _mm_sub_ps(iy1,jy0);
1641 dz10 = _mm_sub_ps(iz1,jz0);
1642 dx11 = _mm_sub_ps(ix1,jx1);
1643 dy11 = _mm_sub_ps(iy1,jy1);
1644 dz11 = _mm_sub_ps(iz1,jz1);
1645 dx12 = _mm_sub_ps(ix1,jx2);
1646 dy12 = _mm_sub_ps(iy1,jy2);
1647 dz12 = _mm_sub_ps(iz1,jz2);
1648 dx20 = _mm_sub_ps(ix2,jx0);
1649 dy20 = _mm_sub_ps(iy2,jy0);
1650 dz20 = _mm_sub_ps(iz2,jz0);
1651 dx21 = _mm_sub_ps(ix2,jx1);
1652 dy21 = _mm_sub_ps(iy2,jy1);
1653 dz21 = _mm_sub_ps(iz2,jz1);
1654 dx22 = _mm_sub_ps(ix2,jx2);
1655 dy22 = _mm_sub_ps(iy2,jy2);
1656 dz22 = _mm_sub_ps(iz2,jz2);
1658 /* Calculate squared distance and things based on it */
1659 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1660 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1661 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1662 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1663 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1664 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1665 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1666 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1667 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1669 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1670 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1671 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1672 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1673 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1674 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1675 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1676 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1677 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1679 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1680 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1681 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1682 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1683 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1684 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1685 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1686 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1687 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1689 fjx0 = _mm_setzero_ps();
1690 fjy0 = _mm_setzero_ps();
1691 fjz0 = _mm_setzero_ps();
1692 fjx1 = _mm_setzero_ps();
1693 fjy1 = _mm_setzero_ps();
1694 fjz1 = _mm_setzero_ps();
1695 fjx2 = _mm_setzero_ps();
1696 fjy2 = _mm_setzero_ps();
1697 fjz2 = _mm_setzero_ps();
1699 /**************************
1700 * CALCULATE INTERACTIONS *
1701 **************************/
1703 if (gmx_mm_any_lt(rsq00,rcutoff2))
1706 r00 = _mm_mul_ps(rsq00,rinv00);
1708 /* EWALD ELECTROSTATICS */
1710 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1711 ewrt = _mm_mul_ps(r00,ewtabscale);
1712 ewitab = _mm_cvttps_epi32(ewrt);
1713 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1714 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1715 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1717 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1718 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1720 /* LENNARD-JONES DISPERSION/REPULSION */
1722 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1723 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1725 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1727 fscal = _mm_add_ps(felec,fvdw);
1729 fscal = _mm_and_ps(fscal,cutoff_mask);
1731 /* Calculate temporary vectorial force */
1732 tx = _mm_mul_ps(fscal,dx00);
1733 ty = _mm_mul_ps(fscal,dy00);
1734 tz = _mm_mul_ps(fscal,dz00);
1736 /* Update vectorial force */
1737 fix0 = _mm_add_ps(fix0,tx);
1738 fiy0 = _mm_add_ps(fiy0,ty);
1739 fiz0 = _mm_add_ps(fiz0,tz);
1741 fjx0 = _mm_add_ps(fjx0,tx);
1742 fjy0 = _mm_add_ps(fjy0,ty);
1743 fjz0 = _mm_add_ps(fjz0,tz);
1747 /**************************
1748 * CALCULATE INTERACTIONS *
1749 **************************/
1751 if (gmx_mm_any_lt(rsq01,rcutoff2))
1754 r01 = _mm_mul_ps(rsq01,rinv01);
1756 /* EWALD ELECTROSTATICS */
1758 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1759 ewrt = _mm_mul_ps(r01,ewtabscale);
1760 ewitab = _mm_cvttps_epi32(ewrt);
1761 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1762 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1763 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1765 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1766 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1768 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1772 fscal = _mm_and_ps(fscal,cutoff_mask);
1774 /* Calculate temporary vectorial force */
1775 tx = _mm_mul_ps(fscal,dx01);
1776 ty = _mm_mul_ps(fscal,dy01);
1777 tz = _mm_mul_ps(fscal,dz01);
1779 /* Update vectorial force */
1780 fix0 = _mm_add_ps(fix0,tx);
1781 fiy0 = _mm_add_ps(fiy0,ty);
1782 fiz0 = _mm_add_ps(fiz0,tz);
1784 fjx1 = _mm_add_ps(fjx1,tx);
1785 fjy1 = _mm_add_ps(fjy1,ty);
1786 fjz1 = _mm_add_ps(fjz1,tz);
1790 /**************************
1791 * CALCULATE INTERACTIONS *
1792 **************************/
1794 if (gmx_mm_any_lt(rsq02,rcutoff2))
1797 r02 = _mm_mul_ps(rsq02,rinv02);
1799 /* EWALD ELECTROSTATICS */
1801 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1802 ewrt = _mm_mul_ps(r02,ewtabscale);
1803 ewitab = _mm_cvttps_epi32(ewrt);
1804 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1805 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1806 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1808 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1809 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1811 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1815 fscal = _mm_and_ps(fscal,cutoff_mask);
1817 /* Calculate temporary vectorial force */
1818 tx = _mm_mul_ps(fscal,dx02);
1819 ty = _mm_mul_ps(fscal,dy02);
1820 tz = _mm_mul_ps(fscal,dz02);
1822 /* Update vectorial force */
1823 fix0 = _mm_add_ps(fix0,tx);
1824 fiy0 = _mm_add_ps(fiy0,ty);
1825 fiz0 = _mm_add_ps(fiz0,tz);
1827 fjx2 = _mm_add_ps(fjx2,tx);
1828 fjy2 = _mm_add_ps(fjy2,ty);
1829 fjz2 = _mm_add_ps(fjz2,tz);
1833 /**************************
1834 * CALCULATE INTERACTIONS *
1835 **************************/
1837 if (gmx_mm_any_lt(rsq10,rcutoff2))
1840 r10 = _mm_mul_ps(rsq10,rinv10);
1842 /* EWALD ELECTROSTATICS */
1844 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1845 ewrt = _mm_mul_ps(r10,ewtabscale);
1846 ewitab = _mm_cvttps_epi32(ewrt);
1847 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1848 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1849 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1851 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1852 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1854 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1858 fscal = _mm_and_ps(fscal,cutoff_mask);
1860 /* Calculate temporary vectorial force */
1861 tx = _mm_mul_ps(fscal,dx10);
1862 ty = _mm_mul_ps(fscal,dy10);
1863 tz = _mm_mul_ps(fscal,dz10);
1865 /* Update vectorial force */
1866 fix1 = _mm_add_ps(fix1,tx);
1867 fiy1 = _mm_add_ps(fiy1,ty);
1868 fiz1 = _mm_add_ps(fiz1,tz);
1870 fjx0 = _mm_add_ps(fjx0,tx);
1871 fjy0 = _mm_add_ps(fjy0,ty);
1872 fjz0 = _mm_add_ps(fjz0,tz);
1876 /**************************
1877 * CALCULATE INTERACTIONS *
1878 **************************/
1880 if (gmx_mm_any_lt(rsq11,rcutoff2))
1883 r11 = _mm_mul_ps(rsq11,rinv11);
1885 /* EWALD ELECTROSTATICS */
1887 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1888 ewrt = _mm_mul_ps(r11,ewtabscale);
1889 ewitab = _mm_cvttps_epi32(ewrt);
1890 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1891 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1892 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1894 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1895 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1897 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1901 fscal = _mm_and_ps(fscal,cutoff_mask);
1903 /* Calculate temporary vectorial force */
1904 tx = _mm_mul_ps(fscal,dx11);
1905 ty = _mm_mul_ps(fscal,dy11);
1906 tz = _mm_mul_ps(fscal,dz11);
1908 /* Update vectorial force */
1909 fix1 = _mm_add_ps(fix1,tx);
1910 fiy1 = _mm_add_ps(fiy1,ty);
1911 fiz1 = _mm_add_ps(fiz1,tz);
1913 fjx1 = _mm_add_ps(fjx1,tx);
1914 fjy1 = _mm_add_ps(fjy1,ty);
1915 fjz1 = _mm_add_ps(fjz1,tz);
1919 /**************************
1920 * CALCULATE INTERACTIONS *
1921 **************************/
1923 if (gmx_mm_any_lt(rsq12,rcutoff2))
1926 r12 = _mm_mul_ps(rsq12,rinv12);
1928 /* EWALD ELECTROSTATICS */
1930 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1931 ewrt = _mm_mul_ps(r12,ewtabscale);
1932 ewitab = _mm_cvttps_epi32(ewrt);
1933 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1934 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1935 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1937 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1938 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1940 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1944 fscal = _mm_and_ps(fscal,cutoff_mask);
1946 /* Calculate temporary vectorial force */
1947 tx = _mm_mul_ps(fscal,dx12);
1948 ty = _mm_mul_ps(fscal,dy12);
1949 tz = _mm_mul_ps(fscal,dz12);
1951 /* Update vectorial force */
1952 fix1 = _mm_add_ps(fix1,tx);
1953 fiy1 = _mm_add_ps(fiy1,ty);
1954 fiz1 = _mm_add_ps(fiz1,tz);
1956 fjx2 = _mm_add_ps(fjx2,tx);
1957 fjy2 = _mm_add_ps(fjy2,ty);
1958 fjz2 = _mm_add_ps(fjz2,tz);
1962 /**************************
1963 * CALCULATE INTERACTIONS *
1964 **************************/
1966 if (gmx_mm_any_lt(rsq20,rcutoff2))
1969 r20 = _mm_mul_ps(rsq20,rinv20);
1971 /* EWALD ELECTROSTATICS */
1973 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1974 ewrt = _mm_mul_ps(r20,ewtabscale);
1975 ewitab = _mm_cvttps_epi32(ewrt);
1976 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1977 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1978 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1980 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1981 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1983 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1987 fscal = _mm_and_ps(fscal,cutoff_mask);
1989 /* Calculate temporary vectorial force */
1990 tx = _mm_mul_ps(fscal,dx20);
1991 ty = _mm_mul_ps(fscal,dy20);
1992 tz = _mm_mul_ps(fscal,dz20);
1994 /* Update vectorial force */
1995 fix2 = _mm_add_ps(fix2,tx);
1996 fiy2 = _mm_add_ps(fiy2,ty);
1997 fiz2 = _mm_add_ps(fiz2,tz);
1999 fjx0 = _mm_add_ps(fjx0,tx);
2000 fjy0 = _mm_add_ps(fjy0,ty);
2001 fjz0 = _mm_add_ps(fjz0,tz);
2005 /**************************
2006 * CALCULATE INTERACTIONS *
2007 **************************/
2009 if (gmx_mm_any_lt(rsq21,rcutoff2))
2012 r21 = _mm_mul_ps(rsq21,rinv21);
2014 /* EWALD ELECTROSTATICS */
2016 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2017 ewrt = _mm_mul_ps(r21,ewtabscale);
2018 ewitab = _mm_cvttps_epi32(ewrt);
2019 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2020 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2021 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2023 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2024 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2026 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2030 fscal = _mm_and_ps(fscal,cutoff_mask);
2032 /* Calculate temporary vectorial force */
2033 tx = _mm_mul_ps(fscal,dx21);
2034 ty = _mm_mul_ps(fscal,dy21);
2035 tz = _mm_mul_ps(fscal,dz21);
2037 /* Update vectorial force */
2038 fix2 = _mm_add_ps(fix2,tx);
2039 fiy2 = _mm_add_ps(fiy2,ty);
2040 fiz2 = _mm_add_ps(fiz2,tz);
2042 fjx1 = _mm_add_ps(fjx1,tx);
2043 fjy1 = _mm_add_ps(fjy1,ty);
2044 fjz1 = _mm_add_ps(fjz1,tz);
2048 /**************************
2049 * CALCULATE INTERACTIONS *
2050 **************************/
2052 if (gmx_mm_any_lt(rsq22,rcutoff2))
2055 r22 = _mm_mul_ps(rsq22,rinv22);
2057 /* EWALD ELECTROSTATICS */
2059 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2060 ewrt = _mm_mul_ps(r22,ewtabscale);
2061 ewitab = _mm_cvttps_epi32(ewrt);
2062 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2063 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2064 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2066 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2067 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2069 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2073 fscal = _mm_and_ps(fscal,cutoff_mask);
2075 /* Calculate temporary vectorial force */
2076 tx = _mm_mul_ps(fscal,dx22);
2077 ty = _mm_mul_ps(fscal,dy22);
2078 tz = _mm_mul_ps(fscal,dz22);
2080 /* Update vectorial force */
2081 fix2 = _mm_add_ps(fix2,tx);
2082 fiy2 = _mm_add_ps(fiy2,ty);
2083 fiz2 = _mm_add_ps(fiz2,tz);
2085 fjx2 = _mm_add_ps(fjx2,tx);
2086 fjy2 = _mm_add_ps(fjy2,ty);
2087 fjz2 = _mm_add_ps(fjz2,tz);
2091 fjptrA = f+j_coord_offsetA;
2092 fjptrB = f+j_coord_offsetB;
2093 fjptrC = f+j_coord_offsetC;
2094 fjptrD = f+j_coord_offsetD;
2096 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2097 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2099 /* Inner loop uses 358 flops */
2102 if(jidx<j_index_end)
2105 /* Get j neighbor index, and coordinate index */
2106 jnrlistA = jjnr[jidx];
2107 jnrlistB = jjnr[jidx+1];
2108 jnrlistC = jjnr[jidx+2];
2109 jnrlistD = jjnr[jidx+3];
2110 /* Sign of each element will be negative for non-real atoms.
2111 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2112 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2114 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2115 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2116 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2117 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2118 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2119 j_coord_offsetA = DIM*jnrA;
2120 j_coord_offsetB = DIM*jnrB;
2121 j_coord_offsetC = DIM*jnrC;
2122 j_coord_offsetD = DIM*jnrD;
2124 /* load j atom coordinates */
2125 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2126 x+j_coord_offsetC,x+j_coord_offsetD,
2127 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2129 /* Calculate displacement vector */
2130 dx00 = _mm_sub_ps(ix0,jx0);
2131 dy00 = _mm_sub_ps(iy0,jy0);
2132 dz00 = _mm_sub_ps(iz0,jz0);
2133 dx01 = _mm_sub_ps(ix0,jx1);
2134 dy01 = _mm_sub_ps(iy0,jy1);
2135 dz01 = _mm_sub_ps(iz0,jz1);
2136 dx02 = _mm_sub_ps(ix0,jx2);
2137 dy02 = _mm_sub_ps(iy0,jy2);
2138 dz02 = _mm_sub_ps(iz0,jz2);
2139 dx10 = _mm_sub_ps(ix1,jx0);
2140 dy10 = _mm_sub_ps(iy1,jy0);
2141 dz10 = _mm_sub_ps(iz1,jz0);
2142 dx11 = _mm_sub_ps(ix1,jx1);
2143 dy11 = _mm_sub_ps(iy1,jy1);
2144 dz11 = _mm_sub_ps(iz1,jz1);
2145 dx12 = _mm_sub_ps(ix1,jx2);
2146 dy12 = _mm_sub_ps(iy1,jy2);
2147 dz12 = _mm_sub_ps(iz1,jz2);
2148 dx20 = _mm_sub_ps(ix2,jx0);
2149 dy20 = _mm_sub_ps(iy2,jy0);
2150 dz20 = _mm_sub_ps(iz2,jz0);
2151 dx21 = _mm_sub_ps(ix2,jx1);
2152 dy21 = _mm_sub_ps(iy2,jy1);
2153 dz21 = _mm_sub_ps(iz2,jz1);
2154 dx22 = _mm_sub_ps(ix2,jx2);
2155 dy22 = _mm_sub_ps(iy2,jy2);
2156 dz22 = _mm_sub_ps(iz2,jz2);
2158 /* Calculate squared distance and things based on it */
2159 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2160 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2161 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2162 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2163 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2164 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2165 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2166 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2167 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2169 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2170 rinv01 = gmx_mm_invsqrt_ps(rsq01);
2171 rinv02 = gmx_mm_invsqrt_ps(rsq02);
2172 rinv10 = gmx_mm_invsqrt_ps(rsq10);
2173 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2174 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2175 rinv20 = gmx_mm_invsqrt_ps(rsq20);
2176 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2177 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2179 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2180 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
2181 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
2182 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
2183 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2184 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2185 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
2186 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2187 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2189 fjx0 = _mm_setzero_ps();
2190 fjy0 = _mm_setzero_ps();
2191 fjz0 = _mm_setzero_ps();
2192 fjx1 = _mm_setzero_ps();
2193 fjy1 = _mm_setzero_ps();
2194 fjz1 = _mm_setzero_ps();
2195 fjx2 = _mm_setzero_ps();
2196 fjy2 = _mm_setzero_ps();
2197 fjz2 = _mm_setzero_ps();
2199 /**************************
2200 * CALCULATE INTERACTIONS *
2201 **************************/
2203 if (gmx_mm_any_lt(rsq00,rcutoff2))
2206 r00 = _mm_mul_ps(rsq00,rinv00);
2207 r00 = _mm_andnot_ps(dummy_mask,r00);
2209 /* EWALD ELECTROSTATICS */
2211 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2212 ewrt = _mm_mul_ps(r00,ewtabscale);
2213 ewitab = _mm_cvttps_epi32(ewrt);
2214 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2215 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2216 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2218 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2219 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2221 /* LENNARD-JONES DISPERSION/REPULSION */
2223 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2224 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
2226 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2228 fscal = _mm_add_ps(felec,fvdw);
2230 fscal = _mm_and_ps(fscal,cutoff_mask);
2232 fscal = _mm_andnot_ps(dummy_mask,fscal);
2234 /* Calculate temporary vectorial force */
2235 tx = _mm_mul_ps(fscal,dx00);
2236 ty = _mm_mul_ps(fscal,dy00);
2237 tz = _mm_mul_ps(fscal,dz00);
2239 /* Update vectorial force */
2240 fix0 = _mm_add_ps(fix0,tx);
2241 fiy0 = _mm_add_ps(fiy0,ty);
2242 fiz0 = _mm_add_ps(fiz0,tz);
2244 fjx0 = _mm_add_ps(fjx0,tx);
2245 fjy0 = _mm_add_ps(fjy0,ty);
2246 fjz0 = _mm_add_ps(fjz0,tz);
2250 /**************************
2251 * CALCULATE INTERACTIONS *
2252 **************************/
2254 if (gmx_mm_any_lt(rsq01,rcutoff2))
2257 r01 = _mm_mul_ps(rsq01,rinv01);
2258 r01 = _mm_andnot_ps(dummy_mask,r01);
2260 /* EWALD ELECTROSTATICS */
2262 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2263 ewrt = _mm_mul_ps(r01,ewtabscale);
2264 ewitab = _mm_cvttps_epi32(ewrt);
2265 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2266 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2267 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2269 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2270 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2272 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2276 fscal = _mm_and_ps(fscal,cutoff_mask);
2278 fscal = _mm_andnot_ps(dummy_mask,fscal);
2280 /* Calculate temporary vectorial force */
2281 tx = _mm_mul_ps(fscal,dx01);
2282 ty = _mm_mul_ps(fscal,dy01);
2283 tz = _mm_mul_ps(fscal,dz01);
2285 /* Update vectorial force */
2286 fix0 = _mm_add_ps(fix0,tx);
2287 fiy0 = _mm_add_ps(fiy0,ty);
2288 fiz0 = _mm_add_ps(fiz0,tz);
2290 fjx1 = _mm_add_ps(fjx1,tx);
2291 fjy1 = _mm_add_ps(fjy1,ty);
2292 fjz1 = _mm_add_ps(fjz1,tz);
2296 /**************************
2297 * CALCULATE INTERACTIONS *
2298 **************************/
2300 if (gmx_mm_any_lt(rsq02,rcutoff2))
2303 r02 = _mm_mul_ps(rsq02,rinv02);
2304 r02 = _mm_andnot_ps(dummy_mask,r02);
2306 /* EWALD ELECTROSTATICS */
2308 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2309 ewrt = _mm_mul_ps(r02,ewtabscale);
2310 ewitab = _mm_cvttps_epi32(ewrt);
2311 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2312 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2313 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2315 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2316 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2318 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2322 fscal = _mm_and_ps(fscal,cutoff_mask);
2324 fscal = _mm_andnot_ps(dummy_mask,fscal);
2326 /* Calculate temporary vectorial force */
2327 tx = _mm_mul_ps(fscal,dx02);
2328 ty = _mm_mul_ps(fscal,dy02);
2329 tz = _mm_mul_ps(fscal,dz02);
2331 /* Update vectorial force */
2332 fix0 = _mm_add_ps(fix0,tx);
2333 fiy0 = _mm_add_ps(fiy0,ty);
2334 fiz0 = _mm_add_ps(fiz0,tz);
2336 fjx2 = _mm_add_ps(fjx2,tx);
2337 fjy2 = _mm_add_ps(fjy2,ty);
2338 fjz2 = _mm_add_ps(fjz2,tz);
2342 /**************************
2343 * CALCULATE INTERACTIONS *
2344 **************************/
2346 if (gmx_mm_any_lt(rsq10,rcutoff2))
2349 r10 = _mm_mul_ps(rsq10,rinv10);
2350 r10 = _mm_andnot_ps(dummy_mask,r10);
2352 /* EWALD ELECTROSTATICS */
2354 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2355 ewrt = _mm_mul_ps(r10,ewtabscale);
2356 ewitab = _mm_cvttps_epi32(ewrt);
2357 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2358 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2359 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2361 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2362 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2364 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2368 fscal = _mm_and_ps(fscal,cutoff_mask);
2370 fscal = _mm_andnot_ps(dummy_mask,fscal);
2372 /* Calculate temporary vectorial force */
2373 tx = _mm_mul_ps(fscal,dx10);
2374 ty = _mm_mul_ps(fscal,dy10);
2375 tz = _mm_mul_ps(fscal,dz10);
2377 /* Update vectorial force */
2378 fix1 = _mm_add_ps(fix1,tx);
2379 fiy1 = _mm_add_ps(fiy1,ty);
2380 fiz1 = _mm_add_ps(fiz1,tz);
2382 fjx0 = _mm_add_ps(fjx0,tx);
2383 fjy0 = _mm_add_ps(fjy0,ty);
2384 fjz0 = _mm_add_ps(fjz0,tz);
2388 /**************************
2389 * CALCULATE INTERACTIONS *
2390 **************************/
2392 if (gmx_mm_any_lt(rsq11,rcutoff2))
2395 r11 = _mm_mul_ps(rsq11,rinv11);
2396 r11 = _mm_andnot_ps(dummy_mask,r11);
2398 /* EWALD ELECTROSTATICS */
2400 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2401 ewrt = _mm_mul_ps(r11,ewtabscale);
2402 ewitab = _mm_cvttps_epi32(ewrt);
2403 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2404 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2405 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2407 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2408 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2410 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2414 fscal = _mm_and_ps(fscal,cutoff_mask);
2416 fscal = _mm_andnot_ps(dummy_mask,fscal);
2418 /* Calculate temporary vectorial force */
2419 tx = _mm_mul_ps(fscal,dx11);
2420 ty = _mm_mul_ps(fscal,dy11);
2421 tz = _mm_mul_ps(fscal,dz11);
2423 /* Update vectorial force */
2424 fix1 = _mm_add_ps(fix1,tx);
2425 fiy1 = _mm_add_ps(fiy1,ty);
2426 fiz1 = _mm_add_ps(fiz1,tz);
2428 fjx1 = _mm_add_ps(fjx1,tx);
2429 fjy1 = _mm_add_ps(fjy1,ty);
2430 fjz1 = _mm_add_ps(fjz1,tz);
2434 /**************************
2435 * CALCULATE INTERACTIONS *
2436 **************************/
2438 if (gmx_mm_any_lt(rsq12,rcutoff2))
2441 r12 = _mm_mul_ps(rsq12,rinv12);
2442 r12 = _mm_andnot_ps(dummy_mask,r12);
2444 /* EWALD ELECTROSTATICS */
2446 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2447 ewrt = _mm_mul_ps(r12,ewtabscale);
2448 ewitab = _mm_cvttps_epi32(ewrt);
2449 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2450 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2451 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2453 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2454 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2456 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2460 fscal = _mm_and_ps(fscal,cutoff_mask);
2462 fscal = _mm_andnot_ps(dummy_mask,fscal);
2464 /* Calculate temporary vectorial force */
2465 tx = _mm_mul_ps(fscal,dx12);
2466 ty = _mm_mul_ps(fscal,dy12);
2467 tz = _mm_mul_ps(fscal,dz12);
2469 /* Update vectorial force */
2470 fix1 = _mm_add_ps(fix1,tx);
2471 fiy1 = _mm_add_ps(fiy1,ty);
2472 fiz1 = _mm_add_ps(fiz1,tz);
2474 fjx2 = _mm_add_ps(fjx2,tx);
2475 fjy2 = _mm_add_ps(fjy2,ty);
2476 fjz2 = _mm_add_ps(fjz2,tz);
2480 /**************************
2481 * CALCULATE INTERACTIONS *
2482 **************************/
2484 if (gmx_mm_any_lt(rsq20,rcutoff2))
2487 r20 = _mm_mul_ps(rsq20,rinv20);
2488 r20 = _mm_andnot_ps(dummy_mask,r20);
2490 /* EWALD ELECTROSTATICS */
2492 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2493 ewrt = _mm_mul_ps(r20,ewtabscale);
2494 ewitab = _mm_cvttps_epi32(ewrt);
2495 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2496 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2497 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2499 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2500 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2502 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2506 fscal = _mm_and_ps(fscal,cutoff_mask);
2508 fscal = _mm_andnot_ps(dummy_mask,fscal);
2510 /* Calculate temporary vectorial force */
2511 tx = _mm_mul_ps(fscal,dx20);
2512 ty = _mm_mul_ps(fscal,dy20);
2513 tz = _mm_mul_ps(fscal,dz20);
2515 /* Update vectorial force */
2516 fix2 = _mm_add_ps(fix2,tx);
2517 fiy2 = _mm_add_ps(fiy2,ty);
2518 fiz2 = _mm_add_ps(fiz2,tz);
2520 fjx0 = _mm_add_ps(fjx0,tx);
2521 fjy0 = _mm_add_ps(fjy0,ty);
2522 fjz0 = _mm_add_ps(fjz0,tz);
2526 /**************************
2527 * CALCULATE INTERACTIONS *
2528 **************************/
2530 if (gmx_mm_any_lt(rsq21,rcutoff2))
2533 r21 = _mm_mul_ps(rsq21,rinv21);
2534 r21 = _mm_andnot_ps(dummy_mask,r21);
2536 /* EWALD ELECTROSTATICS */
2538 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2539 ewrt = _mm_mul_ps(r21,ewtabscale);
2540 ewitab = _mm_cvttps_epi32(ewrt);
2541 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2542 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2543 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2545 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2546 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2548 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2552 fscal = _mm_and_ps(fscal,cutoff_mask);
2554 fscal = _mm_andnot_ps(dummy_mask,fscal);
2556 /* Calculate temporary vectorial force */
2557 tx = _mm_mul_ps(fscal,dx21);
2558 ty = _mm_mul_ps(fscal,dy21);
2559 tz = _mm_mul_ps(fscal,dz21);
2561 /* Update vectorial force */
2562 fix2 = _mm_add_ps(fix2,tx);
2563 fiy2 = _mm_add_ps(fiy2,ty);
2564 fiz2 = _mm_add_ps(fiz2,tz);
2566 fjx1 = _mm_add_ps(fjx1,tx);
2567 fjy1 = _mm_add_ps(fjy1,ty);
2568 fjz1 = _mm_add_ps(fjz1,tz);
2572 /**************************
2573 * CALCULATE INTERACTIONS *
2574 **************************/
2576 if (gmx_mm_any_lt(rsq22,rcutoff2))
2579 r22 = _mm_mul_ps(rsq22,rinv22);
2580 r22 = _mm_andnot_ps(dummy_mask,r22);
2582 /* EWALD ELECTROSTATICS */
2584 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2585 ewrt = _mm_mul_ps(r22,ewtabscale);
2586 ewitab = _mm_cvttps_epi32(ewrt);
2587 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2588 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2589 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2591 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2592 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2594 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2598 fscal = _mm_and_ps(fscal,cutoff_mask);
2600 fscal = _mm_andnot_ps(dummy_mask,fscal);
2602 /* Calculate temporary vectorial force */
2603 tx = _mm_mul_ps(fscal,dx22);
2604 ty = _mm_mul_ps(fscal,dy22);
2605 tz = _mm_mul_ps(fscal,dz22);
2607 /* Update vectorial force */
2608 fix2 = _mm_add_ps(fix2,tx);
2609 fiy2 = _mm_add_ps(fiy2,ty);
2610 fiz2 = _mm_add_ps(fiz2,tz);
2612 fjx2 = _mm_add_ps(fjx2,tx);
2613 fjy2 = _mm_add_ps(fjy2,ty);
2614 fjz2 = _mm_add_ps(fjz2,tz);
2618 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2619 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2620 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2621 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2623 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2624 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2626 /* Inner loop uses 367 flops */
2629 /* End of innermost loop */
2631 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2632 f+i_coord_offset,fshift+i_shift_offset);
2634 /* Increment number of inner iterations */
2635 inneriter += j_index_end - j_index_start;
2637 /* Outer loop uses 18 flops */
2640 /* Increment number of outer iterations */
2643 /* Update outer/inner flops */
2645 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*367);