2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
49 #include "gromacs/simd/math_x86_sse2_single.h"
50 #include "kernelutil_x86_sse2_single.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_single
54 * Electrostatics interaction: Ewald
55 * VdW interaction: LennardJones
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
84 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
109 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
113 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
115 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
117 __m128 dummy_mask,cutoff_mask;
118 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
119 __m128 one = _mm_set1_ps(1.0);
120 __m128 two = _mm_set1_ps(2.0);
126 jindex = nlist->jindex;
128 shiftidx = nlist->shift;
130 shiftvec = fr->shift_vec[0];
131 fshift = fr->fshift[0];
132 facel = _mm_set1_ps(fr->epsfac);
133 charge = mdatoms->chargeA;
134 nvdwtype = fr->ntype;
136 vdwtype = mdatoms->typeA;
138 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
139 ewtab = fr->ic->tabq_coul_FDV0;
140 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
141 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
143 /* Setup water-specific parameters */
144 inr = nlist->iinr[0];
145 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
146 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
147 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
148 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
150 jq0 = _mm_set1_ps(charge[inr+0]);
151 jq1 = _mm_set1_ps(charge[inr+1]);
152 jq2 = _mm_set1_ps(charge[inr+2]);
153 vdwjidx0A = 2*vdwtype[inr+0];
154 qq00 = _mm_mul_ps(iq0,jq0);
155 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
156 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
157 qq01 = _mm_mul_ps(iq0,jq1);
158 qq02 = _mm_mul_ps(iq0,jq2);
159 qq10 = _mm_mul_ps(iq1,jq0);
160 qq11 = _mm_mul_ps(iq1,jq1);
161 qq12 = _mm_mul_ps(iq1,jq2);
162 qq20 = _mm_mul_ps(iq2,jq0);
163 qq21 = _mm_mul_ps(iq2,jq1);
164 qq22 = _mm_mul_ps(iq2,jq2);
166 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
167 rcutoff_scalar = fr->rcoulomb;
168 rcutoff = _mm_set1_ps(rcutoff_scalar);
169 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
171 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
172 rvdw = _mm_set1_ps(fr->rvdw);
174 /* Avoid stupid compiler warnings */
175 jnrA = jnrB = jnrC = jnrD = 0;
184 for(iidx=0;iidx<4*DIM;iidx++)
189 /* Start outer loop over neighborlists */
190 for(iidx=0; iidx<nri; iidx++)
192 /* Load shift vector for this list */
193 i_shift_offset = DIM*shiftidx[iidx];
195 /* Load limits for loop over neighbors */
196 j_index_start = jindex[iidx];
197 j_index_end = jindex[iidx+1];
199 /* Get outer coordinate index */
201 i_coord_offset = DIM*inr;
203 /* Load i particle coords and add shift vector */
204 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
205 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
207 fix0 = _mm_setzero_ps();
208 fiy0 = _mm_setzero_ps();
209 fiz0 = _mm_setzero_ps();
210 fix1 = _mm_setzero_ps();
211 fiy1 = _mm_setzero_ps();
212 fiz1 = _mm_setzero_ps();
213 fix2 = _mm_setzero_ps();
214 fiy2 = _mm_setzero_ps();
215 fiz2 = _mm_setzero_ps();
217 /* Reset potential sums */
218 velecsum = _mm_setzero_ps();
219 vvdwsum = _mm_setzero_ps();
221 /* Start inner kernel loop */
222 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
225 /* Get j neighbor index, and coordinate index */
230 j_coord_offsetA = DIM*jnrA;
231 j_coord_offsetB = DIM*jnrB;
232 j_coord_offsetC = DIM*jnrC;
233 j_coord_offsetD = DIM*jnrD;
235 /* load j atom coordinates */
236 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
237 x+j_coord_offsetC,x+j_coord_offsetD,
238 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
240 /* Calculate displacement vector */
241 dx00 = _mm_sub_ps(ix0,jx0);
242 dy00 = _mm_sub_ps(iy0,jy0);
243 dz00 = _mm_sub_ps(iz0,jz0);
244 dx01 = _mm_sub_ps(ix0,jx1);
245 dy01 = _mm_sub_ps(iy0,jy1);
246 dz01 = _mm_sub_ps(iz0,jz1);
247 dx02 = _mm_sub_ps(ix0,jx2);
248 dy02 = _mm_sub_ps(iy0,jy2);
249 dz02 = _mm_sub_ps(iz0,jz2);
250 dx10 = _mm_sub_ps(ix1,jx0);
251 dy10 = _mm_sub_ps(iy1,jy0);
252 dz10 = _mm_sub_ps(iz1,jz0);
253 dx11 = _mm_sub_ps(ix1,jx1);
254 dy11 = _mm_sub_ps(iy1,jy1);
255 dz11 = _mm_sub_ps(iz1,jz1);
256 dx12 = _mm_sub_ps(ix1,jx2);
257 dy12 = _mm_sub_ps(iy1,jy2);
258 dz12 = _mm_sub_ps(iz1,jz2);
259 dx20 = _mm_sub_ps(ix2,jx0);
260 dy20 = _mm_sub_ps(iy2,jy0);
261 dz20 = _mm_sub_ps(iz2,jz0);
262 dx21 = _mm_sub_ps(ix2,jx1);
263 dy21 = _mm_sub_ps(iy2,jy1);
264 dz21 = _mm_sub_ps(iz2,jz1);
265 dx22 = _mm_sub_ps(ix2,jx2);
266 dy22 = _mm_sub_ps(iy2,jy2);
267 dz22 = _mm_sub_ps(iz2,jz2);
269 /* Calculate squared distance and things based on it */
270 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
271 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
272 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
273 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
274 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
275 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
276 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
277 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
278 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
280 rinv00 = gmx_mm_invsqrt_ps(rsq00);
281 rinv01 = gmx_mm_invsqrt_ps(rsq01);
282 rinv02 = gmx_mm_invsqrt_ps(rsq02);
283 rinv10 = gmx_mm_invsqrt_ps(rsq10);
284 rinv11 = gmx_mm_invsqrt_ps(rsq11);
285 rinv12 = gmx_mm_invsqrt_ps(rsq12);
286 rinv20 = gmx_mm_invsqrt_ps(rsq20);
287 rinv21 = gmx_mm_invsqrt_ps(rsq21);
288 rinv22 = gmx_mm_invsqrt_ps(rsq22);
290 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
291 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
292 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
293 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
294 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
295 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
296 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
297 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
298 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
300 fjx0 = _mm_setzero_ps();
301 fjy0 = _mm_setzero_ps();
302 fjz0 = _mm_setzero_ps();
303 fjx1 = _mm_setzero_ps();
304 fjy1 = _mm_setzero_ps();
305 fjz1 = _mm_setzero_ps();
306 fjx2 = _mm_setzero_ps();
307 fjy2 = _mm_setzero_ps();
308 fjz2 = _mm_setzero_ps();
310 /**************************
311 * CALCULATE INTERACTIONS *
312 **************************/
314 if (gmx_mm_any_lt(rsq00,rcutoff2))
317 r00 = _mm_mul_ps(rsq00,rinv00);
319 /* EWALD ELECTROSTATICS */
321 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
322 ewrt = _mm_mul_ps(r00,ewtabscale);
323 ewitab = _mm_cvttps_epi32(ewrt);
324 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
325 ewitab = _mm_slli_epi32(ewitab,2);
326 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
327 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
328 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
329 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
330 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
331 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
332 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
333 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
334 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
336 /* LENNARD-JONES DISPERSION/REPULSION */
338 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
339 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
340 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
341 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
342 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
343 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
345 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
347 /* Update potential sum for this i atom from the interaction with this j atom. */
348 velec = _mm_and_ps(velec,cutoff_mask);
349 velecsum = _mm_add_ps(velecsum,velec);
350 vvdw = _mm_and_ps(vvdw,cutoff_mask);
351 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
353 fscal = _mm_add_ps(felec,fvdw);
355 fscal = _mm_and_ps(fscal,cutoff_mask);
357 /* Calculate temporary vectorial force */
358 tx = _mm_mul_ps(fscal,dx00);
359 ty = _mm_mul_ps(fscal,dy00);
360 tz = _mm_mul_ps(fscal,dz00);
362 /* Update vectorial force */
363 fix0 = _mm_add_ps(fix0,tx);
364 fiy0 = _mm_add_ps(fiy0,ty);
365 fiz0 = _mm_add_ps(fiz0,tz);
367 fjx0 = _mm_add_ps(fjx0,tx);
368 fjy0 = _mm_add_ps(fjy0,ty);
369 fjz0 = _mm_add_ps(fjz0,tz);
373 /**************************
374 * CALCULATE INTERACTIONS *
375 **************************/
377 if (gmx_mm_any_lt(rsq01,rcutoff2))
380 r01 = _mm_mul_ps(rsq01,rinv01);
382 /* EWALD ELECTROSTATICS */
384 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
385 ewrt = _mm_mul_ps(r01,ewtabscale);
386 ewitab = _mm_cvttps_epi32(ewrt);
387 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
388 ewitab = _mm_slli_epi32(ewitab,2);
389 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
390 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
391 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
392 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
393 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
394 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
395 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
396 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
397 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
399 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
401 /* Update potential sum for this i atom from the interaction with this j atom. */
402 velec = _mm_and_ps(velec,cutoff_mask);
403 velecsum = _mm_add_ps(velecsum,velec);
407 fscal = _mm_and_ps(fscal,cutoff_mask);
409 /* Calculate temporary vectorial force */
410 tx = _mm_mul_ps(fscal,dx01);
411 ty = _mm_mul_ps(fscal,dy01);
412 tz = _mm_mul_ps(fscal,dz01);
414 /* Update vectorial force */
415 fix0 = _mm_add_ps(fix0,tx);
416 fiy0 = _mm_add_ps(fiy0,ty);
417 fiz0 = _mm_add_ps(fiz0,tz);
419 fjx1 = _mm_add_ps(fjx1,tx);
420 fjy1 = _mm_add_ps(fjy1,ty);
421 fjz1 = _mm_add_ps(fjz1,tz);
425 /**************************
426 * CALCULATE INTERACTIONS *
427 **************************/
429 if (gmx_mm_any_lt(rsq02,rcutoff2))
432 r02 = _mm_mul_ps(rsq02,rinv02);
434 /* EWALD ELECTROSTATICS */
436 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
437 ewrt = _mm_mul_ps(r02,ewtabscale);
438 ewitab = _mm_cvttps_epi32(ewrt);
439 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
440 ewitab = _mm_slli_epi32(ewitab,2);
441 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
442 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
443 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
444 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
445 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
446 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
447 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
448 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
449 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
451 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
453 /* Update potential sum for this i atom from the interaction with this j atom. */
454 velec = _mm_and_ps(velec,cutoff_mask);
455 velecsum = _mm_add_ps(velecsum,velec);
459 fscal = _mm_and_ps(fscal,cutoff_mask);
461 /* Calculate temporary vectorial force */
462 tx = _mm_mul_ps(fscal,dx02);
463 ty = _mm_mul_ps(fscal,dy02);
464 tz = _mm_mul_ps(fscal,dz02);
466 /* Update vectorial force */
467 fix0 = _mm_add_ps(fix0,tx);
468 fiy0 = _mm_add_ps(fiy0,ty);
469 fiz0 = _mm_add_ps(fiz0,tz);
471 fjx2 = _mm_add_ps(fjx2,tx);
472 fjy2 = _mm_add_ps(fjy2,ty);
473 fjz2 = _mm_add_ps(fjz2,tz);
477 /**************************
478 * CALCULATE INTERACTIONS *
479 **************************/
481 if (gmx_mm_any_lt(rsq10,rcutoff2))
484 r10 = _mm_mul_ps(rsq10,rinv10);
486 /* EWALD ELECTROSTATICS */
488 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
489 ewrt = _mm_mul_ps(r10,ewtabscale);
490 ewitab = _mm_cvttps_epi32(ewrt);
491 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
492 ewitab = _mm_slli_epi32(ewitab,2);
493 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
494 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
495 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
496 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
497 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
498 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
499 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
500 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
501 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
503 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
505 /* Update potential sum for this i atom from the interaction with this j atom. */
506 velec = _mm_and_ps(velec,cutoff_mask);
507 velecsum = _mm_add_ps(velecsum,velec);
511 fscal = _mm_and_ps(fscal,cutoff_mask);
513 /* Calculate temporary vectorial force */
514 tx = _mm_mul_ps(fscal,dx10);
515 ty = _mm_mul_ps(fscal,dy10);
516 tz = _mm_mul_ps(fscal,dz10);
518 /* Update vectorial force */
519 fix1 = _mm_add_ps(fix1,tx);
520 fiy1 = _mm_add_ps(fiy1,ty);
521 fiz1 = _mm_add_ps(fiz1,tz);
523 fjx0 = _mm_add_ps(fjx0,tx);
524 fjy0 = _mm_add_ps(fjy0,ty);
525 fjz0 = _mm_add_ps(fjz0,tz);
529 /**************************
530 * CALCULATE INTERACTIONS *
531 **************************/
533 if (gmx_mm_any_lt(rsq11,rcutoff2))
536 r11 = _mm_mul_ps(rsq11,rinv11);
538 /* EWALD ELECTROSTATICS */
540 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
541 ewrt = _mm_mul_ps(r11,ewtabscale);
542 ewitab = _mm_cvttps_epi32(ewrt);
543 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
544 ewitab = _mm_slli_epi32(ewitab,2);
545 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
546 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
547 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
548 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
549 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
550 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
551 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
552 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
553 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
555 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
557 /* Update potential sum for this i atom from the interaction with this j atom. */
558 velec = _mm_and_ps(velec,cutoff_mask);
559 velecsum = _mm_add_ps(velecsum,velec);
563 fscal = _mm_and_ps(fscal,cutoff_mask);
565 /* Calculate temporary vectorial force */
566 tx = _mm_mul_ps(fscal,dx11);
567 ty = _mm_mul_ps(fscal,dy11);
568 tz = _mm_mul_ps(fscal,dz11);
570 /* Update vectorial force */
571 fix1 = _mm_add_ps(fix1,tx);
572 fiy1 = _mm_add_ps(fiy1,ty);
573 fiz1 = _mm_add_ps(fiz1,tz);
575 fjx1 = _mm_add_ps(fjx1,tx);
576 fjy1 = _mm_add_ps(fjy1,ty);
577 fjz1 = _mm_add_ps(fjz1,tz);
581 /**************************
582 * CALCULATE INTERACTIONS *
583 **************************/
585 if (gmx_mm_any_lt(rsq12,rcutoff2))
588 r12 = _mm_mul_ps(rsq12,rinv12);
590 /* EWALD ELECTROSTATICS */
592 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
593 ewrt = _mm_mul_ps(r12,ewtabscale);
594 ewitab = _mm_cvttps_epi32(ewrt);
595 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
596 ewitab = _mm_slli_epi32(ewitab,2);
597 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
598 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
599 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
600 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
601 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
602 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
603 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
604 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
605 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
607 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
609 /* Update potential sum for this i atom from the interaction with this j atom. */
610 velec = _mm_and_ps(velec,cutoff_mask);
611 velecsum = _mm_add_ps(velecsum,velec);
615 fscal = _mm_and_ps(fscal,cutoff_mask);
617 /* Calculate temporary vectorial force */
618 tx = _mm_mul_ps(fscal,dx12);
619 ty = _mm_mul_ps(fscal,dy12);
620 tz = _mm_mul_ps(fscal,dz12);
622 /* Update vectorial force */
623 fix1 = _mm_add_ps(fix1,tx);
624 fiy1 = _mm_add_ps(fiy1,ty);
625 fiz1 = _mm_add_ps(fiz1,tz);
627 fjx2 = _mm_add_ps(fjx2,tx);
628 fjy2 = _mm_add_ps(fjy2,ty);
629 fjz2 = _mm_add_ps(fjz2,tz);
633 /**************************
634 * CALCULATE INTERACTIONS *
635 **************************/
637 if (gmx_mm_any_lt(rsq20,rcutoff2))
640 r20 = _mm_mul_ps(rsq20,rinv20);
642 /* EWALD ELECTROSTATICS */
644 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
645 ewrt = _mm_mul_ps(r20,ewtabscale);
646 ewitab = _mm_cvttps_epi32(ewrt);
647 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
648 ewitab = _mm_slli_epi32(ewitab,2);
649 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
650 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
651 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
652 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
653 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
654 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
655 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
656 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
657 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
659 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
661 /* Update potential sum for this i atom from the interaction with this j atom. */
662 velec = _mm_and_ps(velec,cutoff_mask);
663 velecsum = _mm_add_ps(velecsum,velec);
667 fscal = _mm_and_ps(fscal,cutoff_mask);
669 /* Calculate temporary vectorial force */
670 tx = _mm_mul_ps(fscal,dx20);
671 ty = _mm_mul_ps(fscal,dy20);
672 tz = _mm_mul_ps(fscal,dz20);
674 /* Update vectorial force */
675 fix2 = _mm_add_ps(fix2,tx);
676 fiy2 = _mm_add_ps(fiy2,ty);
677 fiz2 = _mm_add_ps(fiz2,tz);
679 fjx0 = _mm_add_ps(fjx0,tx);
680 fjy0 = _mm_add_ps(fjy0,ty);
681 fjz0 = _mm_add_ps(fjz0,tz);
685 /**************************
686 * CALCULATE INTERACTIONS *
687 **************************/
689 if (gmx_mm_any_lt(rsq21,rcutoff2))
692 r21 = _mm_mul_ps(rsq21,rinv21);
694 /* EWALD ELECTROSTATICS */
696 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
697 ewrt = _mm_mul_ps(r21,ewtabscale);
698 ewitab = _mm_cvttps_epi32(ewrt);
699 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
700 ewitab = _mm_slli_epi32(ewitab,2);
701 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
702 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
703 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
704 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
705 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
706 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
707 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
708 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
709 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
711 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
713 /* Update potential sum for this i atom from the interaction with this j atom. */
714 velec = _mm_and_ps(velec,cutoff_mask);
715 velecsum = _mm_add_ps(velecsum,velec);
719 fscal = _mm_and_ps(fscal,cutoff_mask);
721 /* Calculate temporary vectorial force */
722 tx = _mm_mul_ps(fscal,dx21);
723 ty = _mm_mul_ps(fscal,dy21);
724 tz = _mm_mul_ps(fscal,dz21);
726 /* Update vectorial force */
727 fix2 = _mm_add_ps(fix2,tx);
728 fiy2 = _mm_add_ps(fiy2,ty);
729 fiz2 = _mm_add_ps(fiz2,tz);
731 fjx1 = _mm_add_ps(fjx1,tx);
732 fjy1 = _mm_add_ps(fjy1,ty);
733 fjz1 = _mm_add_ps(fjz1,tz);
737 /**************************
738 * CALCULATE INTERACTIONS *
739 **************************/
741 if (gmx_mm_any_lt(rsq22,rcutoff2))
744 r22 = _mm_mul_ps(rsq22,rinv22);
746 /* EWALD ELECTROSTATICS */
748 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
749 ewrt = _mm_mul_ps(r22,ewtabscale);
750 ewitab = _mm_cvttps_epi32(ewrt);
751 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
752 ewitab = _mm_slli_epi32(ewitab,2);
753 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
754 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
755 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
756 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
757 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
758 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
759 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
760 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
761 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
763 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
765 /* Update potential sum for this i atom from the interaction with this j atom. */
766 velec = _mm_and_ps(velec,cutoff_mask);
767 velecsum = _mm_add_ps(velecsum,velec);
771 fscal = _mm_and_ps(fscal,cutoff_mask);
773 /* Calculate temporary vectorial force */
774 tx = _mm_mul_ps(fscal,dx22);
775 ty = _mm_mul_ps(fscal,dy22);
776 tz = _mm_mul_ps(fscal,dz22);
778 /* Update vectorial force */
779 fix2 = _mm_add_ps(fix2,tx);
780 fiy2 = _mm_add_ps(fiy2,ty);
781 fiz2 = _mm_add_ps(fiz2,tz);
783 fjx2 = _mm_add_ps(fjx2,tx);
784 fjy2 = _mm_add_ps(fjy2,ty);
785 fjz2 = _mm_add_ps(fjz2,tz);
789 fjptrA = f+j_coord_offsetA;
790 fjptrB = f+j_coord_offsetB;
791 fjptrC = f+j_coord_offsetC;
792 fjptrD = f+j_coord_offsetD;
794 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
795 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
797 /* Inner loop uses 432 flops */
803 /* Get j neighbor index, and coordinate index */
804 jnrlistA = jjnr[jidx];
805 jnrlistB = jjnr[jidx+1];
806 jnrlistC = jjnr[jidx+2];
807 jnrlistD = jjnr[jidx+3];
808 /* Sign of each element will be negative for non-real atoms.
809 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
810 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
812 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
813 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
814 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
815 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
816 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
817 j_coord_offsetA = DIM*jnrA;
818 j_coord_offsetB = DIM*jnrB;
819 j_coord_offsetC = DIM*jnrC;
820 j_coord_offsetD = DIM*jnrD;
822 /* load j atom coordinates */
823 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
824 x+j_coord_offsetC,x+j_coord_offsetD,
825 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
827 /* Calculate displacement vector */
828 dx00 = _mm_sub_ps(ix0,jx0);
829 dy00 = _mm_sub_ps(iy0,jy0);
830 dz00 = _mm_sub_ps(iz0,jz0);
831 dx01 = _mm_sub_ps(ix0,jx1);
832 dy01 = _mm_sub_ps(iy0,jy1);
833 dz01 = _mm_sub_ps(iz0,jz1);
834 dx02 = _mm_sub_ps(ix0,jx2);
835 dy02 = _mm_sub_ps(iy0,jy2);
836 dz02 = _mm_sub_ps(iz0,jz2);
837 dx10 = _mm_sub_ps(ix1,jx0);
838 dy10 = _mm_sub_ps(iy1,jy0);
839 dz10 = _mm_sub_ps(iz1,jz0);
840 dx11 = _mm_sub_ps(ix1,jx1);
841 dy11 = _mm_sub_ps(iy1,jy1);
842 dz11 = _mm_sub_ps(iz1,jz1);
843 dx12 = _mm_sub_ps(ix1,jx2);
844 dy12 = _mm_sub_ps(iy1,jy2);
845 dz12 = _mm_sub_ps(iz1,jz2);
846 dx20 = _mm_sub_ps(ix2,jx0);
847 dy20 = _mm_sub_ps(iy2,jy0);
848 dz20 = _mm_sub_ps(iz2,jz0);
849 dx21 = _mm_sub_ps(ix2,jx1);
850 dy21 = _mm_sub_ps(iy2,jy1);
851 dz21 = _mm_sub_ps(iz2,jz1);
852 dx22 = _mm_sub_ps(ix2,jx2);
853 dy22 = _mm_sub_ps(iy2,jy2);
854 dz22 = _mm_sub_ps(iz2,jz2);
856 /* Calculate squared distance and things based on it */
857 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
858 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
859 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
860 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
861 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
862 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
863 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
864 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
865 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
867 rinv00 = gmx_mm_invsqrt_ps(rsq00);
868 rinv01 = gmx_mm_invsqrt_ps(rsq01);
869 rinv02 = gmx_mm_invsqrt_ps(rsq02);
870 rinv10 = gmx_mm_invsqrt_ps(rsq10);
871 rinv11 = gmx_mm_invsqrt_ps(rsq11);
872 rinv12 = gmx_mm_invsqrt_ps(rsq12);
873 rinv20 = gmx_mm_invsqrt_ps(rsq20);
874 rinv21 = gmx_mm_invsqrt_ps(rsq21);
875 rinv22 = gmx_mm_invsqrt_ps(rsq22);
877 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
878 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
879 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
880 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
881 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
882 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
883 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
884 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
885 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
887 fjx0 = _mm_setzero_ps();
888 fjy0 = _mm_setzero_ps();
889 fjz0 = _mm_setzero_ps();
890 fjx1 = _mm_setzero_ps();
891 fjy1 = _mm_setzero_ps();
892 fjz1 = _mm_setzero_ps();
893 fjx2 = _mm_setzero_ps();
894 fjy2 = _mm_setzero_ps();
895 fjz2 = _mm_setzero_ps();
897 /**************************
898 * CALCULATE INTERACTIONS *
899 **************************/
901 if (gmx_mm_any_lt(rsq00,rcutoff2))
904 r00 = _mm_mul_ps(rsq00,rinv00);
905 r00 = _mm_andnot_ps(dummy_mask,r00);
907 /* EWALD ELECTROSTATICS */
909 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
910 ewrt = _mm_mul_ps(r00,ewtabscale);
911 ewitab = _mm_cvttps_epi32(ewrt);
912 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
913 ewitab = _mm_slli_epi32(ewitab,2);
914 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
915 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
916 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
917 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
918 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
919 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
920 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
921 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
922 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
924 /* LENNARD-JONES DISPERSION/REPULSION */
926 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
927 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
928 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
929 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
930 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
931 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
933 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
935 /* Update potential sum for this i atom from the interaction with this j atom. */
936 velec = _mm_and_ps(velec,cutoff_mask);
937 velec = _mm_andnot_ps(dummy_mask,velec);
938 velecsum = _mm_add_ps(velecsum,velec);
939 vvdw = _mm_and_ps(vvdw,cutoff_mask);
940 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
941 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
943 fscal = _mm_add_ps(felec,fvdw);
945 fscal = _mm_and_ps(fscal,cutoff_mask);
947 fscal = _mm_andnot_ps(dummy_mask,fscal);
949 /* Calculate temporary vectorial force */
950 tx = _mm_mul_ps(fscal,dx00);
951 ty = _mm_mul_ps(fscal,dy00);
952 tz = _mm_mul_ps(fscal,dz00);
954 /* Update vectorial force */
955 fix0 = _mm_add_ps(fix0,tx);
956 fiy0 = _mm_add_ps(fiy0,ty);
957 fiz0 = _mm_add_ps(fiz0,tz);
959 fjx0 = _mm_add_ps(fjx0,tx);
960 fjy0 = _mm_add_ps(fjy0,ty);
961 fjz0 = _mm_add_ps(fjz0,tz);
965 /**************************
966 * CALCULATE INTERACTIONS *
967 **************************/
969 if (gmx_mm_any_lt(rsq01,rcutoff2))
972 r01 = _mm_mul_ps(rsq01,rinv01);
973 r01 = _mm_andnot_ps(dummy_mask,r01);
975 /* EWALD ELECTROSTATICS */
977 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
978 ewrt = _mm_mul_ps(r01,ewtabscale);
979 ewitab = _mm_cvttps_epi32(ewrt);
980 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
981 ewitab = _mm_slli_epi32(ewitab,2);
982 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
983 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
984 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
985 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
986 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
987 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
988 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
989 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
990 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
992 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
994 /* Update potential sum for this i atom from the interaction with this j atom. */
995 velec = _mm_and_ps(velec,cutoff_mask);
996 velec = _mm_andnot_ps(dummy_mask,velec);
997 velecsum = _mm_add_ps(velecsum,velec);
1001 fscal = _mm_and_ps(fscal,cutoff_mask);
1003 fscal = _mm_andnot_ps(dummy_mask,fscal);
1005 /* Calculate temporary vectorial force */
1006 tx = _mm_mul_ps(fscal,dx01);
1007 ty = _mm_mul_ps(fscal,dy01);
1008 tz = _mm_mul_ps(fscal,dz01);
1010 /* Update vectorial force */
1011 fix0 = _mm_add_ps(fix0,tx);
1012 fiy0 = _mm_add_ps(fiy0,ty);
1013 fiz0 = _mm_add_ps(fiz0,tz);
1015 fjx1 = _mm_add_ps(fjx1,tx);
1016 fjy1 = _mm_add_ps(fjy1,ty);
1017 fjz1 = _mm_add_ps(fjz1,tz);
1021 /**************************
1022 * CALCULATE INTERACTIONS *
1023 **************************/
1025 if (gmx_mm_any_lt(rsq02,rcutoff2))
1028 r02 = _mm_mul_ps(rsq02,rinv02);
1029 r02 = _mm_andnot_ps(dummy_mask,r02);
1031 /* EWALD ELECTROSTATICS */
1033 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1034 ewrt = _mm_mul_ps(r02,ewtabscale);
1035 ewitab = _mm_cvttps_epi32(ewrt);
1036 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1037 ewitab = _mm_slli_epi32(ewitab,2);
1038 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1039 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1040 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1041 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1042 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1043 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1044 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1045 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
1046 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1048 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1050 /* Update potential sum for this i atom from the interaction with this j atom. */
1051 velec = _mm_and_ps(velec,cutoff_mask);
1052 velec = _mm_andnot_ps(dummy_mask,velec);
1053 velecsum = _mm_add_ps(velecsum,velec);
1057 fscal = _mm_and_ps(fscal,cutoff_mask);
1059 fscal = _mm_andnot_ps(dummy_mask,fscal);
1061 /* Calculate temporary vectorial force */
1062 tx = _mm_mul_ps(fscal,dx02);
1063 ty = _mm_mul_ps(fscal,dy02);
1064 tz = _mm_mul_ps(fscal,dz02);
1066 /* Update vectorial force */
1067 fix0 = _mm_add_ps(fix0,tx);
1068 fiy0 = _mm_add_ps(fiy0,ty);
1069 fiz0 = _mm_add_ps(fiz0,tz);
1071 fjx2 = _mm_add_ps(fjx2,tx);
1072 fjy2 = _mm_add_ps(fjy2,ty);
1073 fjz2 = _mm_add_ps(fjz2,tz);
1077 /**************************
1078 * CALCULATE INTERACTIONS *
1079 **************************/
1081 if (gmx_mm_any_lt(rsq10,rcutoff2))
1084 r10 = _mm_mul_ps(rsq10,rinv10);
1085 r10 = _mm_andnot_ps(dummy_mask,r10);
1087 /* EWALD ELECTROSTATICS */
1089 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1090 ewrt = _mm_mul_ps(r10,ewtabscale);
1091 ewitab = _mm_cvttps_epi32(ewrt);
1092 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1093 ewitab = _mm_slli_epi32(ewitab,2);
1094 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1095 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1096 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1097 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1098 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1099 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1100 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1101 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
1102 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1104 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1106 /* Update potential sum for this i atom from the interaction with this j atom. */
1107 velec = _mm_and_ps(velec,cutoff_mask);
1108 velec = _mm_andnot_ps(dummy_mask,velec);
1109 velecsum = _mm_add_ps(velecsum,velec);
1113 fscal = _mm_and_ps(fscal,cutoff_mask);
1115 fscal = _mm_andnot_ps(dummy_mask,fscal);
1117 /* Calculate temporary vectorial force */
1118 tx = _mm_mul_ps(fscal,dx10);
1119 ty = _mm_mul_ps(fscal,dy10);
1120 tz = _mm_mul_ps(fscal,dz10);
1122 /* Update vectorial force */
1123 fix1 = _mm_add_ps(fix1,tx);
1124 fiy1 = _mm_add_ps(fiy1,ty);
1125 fiz1 = _mm_add_ps(fiz1,tz);
1127 fjx0 = _mm_add_ps(fjx0,tx);
1128 fjy0 = _mm_add_ps(fjy0,ty);
1129 fjz0 = _mm_add_ps(fjz0,tz);
1133 /**************************
1134 * CALCULATE INTERACTIONS *
1135 **************************/
1137 if (gmx_mm_any_lt(rsq11,rcutoff2))
1140 r11 = _mm_mul_ps(rsq11,rinv11);
1141 r11 = _mm_andnot_ps(dummy_mask,r11);
1143 /* EWALD ELECTROSTATICS */
1145 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1146 ewrt = _mm_mul_ps(r11,ewtabscale);
1147 ewitab = _mm_cvttps_epi32(ewrt);
1148 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1149 ewitab = _mm_slli_epi32(ewitab,2);
1150 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1151 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1152 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1153 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1154 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1155 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1156 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1157 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
1158 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1160 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1162 /* Update potential sum for this i atom from the interaction with this j atom. */
1163 velec = _mm_and_ps(velec,cutoff_mask);
1164 velec = _mm_andnot_ps(dummy_mask,velec);
1165 velecsum = _mm_add_ps(velecsum,velec);
1169 fscal = _mm_and_ps(fscal,cutoff_mask);
1171 fscal = _mm_andnot_ps(dummy_mask,fscal);
1173 /* Calculate temporary vectorial force */
1174 tx = _mm_mul_ps(fscal,dx11);
1175 ty = _mm_mul_ps(fscal,dy11);
1176 tz = _mm_mul_ps(fscal,dz11);
1178 /* Update vectorial force */
1179 fix1 = _mm_add_ps(fix1,tx);
1180 fiy1 = _mm_add_ps(fiy1,ty);
1181 fiz1 = _mm_add_ps(fiz1,tz);
1183 fjx1 = _mm_add_ps(fjx1,tx);
1184 fjy1 = _mm_add_ps(fjy1,ty);
1185 fjz1 = _mm_add_ps(fjz1,tz);
1189 /**************************
1190 * CALCULATE INTERACTIONS *
1191 **************************/
1193 if (gmx_mm_any_lt(rsq12,rcutoff2))
1196 r12 = _mm_mul_ps(rsq12,rinv12);
1197 r12 = _mm_andnot_ps(dummy_mask,r12);
1199 /* EWALD ELECTROSTATICS */
1201 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1202 ewrt = _mm_mul_ps(r12,ewtabscale);
1203 ewitab = _mm_cvttps_epi32(ewrt);
1204 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1205 ewitab = _mm_slli_epi32(ewitab,2);
1206 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1207 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1208 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1209 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1210 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1211 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1212 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1213 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
1214 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1216 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1218 /* Update potential sum for this i atom from the interaction with this j atom. */
1219 velec = _mm_and_ps(velec,cutoff_mask);
1220 velec = _mm_andnot_ps(dummy_mask,velec);
1221 velecsum = _mm_add_ps(velecsum,velec);
1225 fscal = _mm_and_ps(fscal,cutoff_mask);
1227 fscal = _mm_andnot_ps(dummy_mask,fscal);
1229 /* Calculate temporary vectorial force */
1230 tx = _mm_mul_ps(fscal,dx12);
1231 ty = _mm_mul_ps(fscal,dy12);
1232 tz = _mm_mul_ps(fscal,dz12);
1234 /* Update vectorial force */
1235 fix1 = _mm_add_ps(fix1,tx);
1236 fiy1 = _mm_add_ps(fiy1,ty);
1237 fiz1 = _mm_add_ps(fiz1,tz);
1239 fjx2 = _mm_add_ps(fjx2,tx);
1240 fjy2 = _mm_add_ps(fjy2,ty);
1241 fjz2 = _mm_add_ps(fjz2,tz);
1245 /**************************
1246 * CALCULATE INTERACTIONS *
1247 **************************/
1249 if (gmx_mm_any_lt(rsq20,rcutoff2))
1252 r20 = _mm_mul_ps(rsq20,rinv20);
1253 r20 = _mm_andnot_ps(dummy_mask,r20);
1255 /* EWALD ELECTROSTATICS */
1257 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1258 ewrt = _mm_mul_ps(r20,ewtabscale);
1259 ewitab = _mm_cvttps_epi32(ewrt);
1260 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1261 ewitab = _mm_slli_epi32(ewitab,2);
1262 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1263 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1264 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1265 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1266 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1267 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1268 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1269 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
1270 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1272 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1274 /* Update potential sum for this i atom from the interaction with this j atom. */
1275 velec = _mm_and_ps(velec,cutoff_mask);
1276 velec = _mm_andnot_ps(dummy_mask,velec);
1277 velecsum = _mm_add_ps(velecsum,velec);
1281 fscal = _mm_and_ps(fscal,cutoff_mask);
1283 fscal = _mm_andnot_ps(dummy_mask,fscal);
1285 /* Calculate temporary vectorial force */
1286 tx = _mm_mul_ps(fscal,dx20);
1287 ty = _mm_mul_ps(fscal,dy20);
1288 tz = _mm_mul_ps(fscal,dz20);
1290 /* Update vectorial force */
1291 fix2 = _mm_add_ps(fix2,tx);
1292 fiy2 = _mm_add_ps(fiy2,ty);
1293 fiz2 = _mm_add_ps(fiz2,tz);
1295 fjx0 = _mm_add_ps(fjx0,tx);
1296 fjy0 = _mm_add_ps(fjy0,ty);
1297 fjz0 = _mm_add_ps(fjz0,tz);
1301 /**************************
1302 * CALCULATE INTERACTIONS *
1303 **************************/
1305 if (gmx_mm_any_lt(rsq21,rcutoff2))
1308 r21 = _mm_mul_ps(rsq21,rinv21);
1309 r21 = _mm_andnot_ps(dummy_mask,r21);
1311 /* EWALD ELECTROSTATICS */
1313 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1314 ewrt = _mm_mul_ps(r21,ewtabscale);
1315 ewitab = _mm_cvttps_epi32(ewrt);
1316 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1317 ewitab = _mm_slli_epi32(ewitab,2);
1318 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1319 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1320 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1321 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1322 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1323 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1324 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1325 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
1326 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1328 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1330 /* Update potential sum for this i atom from the interaction with this j atom. */
1331 velec = _mm_and_ps(velec,cutoff_mask);
1332 velec = _mm_andnot_ps(dummy_mask,velec);
1333 velecsum = _mm_add_ps(velecsum,velec);
1337 fscal = _mm_and_ps(fscal,cutoff_mask);
1339 fscal = _mm_andnot_ps(dummy_mask,fscal);
1341 /* Calculate temporary vectorial force */
1342 tx = _mm_mul_ps(fscal,dx21);
1343 ty = _mm_mul_ps(fscal,dy21);
1344 tz = _mm_mul_ps(fscal,dz21);
1346 /* Update vectorial force */
1347 fix2 = _mm_add_ps(fix2,tx);
1348 fiy2 = _mm_add_ps(fiy2,ty);
1349 fiz2 = _mm_add_ps(fiz2,tz);
1351 fjx1 = _mm_add_ps(fjx1,tx);
1352 fjy1 = _mm_add_ps(fjy1,ty);
1353 fjz1 = _mm_add_ps(fjz1,tz);
1357 /**************************
1358 * CALCULATE INTERACTIONS *
1359 **************************/
1361 if (gmx_mm_any_lt(rsq22,rcutoff2))
1364 r22 = _mm_mul_ps(rsq22,rinv22);
1365 r22 = _mm_andnot_ps(dummy_mask,r22);
1367 /* EWALD ELECTROSTATICS */
1369 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1370 ewrt = _mm_mul_ps(r22,ewtabscale);
1371 ewitab = _mm_cvttps_epi32(ewrt);
1372 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1373 ewitab = _mm_slli_epi32(ewitab,2);
1374 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1375 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1376 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1377 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1378 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1379 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1380 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1381 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
1382 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1384 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1386 /* Update potential sum for this i atom from the interaction with this j atom. */
1387 velec = _mm_and_ps(velec,cutoff_mask);
1388 velec = _mm_andnot_ps(dummy_mask,velec);
1389 velecsum = _mm_add_ps(velecsum,velec);
1393 fscal = _mm_and_ps(fscal,cutoff_mask);
1395 fscal = _mm_andnot_ps(dummy_mask,fscal);
1397 /* Calculate temporary vectorial force */
1398 tx = _mm_mul_ps(fscal,dx22);
1399 ty = _mm_mul_ps(fscal,dy22);
1400 tz = _mm_mul_ps(fscal,dz22);
1402 /* Update vectorial force */
1403 fix2 = _mm_add_ps(fix2,tx);
1404 fiy2 = _mm_add_ps(fiy2,ty);
1405 fiz2 = _mm_add_ps(fiz2,tz);
1407 fjx2 = _mm_add_ps(fjx2,tx);
1408 fjy2 = _mm_add_ps(fjy2,ty);
1409 fjz2 = _mm_add_ps(fjz2,tz);
1413 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1414 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1415 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1416 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1418 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1419 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1421 /* Inner loop uses 441 flops */
1424 /* End of innermost loop */
1426 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1427 f+i_coord_offset,fshift+i_shift_offset);
1430 /* Update potential energies */
1431 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1432 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1434 /* Increment number of inner iterations */
1435 inneriter += j_index_end - j_index_start;
1437 /* Outer loop uses 20 flops */
1440 /* Increment number of outer iterations */
1443 /* Update outer/inner flops */
1445 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*441);
1448 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_single
1449 * Electrostatics interaction: Ewald
1450 * VdW interaction: LennardJones
1451 * Geometry: Water3-Water3
1452 * Calculate force/pot: Force
1455 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_single
1456 (t_nblist * gmx_restrict nlist,
1457 rvec * gmx_restrict xx,
1458 rvec * gmx_restrict ff,
1459 t_forcerec * gmx_restrict fr,
1460 t_mdatoms * gmx_restrict mdatoms,
1461 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1462 t_nrnb * gmx_restrict nrnb)
1464 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1465 * just 0 for non-waters.
1466 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1467 * jnr indices corresponding to data put in the four positions in the SIMD register.
1469 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1470 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1471 int jnrA,jnrB,jnrC,jnrD;
1472 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1473 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1474 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1475 real rcutoff_scalar;
1476 real *shiftvec,*fshift,*x,*f;
1477 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1478 real scratch[4*DIM];
1479 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1481 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1483 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1485 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1486 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1487 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1488 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1489 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1490 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1491 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1492 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1493 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1494 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1495 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1496 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1497 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1498 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1499 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1500 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1501 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1504 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1507 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1508 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1510 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1512 __m128 dummy_mask,cutoff_mask;
1513 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1514 __m128 one = _mm_set1_ps(1.0);
1515 __m128 two = _mm_set1_ps(2.0);
1521 jindex = nlist->jindex;
1523 shiftidx = nlist->shift;
1525 shiftvec = fr->shift_vec[0];
1526 fshift = fr->fshift[0];
1527 facel = _mm_set1_ps(fr->epsfac);
1528 charge = mdatoms->chargeA;
1529 nvdwtype = fr->ntype;
1530 vdwparam = fr->nbfp;
1531 vdwtype = mdatoms->typeA;
1533 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1534 ewtab = fr->ic->tabq_coul_F;
1535 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1536 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1538 /* Setup water-specific parameters */
1539 inr = nlist->iinr[0];
1540 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1541 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1542 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1543 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1545 jq0 = _mm_set1_ps(charge[inr+0]);
1546 jq1 = _mm_set1_ps(charge[inr+1]);
1547 jq2 = _mm_set1_ps(charge[inr+2]);
1548 vdwjidx0A = 2*vdwtype[inr+0];
1549 qq00 = _mm_mul_ps(iq0,jq0);
1550 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1551 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1552 qq01 = _mm_mul_ps(iq0,jq1);
1553 qq02 = _mm_mul_ps(iq0,jq2);
1554 qq10 = _mm_mul_ps(iq1,jq0);
1555 qq11 = _mm_mul_ps(iq1,jq1);
1556 qq12 = _mm_mul_ps(iq1,jq2);
1557 qq20 = _mm_mul_ps(iq2,jq0);
1558 qq21 = _mm_mul_ps(iq2,jq1);
1559 qq22 = _mm_mul_ps(iq2,jq2);
1561 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1562 rcutoff_scalar = fr->rcoulomb;
1563 rcutoff = _mm_set1_ps(rcutoff_scalar);
1564 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1566 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
1567 rvdw = _mm_set1_ps(fr->rvdw);
1569 /* Avoid stupid compiler warnings */
1570 jnrA = jnrB = jnrC = jnrD = 0;
1571 j_coord_offsetA = 0;
1572 j_coord_offsetB = 0;
1573 j_coord_offsetC = 0;
1574 j_coord_offsetD = 0;
1579 for(iidx=0;iidx<4*DIM;iidx++)
1581 scratch[iidx] = 0.0;
1584 /* Start outer loop over neighborlists */
1585 for(iidx=0; iidx<nri; iidx++)
1587 /* Load shift vector for this list */
1588 i_shift_offset = DIM*shiftidx[iidx];
1590 /* Load limits for loop over neighbors */
1591 j_index_start = jindex[iidx];
1592 j_index_end = jindex[iidx+1];
1594 /* Get outer coordinate index */
1596 i_coord_offset = DIM*inr;
1598 /* Load i particle coords and add shift vector */
1599 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1600 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1602 fix0 = _mm_setzero_ps();
1603 fiy0 = _mm_setzero_ps();
1604 fiz0 = _mm_setzero_ps();
1605 fix1 = _mm_setzero_ps();
1606 fiy1 = _mm_setzero_ps();
1607 fiz1 = _mm_setzero_ps();
1608 fix2 = _mm_setzero_ps();
1609 fiy2 = _mm_setzero_ps();
1610 fiz2 = _mm_setzero_ps();
1612 /* Start inner kernel loop */
1613 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1616 /* Get j neighbor index, and coordinate index */
1618 jnrB = jjnr[jidx+1];
1619 jnrC = jjnr[jidx+2];
1620 jnrD = jjnr[jidx+3];
1621 j_coord_offsetA = DIM*jnrA;
1622 j_coord_offsetB = DIM*jnrB;
1623 j_coord_offsetC = DIM*jnrC;
1624 j_coord_offsetD = DIM*jnrD;
1626 /* load j atom coordinates */
1627 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1628 x+j_coord_offsetC,x+j_coord_offsetD,
1629 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1631 /* Calculate displacement vector */
1632 dx00 = _mm_sub_ps(ix0,jx0);
1633 dy00 = _mm_sub_ps(iy0,jy0);
1634 dz00 = _mm_sub_ps(iz0,jz0);
1635 dx01 = _mm_sub_ps(ix0,jx1);
1636 dy01 = _mm_sub_ps(iy0,jy1);
1637 dz01 = _mm_sub_ps(iz0,jz1);
1638 dx02 = _mm_sub_ps(ix0,jx2);
1639 dy02 = _mm_sub_ps(iy0,jy2);
1640 dz02 = _mm_sub_ps(iz0,jz2);
1641 dx10 = _mm_sub_ps(ix1,jx0);
1642 dy10 = _mm_sub_ps(iy1,jy0);
1643 dz10 = _mm_sub_ps(iz1,jz0);
1644 dx11 = _mm_sub_ps(ix1,jx1);
1645 dy11 = _mm_sub_ps(iy1,jy1);
1646 dz11 = _mm_sub_ps(iz1,jz1);
1647 dx12 = _mm_sub_ps(ix1,jx2);
1648 dy12 = _mm_sub_ps(iy1,jy2);
1649 dz12 = _mm_sub_ps(iz1,jz2);
1650 dx20 = _mm_sub_ps(ix2,jx0);
1651 dy20 = _mm_sub_ps(iy2,jy0);
1652 dz20 = _mm_sub_ps(iz2,jz0);
1653 dx21 = _mm_sub_ps(ix2,jx1);
1654 dy21 = _mm_sub_ps(iy2,jy1);
1655 dz21 = _mm_sub_ps(iz2,jz1);
1656 dx22 = _mm_sub_ps(ix2,jx2);
1657 dy22 = _mm_sub_ps(iy2,jy2);
1658 dz22 = _mm_sub_ps(iz2,jz2);
1660 /* Calculate squared distance and things based on it */
1661 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1662 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1663 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1664 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1665 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1666 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1667 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1668 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1669 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1671 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1672 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1673 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1674 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1675 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1676 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1677 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1678 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1679 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1681 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1682 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1683 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1684 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1685 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1686 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1687 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1688 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1689 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1691 fjx0 = _mm_setzero_ps();
1692 fjy0 = _mm_setzero_ps();
1693 fjz0 = _mm_setzero_ps();
1694 fjx1 = _mm_setzero_ps();
1695 fjy1 = _mm_setzero_ps();
1696 fjz1 = _mm_setzero_ps();
1697 fjx2 = _mm_setzero_ps();
1698 fjy2 = _mm_setzero_ps();
1699 fjz2 = _mm_setzero_ps();
1701 /**************************
1702 * CALCULATE INTERACTIONS *
1703 **************************/
1705 if (gmx_mm_any_lt(rsq00,rcutoff2))
1708 r00 = _mm_mul_ps(rsq00,rinv00);
1710 /* EWALD ELECTROSTATICS */
1712 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1713 ewrt = _mm_mul_ps(r00,ewtabscale);
1714 ewitab = _mm_cvttps_epi32(ewrt);
1715 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1716 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1717 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1719 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1720 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1722 /* LENNARD-JONES DISPERSION/REPULSION */
1724 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1725 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1727 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1729 fscal = _mm_add_ps(felec,fvdw);
1731 fscal = _mm_and_ps(fscal,cutoff_mask);
1733 /* Calculate temporary vectorial force */
1734 tx = _mm_mul_ps(fscal,dx00);
1735 ty = _mm_mul_ps(fscal,dy00);
1736 tz = _mm_mul_ps(fscal,dz00);
1738 /* Update vectorial force */
1739 fix0 = _mm_add_ps(fix0,tx);
1740 fiy0 = _mm_add_ps(fiy0,ty);
1741 fiz0 = _mm_add_ps(fiz0,tz);
1743 fjx0 = _mm_add_ps(fjx0,tx);
1744 fjy0 = _mm_add_ps(fjy0,ty);
1745 fjz0 = _mm_add_ps(fjz0,tz);
1749 /**************************
1750 * CALCULATE INTERACTIONS *
1751 **************************/
1753 if (gmx_mm_any_lt(rsq01,rcutoff2))
1756 r01 = _mm_mul_ps(rsq01,rinv01);
1758 /* EWALD ELECTROSTATICS */
1760 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1761 ewrt = _mm_mul_ps(r01,ewtabscale);
1762 ewitab = _mm_cvttps_epi32(ewrt);
1763 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1764 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1765 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1767 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1768 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1770 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1774 fscal = _mm_and_ps(fscal,cutoff_mask);
1776 /* Calculate temporary vectorial force */
1777 tx = _mm_mul_ps(fscal,dx01);
1778 ty = _mm_mul_ps(fscal,dy01);
1779 tz = _mm_mul_ps(fscal,dz01);
1781 /* Update vectorial force */
1782 fix0 = _mm_add_ps(fix0,tx);
1783 fiy0 = _mm_add_ps(fiy0,ty);
1784 fiz0 = _mm_add_ps(fiz0,tz);
1786 fjx1 = _mm_add_ps(fjx1,tx);
1787 fjy1 = _mm_add_ps(fjy1,ty);
1788 fjz1 = _mm_add_ps(fjz1,tz);
1792 /**************************
1793 * CALCULATE INTERACTIONS *
1794 **************************/
1796 if (gmx_mm_any_lt(rsq02,rcutoff2))
1799 r02 = _mm_mul_ps(rsq02,rinv02);
1801 /* EWALD ELECTROSTATICS */
1803 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1804 ewrt = _mm_mul_ps(r02,ewtabscale);
1805 ewitab = _mm_cvttps_epi32(ewrt);
1806 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1807 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1808 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1810 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1811 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1813 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1817 fscal = _mm_and_ps(fscal,cutoff_mask);
1819 /* Calculate temporary vectorial force */
1820 tx = _mm_mul_ps(fscal,dx02);
1821 ty = _mm_mul_ps(fscal,dy02);
1822 tz = _mm_mul_ps(fscal,dz02);
1824 /* Update vectorial force */
1825 fix0 = _mm_add_ps(fix0,tx);
1826 fiy0 = _mm_add_ps(fiy0,ty);
1827 fiz0 = _mm_add_ps(fiz0,tz);
1829 fjx2 = _mm_add_ps(fjx2,tx);
1830 fjy2 = _mm_add_ps(fjy2,ty);
1831 fjz2 = _mm_add_ps(fjz2,tz);
1835 /**************************
1836 * CALCULATE INTERACTIONS *
1837 **************************/
1839 if (gmx_mm_any_lt(rsq10,rcutoff2))
1842 r10 = _mm_mul_ps(rsq10,rinv10);
1844 /* EWALD ELECTROSTATICS */
1846 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1847 ewrt = _mm_mul_ps(r10,ewtabscale);
1848 ewitab = _mm_cvttps_epi32(ewrt);
1849 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1850 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1851 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1853 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1854 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1856 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1860 fscal = _mm_and_ps(fscal,cutoff_mask);
1862 /* Calculate temporary vectorial force */
1863 tx = _mm_mul_ps(fscal,dx10);
1864 ty = _mm_mul_ps(fscal,dy10);
1865 tz = _mm_mul_ps(fscal,dz10);
1867 /* Update vectorial force */
1868 fix1 = _mm_add_ps(fix1,tx);
1869 fiy1 = _mm_add_ps(fiy1,ty);
1870 fiz1 = _mm_add_ps(fiz1,tz);
1872 fjx0 = _mm_add_ps(fjx0,tx);
1873 fjy0 = _mm_add_ps(fjy0,ty);
1874 fjz0 = _mm_add_ps(fjz0,tz);
1878 /**************************
1879 * CALCULATE INTERACTIONS *
1880 **************************/
1882 if (gmx_mm_any_lt(rsq11,rcutoff2))
1885 r11 = _mm_mul_ps(rsq11,rinv11);
1887 /* EWALD ELECTROSTATICS */
1889 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1890 ewrt = _mm_mul_ps(r11,ewtabscale);
1891 ewitab = _mm_cvttps_epi32(ewrt);
1892 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1893 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1894 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1896 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1897 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1899 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1903 fscal = _mm_and_ps(fscal,cutoff_mask);
1905 /* Calculate temporary vectorial force */
1906 tx = _mm_mul_ps(fscal,dx11);
1907 ty = _mm_mul_ps(fscal,dy11);
1908 tz = _mm_mul_ps(fscal,dz11);
1910 /* Update vectorial force */
1911 fix1 = _mm_add_ps(fix1,tx);
1912 fiy1 = _mm_add_ps(fiy1,ty);
1913 fiz1 = _mm_add_ps(fiz1,tz);
1915 fjx1 = _mm_add_ps(fjx1,tx);
1916 fjy1 = _mm_add_ps(fjy1,ty);
1917 fjz1 = _mm_add_ps(fjz1,tz);
1921 /**************************
1922 * CALCULATE INTERACTIONS *
1923 **************************/
1925 if (gmx_mm_any_lt(rsq12,rcutoff2))
1928 r12 = _mm_mul_ps(rsq12,rinv12);
1930 /* EWALD ELECTROSTATICS */
1932 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1933 ewrt = _mm_mul_ps(r12,ewtabscale);
1934 ewitab = _mm_cvttps_epi32(ewrt);
1935 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1936 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1937 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1939 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1940 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1942 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1946 fscal = _mm_and_ps(fscal,cutoff_mask);
1948 /* Calculate temporary vectorial force */
1949 tx = _mm_mul_ps(fscal,dx12);
1950 ty = _mm_mul_ps(fscal,dy12);
1951 tz = _mm_mul_ps(fscal,dz12);
1953 /* Update vectorial force */
1954 fix1 = _mm_add_ps(fix1,tx);
1955 fiy1 = _mm_add_ps(fiy1,ty);
1956 fiz1 = _mm_add_ps(fiz1,tz);
1958 fjx2 = _mm_add_ps(fjx2,tx);
1959 fjy2 = _mm_add_ps(fjy2,ty);
1960 fjz2 = _mm_add_ps(fjz2,tz);
1964 /**************************
1965 * CALCULATE INTERACTIONS *
1966 **************************/
1968 if (gmx_mm_any_lt(rsq20,rcutoff2))
1971 r20 = _mm_mul_ps(rsq20,rinv20);
1973 /* EWALD ELECTROSTATICS */
1975 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1976 ewrt = _mm_mul_ps(r20,ewtabscale);
1977 ewitab = _mm_cvttps_epi32(ewrt);
1978 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1979 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1980 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1982 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1983 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1985 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1989 fscal = _mm_and_ps(fscal,cutoff_mask);
1991 /* Calculate temporary vectorial force */
1992 tx = _mm_mul_ps(fscal,dx20);
1993 ty = _mm_mul_ps(fscal,dy20);
1994 tz = _mm_mul_ps(fscal,dz20);
1996 /* Update vectorial force */
1997 fix2 = _mm_add_ps(fix2,tx);
1998 fiy2 = _mm_add_ps(fiy2,ty);
1999 fiz2 = _mm_add_ps(fiz2,tz);
2001 fjx0 = _mm_add_ps(fjx0,tx);
2002 fjy0 = _mm_add_ps(fjy0,ty);
2003 fjz0 = _mm_add_ps(fjz0,tz);
2007 /**************************
2008 * CALCULATE INTERACTIONS *
2009 **************************/
2011 if (gmx_mm_any_lt(rsq21,rcutoff2))
2014 r21 = _mm_mul_ps(rsq21,rinv21);
2016 /* EWALD ELECTROSTATICS */
2018 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2019 ewrt = _mm_mul_ps(r21,ewtabscale);
2020 ewitab = _mm_cvttps_epi32(ewrt);
2021 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2022 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2023 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2025 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2026 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2028 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2032 fscal = _mm_and_ps(fscal,cutoff_mask);
2034 /* Calculate temporary vectorial force */
2035 tx = _mm_mul_ps(fscal,dx21);
2036 ty = _mm_mul_ps(fscal,dy21);
2037 tz = _mm_mul_ps(fscal,dz21);
2039 /* Update vectorial force */
2040 fix2 = _mm_add_ps(fix2,tx);
2041 fiy2 = _mm_add_ps(fiy2,ty);
2042 fiz2 = _mm_add_ps(fiz2,tz);
2044 fjx1 = _mm_add_ps(fjx1,tx);
2045 fjy1 = _mm_add_ps(fjy1,ty);
2046 fjz1 = _mm_add_ps(fjz1,tz);
2050 /**************************
2051 * CALCULATE INTERACTIONS *
2052 **************************/
2054 if (gmx_mm_any_lt(rsq22,rcutoff2))
2057 r22 = _mm_mul_ps(rsq22,rinv22);
2059 /* EWALD ELECTROSTATICS */
2061 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2062 ewrt = _mm_mul_ps(r22,ewtabscale);
2063 ewitab = _mm_cvttps_epi32(ewrt);
2064 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2065 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2066 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2068 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2069 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2071 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2075 fscal = _mm_and_ps(fscal,cutoff_mask);
2077 /* Calculate temporary vectorial force */
2078 tx = _mm_mul_ps(fscal,dx22);
2079 ty = _mm_mul_ps(fscal,dy22);
2080 tz = _mm_mul_ps(fscal,dz22);
2082 /* Update vectorial force */
2083 fix2 = _mm_add_ps(fix2,tx);
2084 fiy2 = _mm_add_ps(fiy2,ty);
2085 fiz2 = _mm_add_ps(fiz2,tz);
2087 fjx2 = _mm_add_ps(fjx2,tx);
2088 fjy2 = _mm_add_ps(fjy2,ty);
2089 fjz2 = _mm_add_ps(fjz2,tz);
2093 fjptrA = f+j_coord_offsetA;
2094 fjptrB = f+j_coord_offsetB;
2095 fjptrC = f+j_coord_offsetC;
2096 fjptrD = f+j_coord_offsetD;
2098 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2099 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2101 /* Inner loop uses 358 flops */
2104 if(jidx<j_index_end)
2107 /* Get j neighbor index, and coordinate index */
2108 jnrlistA = jjnr[jidx];
2109 jnrlistB = jjnr[jidx+1];
2110 jnrlistC = jjnr[jidx+2];
2111 jnrlistD = jjnr[jidx+3];
2112 /* Sign of each element will be negative for non-real atoms.
2113 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2114 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2116 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2117 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2118 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2119 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2120 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2121 j_coord_offsetA = DIM*jnrA;
2122 j_coord_offsetB = DIM*jnrB;
2123 j_coord_offsetC = DIM*jnrC;
2124 j_coord_offsetD = DIM*jnrD;
2126 /* load j atom coordinates */
2127 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2128 x+j_coord_offsetC,x+j_coord_offsetD,
2129 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2131 /* Calculate displacement vector */
2132 dx00 = _mm_sub_ps(ix0,jx0);
2133 dy00 = _mm_sub_ps(iy0,jy0);
2134 dz00 = _mm_sub_ps(iz0,jz0);
2135 dx01 = _mm_sub_ps(ix0,jx1);
2136 dy01 = _mm_sub_ps(iy0,jy1);
2137 dz01 = _mm_sub_ps(iz0,jz1);
2138 dx02 = _mm_sub_ps(ix0,jx2);
2139 dy02 = _mm_sub_ps(iy0,jy2);
2140 dz02 = _mm_sub_ps(iz0,jz2);
2141 dx10 = _mm_sub_ps(ix1,jx0);
2142 dy10 = _mm_sub_ps(iy1,jy0);
2143 dz10 = _mm_sub_ps(iz1,jz0);
2144 dx11 = _mm_sub_ps(ix1,jx1);
2145 dy11 = _mm_sub_ps(iy1,jy1);
2146 dz11 = _mm_sub_ps(iz1,jz1);
2147 dx12 = _mm_sub_ps(ix1,jx2);
2148 dy12 = _mm_sub_ps(iy1,jy2);
2149 dz12 = _mm_sub_ps(iz1,jz2);
2150 dx20 = _mm_sub_ps(ix2,jx0);
2151 dy20 = _mm_sub_ps(iy2,jy0);
2152 dz20 = _mm_sub_ps(iz2,jz0);
2153 dx21 = _mm_sub_ps(ix2,jx1);
2154 dy21 = _mm_sub_ps(iy2,jy1);
2155 dz21 = _mm_sub_ps(iz2,jz1);
2156 dx22 = _mm_sub_ps(ix2,jx2);
2157 dy22 = _mm_sub_ps(iy2,jy2);
2158 dz22 = _mm_sub_ps(iz2,jz2);
2160 /* Calculate squared distance and things based on it */
2161 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2162 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2163 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2164 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2165 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2166 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2167 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2168 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2169 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2171 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2172 rinv01 = gmx_mm_invsqrt_ps(rsq01);
2173 rinv02 = gmx_mm_invsqrt_ps(rsq02);
2174 rinv10 = gmx_mm_invsqrt_ps(rsq10);
2175 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2176 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2177 rinv20 = gmx_mm_invsqrt_ps(rsq20);
2178 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2179 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2181 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2182 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
2183 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
2184 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
2185 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2186 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2187 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
2188 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2189 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2191 fjx0 = _mm_setzero_ps();
2192 fjy0 = _mm_setzero_ps();
2193 fjz0 = _mm_setzero_ps();
2194 fjx1 = _mm_setzero_ps();
2195 fjy1 = _mm_setzero_ps();
2196 fjz1 = _mm_setzero_ps();
2197 fjx2 = _mm_setzero_ps();
2198 fjy2 = _mm_setzero_ps();
2199 fjz2 = _mm_setzero_ps();
2201 /**************************
2202 * CALCULATE INTERACTIONS *
2203 **************************/
2205 if (gmx_mm_any_lt(rsq00,rcutoff2))
2208 r00 = _mm_mul_ps(rsq00,rinv00);
2209 r00 = _mm_andnot_ps(dummy_mask,r00);
2211 /* EWALD ELECTROSTATICS */
2213 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2214 ewrt = _mm_mul_ps(r00,ewtabscale);
2215 ewitab = _mm_cvttps_epi32(ewrt);
2216 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2217 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2218 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2220 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2221 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2223 /* LENNARD-JONES DISPERSION/REPULSION */
2225 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2226 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
2228 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2230 fscal = _mm_add_ps(felec,fvdw);
2232 fscal = _mm_and_ps(fscal,cutoff_mask);
2234 fscal = _mm_andnot_ps(dummy_mask,fscal);
2236 /* Calculate temporary vectorial force */
2237 tx = _mm_mul_ps(fscal,dx00);
2238 ty = _mm_mul_ps(fscal,dy00);
2239 tz = _mm_mul_ps(fscal,dz00);
2241 /* Update vectorial force */
2242 fix0 = _mm_add_ps(fix0,tx);
2243 fiy0 = _mm_add_ps(fiy0,ty);
2244 fiz0 = _mm_add_ps(fiz0,tz);
2246 fjx0 = _mm_add_ps(fjx0,tx);
2247 fjy0 = _mm_add_ps(fjy0,ty);
2248 fjz0 = _mm_add_ps(fjz0,tz);
2252 /**************************
2253 * CALCULATE INTERACTIONS *
2254 **************************/
2256 if (gmx_mm_any_lt(rsq01,rcutoff2))
2259 r01 = _mm_mul_ps(rsq01,rinv01);
2260 r01 = _mm_andnot_ps(dummy_mask,r01);
2262 /* EWALD ELECTROSTATICS */
2264 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2265 ewrt = _mm_mul_ps(r01,ewtabscale);
2266 ewitab = _mm_cvttps_epi32(ewrt);
2267 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2268 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2269 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2271 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2272 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2274 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2278 fscal = _mm_and_ps(fscal,cutoff_mask);
2280 fscal = _mm_andnot_ps(dummy_mask,fscal);
2282 /* Calculate temporary vectorial force */
2283 tx = _mm_mul_ps(fscal,dx01);
2284 ty = _mm_mul_ps(fscal,dy01);
2285 tz = _mm_mul_ps(fscal,dz01);
2287 /* Update vectorial force */
2288 fix0 = _mm_add_ps(fix0,tx);
2289 fiy0 = _mm_add_ps(fiy0,ty);
2290 fiz0 = _mm_add_ps(fiz0,tz);
2292 fjx1 = _mm_add_ps(fjx1,tx);
2293 fjy1 = _mm_add_ps(fjy1,ty);
2294 fjz1 = _mm_add_ps(fjz1,tz);
2298 /**************************
2299 * CALCULATE INTERACTIONS *
2300 **************************/
2302 if (gmx_mm_any_lt(rsq02,rcutoff2))
2305 r02 = _mm_mul_ps(rsq02,rinv02);
2306 r02 = _mm_andnot_ps(dummy_mask,r02);
2308 /* EWALD ELECTROSTATICS */
2310 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2311 ewrt = _mm_mul_ps(r02,ewtabscale);
2312 ewitab = _mm_cvttps_epi32(ewrt);
2313 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2314 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2315 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2317 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2318 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2320 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2324 fscal = _mm_and_ps(fscal,cutoff_mask);
2326 fscal = _mm_andnot_ps(dummy_mask,fscal);
2328 /* Calculate temporary vectorial force */
2329 tx = _mm_mul_ps(fscal,dx02);
2330 ty = _mm_mul_ps(fscal,dy02);
2331 tz = _mm_mul_ps(fscal,dz02);
2333 /* Update vectorial force */
2334 fix0 = _mm_add_ps(fix0,tx);
2335 fiy0 = _mm_add_ps(fiy0,ty);
2336 fiz0 = _mm_add_ps(fiz0,tz);
2338 fjx2 = _mm_add_ps(fjx2,tx);
2339 fjy2 = _mm_add_ps(fjy2,ty);
2340 fjz2 = _mm_add_ps(fjz2,tz);
2344 /**************************
2345 * CALCULATE INTERACTIONS *
2346 **************************/
2348 if (gmx_mm_any_lt(rsq10,rcutoff2))
2351 r10 = _mm_mul_ps(rsq10,rinv10);
2352 r10 = _mm_andnot_ps(dummy_mask,r10);
2354 /* EWALD ELECTROSTATICS */
2356 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2357 ewrt = _mm_mul_ps(r10,ewtabscale);
2358 ewitab = _mm_cvttps_epi32(ewrt);
2359 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2360 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2361 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2363 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2364 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2366 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2370 fscal = _mm_and_ps(fscal,cutoff_mask);
2372 fscal = _mm_andnot_ps(dummy_mask,fscal);
2374 /* Calculate temporary vectorial force */
2375 tx = _mm_mul_ps(fscal,dx10);
2376 ty = _mm_mul_ps(fscal,dy10);
2377 tz = _mm_mul_ps(fscal,dz10);
2379 /* Update vectorial force */
2380 fix1 = _mm_add_ps(fix1,tx);
2381 fiy1 = _mm_add_ps(fiy1,ty);
2382 fiz1 = _mm_add_ps(fiz1,tz);
2384 fjx0 = _mm_add_ps(fjx0,tx);
2385 fjy0 = _mm_add_ps(fjy0,ty);
2386 fjz0 = _mm_add_ps(fjz0,tz);
2390 /**************************
2391 * CALCULATE INTERACTIONS *
2392 **************************/
2394 if (gmx_mm_any_lt(rsq11,rcutoff2))
2397 r11 = _mm_mul_ps(rsq11,rinv11);
2398 r11 = _mm_andnot_ps(dummy_mask,r11);
2400 /* EWALD ELECTROSTATICS */
2402 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2403 ewrt = _mm_mul_ps(r11,ewtabscale);
2404 ewitab = _mm_cvttps_epi32(ewrt);
2405 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2406 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2407 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2409 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2410 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2412 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2416 fscal = _mm_and_ps(fscal,cutoff_mask);
2418 fscal = _mm_andnot_ps(dummy_mask,fscal);
2420 /* Calculate temporary vectorial force */
2421 tx = _mm_mul_ps(fscal,dx11);
2422 ty = _mm_mul_ps(fscal,dy11);
2423 tz = _mm_mul_ps(fscal,dz11);
2425 /* Update vectorial force */
2426 fix1 = _mm_add_ps(fix1,tx);
2427 fiy1 = _mm_add_ps(fiy1,ty);
2428 fiz1 = _mm_add_ps(fiz1,tz);
2430 fjx1 = _mm_add_ps(fjx1,tx);
2431 fjy1 = _mm_add_ps(fjy1,ty);
2432 fjz1 = _mm_add_ps(fjz1,tz);
2436 /**************************
2437 * CALCULATE INTERACTIONS *
2438 **************************/
2440 if (gmx_mm_any_lt(rsq12,rcutoff2))
2443 r12 = _mm_mul_ps(rsq12,rinv12);
2444 r12 = _mm_andnot_ps(dummy_mask,r12);
2446 /* EWALD ELECTROSTATICS */
2448 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2449 ewrt = _mm_mul_ps(r12,ewtabscale);
2450 ewitab = _mm_cvttps_epi32(ewrt);
2451 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2452 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2453 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2455 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2456 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2458 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2462 fscal = _mm_and_ps(fscal,cutoff_mask);
2464 fscal = _mm_andnot_ps(dummy_mask,fscal);
2466 /* Calculate temporary vectorial force */
2467 tx = _mm_mul_ps(fscal,dx12);
2468 ty = _mm_mul_ps(fscal,dy12);
2469 tz = _mm_mul_ps(fscal,dz12);
2471 /* Update vectorial force */
2472 fix1 = _mm_add_ps(fix1,tx);
2473 fiy1 = _mm_add_ps(fiy1,ty);
2474 fiz1 = _mm_add_ps(fiz1,tz);
2476 fjx2 = _mm_add_ps(fjx2,tx);
2477 fjy2 = _mm_add_ps(fjy2,ty);
2478 fjz2 = _mm_add_ps(fjz2,tz);
2482 /**************************
2483 * CALCULATE INTERACTIONS *
2484 **************************/
2486 if (gmx_mm_any_lt(rsq20,rcutoff2))
2489 r20 = _mm_mul_ps(rsq20,rinv20);
2490 r20 = _mm_andnot_ps(dummy_mask,r20);
2492 /* EWALD ELECTROSTATICS */
2494 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2495 ewrt = _mm_mul_ps(r20,ewtabscale);
2496 ewitab = _mm_cvttps_epi32(ewrt);
2497 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2498 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2499 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2501 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2502 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2504 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2508 fscal = _mm_and_ps(fscal,cutoff_mask);
2510 fscal = _mm_andnot_ps(dummy_mask,fscal);
2512 /* Calculate temporary vectorial force */
2513 tx = _mm_mul_ps(fscal,dx20);
2514 ty = _mm_mul_ps(fscal,dy20);
2515 tz = _mm_mul_ps(fscal,dz20);
2517 /* Update vectorial force */
2518 fix2 = _mm_add_ps(fix2,tx);
2519 fiy2 = _mm_add_ps(fiy2,ty);
2520 fiz2 = _mm_add_ps(fiz2,tz);
2522 fjx0 = _mm_add_ps(fjx0,tx);
2523 fjy0 = _mm_add_ps(fjy0,ty);
2524 fjz0 = _mm_add_ps(fjz0,tz);
2528 /**************************
2529 * CALCULATE INTERACTIONS *
2530 **************************/
2532 if (gmx_mm_any_lt(rsq21,rcutoff2))
2535 r21 = _mm_mul_ps(rsq21,rinv21);
2536 r21 = _mm_andnot_ps(dummy_mask,r21);
2538 /* EWALD ELECTROSTATICS */
2540 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2541 ewrt = _mm_mul_ps(r21,ewtabscale);
2542 ewitab = _mm_cvttps_epi32(ewrt);
2543 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2544 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2545 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2547 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2548 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2550 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2554 fscal = _mm_and_ps(fscal,cutoff_mask);
2556 fscal = _mm_andnot_ps(dummy_mask,fscal);
2558 /* Calculate temporary vectorial force */
2559 tx = _mm_mul_ps(fscal,dx21);
2560 ty = _mm_mul_ps(fscal,dy21);
2561 tz = _mm_mul_ps(fscal,dz21);
2563 /* Update vectorial force */
2564 fix2 = _mm_add_ps(fix2,tx);
2565 fiy2 = _mm_add_ps(fiy2,ty);
2566 fiz2 = _mm_add_ps(fiz2,tz);
2568 fjx1 = _mm_add_ps(fjx1,tx);
2569 fjy1 = _mm_add_ps(fjy1,ty);
2570 fjz1 = _mm_add_ps(fjz1,tz);
2574 /**************************
2575 * CALCULATE INTERACTIONS *
2576 **************************/
2578 if (gmx_mm_any_lt(rsq22,rcutoff2))
2581 r22 = _mm_mul_ps(rsq22,rinv22);
2582 r22 = _mm_andnot_ps(dummy_mask,r22);
2584 /* EWALD ELECTROSTATICS */
2586 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2587 ewrt = _mm_mul_ps(r22,ewtabscale);
2588 ewitab = _mm_cvttps_epi32(ewrt);
2589 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2590 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2591 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2593 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2594 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2596 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2600 fscal = _mm_and_ps(fscal,cutoff_mask);
2602 fscal = _mm_andnot_ps(dummy_mask,fscal);
2604 /* Calculate temporary vectorial force */
2605 tx = _mm_mul_ps(fscal,dx22);
2606 ty = _mm_mul_ps(fscal,dy22);
2607 tz = _mm_mul_ps(fscal,dz22);
2609 /* Update vectorial force */
2610 fix2 = _mm_add_ps(fix2,tx);
2611 fiy2 = _mm_add_ps(fiy2,ty);
2612 fiz2 = _mm_add_ps(fiz2,tz);
2614 fjx2 = _mm_add_ps(fjx2,tx);
2615 fjy2 = _mm_add_ps(fjy2,ty);
2616 fjz2 = _mm_add_ps(fjz2,tz);
2620 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2621 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2622 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2623 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2625 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2626 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2628 /* Inner loop uses 367 flops */
2631 /* End of innermost loop */
2633 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2634 f+i_coord_offset,fshift+i_shift_offset);
2636 /* Increment number of inner iterations */
2637 inneriter += j_index_end - j_index_start;
2639 /* Outer loop uses 18 flops */
2642 /* Increment number of outer iterations */
2645 /* Update outer/inner flops */
2647 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*367);