2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse4_1_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse4_1_single
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LennardJones
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse4_1_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
89 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
91 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
93 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
106 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
109 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
110 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
112 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
114 __m128 dummy_mask,cutoff_mask;
115 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
116 __m128 one = _mm_set1_ps(1.0);
117 __m128 two = _mm_set1_ps(2.0);
123 jindex = nlist->jindex;
125 shiftidx = nlist->shift;
127 shiftvec = fr->shift_vec[0];
128 fshift = fr->fshift[0];
129 facel = _mm_set1_ps(fr->ic->epsfac);
130 charge = mdatoms->chargeA;
131 nvdwtype = fr->ntype;
133 vdwtype = mdatoms->typeA;
135 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
136 ewtab = fr->ic->tabq_coul_FDV0;
137 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
138 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
140 /* Setup water-specific parameters */
141 inr = nlist->iinr[0];
142 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
143 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
144 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
145 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
147 jq0 = _mm_set1_ps(charge[inr+0]);
148 jq1 = _mm_set1_ps(charge[inr+1]);
149 jq2 = _mm_set1_ps(charge[inr+2]);
150 vdwjidx0A = 2*vdwtype[inr+0];
151 qq00 = _mm_mul_ps(iq0,jq0);
152 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
153 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
154 qq01 = _mm_mul_ps(iq0,jq1);
155 qq02 = _mm_mul_ps(iq0,jq2);
156 qq10 = _mm_mul_ps(iq1,jq0);
157 qq11 = _mm_mul_ps(iq1,jq1);
158 qq12 = _mm_mul_ps(iq1,jq2);
159 qq20 = _mm_mul_ps(iq2,jq0);
160 qq21 = _mm_mul_ps(iq2,jq1);
161 qq22 = _mm_mul_ps(iq2,jq2);
163 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
164 rcutoff_scalar = fr->ic->rcoulomb;
165 rcutoff = _mm_set1_ps(rcutoff_scalar);
166 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
168 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
169 rvdw = _mm_set1_ps(fr->ic->rvdw);
171 /* Avoid stupid compiler warnings */
172 jnrA = jnrB = jnrC = jnrD = 0;
181 for(iidx=0;iidx<4*DIM;iidx++)
186 /* Start outer loop over neighborlists */
187 for(iidx=0; iidx<nri; iidx++)
189 /* Load shift vector for this list */
190 i_shift_offset = DIM*shiftidx[iidx];
192 /* Load limits for loop over neighbors */
193 j_index_start = jindex[iidx];
194 j_index_end = jindex[iidx+1];
196 /* Get outer coordinate index */
198 i_coord_offset = DIM*inr;
200 /* Load i particle coords and add shift vector */
201 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
202 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
204 fix0 = _mm_setzero_ps();
205 fiy0 = _mm_setzero_ps();
206 fiz0 = _mm_setzero_ps();
207 fix1 = _mm_setzero_ps();
208 fiy1 = _mm_setzero_ps();
209 fiz1 = _mm_setzero_ps();
210 fix2 = _mm_setzero_ps();
211 fiy2 = _mm_setzero_ps();
212 fiz2 = _mm_setzero_ps();
214 /* Reset potential sums */
215 velecsum = _mm_setzero_ps();
216 vvdwsum = _mm_setzero_ps();
218 /* Start inner kernel loop */
219 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
222 /* Get j neighbor index, and coordinate index */
227 j_coord_offsetA = DIM*jnrA;
228 j_coord_offsetB = DIM*jnrB;
229 j_coord_offsetC = DIM*jnrC;
230 j_coord_offsetD = DIM*jnrD;
232 /* load j atom coordinates */
233 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
234 x+j_coord_offsetC,x+j_coord_offsetD,
235 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
237 /* Calculate displacement vector */
238 dx00 = _mm_sub_ps(ix0,jx0);
239 dy00 = _mm_sub_ps(iy0,jy0);
240 dz00 = _mm_sub_ps(iz0,jz0);
241 dx01 = _mm_sub_ps(ix0,jx1);
242 dy01 = _mm_sub_ps(iy0,jy1);
243 dz01 = _mm_sub_ps(iz0,jz1);
244 dx02 = _mm_sub_ps(ix0,jx2);
245 dy02 = _mm_sub_ps(iy0,jy2);
246 dz02 = _mm_sub_ps(iz0,jz2);
247 dx10 = _mm_sub_ps(ix1,jx0);
248 dy10 = _mm_sub_ps(iy1,jy0);
249 dz10 = _mm_sub_ps(iz1,jz0);
250 dx11 = _mm_sub_ps(ix1,jx1);
251 dy11 = _mm_sub_ps(iy1,jy1);
252 dz11 = _mm_sub_ps(iz1,jz1);
253 dx12 = _mm_sub_ps(ix1,jx2);
254 dy12 = _mm_sub_ps(iy1,jy2);
255 dz12 = _mm_sub_ps(iz1,jz2);
256 dx20 = _mm_sub_ps(ix2,jx0);
257 dy20 = _mm_sub_ps(iy2,jy0);
258 dz20 = _mm_sub_ps(iz2,jz0);
259 dx21 = _mm_sub_ps(ix2,jx1);
260 dy21 = _mm_sub_ps(iy2,jy1);
261 dz21 = _mm_sub_ps(iz2,jz1);
262 dx22 = _mm_sub_ps(ix2,jx2);
263 dy22 = _mm_sub_ps(iy2,jy2);
264 dz22 = _mm_sub_ps(iz2,jz2);
266 /* Calculate squared distance and things based on it */
267 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
268 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
269 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
270 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
271 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
272 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
273 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
274 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
275 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
277 rinv00 = sse41_invsqrt_f(rsq00);
278 rinv01 = sse41_invsqrt_f(rsq01);
279 rinv02 = sse41_invsqrt_f(rsq02);
280 rinv10 = sse41_invsqrt_f(rsq10);
281 rinv11 = sse41_invsqrt_f(rsq11);
282 rinv12 = sse41_invsqrt_f(rsq12);
283 rinv20 = sse41_invsqrt_f(rsq20);
284 rinv21 = sse41_invsqrt_f(rsq21);
285 rinv22 = sse41_invsqrt_f(rsq22);
287 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
288 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
289 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
290 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
291 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
292 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
293 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
294 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
295 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
297 fjx0 = _mm_setzero_ps();
298 fjy0 = _mm_setzero_ps();
299 fjz0 = _mm_setzero_ps();
300 fjx1 = _mm_setzero_ps();
301 fjy1 = _mm_setzero_ps();
302 fjz1 = _mm_setzero_ps();
303 fjx2 = _mm_setzero_ps();
304 fjy2 = _mm_setzero_ps();
305 fjz2 = _mm_setzero_ps();
307 /**************************
308 * CALCULATE INTERACTIONS *
309 **************************/
311 if (gmx_mm_any_lt(rsq00,rcutoff2))
314 r00 = _mm_mul_ps(rsq00,rinv00);
316 /* EWALD ELECTROSTATICS */
318 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
319 ewrt = _mm_mul_ps(r00,ewtabscale);
320 ewitab = _mm_cvttps_epi32(ewrt);
321 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
322 ewitab = _mm_slli_epi32(ewitab,2);
323 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
324 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
325 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
326 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
327 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
328 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
329 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
330 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
331 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
333 /* LENNARD-JONES DISPERSION/REPULSION */
335 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
336 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
337 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
338 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
339 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
340 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
342 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
344 /* Update potential sum for this i atom from the interaction with this j atom. */
345 velec = _mm_and_ps(velec,cutoff_mask);
346 velecsum = _mm_add_ps(velecsum,velec);
347 vvdw = _mm_and_ps(vvdw,cutoff_mask);
348 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
350 fscal = _mm_add_ps(felec,fvdw);
352 fscal = _mm_and_ps(fscal,cutoff_mask);
354 /* Calculate temporary vectorial force */
355 tx = _mm_mul_ps(fscal,dx00);
356 ty = _mm_mul_ps(fscal,dy00);
357 tz = _mm_mul_ps(fscal,dz00);
359 /* Update vectorial force */
360 fix0 = _mm_add_ps(fix0,tx);
361 fiy0 = _mm_add_ps(fiy0,ty);
362 fiz0 = _mm_add_ps(fiz0,tz);
364 fjx0 = _mm_add_ps(fjx0,tx);
365 fjy0 = _mm_add_ps(fjy0,ty);
366 fjz0 = _mm_add_ps(fjz0,tz);
370 /**************************
371 * CALCULATE INTERACTIONS *
372 **************************/
374 if (gmx_mm_any_lt(rsq01,rcutoff2))
377 r01 = _mm_mul_ps(rsq01,rinv01);
379 /* EWALD ELECTROSTATICS */
381 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
382 ewrt = _mm_mul_ps(r01,ewtabscale);
383 ewitab = _mm_cvttps_epi32(ewrt);
384 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
385 ewitab = _mm_slli_epi32(ewitab,2);
386 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
387 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
388 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
389 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
390 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
391 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
392 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
393 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
394 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
396 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
398 /* Update potential sum for this i atom from the interaction with this j atom. */
399 velec = _mm_and_ps(velec,cutoff_mask);
400 velecsum = _mm_add_ps(velecsum,velec);
404 fscal = _mm_and_ps(fscal,cutoff_mask);
406 /* Calculate temporary vectorial force */
407 tx = _mm_mul_ps(fscal,dx01);
408 ty = _mm_mul_ps(fscal,dy01);
409 tz = _mm_mul_ps(fscal,dz01);
411 /* Update vectorial force */
412 fix0 = _mm_add_ps(fix0,tx);
413 fiy0 = _mm_add_ps(fiy0,ty);
414 fiz0 = _mm_add_ps(fiz0,tz);
416 fjx1 = _mm_add_ps(fjx1,tx);
417 fjy1 = _mm_add_ps(fjy1,ty);
418 fjz1 = _mm_add_ps(fjz1,tz);
422 /**************************
423 * CALCULATE INTERACTIONS *
424 **************************/
426 if (gmx_mm_any_lt(rsq02,rcutoff2))
429 r02 = _mm_mul_ps(rsq02,rinv02);
431 /* EWALD ELECTROSTATICS */
433 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
434 ewrt = _mm_mul_ps(r02,ewtabscale);
435 ewitab = _mm_cvttps_epi32(ewrt);
436 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
437 ewitab = _mm_slli_epi32(ewitab,2);
438 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
439 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
440 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
441 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
442 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
443 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
444 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
445 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
446 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
448 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
450 /* Update potential sum for this i atom from the interaction with this j atom. */
451 velec = _mm_and_ps(velec,cutoff_mask);
452 velecsum = _mm_add_ps(velecsum,velec);
456 fscal = _mm_and_ps(fscal,cutoff_mask);
458 /* Calculate temporary vectorial force */
459 tx = _mm_mul_ps(fscal,dx02);
460 ty = _mm_mul_ps(fscal,dy02);
461 tz = _mm_mul_ps(fscal,dz02);
463 /* Update vectorial force */
464 fix0 = _mm_add_ps(fix0,tx);
465 fiy0 = _mm_add_ps(fiy0,ty);
466 fiz0 = _mm_add_ps(fiz0,tz);
468 fjx2 = _mm_add_ps(fjx2,tx);
469 fjy2 = _mm_add_ps(fjy2,ty);
470 fjz2 = _mm_add_ps(fjz2,tz);
474 /**************************
475 * CALCULATE INTERACTIONS *
476 **************************/
478 if (gmx_mm_any_lt(rsq10,rcutoff2))
481 r10 = _mm_mul_ps(rsq10,rinv10);
483 /* EWALD ELECTROSTATICS */
485 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
486 ewrt = _mm_mul_ps(r10,ewtabscale);
487 ewitab = _mm_cvttps_epi32(ewrt);
488 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
489 ewitab = _mm_slli_epi32(ewitab,2);
490 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
491 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
492 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
493 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
494 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
495 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
496 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
497 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
498 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
500 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
502 /* Update potential sum for this i atom from the interaction with this j atom. */
503 velec = _mm_and_ps(velec,cutoff_mask);
504 velecsum = _mm_add_ps(velecsum,velec);
508 fscal = _mm_and_ps(fscal,cutoff_mask);
510 /* Calculate temporary vectorial force */
511 tx = _mm_mul_ps(fscal,dx10);
512 ty = _mm_mul_ps(fscal,dy10);
513 tz = _mm_mul_ps(fscal,dz10);
515 /* Update vectorial force */
516 fix1 = _mm_add_ps(fix1,tx);
517 fiy1 = _mm_add_ps(fiy1,ty);
518 fiz1 = _mm_add_ps(fiz1,tz);
520 fjx0 = _mm_add_ps(fjx0,tx);
521 fjy0 = _mm_add_ps(fjy0,ty);
522 fjz0 = _mm_add_ps(fjz0,tz);
526 /**************************
527 * CALCULATE INTERACTIONS *
528 **************************/
530 if (gmx_mm_any_lt(rsq11,rcutoff2))
533 r11 = _mm_mul_ps(rsq11,rinv11);
535 /* EWALD ELECTROSTATICS */
537 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
538 ewrt = _mm_mul_ps(r11,ewtabscale);
539 ewitab = _mm_cvttps_epi32(ewrt);
540 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
541 ewitab = _mm_slli_epi32(ewitab,2);
542 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
543 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
544 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
545 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
546 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
547 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
548 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
549 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
550 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
552 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
554 /* Update potential sum for this i atom from the interaction with this j atom. */
555 velec = _mm_and_ps(velec,cutoff_mask);
556 velecsum = _mm_add_ps(velecsum,velec);
560 fscal = _mm_and_ps(fscal,cutoff_mask);
562 /* Calculate temporary vectorial force */
563 tx = _mm_mul_ps(fscal,dx11);
564 ty = _mm_mul_ps(fscal,dy11);
565 tz = _mm_mul_ps(fscal,dz11);
567 /* Update vectorial force */
568 fix1 = _mm_add_ps(fix1,tx);
569 fiy1 = _mm_add_ps(fiy1,ty);
570 fiz1 = _mm_add_ps(fiz1,tz);
572 fjx1 = _mm_add_ps(fjx1,tx);
573 fjy1 = _mm_add_ps(fjy1,ty);
574 fjz1 = _mm_add_ps(fjz1,tz);
578 /**************************
579 * CALCULATE INTERACTIONS *
580 **************************/
582 if (gmx_mm_any_lt(rsq12,rcutoff2))
585 r12 = _mm_mul_ps(rsq12,rinv12);
587 /* EWALD ELECTROSTATICS */
589 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
590 ewrt = _mm_mul_ps(r12,ewtabscale);
591 ewitab = _mm_cvttps_epi32(ewrt);
592 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
593 ewitab = _mm_slli_epi32(ewitab,2);
594 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
595 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
596 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
597 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
598 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
599 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
600 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
601 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
602 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
604 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
606 /* Update potential sum for this i atom from the interaction with this j atom. */
607 velec = _mm_and_ps(velec,cutoff_mask);
608 velecsum = _mm_add_ps(velecsum,velec);
612 fscal = _mm_and_ps(fscal,cutoff_mask);
614 /* Calculate temporary vectorial force */
615 tx = _mm_mul_ps(fscal,dx12);
616 ty = _mm_mul_ps(fscal,dy12);
617 tz = _mm_mul_ps(fscal,dz12);
619 /* Update vectorial force */
620 fix1 = _mm_add_ps(fix1,tx);
621 fiy1 = _mm_add_ps(fiy1,ty);
622 fiz1 = _mm_add_ps(fiz1,tz);
624 fjx2 = _mm_add_ps(fjx2,tx);
625 fjy2 = _mm_add_ps(fjy2,ty);
626 fjz2 = _mm_add_ps(fjz2,tz);
630 /**************************
631 * CALCULATE INTERACTIONS *
632 **************************/
634 if (gmx_mm_any_lt(rsq20,rcutoff2))
637 r20 = _mm_mul_ps(rsq20,rinv20);
639 /* EWALD ELECTROSTATICS */
641 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
642 ewrt = _mm_mul_ps(r20,ewtabscale);
643 ewitab = _mm_cvttps_epi32(ewrt);
644 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
645 ewitab = _mm_slli_epi32(ewitab,2);
646 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
647 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
648 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
649 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
650 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
651 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
652 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
653 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
654 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
656 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
658 /* Update potential sum for this i atom from the interaction with this j atom. */
659 velec = _mm_and_ps(velec,cutoff_mask);
660 velecsum = _mm_add_ps(velecsum,velec);
664 fscal = _mm_and_ps(fscal,cutoff_mask);
666 /* Calculate temporary vectorial force */
667 tx = _mm_mul_ps(fscal,dx20);
668 ty = _mm_mul_ps(fscal,dy20);
669 tz = _mm_mul_ps(fscal,dz20);
671 /* Update vectorial force */
672 fix2 = _mm_add_ps(fix2,tx);
673 fiy2 = _mm_add_ps(fiy2,ty);
674 fiz2 = _mm_add_ps(fiz2,tz);
676 fjx0 = _mm_add_ps(fjx0,tx);
677 fjy0 = _mm_add_ps(fjy0,ty);
678 fjz0 = _mm_add_ps(fjz0,tz);
682 /**************************
683 * CALCULATE INTERACTIONS *
684 **************************/
686 if (gmx_mm_any_lt(rsq21,rcutoff2))
689 r21 = _mm_mul_ps(rsq21,rinv21);
691 /* EWALD ELECTROSTATICS */
693 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
694 ewrt = _mm_mul_ps(r21,ewtabscale);
695 ewitab = _mm_cvttps_epi32(ewrt);
696 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
697 ewitab = _mm_slli_epi32(ewitab,2);
698 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
699 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
700 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
701 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
702 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
703 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
704 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
705 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
706 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
708 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
710 /* Update potential sum for this i atom from the interaction with this j atom. */
711 velec = _mm_and_ps(velec,cutoff_mask);
712 velecsum = _mm_add_ps(velecsum,velec);
716 fscal = _mm_and_ps(fscal,cutoff_mask);
718 /* Calculate temporary vectorial force */
719 tx = _mm_mul_ps(fscal,dx21);
720 ty = _mm_mul_ps(fscal,dy21);
721 tz = _mm_mul_ps(fscal,dz21);
723 /* Update vectorial force */
724 fix2 = _mm_add_ps(fix2,tx);
725 fiy2 = _mm_add_ps(fiy2,ty);
726 fiz2 = _mm_add_ps(fiz2,tz);
728 fjx1 = _mm_add_ps(fjx1,tx);
729 fjy1 = _mm_add_ps(fjy1,ty);
730 fjz1 = _mm_add_ps(fjz1,tz);
734 /**************************
735 * CALCULATE INTERACTIONS *
736 **************************/
738 if (gmx_mm_any_lt(rsq22,rcutoff2))
741 r22 = _mm_mul_ps(rsq22,rinv22);
743 /* EWALD ELECTROSTATICS */
745 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
746 ewrt = _mm_mul_ps(r22,ewtabscale);
747 ewitab = _mm_cvttps_epi32(ewrt);
748 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
749 ewitab = _mm_slli_epi32(ewitab,2);
750 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
751 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
752 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
753 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
754 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
755 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
756 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
757 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
758 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
760 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
762 /* Update potential sum for this i atom from the interaction with this j atom. */
763 velec = _mm_and_ps(velec,cutoff_mask);
764 velecsum = _mm_add_ps(velecsum,velec);
768 fscal = _mm_and_ps(fscal,cutoff_mask);
770 /* Calculate temporary vectorial force */
771 tx = _mm_mul_ps(fscal,dx22);
772 ty = _mm_mul_ps(fscal,dy22);
773 tz = _mm_mul_ps(fscal,dz22);
775 /* Update vectorial force */
776 fix2 = _mm_add_ps(fix2,tx);
777 fiy2 = _mm_add_ps(fiy2,ty);
778 fiz2 = _mm_add_ps(fiz2,tz);
780 fjx2 = _mm_add_ps(fjx2,tx);
781 fjy2 = _mm_add_ps(fjy2,ty);
782 fjz2 = _mm_add_ps(fjz2,tz);
786 fjptrA = f+j_coord_offsetA;
787 fjptrB = f+j_coord_offsetB;
788 fjptrC = f+j_coord_offsetC;
789 fjptrD = f+j_coord_offsetD;
791 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
792 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
794 /* Inner loop uses 432 flops */
800 /* Get j neighbor index, and coordinate index */
801 jnrlistA = jjnr[jidx];
802 jnrlistB = jjnr[jidx+1];
803 jnrlistC = jjnr[jidx+2];
804 jnrlistD = jjnr[jidx+3];
805 /* Sign of each element will be negative for non-real atoms.
806 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
807 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
809 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
810 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
811 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
812 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
813 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
814 j_coord_offsetA = DIM*jnrA;
815 j_coord_offsetB = DIM*jnrB;
816 j_coord_offsetC = DIM*jnrC;
817 j_coord_offsetD = DIM*jnrD;
819 /* load j atom coordinates */
820 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
821 x+j_coord_offsetC,x+j_coord_offsetD,
822 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
824 /* Calculate displacement vector */
825 dx00 = _mm_sub_ps(ix0,jx0);
826 dy00 = _mm_sub_ps(iy0,jy0);
827 dz00 = _mm_sub_ps(iz0,jz0);
828 dx01 = _mm_sub_ps(ix0,jx1);
829 dy01 = _mm_sub_ps(iy0,jy1);
830 dz01 = _mm_sub_ps(iz0,jz1);
831 dx02 = _mm_sub_ps(ix0,jx2);
832 dy02 = _mm_sub_ps(iy0,jy2);
833 dz02 = _mm_sub_ps(iz0,jz2);
834 dx10 = _mm_sub_ps(ix1,jx0);
835 dy10 = _mm_sub_ps(iy1,jy0);
836 dz10 = _mm_sub_ps(iz1,jz0);
837 dx11 = _mm_sub_ps(ix1,jx1);
838 dy11 = _mm_sub_ps(iy1,jy1);
839 dz11 = _mm_sub_ps(iz1,jz1);
840 dx12 = _mm_sub_ps(ix1,jx2);
841 dy12 = _mm_sub_ps(iy1,jy2);
842 dz12 = _mm_sub_ps(iz1,jz2);
843 dx20 = _mm_sub_ps(ix2,jx0);
844 dy20 = _mm_sub_ps(iy2,jy0);
845 dz20 = _mm_sub_ps(iz2,jz0);
846 dx21 = _mm_sub_ps(ix2,jx1);
847 dy21 = _mm_sub_ps(iy2,jy1);
848 dz21 = _mm_sub_ps(iz2,jz1);
849 dx22 = _mm_sub_ps(ix2,jx2);
850 dy22 = _mm_sub_ps(iy2,jy2);
851 dz22 = _mm_sub_ps(iz2,jz2);
853 /* Calculate squared distance and things based on it */
854 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
855 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
856 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
857 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
858 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
859 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
860 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
861 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
862 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
864 rinv00 = sse41_invsqrt_f(rsq00);
865 rinv01 = sse41_invsqrt_f(rsq01);
866 rinv02 = sse41_invsqrt_f(rsq02);
867 rinv10 = sse41_invsqrt_f(rsq10);
868 rinv11 = sse41_invsqrt_f(rsq11);
869 rinv12 = sse41_invsqrt_f(rsq12);
870 rinv20 = sse41_invsqrt_f(rsq20);
871 rinv21 = sse41_invsqrt_f(rsq21);
872 rinv22 = sse41_invsqrt_f(rsq22);
874 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
875 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
876 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
877 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
878 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
879 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
880 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
881 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
882 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
884 fjx0 = _mm_setzero_ps();
885 fjy0 = _mm_setzero_ps();
886 fjz0 = _mm_setzero_ps();
887 fjx1 = _mm_setzero_ps();
888 fjy1 = _mm_setzero_ps();
889 fjz1 = _mm_setzero_ps();
890 fjx2 = _mm_setzero_ps();
891 fjy2 = _mm_setzero_ps();
892 fjz2 = _mm_setzero_ps();
894 /**************************
895 * CALCULATE INTERACTIONS *
896 **************************/
898 if (gmx_mm_any_lt(rsq00,rcutoff2))
901 r00 = _mm_mul_ps(rsq00,rinv00);
902 r00 = _mm_andnot_ps(dummy_mask,r00);
904 /* EWALD ELECTROSTATICS */
906 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
907 ewrt = _mm_mul_ps(r00,ewtabscale);
908 ewitab = _mm_cvttps_epi32(ewrt);
909 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
910 ewitab = _mm_slli_epi32(ewitab,2);
911 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
912 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
913 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
914 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
915 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
916 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
917 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
918 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
919 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
921 /* LENNARD-JONES DISPERSION/REPULSION */
923 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
924 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
925 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
926 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
927 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
928 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
930 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
932 /* Update potential sum for this i atom from the interaction with this j atom. */
933 velec = _mm_and_ps(velec,cutoff_mask);
934 velec = _mm_andnot_ps(dummy_mask,velec);
935 velecsum = _mm_add_ps(velecsum,velec);
936 vvdw = _mm_and_ps(vvdw,cutoff_mask);
937 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
938 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
940 fscal = _mm_add_ps(felec,fvdw);
942 fscal = _mm_and_ps(fscal,cutoff_mask);
944 fscal = _mm_andnot_ps(dummy_mask,fscal);
946 /* Calculate temporary vectorial force */
947 tx = _mm_mul_ps(fscal,dx00);
948 ty = _mm_mul_ps(fscal,dy00);
949 tz = _mm_mul_ps(fscal,dz00);
951 /* Update vectorial force */
952 fix0 = _mm_add_ps(fix0,tx);
953 fiy0 = _mm_add_ps(fiy0,ty);
954 fiz0 = _mm_add_ps(fiz0,tz);
956 fjx0 = _mm_add_ps(fjx0,tx);
957 fjy0 = _mm_add_ps(fjy0,ty);
958 fjz0 = _mm_add_ps(fjz0,tz);
962 /**************************
963 * CALCULATE INTERACTIONS *
964 **************************/
966 if (gmx_mm_any_lt(rsq01,rcutoff2))
969 r01 = _mm_mul_ps(rsq01,rinv01);
970 r01 = _mm_andnot_ps(dummy_mask,r01);
972 /* EWALD ELECTROSTATICS */
974 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
975 ewrt = _mm_mul_ps(r01,ewtabscale);
976 ewitab = _mm_cvttps_epi32(ewrt);
977 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
978 ewitab = _mm_slli_epi32(ewitab,2);
979 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
980 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
981 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
982 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
983 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
984 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
985 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
986 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
987 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
989 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
991 /* Update potential sum for this i atom from the interaction with this j atom. */
992 velec = _mm_and_ps(velec,cutoff_mask);
993 velec = _mm_andnot_ps(dummy_mask,velec);
994 velecsum = _mm_add_ps(velecsum,velec);
998 fscal = _mm_and_ps(fscal,cutoff_mask);
1000 fscal = _mm_andnot_ps(dummy_mask,fscal);
1002 /* Calculate temporary vectorial force */
1003 tx = _mm_mul_ps(fscal,dx01);
1004 ty = _mm_mul_ps(fscal,dy01);
1005 tz = _mm_mul_ps(fscal,dz01);
1007 /* Update vectorial force */
1008 fix0 = _mm_add_ps(fix0,tx);
1009 fiy0 = _mm_add_ps(fiy0,ty);
1010 fiz0 = _mm_add_ps(fiz0,tz);
1012 fjx1 = _mm_add_ps(fjx1,tx);
1013 fjy1 = _mm_add_ps(fjy1,ty);
1014 fjz1 = _mm_add_ps(fjz1,tz);
1018 /**************************
1019 * CALCULATE INTERACTIONS *
1020 **************************/
1022 if (gmx_mm_any_lt(rsq02,rcutoff2))
1025 r02 = _mm_mul_ps(rsq02,rinv02);
1026 r02 = _mm_andnot_ps(dummy_mask,r02);
1028 /* EWALD ELECTROSTATICS */
1030 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1031 ewrt = _mm_mul_ps(r02,ewtabscale);
1032 ewitab = _mm_cvttps_epi32(ewrt);
1033 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1034 ewitab = _mm_slli_epi32(ewitab,2);
1035 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1036 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1037 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1038 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1039 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1040 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1041 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1042 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
1043 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1045 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1047 /* Update potential sum for this i atom from the interaction with this j atom. */
1048 velec = _mm_and_ps(velec,cutoff_mask);
1049 velec = _mm_andnot_ps(dummy_mask,velec);
1050 velecsum = _mm_add_ps(velecsum,velec);
1054 fscal = _mm_and_ps(fscal,cutoff_mask);
1056 fscal = _mm_andnot_ps(dummy_mask,fscal);
1058 /* Calculate temporary vectorial force */
1059 tx = _mm_mul_ps(fscal,dx02);
1060 ty = _mm_mul_ps(fscal,dy02);
1061 tz = _mm_mul_ps(fscal,dz02);
1063 /* Update vectorial force */
1064 fix0 = _mm_add_ps(fix0,tx);
1065 fiy0 = _mm_add_ps(fiy0,ty);
1066 fiz0 = _mm_add_ps(fiz0,tz);
1068 fjx2 = _mm_add_ps(fjx2,tx);
1069 fjy2 = _mm_add_ps(fjy2,ty);
1070 fjz2 = _mm_add_ps(fjz2,tz);
1074 /**************************
1075 * CALCULATE INTERACTIONS *
1076 **************************/
1078 if (gmx_mm_any_lt(rsq10,rcutoff2))
1081 r10 = _mm_mul_ps(rsq10,rinv10);
1082 r10 = _mm_andnot_ps(dummy_mask,r10);
1084 /* EWALD ELECTROSTATICS */
1086 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1087 ewrt = _mm_mul_ps(r10,ewtabscale);
1088 ewitab = _mm_cvttps_epi32(ewrt);
1089 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1090 ewitab = _mm_slli_epi32(ewitab,2);
1091 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1092 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1093 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1094 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1095 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1096 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1097 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1098 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
1099 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1101 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1103 /* Update potential sum for this i atom from the interaction with this j atom. */
1104 velec = _mm_and_ps(velec,cutoff_mask);
1105 velec = _mm_andnot_ps(dummy_mask,velec);
1106 velecsum = _mm_add_ps(velecsum,velec);
1110 fscal = _mm_and_ps(fscal,cutoff_mask);
1112 fscal = _mm_andnot_ps(dummy_mask,fscal);
1114 /* Calculate temporary vectorial force */
1115 tx = _mm_mul_ps(fscal,dx10);
1116 ty = _mm_mul_ps(fscal,dy10);
1117 tz = _mm_mul_ps(fscal,dz10);
1119 /* Update vectorial force */
1120 fix1 = _mm_add_ps(fix1,tx);
1121 fiy1 = _mm_add_ps(fiy1,ty);
1122 fiz1 = _mm_add_ps(fiz1,tz);
1124 fjx0 = _mm_add_ps(fjx0,tx);
1125 fjy0 = _mm_add_ps(fjy0,ty);
1126 fjz0 = _mm_add_ps(fjz0,tz);
1130 /**************************
1131 * CALCULATE INTERACTIONS *
1132 **************************/
1134 if (gmx_mm_any_lt(rsq11,rcutoff2))
1137 r11 = _mm_mul_ps(rsq11,rinv11);
1138 r11 = _mm_andnot_ps(dummy_mask,r11);
1140 /* EWALD ELECTROSTATICS */
1142 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1143 ewrt = _mm_mul_ps(r11,ewtabscale);
1144 ewitab = _mm_cvttps_epi32(ewrt);
1145 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1146 ewitab = _mm_slli_epi32(ewitab,2);
1147 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1148 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1149 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1150 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1151 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1152 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1153 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1154 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
1155 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1157 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1159 /* Update potential sum for this i atom from the interaction with this j atom. */
1160 velec = _mm_and_ps(velec,cutoff_mask);
1161 velec = _mm_andnot_ps(dummy_mask,velec);
1162 velecsum = _mm_add_ps(velecsum,velec);
1166 fscal = _mm_and_ps(fscal,cutoff_mask);
1168 fscal = _mm_andnot_ps(dummy_mask,fscal);
1170 /* Calculate temporary vectorial force */
1171 tx = _mm_mul_ps(fscal,dx11);
1172 ty = _mm_mul_ps(fscal,dy11);
1173 tz = _mm_mul_ps(fscal,dz11);
1175 /* Update vectorial force */
1176 fix1 = _mm_add_ps(fix1,tx);
1177 fiy1 = _mm_add_ps(fiy1,ty);
1178 fiz1 = _mm_add_ps(fiz1,tz);
1180 fjx1 = _mm_add_ps(fjx1,tx);
1181 fjy1 = _mm_add_ps(fjy1,ty);
1182 fjz1 = _mm_add_ps(fjz1,tz);
1186 /**************************
1187 * CALCULATE INTERACTIONS *
1188 **************************/
1190 if (gmx_mm_any_lt(rsq12,rcutoff2))
1193 r12 = _mm_mul_ps(rsq12,rinv12);
1194 r12 = _mm_andnot_ps(dummy_mask,r12);
1196 /* EWALD ELECTROSTATICS */
1198 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1199 ewrt = _mm_mul_ps(r12,ewtabscale);
1200 ewitab = _mm_cvttps_epi32(ewrt);
1201 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1202 ewitab = _mm_slli_epi32(ewitab,2);
1203 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1204 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1205 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1206 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1207 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1208 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1209 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1210 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
1211 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1213 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1215 /* Update potential sum for this i atom from the interaction with this j atom. */
1216 velec = _mm_and_ps(velec,cutoff_mask);
1217 velec = _mm_andnot_ps(dummy_mask,velec);
1218 velecsum = _mm_add_ps(velecsum,velec);
1222 fscal = _mm_and_ps(fscal,cutoff_mask);
1224 fscal = _mm_andnot_ps(dummy_mask,fscal);
1226 /* Calculate temporary vectorial force */
1227 tx = _mm_mul_ps(fscal,dx12);
1228 ty = _mm_mul_ps(fscal,dy12);
1229 tz = _mm_mul_ps(fscal,dz12);
1231 /* Update vectorial force */
1232 fix1 = _mm_add_ps(fix1,tx);
1233 fiy1 = _mm_add_ps(fiy1,ty);
1234 fiz1 = _mm_add_ps(fiz1,tz);
1236 fjx2 = _mm_add_ps(fjx2,tx);
1237 fjy2 = _mm_add_ps(fjy2,ty);
1238 fjz2 = _mm_add_ps(fjz2,tz);
1242 /**************************
1243 * CALCULATE INTERACTIONS *
1244 **************************/
1246 if (gmx_mm_any_lt(rsq20,rcutoff2))
1249 r20 = _mm_mul_ps(rsq20,rinv20);
1250 r20 = _mm_andnot_ps(dummy_mask,r20);
1252 /* EWALD ELECTROSTATICS */
1254 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1255 ewrt = _mm_mul_ps(r20,ewtabscale);
1256 ewitab = _mm_cvttps_epi32(ewrt);
1257 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1258 ewitab = _mm_slli_epi32(ewitab,2);
1259 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1260 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1261 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1262 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1263 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1264 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1265 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1266 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
1267 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1269 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1271 /* Update potential sum for this i atom from the interaction with this j atom. */
1272 velec = _mm_and_ps(velec,cutoff_mask);
1273 velec = _mm_andnot_ps(dummy_mask,velec);
1274 velecsum = _mm_add_ps(velecsum,velec);
1278 fscal = _mm_and_ps(fscal,cutoff_mask);
1280 fscal = _mm_andnot_ps(dummy_mask,fscal);
1282 /* Calculate temporary vectorial force */
1283 tx = _mm_mul_ps(fscal,dx20);
1284 ty = _mm_mul_ps(fscal,dy20);
1285 tz = _mm_mul_ps(fscal,dz20);
1287 /* Update vectorial force */
1288 fix2 = _mm_add_ps(fix2,tx);
1289 fiy2 = _mm_add_ps(fiy2,ty);
1290 fiz2 = _mm_add_ps(fiz2,tz);
1292 fjx0 = _mm_add_ps(fjx0,tx);
1293 fjy0 = _mm_add_ps(fjy0,ty);
1294 fjz0 = _mm_add_ps(fjz0,tz);
1298 /**************************
1299 * CALCULATE INTERACTIONS *
1300 **************************/
1302 if (gmx_mm_any_lt(rsq21,rcutoff2))
1305 r21 = _mm_mul_ps(rsq21,rinv21);
1306 r21 = _mm_andnot_ps(dummy_mask,r21);
1308 /* EWALD ELECTROSTATICS */
1310 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1311 ewrt = _mm_mul_ps(r21,ewtabscale);
1312 ewitab = _mm_cvttps_epi32(ewrt);
1313 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1314 ewitab = _mm_slli_epi32(ewitab,2);
1315 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1316 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1317 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1318 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1319 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1320 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1321 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1322 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
1323 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1325 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1327 /* Update potential sum for this i atom from the interaction with this j atom. */
1328 velec = _mm_and_ps(velec,cutoff_mask);
1329 velec = _mm_andnot_ps(dummy_mask,velec);
1330 velecsum = _mm_add_ps(velecsum,velec);
1334 fscal = _mm_and_ps(fscal,cutoff_mask);
1336 fscal = _mm_andnot_ps(dummy_mask,fscal);
1338 /* Calculate temporary vectorial force */
1339 tx = _mm_mul_ps(fscal,dx21);
1340 ty = _mm_mul_ps(fscal,dy21);
1341 tz = _mm_mul_ps(fscal,dz21);
1343 /* Update vectorial force */
1344 fix2 = _mm_add_ps(fix2,tx);
1345 fiy2 = _mm_add_ps(fiy2,ty);
1346 fiz2 = _mm_add_ps(fiz2,tz);
1348 fjx1 = _mm_add_ps(fjx1,tx);
1349 fjy1 = _mm_add_ps(fjy1,ty);
1350 fjz1 = _mm_add_ps(fjz1,tz);
1354 /**************************
1355 * CALCULATE INTERACTIONS *
1356 **************************/
1358 if (gmx_mm_any_lt(rsq22,rcutoff2))
1361 r22 = _mm_mul_ps(rsq22,rinv22);
1362 r22 = _mm_andnot_ps(dummy_mask,r22);
1364 /* EWALD ELECTROSTATICS */
1366 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1367 ewrt = _mm_mul_ps(r22,ewtabscale);
1368 ewitab = _mm_cvttps_epi32(ewrt);
1369 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1370 ewitab = _mm_slli_epi32(ewitab,2);
1371 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1372 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1373 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1374 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1375 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1376 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1377 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1378 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
1379 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1381 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1383 /* Update potential sum for this i atom from the interaction with this j atom. */
1384 velec = _mm_and_ps(velec,cutoff_mask);
1385 velec = _mm_andnot_ps(dummy_mask,velec);
1386 velecsum = _mm_add_ps(velecsum,velec);
1390 fscal = _mm_and_ps(fscal,cutoff_mask);
1392 fscal = _mm_andnot_ps(dummy_mask,fscal);
1394 /* Calculate temporary vectorial force */
1395 tx = _mm_mul_ps(fscal,dx22);
1396 ty = _mm_mul_ps(fscal,dy22);
1397 tz = _mm_mul_ps(fscal,dz22);
1399 /* Update vectorial force */
1400 fix2 = _mm_add_ps(fix2,tx);
1401 fiy2 = _mm_add_ps(fiy2,ty);
1402 fiz2 = _mm_add_ps(fiz2,tz);
1404 fjx2 = _mm_add_ps(fjx2,tx);
1405 fjy2 = _mm_add_ps(fjy2,ty);
1406 fjz2 = _mm_add_ps(fjz2,tz);
1410 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1411 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1412 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1413 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1415 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1416 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1418 /* Inner loop uses 441 flops */
1421 /* End of innermost loop */
1423 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1424 f+i_coord_offset,fshift+i_shift_offset);
1427 /* Update potential energies */
1428 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1429 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1431 /* Increment number of inner iterations */
1432 inneriter += j_index_end - j_index_start;
1434 /* Outer loop uses 20 flops */
1437 /* Increment number of outer iterations */
1440 /* Update outer/inner flops */
1442 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*441);
1445 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_single
1446 * Electrostatics interaction: Ewald
1447 * VdW interaction: LennardJones
1448 * Geometry: Water3-Water3
1449 * Calculate force/pot: Force
1452 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_single
1453 (t_nblist * gmx_restrict nlist,
1454 rvec * gmx_restrict xx,
1455 rvec * gmx_restrict ff,
1456 struct t_forcerec * gmx_restrict fr,
1457 t_mdatoms * gmx_restrict mdatoms,
1458 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1459 t_nrnb * gmx_restrict nrnb)
1461 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1462 * just 0 for non-waters.
1463 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1464 * jnr indices corresponding to data put in the four positions in the SIMD register.
1466 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1467 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1468 int jnrA,jnrB,jnrC,jnrD;
1469 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1470 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1471 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1472 real rcutoff_scalar;
1473 real *shiftvec,*fshift,*x,*f;
1474 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1475 real scratch[4*DIM];
1476 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1478 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1480 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1482 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1483 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1484 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1485 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1486 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1487 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1488 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1489 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1490 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1491 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1492 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1493 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1494 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1495 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1496 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1497 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1498 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1501 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1504 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1505 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1507 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1509 __m128 dummy_mask,cutoff_mask;
1510 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1511 __m128 one = _mm_set1_ps(1.0);
1512 __m128 two = _mm_set1_ps(2.0);
1518 jindex = nlist->jindex;
1520 shiftidx = nlist->shift;
1522 shiftvec = fr->shift_vec[0];
1523 fshift = fr->fshift[0];
1524 facel = _mm_set1_ps(fr->ic->epsfac);
1525 charge = mdatoms->chargeA;
1526 nvdwtype = fr->ntype;
1527 vdwparam = fr->nbfp;
1528 vdwtype = mdatoms->typeA;
1530 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1531 ewtab = fr->ic->tabq_coul_F;
1532 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1533 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1535 /* Setup water-specific parameters */
1536 inr = nlist->iinr[0];
1537 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1538 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1539 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1540 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1542 jq0 = _mm_set1_ps(charge[inr+0]);
1543 jq1 = _mm_set1_ps(charge[inr+1]);
1544 jq2 = _mm_set1_ps(charge[inr+2]);
1545 vdwjidx0A = 2*vdwtype[inr+0];
1546 qq00 = _mm_mul_ps(iq0,jq0);
1547 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1548 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1549 qq01 = _mm_mul_ps(iq0,jq1);
1550 qq02 = _mm_mul_ps(iq0,jq2);
1551 qq10 = _mm_mul_ps(iq1,jq0);
1552 qq11 = _mm_mul_ps(iq1,jq1);
1553 qq12 = _mm_mul_ps(iq1,jq2);
1554 qq20 = _mm_mul_ps(iq2,jq0);
1555 qq21 = _mm_mul_ps(iq2,jq1);
1556 qq22 = _mm_mul_ps(iq2,jq2);
1558 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1559 rcutoff_scalar = fr->ic->rcoulomb;
1560 rcutoff = _mm_set1_ps(rcutoff_scalar);
1561 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1563 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
1564 rvdw = _mm_set1_ps(fr->ic->rvdw);
1566 /* Avoid stupid compiler warnings */
1567 jnrA = jnrB = jnrC = jnrD = 0;
1568 j_coord_offsetA = 0;
1569 j_coord_offsetB = 0;
1570 j_coord_offsetC = 0;
1571 j_coord_offsetD = 0;
1576 for(iidx=0;iidx<4*DIM;iidx++)
1578 scratch[iidx] = 0.0;
1581 /* Start outer loop over neighborlists */
1582 for(iidx=0; iidx<nri; iidx++)
1584 /* Load shift vector for this list */
1585 i_shift_offset = DIM*shiftidx[iidx];
1587 /* Load limits for loop over neighbors */
1588 j_index_start = jindex[iidx];
1589 j_index_end = jindex[iidx+1];
1591 /* Get outer coordinate index */
1593 i_coord_offset = DIM*inr;
1595 /* Load i particle coords and add shift vector */
1596 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1597 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1599 fix0 = _mm_setzero_ps();
1600 fiy0 = _mm_setzero_ps();
1601 fiz0 = _mm_setzero_ps();
1602 fix1 = _mm_setzero_ps();
1603 fiy1 = _mm_setzero_ps();
1604 fiz1 = _mm_setzero_ps();
1605 fix2 = _mm_setzero_ps();
1606 fiy2 = _mm_setzero_ps();
1607 fiz2 = _mm_setzero_ps();
1609 /* Start inner kernel loop */
1610 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1613 /* Get j neighbor index, and coordinate index */
1615 jnrB = jjnr[jidx+1];
1616 jnrC = jjnr[jidx+2];
1617 jnrD = jjnr[jidx+3];
1618 j_coord_offsetA = DIM*jnrA;
1619 j_coord_offsetB = DIM*jnrB;
1620 j_coord_offsetC = DIM*jnrC;
1621 j_coord_offsetD = DIM*jnrD;
1623 /* load j atom coordinates */
1624 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1625 x+j_coord_offsetC,x+j_coord_offsetD,
1626 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1628 /* Calculate displacement vector */
1629 dx00 = _mm_sub_ps(ix0,jx0);
1630 dy00 = _mm_sub_ps(iy0,jy0);
1631 dz00 = _mm_sub_ps(iz0,jz0);
1632 dx01 = _mm_sub_ps(ix0,jx1);
1633 dy01 = _mm_sub_ps(iy0,jy1);
1634 dz01 = _mm_sub_ps(iz0,jz1);
1635 dx02 = _mm_sub_ps(ix0,jx2);
1636 dy02 = _mm_sub_ps(iy0,jy2);
1637 dz02 = _mm_sub_ps(iz0,jz2);
1638 dx10 = _mm_sub_ps(ix1,jx0);
1639 dy10 = _mm_sub_ps(iy1,jy0);
1640 dz10 = _mm_sub_ps(iz1,jz0);
1641 dx11 = _mm_sub_ps(ix1,jx1);
1642 dy11 = _mm_sub_ps(iy1,jy1);
1643 dz11 = _mm_sub_ps(iz1,jz1);
1644 dx12 = _mm_sub_ps(ix1,jx2);
1645 dy12 = _mm_sub_ps(iy1,jy2);
1646 dz12 = _mm_sub_ps(iz1,jz2);
1647 dx20 = _mm_sub_ps(ix2,jx0);
1648 dy20 = _mm_sub_ps(iy2,jy0);
1649 dz20 = _mm_sub_ps(iz2,jz0);
1650 dx21 = _mm_sub_ps(ix2,jx1);
1651 dy21 = _mm_sub_ps(iy2,jy1);
1652 dz21 = _mm_sub_ps(iz2,jz1);
1653 dx22 = _mm_sub_ps(ix2,jx2);
1654 dy22 = _mm_sub_ps(iy2,jy2);
1655 dz22 = _mm_sub_ps(iz2,jz2);
1657 /* Calculate squared distance and things based on it */
1658 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1659 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1660 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1661 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1662 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1663 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1664 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1665 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1666 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1668 rinv00 = sse41_invsqrt_f(rsq00);
1669 rinv01 = sse41_invsqrt_f(rsq01);
1670 rinv02 = sse41_invsqrt_f(rsq02);
1671 rinv10 = sse41_invsqrt_f(rsq10);
1672 rinv11 = sse41_invsqrt_f(rsq11);
1673 rinv12 = sse41_invsqrt_f(rsq12);
1674 rinv20 = sse41_invsqrt_f(rsq20);
1675 rinv21 = sse41_invsqrt_f(rsq21);
1676 rinv22 = sse41_invsqrt_f(rsq22);
1678 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1679 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1680 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1681 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1682 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1683 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1684 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1685 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1686 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1688 fjx0 = _mm_setzero_ps();
1689 fjy0 = _mm_setzero_ps();
1690 fjz0 = _mm_setzero_ps();
1691 fjx1 = _mm_setzero_ps();
1692 fjy1 = _mm_setzero_ps();
1693 fjz1 = _mm_setzero_ps();
1694 fjx2 = _mm_setzero_ps();
1695 fjy2 = _mm_setzero_ps();
1696 fjz2 = _mm_setzero_ps();
1698 /**************************
1699 * CALCULATE INTERACTIONS *
1700 **************************/
1702 if (gmx_mm_any_lt(rsq00,rcutoff2))
1705 r00 = _mm_mul_ps(rsq00,rinv00);
1707 /* EWALD ELECTROSTATICS */
1709 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1710 ewrt = _mm_mul_ps(r00,ewtabscale);
1711 ewitab = _mm_cvttps_epi32(ewrt);
1712 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1713 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1714 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1716 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1717 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1719 /* LENNARD-JONES DISPERSION/REPULSION */
1721 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1722 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1724 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1726 fscal = _mm_add_ps(felec,fvdw);
1728 fscal = _mm_and_ps(fscal,cutoff_mask);
1730 /* Calculate temporary vectorial force */
1731 tx = _mm_mul_ps(fscal,dx00);
1732 ty = _mm_mul_ps(fscal,dy00);
1733 tz = _mm_mul_ps(fscal,dz00);
1735 /* Update vectorial force */
1736 fix0 = _mm_add_ps(fix0,tx);
1737 fiy0 = _mm_add_ps(fiy0,ty);
1738 fiz0 = _mm_add_ps(fiz0,tz);
1740 fjx0 = _mm_add_ps(fjx0,tx);
1741 fjy0 = _mm_add_ps(fjy0,ty);
1742 fjz0 = _mm_add_ps(fjz0,tz);
1746 /**************************
1747 * CALCULATE INTERACTIONS *
1748 **************************/
1750 if (gmx_mm_any_lt(rsq01,rcutoff2))
1753 r01 = _mm_mul_ps(rsq01,rinv01);
1755 /* EWALD ELECTROSTATICS */
1757 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1758 ewrt = _mm_mul_ps(r01,ewtabscale);
1759 ewitab = _mm_cvttps_epi32(ewrt);
1760 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1761 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1762 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1764 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1765 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1767 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1771 fscal = _mm_and_ps(fscal,cutoff_mask);
1773 /* Calculate temporary vectorial force */
1774 tx = _mm_mul_ps(fscal,dx01);
1775 ty = _mm_mul_ps(fscal,dy01);
1776 tz = _mm_mul_ps(fscal,dz01);
1778 /* Update vectorial force */
1779 fix0 = _mm_add_ps(fix0,tx);
1780 fiy0 = _mm_add_ps(fiy0,ty);
1781 fiz0 = _mm_add_ps(fiz0,tz);
1783 fjx1 = _mm_add_ps(fjx1,tx);
1784 fjy1 = _mm_add_ps(fjy1,ty);
1785 fjz1 = _mm_add_ps(fjz1,tz);
1789 /**************************
1790 * CALCULATE INTERACTIONS *
1791 **************************/
1793 if (gmx_mm_any_lt(rsq02,rcutoff2))
1796 r02 = _mm_mul_ps(rsq02,rinv02);
1798 /* EWALD ELECTROSTATICS */
1800 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1801 ewrt = _mm_mul_ps(r02,ewtabscale);
1802 ewitab = _mm_cvttps_epi32(ewrt);
1803 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1804 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1805 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1807 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1808 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1810 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1814 fscal = _mm_and_ps(fscal,cutoff_mask);
1816 /* Calculate temporary vectorial force */
1817 tx = _mm_mul_ps(fscal,dx02);
1818 ty = _mm_mul_ps(fscal,dy02);
1819 tz = _mm_mul_ps(fscal,dz02);
1821 /* Update vectorial force */
1822 fix0 = _mm_add_ps(fix0,tx);
1823 fiy0 = _mm_add_ps(fiy0,ty);
1824 fiz0 = _mm_add_ps(fiz0,tz);
1826 fjx2 = _mm_add_ps(fjx2,tx);
1827 fjy2 = _mm_add_ps(fjy2,ty);
1828 fjz2 = _mm_add_ps(fjz2,tz);
1832 /**************************
1833 * CALCULATE INTERACTIONS *
1834 **************************/
1836 if (gmx_mm_any_lt(rsq10,rcutoff2))
1839 r10 = _mm_mul_ps(rsq10,rinv10);
1841 /* EWALD ELECTROSTATICS */
1843 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1844 ewrt = _mm_mul_ps(r10,ewtabscale);
1845 ewitab = _mm_cvttps_epi32(ewrt);
1846 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1847 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1848 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1850 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1851 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1853 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1857 fscal = _mm_and_ps(fscal,cutoff_mask);
1859 /* Calculate temporary vectorial force */
1860 tx = _mm_mul_ps(fscal,dx10);
1861 ty = _mm_mul_ps(fscal,dy10);
1862 tz = _mm_mul_ps(fscal,dz10);
1864 /* Update vectorial force */
1865 fix1 = _mm_add_ps(fix1,tx);
1866 fiy1 = _mm_add_ps(fiy1,ty);
1867 fiz1 = _mm_add_ps(fiz1,tz);
1869 fjx0 = _mm_add_ps(fjx0,tx);
1870 fjy0 = _mm_add_ps(fjy0,ty);
1871 fjz0 = _mm_add_ps(fjz0,tz);
1875 /**************************
1876 * CALCULATE INTERACTIONS *
1877 **************************/
1879 if (gmx_mm_any_lt(rsq11,rcutoff2))
1882 r11 = _mm_mul_ps(rsq11,rinv11);
1884 /* EWALD ELECTROSTATICS */
1886 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1887 ewrt = _mm_mul_ps(r11,ewtabscale);
1888 ewitab = _mm_cvttps_epi32(ewrt);
1889 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1890 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1891 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1893 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1894 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1896 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1900 fscal = _mm_and_ps(fscal,cutoff_mask);
1902 /* Calculate temporary vectorial force */
1903 tx = _mm_mul_ps(fscal,dx11);
1904 ty = _mm_mul_ps(fscal,dy11);
1905 tz = _mm_mul_ps(fscal,dz11);
1907 /* Update vectorial force */
1908 fix1 = _mm_add_ps(fix1,tx);
1909 fiy1 = _mm_add_ps(fiy1,ty);
1910 fiz1 = _mm_add_ps(fiz1,tz);
1912 fjx1 = _mm_add_ps(fjx1,tx);
1913 fjy1 = _mm_add_ps(fjy1,ty);
1914 fjz1 = _mm_add_ps(fjz1,tz);
1918 /**************************
1919 * CALCULATE INTERACTIONS *
1920 **************************/
1922 if (gmx_mm_any_lt(rsq12,rcutoff2))
1925 r12 = _mm_mul_ps(rsq12,rinv12);
1927 /* EWALD ELECTROSTATICS */
1929 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1930 ewrt = _mm_mul_ps(r12,ewtabscale);
1931 ewitab = _mm_cvttps_epi32(ewrt);
1932 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1933 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1934 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1936 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1937 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1939 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1943 fscal = _mm_and_ps(fscal,cutoff_mask);
1945 /* Calculate temporary vectorial force */
1946 tx = _mm_mul_ps(fscal,dx12);
1947 ty = _mm_mul_ps(fscal,dy12);
1948 tz = _mm_mul_ps(fscal,dz12);
1950 /* Update vectorial force */
1951 fix1 = _mm_add_ps(fix1,tx);
1952 fiy1 = _mm_add_ps(fiy1,ty);
1953 fiz1 = _mm_add_ps(fiz1,tz);
1955 fjx2 = _mm_add_ps(fjx2,tx);
1956 fjy2 = _mm_add_ps(fjy2,ty);
1957 fjz2 = _mm_add_ps(fjz2,tz);
1961 /**************************
1962 * CALCULATE INTERACTIONS *
1963 **************************/
1965 if (gmx_mm_any_lt(rsq20,rcutoff2))
1968 r20 = _mm_mul_ps(rsq20,rinv20);
1970 /* EWALD ELECTROSTATICS */
1972 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1973 ewrt = _mm_mul_ps(r20,ewtabscale);
1974 ewitab = _mm_cvttps_epi32(ewrt);
1975 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1976 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1977 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1979 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1980 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1982 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1986 fscal = _mm_and_ps(fscal,cutoff_mask);
1988 /* Calculate temporary vectorial force */
1989 tx = _mm_mul_ps(fscal,dx20);
1990 ty = _mm_mul_ps(fscal,dy20);
1991 tz = _mm_mul_ps(fscal,dz20);
1993 /* Update vectorial force */
1994 fix2 = _mm_add_ps(fix2,tx);
1995 fiy2 = _mm_add_ps(fiy2,ty);
1996 fiz2 = _mm_add_ps(fiz2,tz);
1998 fjx0 = _mm_add_ps(fjx0,tx);
1999 fjy0 = _mm_add_ps(fjy0,ty);
2000 fjz0 = _mm_add_ps(fjz0,tz);
2004 /**************************
2005 * CALCULATE INTERACTIONS *
2006 **************************/
2008 if (gmx_mm_any_lt(rsq21,rcutoff2))
2011 r21 = _mm_mul_ps(rsq21,rinv21);
2013 /* EWALD ELECTROSTATICS */
2015 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2016 ewrt = _mm_mul_ps(r21,ewtabscale);
2017 ewitab = _mm_cvttps_epi32(ewrt);
2018 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2019 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2020 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2022 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2023 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2025 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2029 fscal = _mm_and_ps(fscal,cutoff_mask);
2031 /* Calculate temporary vectorial force */
2032 tx = _mm_mul_ps(fscal,dx21);
2033 ty = _mm_mul_ps(fscal,dy21);
2034 tz = _mm_mul_ps(fscal,dz21);
2036 /* Update vectorial force */
2037 fix2 = _mm_add_ps(fix2,tx);
2038 fiy2 = _mm_add_ps(fiy2,ty);
2039 fiz2 = _mm_add_ps(fiz2,tz);
2041 fjx1 = _mm_add_ps(fjx1,tx);
2042 fjy1 = _mm_add_ps(fjy1,ty);
2043 fjz1 = _mm_add_ps(fjz1,tz);
2047 /**************************
2048 * CALCULATE INTERACTIONS *
2049 **************************/
2051 if (gmx_mm_any_lt(rsq22,rcutoff2))
2054 r22 = _mm_mul_ps(rsq22,rinv22);
2056 /* EWALD ELECTROSTATICS */
2058 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2059 ewrt = _mm_mul_ps(r22,ewtabscale);
2060 ewitab = _mm_cvttps_epi32(ewrt);
2061 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2062 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2063 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2065 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2066 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2068 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2072 fscal = _mm_and_ps(fscal,cutoff_mask);
2074 /* Calculate temporary vectorial force */
2075 tx = _mm_mul_ps(fscal,dx22);
2076 ty = _mm_mul_ps(fscal,dy22);
2077 tz = _mm_mul_ps(fscal,dz22);
2079 /* Update vectorial force */
2080 fix2 = _mm_add_ps(fix2,tx);
2081 fiy2 = _mm_add_ps(fiy2,ty);
2082 fiz2 = _mm_add_ps(fiz2,tz);
2084 fjx2 = _mm_add_ps(fjx2,tx);
2085 fjy2 = _mm_add_ps(fjy2,ty);
2086 fjz2 = _mm_add_ps(fjz2,tz);
2090 fjptrA = f+j_coord_offsetA;
2091 fjptrB = f+j_coord_offsetB;
2092 fjptrC = f+j_coord_offsetC;
2093 fjptrD = f+j_coord_offsetD;
2095 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2096 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2098 /* Inner loop uses 358 flops */
2101 if(jidx<j_index_end)
2104 /* Get j neighbor index, and coordinate index */
2105 jnrlistA = jjnr[jidx];
2106 jnrlistB = jjnr[jidx+1];
2107 jnrlistC = jjnr[jidx+2];
2108 jnrlistD = jjnr[jidx+3];
2109 /* Sign of each element will be negative for non-real atoms.
2110 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2111 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2113 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2114 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2115 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2116 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2117 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2118 j_coord_offsetA = DIM*jnrA;
2119 j_coord_offsetB = DIM*jnrB;
2120 j_coord_offsetC = DIM*jnrC;
2121 j_coord_offsetD = DIM*jnrD;
2123 /* load j atom coordinates */
2124 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2125 x+j_coord_offsetC,x+j_coord_offsetD,
2126 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2128 /* Calculate displacement vector */
2129 dx00 = _mm_sub_ps(ix0,jx0);
2130 dy00 = _mm_sub_ps(iy0,jy0);
2131 dz00 = _mm_sub_ps(iz0,jz0);
2132 dx01 = _mm_sub_ps(ix0,jx1);
2133 dy01 = _mm_sub_ps(iy0,jy1);
2134 dz01 = _mm_sub_ps(iz0,jz1);
2135 dx02 = _mm_sub_ps(ix0,jx2);
2136 dy02 = _mm_sub_ps(iy0,jy2);
2137 dz02 = _mm_sub_ps(iz0,jz2);
2138 dx10 = _mm_sub_ps(ix1,jx0);
2139 dy10 = _mm_sub_ps(iy1,jy0);
2140 dz10 = _mm_sub_ps(iz1,jz0);
2141 dx11 = _mm_sub_ps(ix1,jx1);
2142 dy11 = _mm_sub_ps(iy1,jy1);
2143 dz11 = _mm_sub_ps(iz1,jz1);
2144 dx12 = _mm_sub_ps(ix1,jx2);
2145 dy12 = _mm_sub_ps(iy1,jy2);
2146 dz12 = _mm_sub_ps(iz1,jz2);
2147 dx20 = _mm_sub_ps(ix2,jx0);
2148 dy20 = _mm_sub_ps(iy2,jy0);
2149 dz20 = _mm_sub_ps(iz2,jz0);
2150 dx21 = _mm_sub_ps(ix2,jx1);
2151 dy21 = _mm_sub_ps(iy2,jy1);
2152 dz21 = _mm_sub_ps(iz2,jz1);
2153 dx22 = _mm_sub_ps(ix2,jx2);
2154 dy22 = _mm_sub_ps(iy2,jy2);
2155 dz22 = _mm_sub_ps(iz2,jz2);
2157 /* Calculate squared distance and things based on it */
2158 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2159 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2160 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2161 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2162 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2163 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2164 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2165 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2166 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2168 rinv00 = sse41_invsqrt_f(rsq00);
2169 rinv01 = sse41_invsqrt_f(rsq01);
2170 rinv02 = sse41_invsqrt_f(rsq02);
2171 rinv10 = sse41_invsqrt_f(rsq10);
2172 rinv11 = sse41_invsqrt_f(rsq11);
2173 rinv12 = sse41_invsqrt_f(rsq12);
2174 rinv20 = sse41_invsqrt_f(rsq20);
2175 rinv21 = sse41_invsqrt_f(rsq21);
2176 rinv22 = sse41_invsqrt_f(rsq22);
2178 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2179 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
2180 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
2181 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
2182 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2183 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2184 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
2185 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2186 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2188 fjx0 = _mm_setzero_ps();
2189 fjy0 = _mm_setzero_ps();
2190 fjz0 = _mm_setzero_ps();
2191 fjx1 = _mm_setzero_ps();
2192 fjy1 = _mm_setzero_ps();
2193 fjz1 = _mm_setzero_ps();
2194 fjx2 = _mm_setzero_ps();
2195 fjy2 = _mm_setzero_ps();
2196 fjz2 = _mm_setzero_ps();
2198 /**************************
2199 * CALCULATE INTERACTIONS *
2200 **************************/
2202 if (gmx_mm_any_lt(rsq00,rcutoff2))
2205 r00 = _mm_mul_ps(rsq00,rinv00);
2206 r00 = _mm_andnot_ps(dummy_mask,r00);
2208 /* EWALD ELECTROSTATICS */
2210 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2211 ewrt = _mm_mul_ps(r00,ewtabscale);
2212 ewitab = _mm_cvttps_epi32(ewrt);
2213 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2214 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2215 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2217 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2218 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2220 /* LENNARD-JONES DISPERSION/REPULSION */
2222 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2223 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
2225 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2227 fscal = _mm_add_ps(felec,fvdw);
2229 fscal = _mm_and_ps(fscal,cutoff_mask);
2231 fscal = _mm_andnot_ps(dummy_mask,fscal);
2233 /* Calculate temporary vectorial force */
2234 tx = _mm_mul_ps(fscal,dx00);
2235 ty = _mm_mul_ps(fscal,dy00);
2236 tz = _mm_mul_ps(fscal,dz00);
2238 /* Update vectorial force */
2239 fix0 = _mm_add_ps(fix0,tx);
2240 fiy0 = _mm_add_ps(fiy0,ty);
2241 fiz0 = _mm_add_ps(fiz0,tz);
2243 fjx0 = _mm_add_ps(fjx0,tx);
2244 fjy0 = _mm_add_ps(fjy0,ty);
2245 fjz0 = _mm_add_ps(fjz0,tz);
2249 /**************************
2250 * CALCULATE INTERACTIONS *
2251 **************************/
2253 if (gmx_mm_any_lt(rsq01,rcutoff2))
2256 r01 = _mm_mul_ps(rsq01,rinv01);
2257 r01 = _mm_andnot_ps(dummy_mask,r01);
2259 /* EWALD ELECTROSTATICS */
2261 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2262 ewrt = _mm_mul_ps(r01,ewtabscale);
2263 ewitab = _mm_cvttps_epi32(ewrt);
2264 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2265 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2266 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2268 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2269 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2271 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2275 fscal = _mm_and_ps(fscal,cutoff_mask);
2277 fscal = _mm_andnot_ps(dummy_mask,fscal);
2279 /* Calculate temporary vectorial force */
2280 tx = _mm_mul_ps(fscal,dx01);
2281 ty = _mm_mul_ps(fscal,dy01);
2282 tz = _mm_mul_ps(fscal,dz01);
2284 /* Update vectorial force */
2285 fix0 = _mm_add_ps(fix0,tx);
2286 fiy0 = _mm_add_ps(fiy0,ty);
2287 fiz0 = _mm_add_ps(fiz0,tz);
2289 fjx1 = _mm_add_ps(fjx1,tx);
2290 fjy1 = _mm_add_ps(fjy1,ty);
2291 fjz1 = _mm_add_ps(fjz1,tz);
2295 /**************************
2296 * CALCULATE INTERACTIONS *
2297 **************************/
2299 if (gmx_mm_any_lt(rsq02,rcutoff2))
2302 r02 = _mm_mul_ps(rsq02,rinv02);
2303 r02 = _mm_andnot_ps(dummy_mask,r02);
2305 /* EWALD ELECTROSTATICS */
2307 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2308 ewrt = _mm_mul_ps(r02,ewtabscale);
2309 ewitab = _mm_cvttps_epi32(ewrt);
2310 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2311 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2312 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2314 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2315 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2317 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2321 fscal = _mm_and_ps(fscal,cutoff_mask);
2323 fscal = _mm_andnot_ps(dummy_mask,fscal);
2325 /* Calculate temporary vectorial force */
2326 tx = _mm_mul_ps(fscal,dx02);
2327 ty = _mm_mul_ps(fscal,dy02);
2328 tz = _mm_mul_ps(fscal,dz02);
2330 /* Update vectorial force */
2331 fix0 = _mm_add_ps(fix0,tx);
2332 fiy0 = _mm_add_ps(fiy0,ty);
2333 fiz0 = _mm_add_ps(fiz0,tz);
2335 fjx2 = _mm_add_ps(fjx2,tx);
2336 fjy2 = _mm_add_ps(fjy2,ty);
2337 fjz2 = _mm_add_ps(fjz2,tz);
2341 /**************************
2342 * CALCULATE INTERACTIONS *
2343 **************************/
2345 if (gmx_mm_any_lt(rsq10,rcutoff2))
2348 r10 = _mm_mul_ps(rsq10,rinv10);
2349 r10 = _mm_andnot_ps(dummy_mask,r10);
2351 /* EWALD ELECTROSTATICS */
2353 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2354 ewrt = _mm_mul_ps(r10,ewtabscale);
2355 ewitab = _mm_cvttps_epi32(ewrt);
2356 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2357 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2358 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2360 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2361 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2363 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2367 fscal = _mm_and_ps(fscal,cutoff_mask);
2369 fscal = _mm_andnot_ps(dummy_mask,fscal);
2371 /* Calculate temporary vectorial force */
2372 tx = _mm_mul_ps(fscal,dx10);
2373 ty = _mm_mul_ps(fscal,dy10);
2374 tz = _mm_mul_ps(fscal,dz10);
2376 /* Update vectorial force */
2377 fix1 = _mm_add_ps(fix1,tx);
2378 fiy1 = _mm_add_ps(fiy1,ty);
2379 fiz1 = _mm_add_ps(fiz1,tz);
2381 fjx0 = _mm_add_ps(fjx0,tx);
2382 fjy0 = _mm_add_ps(fjy0,ty);
2383 fjz0 = _mm_add_ps(fjz0,tz);
2387 /**************************
2388 * CALCULATE INTERACTIONS *
2389 **************************/
2391 if (gmx_mm_any_lt(rsq11,rcutoff2))
2394 r11 = _mm_mul_ps(rsq11,rinv11);
2395 r11 = _mm_andnot_ps(dummy_mask,r11);
2397 /* EWALD ELECTROSTATICS */
2399 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2400 ewrt = _mm_mul_ps(r11,ewtabscale);
2401 ewitab = _mm_cvttps_epi32(ewrt);
2402 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2403 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2404 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2406 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2407 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2409 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2413 fscal = _mm_and_ps(fscal,cutoff_mask);
2415 fscal = _mm_andnot_ps(dummy_mask,fscal);
2417 /* Calculate temporary vectorial force */
2418 tx = _mm_mul_ps(fscal,dx11);
2419 ty = _mm_mul_ps(fscal,dy11);
2420 tz = _mm_mul_ps(fscal,dz11);
2422 /* Update vectorial force */
2423 fix1 = _mm_add_ps(fix1,tx);
2424 fiy1 = _mm_add_ps(fiy1,ty);
2425 fiz1 = _mm_add_ps(fiz1,tz);
2427 fjx1 = _mm_add_ps(fjx1,tx);
2428 fjy1 = _mm_add_ps(fjy1,ty);
2429 fjz1 = _mm_add_ps(fjz1,tz);
2433 /**************************
2434 * CALCULATE INTERACTIONS *
2435 **************************/
2437 if (gmx_mm_any_lt(rsq12,rcutoff2))
2440 r12 = _mm_mul_ps(rsq12,rinv12);
2441 r12 = _mm_andnot_ps(dummy_mask,r12);
2443 /* EWALD ELECTROSTATICS */
2445 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2446 ewrt = _mm_mul_ps(r12,ewtabscale);
2447 ewitab = _mm_cvttps_epi32(ewrt);
2448 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2449 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2450 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2452 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2453 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2455 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2459 fscal = _mm_and_ps(fscal,cutoff_mask);
2461 fscal = _mm_andnot_ps(dummy_mask,fscal);
2463 /* Calculate temporary vectorial force */
2464 tx = _mm_mul_ps(fscal,dx12);
2465 ty = _mm_mul_ps(fscal,dy12);
2466 tz = _mm_mul_ps(fscal,dz12);
2468 /* Update vectorial force */
2469 fix1 = _mm_add_ps(fix1,tx);
2470 fiy1 = _mm_add_ps(fiy1,ty);
2471 fiz1 = _mm_add_ps(fiz1,tz);
2473 fjx2 = _mm_add_ps(fjx2,tx);
2474 fjy2 = _mm_add_ps(fjy2,ty);
2475 fjz2 = _mm_add_ps(fjz2,tz);
2479 /**************************
2480 * CALCULATE INTERACTIONS *
2481 **************************/
2483 if (gmx_mm_any_lt(rsq20,rcutoff2))
2486 r20 = _mm_mul_ps(rsq20,rinv20);
2487 r20 = _mm_andnot_ps(dummy_mask,r20);
2489 /* EWALD ELECTROSTATICS */
2491 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2492 ewrt = _mm_mul_ps(r20,ewtabscale);
2493 ewitab = _mm_cvttps_epi32(ewrt);
2494 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2495 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2496 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2498 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2499 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2501 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2505 fscal = _mm_and_ps(fscal,cutoff_mask);
2507 fscal = _mm_andnot_ps(dummy_mask,fscal);
2509 /* Calculate temporary vectorial force */
2510 tx = _mm_mul_ps(fscal,dx20);
2511 ty = _mm_mul_ps(fscal,dy20);
2512 tz = _mm_mul_ps(fscal,dz20);
2514 /* Update vectorial force */
2515 fix2 = _mm_add_ps(fix2,tx);
2516 fiy2 = _mm_add_ps(fiy2,ty);
2517 fiz2 = _mm_add_ps(fiz2,tz);
2519 fjx0 = _mm_add_ps(fjx0,tx);
2520 fjy0 = _mm_add_ps(fjy0,ty);
2521 fjz0 = _mm_add_ps(fjz0,tz);
2525 /**************************
2526 * CALCULATE INTERACTIONS *
2527 **************************/
2529 if (gmx_mm_any_lt(rsq21,rcutoff2))
2532 r21 = _mm_mul_ps(rsq21,rinv21);
2533 r21 = _mm_andnot_ps(dummy_mask,r21);
2535 /* EWALD ELECTROSTATICS */
2537 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2538 ewrt = _mm_mul_ps(r21,ewtabscale);
2539 ewitab = _mm_cvttps_epi32(ewrt);
2540 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2541 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2542 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2544 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2545 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2547 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2551 fscal = _mm_and_ps(fscal,cutoff_mask);
2553 fscal = _mm_andnot_ps(dummy_mask,fscal);
2555 /* Calculate temporary vectorial force */
2556 tx = _mm_mul_ps(fscal,dx21);
2557 ty = _mm_mul_ps(fscal,dy21);
2558 tz = _mm_mul_ps(fscal,dz21);
2560 /* Update vectorial force */
2561 fix2 = _mm_add_ps(fix2,tx);
2562 fiy2 = _mm_add_ps(fiy2,ty);
2563 fiz2 = _mm_add_ps(fiz2,tz);
2565 fjx1 = _mm_add_ps(fjx1,tx);
2566 fjy1 = _mm_add_ps(fjy1,ty);
2567 fjz1 = _mm_add_ps(fjz1,tz);
2571 /**************************
2572 * CALCULATE INTERACTIONS *
2573 **************************/
2575 if (gmx_mm_any_lt(rsq22,rcutoff2))
2578 r22 = _mm_mul_ps(rsq22,rinv22);
2579 r22 = _mm_andnot_ps(dummy_mask,r22);
2581 /* EWALD ELECTROSTATICS */
2583 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2584 ewrt = _mm_mul_ps(r22,ewtabscale);
2585 ewitab = _mm_cvttps_epi32(ewrt);
2586 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2587 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2588 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2590 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2591 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2593 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2597 fscal = _mm_and_ps(fscal,cutoff_mask);
2599 fscal = _mm_andnot_ps(dummy_mask,fscal);
2601 /* Calculate temporary vectorial force */
2602 tx = _mm_mul_ps(fscal,dx22);
2603 ty = _mm_mul_ps(fscal,dy22);
2604 tz = _mm_mul_ps(fscal,dz22);
2606 /* Update vectorial force */
2607 fix2 = _mm_add_ps(fix2,tx);
2608 fiy2 = _mm_add_ps(fiy2,ty);
2609 fiz2 = _mm_add_ps(fiz2,tz);
2611 fjx2 = _mm_add_ps(fjx2,tx);
2612 fjy2 = _mm_add_ps(fjy2,ty);
2613 fjz2 = _mm_add_ps(fjz2,tz);
2617 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2618 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2619 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2620 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2622 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2623 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2625 /* Inner loop uses 367 flops */
2628 /* End of innermost loop */
2630 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2631 f+i_coord_offset,fshift+i_shift_offset);
2633 /* Increment number of inner iterations */
2634 inneriter += j_index_end - j_index_start;
2636 /* Outer loop uses 18 flops */
2639 /* Increment number of outer iterations */
2642 /* Update outer/inner flops */
2644 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*367);