2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse4_1_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse4_1_single
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LJEwald
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse4_1_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
90 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
97 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
98 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
102 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
105 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
106 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
107 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
108 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
111 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
114 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
115 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
126 __m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
128 __m128 one_half = _mm_set1_ps(0.5);
129 __m128 minus_one = _mm_set1_ps(-1.0);
131 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
133 __m128 dummy_mask,cutoff_mask;
134 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
135 __m128 one = _mm_set1_ps(1.0);
136 __m128 two = _mm_set1_ps(2.0);
142 jindex = nlist->jindex;
144 shiftidx = nlist->shift;
146 shiftvec = fr->shift_vec[0];
147 fshift = fr->fshift[0];
148 facel = _mm_set1_ps(fr->ic->epsfac);
149 charge = mdatoms->chargeA;
150 nvdwtype = fr->ntype;
152 vdwtype = mdatoms->typeA;
153 vdwgridparam = fr->ljpme_c6grid;
154 sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
155 ewclj = _mm_set1_ps(fr->ic->ewaldcoeff_lj);
156 ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
158 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
159 ewtab = fr->ic->tabq_coul_FDV0;
160 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
161 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
163 /* Setup water-specific parameters */
164 inr = nlist->iinr[0];
165 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
166 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
167 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
168 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
170 jq1 = _mm_set1_ps(charge[inr+1]);
171 jq2 = _mm_set1_ps(charge[inr+2]);
172 jq3 = _mm_set1_ps(charge[inr+3]);
173 vdwjidx0A = 2*vdwtype[inr+0];
174 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
175 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
176 c6grid_00 = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
177 qq11 = _mm_mul_ps(iq1,jq1);
178 qq12 = _mm_mul_ps(iq1,jq2);
179 qq13 = _mm_mul_ps(iq1,jq3);
180 qq21 = _mm_mul_ps(iq2,jq1);
181 qq22 = _mm_mul_ps(iq2,jq2);
182 qq23 = _mm_mul_ps(iq2,jq3);
183 qq31 = _mm_mul_ps(iq3,jq1);
184 qq32 = _mm_mul_ps(iq3,jq2);
185 qq33 = _mm_mul_ps(iq3,jq3);
187 /* Avoid stupid compiler warnings */
188 jnrA = jnrB = jnrC = jnrD = 0;
197 for(iidx=0;iidx<4*DIM;iidx++)
202 /* Start outer loop over neighborlists */
203 for(iidx=0; iidx<nri; iidx++)
205 /* Load shift vector for this list */
206 i_shift_offset = DIM*shiftidx[iidx];
208 /* Load limits for loop over neighbors */
209 j_index_start = jindex[iidx];
210 j_index_end = jindex[iidx+1];
212 /* Get outer coordinate index */
214 i_coord_offset = DIM*inr;
216 /* Load i particle coords and add shift vector */
217 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
218 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
220 fix0 = _mm_setzero_ps();
221 fiy0 = _mm_setzero_ps();
222 fiz0 = _mm_setzero_ps();
223 fix1 = _mm_setzero_ps();
224 fiy1 = _mm_setzero_ps();
225 fiz1 = _mm_setzero_ps();
226 fix2 = _mm_setzero_ps();
227 fiy2 = _mm_setzero_ps();
228 fiz2 = _mm_setzero_ps();
229 fix3 = _mm_setzero_ps();
230 fiy3 = _mm_setzero_ps();
231 fiz3 = _mm_setzero_ps();
233 /* Reset potential sums */
234 velecsum = _mm_setzero_ps();
235 vvdwsum = _mm_setzero_ps();
237 /* Start inner kernel loop */
238 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
241 /* Get j neighbor index, and coordinate index */
246 j_coord_offsetA = DIM*jnrA;
247 j_coord_offsetB = DIM*jnrB;
248 j_coord_offsetC = DIM*jnrC;
249 j_coord_offsetD = DIM*jnrD;
251 /* load j atom coordinates */
252 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
253 x+j_coord_offsetC,x+j_coord_offsetD,
254 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
255 &jy2,&jz2,&jx3,&jy3,&jz3);
257 /* Calculate displacement vector */
258 dx00 = _mm_sub_ps(ix0,jx0);
259 dy00 = _mm_sub_ps(iy0,jy0);
260 dz00 = _mm_sub_ps(iz0,jz0);
261 dx11 = _mm_sub_ps(ix1,jx1);
262 dy11 = _mm_sub_ps(iy1,jy1);
263 dz11 = _mm_sub_ps(iz1,jz1);
264 dx12 = _mm_sub_ps(ix1,jx2);
265 dy12 = _mm_sub_ps(iy1,jy2);
266 dz12 = _mm_sub_ps(iz1,jz2);
267 dx13 = _mm_sub_ps(ix1,jx3);
268 dy13 = _mm_sub_ps(iy1,jy3);
269 dz13 = _mm_sub_ps(iz1,jz3);
270 dx21 = _mm_sub_ps(ix2,jx1);
271 dy21 = _mm_sub_ps(iy2,jy1);
272 dz21 = _mm_sub_ps(iz2,jz1);
273 dx22 = _mm_sub_ps(ix2,jx2);
274 dy22 = _mm_sub_ps(iy2,jy2);
275 dz22 = _mm_sub_ps(iz2,jz2);
276 dx23 = _mm_sub_ps(ix2,jx3);
277 dy23 = _mm_sub_ps(iy2,jy3);
278 dz23 = _mm_sub_ps(iz2,jz3);
279 dx31 = _mm_sub_ps(ix3,jx1);
280 dy31 = _mm_sub_ps(iy3,jy1);
281 dz31 = _mm_sub_ps(iz3,jz1);
282 dx32 = _mm_sub_ps(ix3,jx2);
283 dy32 = _mm_sub_ps(iy3,jy2);
284 dz32 = _mm_sub_ps(iz3,jz2);
285 dx33 = _mm_sub_ps(ix3,jx3);
286 dy33 = _mm_sub_ps(iy3,jy3);
287 dz33 = _mm_sub_ps(iz3,jz3);
289 /* Calculate squared distance and things based on it */
290 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
291 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
292 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
293 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
294 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
295 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
296 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
297 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
298 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
299 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
301 rinv00 = sse41_invsqrt_f(rsq00);
302 rinv11 = sse41_invsqrt_f(rsq11);
303 rinv12 = sse41_invsqrt_f(rsq12);
304 rinv13 = sse41_invsqrt_f(rsq13);
305 rinv21 = sse41_invsqrt_f(rsq21);
306 rinv22 = sse41_invsqrt_f(rsq22);
307 rinv23 = sse41_invsqrt_f(rsq23);
308 rinv31 = sse41_invsqrt_f(rsq31);
309 rinv32 = sse41_invsqrt_f(rsq32);
310 rinv33 = sse41_invsqrt_f(rsq33);
312 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
313 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
314 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
315 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
316 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
317 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
318 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
319 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
320 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
321 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
323 fjx0 = _mm_setzero_ps();
324 fjy0 = _mm_setzero_ps();
325 fjz0 = _mm_setzero_ps();
326 fjx1 = _mm_setzero_ps();
327 fjy1 = _mm_setzero_ps();
328 fjz1 = _mm_setzero_ps();
329 fjx2 = _mm_setzero_ps();
330 fjy2 = _mm_setzero_ps();
331 fjz2 = _mm_setzero_ps();
332 fjx3 = _mm_setzero_ps();
333 fjy3 = _mm_setzero_ps();
334 fjz3 = _mm_setzero_ps();
336 /**************************
337 * CALCULATE INTERACTIONS *
338 **************************/
340 r00 = _mm_mul_ps(rsq00,rinv00);
342 /* Analytical LJ-PME */
343 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
344 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
345 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
346 exponent = sse41_exp_f(ewcljrsq);
347 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
348 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
349 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
350 vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
351 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
352 vvdw = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
353 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
354 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
356 /* Update potential sum for this i atom from the interaction with this j atom. */
357 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
361 /* Calculate temporary vectorial force */
362 tx = _mm_mul_ps(fscal,dx00);
363 ty = _mm_mul_ps(fscal,dy00);
364 tz = _mm_mul_ps(fscal,dz00);
366 /* Update vectorial force */
367 fix0 = _mm_add_ps(fix0,tx);
368 fiy0 = _mm_add_ps(fiy0,ty);
369 fiz0 = _mm_add_ps(fiz0,tz);
371 fjx0 = _mm_add_ps(fjx0,tx);
372 fjy0 = _mm_add_ps(fjy0,ty);
373 fjz0 = _mm_add_ps(fjz0,tz);
375 /**************************
376 * CALCULATE INTERACTIONS *
377 **************************/
379 r11 = _mm_mul_ps(rsq11,rinv11);
381 /* EWALD ELECTROSTATICS */
383 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
384 ewrt = _mm_mul_ps(r11,ewtabscale);
385 ewitab = _mm_cvttps_epi32(ewrt);
386 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
387 ewitab = _mm_slli_epi32(ewitab,2);
388 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
389 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
390 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
391 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
392 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
393 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
394 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
395 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
396 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
398 /* Update potential sum for this i atom from the interaction with this j atom. */
399 velecsum = _mm_add_ps(velecsum,velec);
403 /* Calculate temporary vectorial force */
404 tx = _mm_mul_ps(fscal,dx11);
405 ty = _mm_mul_ps(fscal,dy11);
406 tz = _mm_mul_ps(fscal,dz11);
408 /* Update vectorial force */
409 fix1 = _mm_add_ps(fix1,tx);
410 fiy1 = _mm_add_ps(fiy1,ty);
411 fiz1 = _mm_add_ps(fiz1,tz);
413 fjx1 = _mm_add_ps(fjx1,tx);
414 fjy1 = _mm_add_ps(fjy1,ty);
415 fjz1 = _mm_add_ps(fjz1,tz);
417 /**************************
418 * CALCULATE INTERACTIONS *
419 **************************/
421 r12 = _mm_mul_ps(rsq12,rinv12);
423 /* EWALD ELECTROSTATICS */
425 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
426 ewrt = _mm_mul_ps(r12,ewtabscale);
427 ewitab = _mm_cvttps_epi32(ewrt);
428 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
429 ewitab = _mm_slli_epi32(ewitab,2);
430 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
431 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
432 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
433 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
434 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
435 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
436 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
437 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
438 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
440 /* Update potential sum for this i atom from the interaction with this j atom. */
441 velecsum = _mm_add_ps(velecsum,velec);
445 /* Calculate temporary vectorial force */
446 tx = _mm_mul_ps(fscal,dx12);
447 ty = _mm_mul_ps(fscal,dy12);
448 tz = _mm_mul_ps(fscal,dz12);
450 /* Update vectorial force */
451 fix1 = _mm_add_ps(fix1,tx);
452 fiy1 = _mm_add_ps(fiy1,ty);
453 fiz1 = _mm_add_ps(fiz1,tz);
455 fjx2 = _mm_add_ps(fjx2,tx);
456 fjy2 = _mm_add_ps(fjy2,ty);
457 fjz2 = _mm_add_ps(fjz2,tz);
459 /**************************
460 * CALCULATE INTERACTIONS *
461 **************************/
463 r13 = _mm_mul_ps(rsq13,rinv13);
465 /* EWALD ELECTROSTATICS */
467 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
468 ewrt = _mm_mul_ps(r13,ewtabscale);
469 ewitab = _mm_cvttps_epi32(ewrt);
470 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
471 ewitab = _mm_slli_epi32(ewitab,2);
472 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
473 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
474 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
475 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
476 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
477 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
478 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
479 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
480 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
482 /* Update potential sum for this i atom from the interaction with this j atom. */
483 velecsum = _mm_add_ps(velecsum,velec);
487 /* Calculate temporary vectorial force */
488 tx = _mm_mul_ps(fscal,dx13);
489 ty = _mm_mul_ps(fscal,dy13);
490 tz = _mm_mul_ps(fscal,dz13);
492 /* Update vectorial force */
493 fix1 = _mm_add_ps(fix1,tx);
494 fiy1 = _mm_add_ps(fiy1,ty);
495 fiz1 = _mm_add_ps(fiz1,tz);
497 fjx3 = _mm_add_ps(fjx3,tx);
498 fjy3 = _mm_add_ps(fjy3,ty);
499 fjz3 = _mm_add_ps(fjz3,tz);
501 /**************************
502 * CALCULATE INTERACTIONS *
503 **************************/
505 r21 = _mm_mul_ps(rsq21,rinv21);
507 /* EWALD ELECTROSTATICS */
509 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
510 ewrt = _mm_mul_ps(r21,ewtabscale);
511 ewitab = _mm_cvttps_epi32(ewrt);
512 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
513 ewitab = _mm_slli_epi32(ewitab,2);
514 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
515 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
516 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
517 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
518 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
519 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
520 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
521 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
522 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
524 /* Update potential sum for this i atom from the interaction with this j atom. */
525 velecsum = _mm_add_ps(velecsum,velec);
529 /* Calculate temporary vectorial force */
530 tx = _mm_mul_ps(fscal,dx21);
531 ty = _mm_mul_ps(fscal,dy21);
532 tz = _mm_mul_ps(fscal,dz21);
534 /* Update vectorial force */
535 fix2 = _mm_add_ps(fix2,tx);
536 fiy2 = _mm_add_ps(fiy2,ty);
537 fiz2 = _mm_add_ps(fiz2,tz);
539 fjx1 = _mm_add_ps(fjx1,tx);
540 fjy1 = _mm_add_ps(fjy1,ty);
541 fjz1 = _mm_add_ps(fjz1,tz);
543 /**************************
544 * CALCULATE INTERACTIONS *
545 **************************/
547 r22 = _mm_mul_ps(rsq22,rinv22);
549 /* EWALD ELECTROSTATICS */
551 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
552 ewrt = _mm_mul_ps(r22,ewtabscale);
553 ewitab = _mm_cvttps_epi32(ewrt);
554 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
555 ewitab = _mm_slli_epi32(ewitab,2);
556 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
557 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
558 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
559 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
560 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
561 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
562 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
563 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
564 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
566 /* Update potential sum for this i atom from the interaction with this j atom. */
567 velecsum = _mm_add_ps(velecsum,velec);
571 /* Calculate temporary vectorial force */
572 tx = _mm_mul_ps(fscal,dx22);
573 ty = _mm_mul_ps(fscal,dy22);
574 tz = _mm_mul_ps(fscal,dz22);
576 /* Update vectorial force */
577 fix2 = _mm_add_ps(fix2,tx);
578 fiy2 = _mm_add_ps(fiy2,ty);
579 fiz2 = _mm_add_ps(fiz2,tz);
581 fjx2 = _mm_add_ps(fjx2,tx);
582 fjy2 = _mm_add_ps(fjy2,ty);
583 fjz2 = _mm_add_ps(fjz2,tz);
585 /**************************
586 * CALCULATE INTERACTIONS *
587 **************************/
589 r23 = _mm_mul_ps(rsq23,rinv23);
591 /* EWALD ELECTROSTATICS */
593 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
594 ewrt = _mm_mul_ps(r23,ewtabscale);
595 ewitab = _mm_cvttps_epi32(ewrt);
596 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
597 ewitab = _mm_slli_epi32(ewitab,2);
598 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
599 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
600 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
601 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
602 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
603 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
604 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
605 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
606 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
608 /* Update potential sum for this i atom from the interaction with this j atom. */
609 velecsum = _mm_add_ps(velecsum,velec);
613 /* Calculate temporary vectorial force */
614 tx = _mm_mul_ps(fscal,dx23);
615 ty = _mm_mul_ps(fscal,dy23);
616 tz = _mm_mul_ps(fscal,dz23);
618 /* Update vectorial force */
619 fix2 = _mm_add_ps(fix2,tx);
620 fiy2 = _mm_add_ps(fiy2,ty);
621 fiz2 = _mm_add_ps(fiz2,tz);
623 fjx3 = _mm_add_ps(fjx3,tx);
624 fjy3 = _mm_add_ps(fjy3,ty);
625 fjz3 = _mm_add_ps(fjz3,tz);
627 /**************************
628 * CALCULATE INTERACTIONS *
629 **************************/
631 r31 = _mm_mul_ps(rsq31,rinv31);
633 /* EWALD ELECTROSTATICS */
635 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
636 ewrt = _mm_mul_ps(r31,ewtabscale);
637 ewitab = _mm_cvttps_epi32(ewrt);
638 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
639 ewitab = _mm_slli_epi32(ewitab,2);
640 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
641 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
642 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
643 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
644 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
645 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
646 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
647 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
648 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
650 /* Update potential sum for this i atom from the interaction with this j atom. */
651 velecsum = _mm_add_ps(velecsum,velec);
655 /* Calculate temporary vectorial force */
656 tx = _mm_mul_ps(fscal,dx31);
657 ty = _mm_mul_ps(fscal,dy31);
658 tz = _mm_mul_ps(fscal,dz31);
660 /* Update vectorial force */
661 fix3 = _mm_add_ps(fix3,tx);
662 fiy3 = _mm_add_ps(fiy3,ty);
663 fiz3 = _mm_add_ps(fiz3,tz);
665 fjx1 = _mm_add_ps(fjx1,tx);
666 fjy1 = _mm_add_ps(fjy1,ty);
667 fjz1 = _mm_add_ps(fjz1,tz);
669 /**************************
670 * CALCULATE INTERACTIONS *
671 **************************/
673 r32 = _mm_mul_ps(rsq32,rinv32);
675 /* EWALD ELECTROSTATICS */
677 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
678 ewrt = _mm_mul_ps(r32,ewtabscale);
679 ewitab = _mm_cvttps_epi32(ewrt);
680 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
681 ewitab = _mm_slli_epi32(ewitab,2);
682 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
683 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
684 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
685 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
686 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
687 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
688 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
689 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
690 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
692 /* Update potential sum for this i atom from the interaction with this j atom. */
693 velecsum = _mm_add_ps(velecsum,velec);
697 /* Calculate temporary vectorial force */
698 tx = _mm_mul_ps(fscal,dx32);
699 ty = _mm_mul_ps(fscal,dy32);
700 tz = _mm_mul_ps(fscal,dz32);
702 /* Update vectorial force */
703 fix3 = _mm_add_ps(fix3,tx);
704 fiy3 = _mm_add_ps(fiy3,ty);
705 fiz3 = _mm_add_ps(fiz3,tz);
707 fjx2 = _mm_add_ps(fjx2,tx);
708 fjy2 = _mm_add_ps(fjy2,ty);
709 fjz2 = _mm_add_ps(fjz2,tz);
711 /**************************
712 * CALCULATE INTERACTIONS *
713 **************************/
715 r33 = _mm_mul_ps(rsq33,rinv33);
717 /* EWALD ELECTROSTATICS */
719 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
720 ewrt = _mm_mul_ps(r33,ewtabscale);
721 ewitab = _mm_cvttps_epi32(ewrt);
722 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
723 ewitab = _mm_slli_epi32(ewitab,2);
724 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
725 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
726 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
727 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
728 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
729 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
730 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
731 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
732 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
734 /* Update potential sum for this i atom from the interaction with this j atom. */
735 velecsum = _mm_add_ps(velecsum,velec);
739 /* Calculate temporary vectorial force */
740 tx = _mm_mul_ps(fscal,dx33);
741 ty = _mm_mul_ps(fscal,dy33);
742 tz = _mm_mul_ps(fscal,dz33);
744 /* Update vectorial force */
745 fix3 = _mm_add_ps(fix3,tx);
746 fiy3 = _mm_add_ps(fiy3,ty);
747 fiz3 = _mm_add_ps(fiz3,tz);
749 fjx3 = _mm_add_ps(fjx3,tx);
750 fjy3 = _mm_add_ps(fjy3,ty);
751 fjz3 = _mm_add_ps(fjz3,tz);
753 fjptrA = f+j_coord_offsetA;
754 fjptrB = f+j_coord_offsetB;
755 fjptrC = f+j_coord_offsetC;
756 fjptrD = f+j_coord_offsetD;
758 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
759 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
760 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
762 /* Inner loop uses 423 flops */
768 /* Get j neighbor index, and coordinate index */
769 jnrlistA = jjnr[jidx];
770 jnrlistB = jjnr[jidx+1];
771 jnrlistC = jjnr[jidx+2];
772 jnrlistD = jjnr[jidx+3];
773 /* Sign of each element will be negative for non-real atoms.
774 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
775 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
777 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
778 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
779 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
780 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
781 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
782 j_coord_offsetA = DIM*jnrA;
783 j_coord_offsetB = DIM*jnrB;
784 j_coord_offsetC = DIM*jnrC;
785 j_coord_offsetD = DIM*jnrD;
787 /* load j atom coordinates */
788 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
789 x+j_coord_offsetC,x+j_coord_offsetD,
790 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
791 &jy2,&jz2,&jx3,&jy3,&jz3);
793 /* Calculate displacement vector */
794 dx00 = _mm_sub_ps(ix0,jx0);
795 dy00 = _mm_sub_ps(iy0,jy0);
796 dz00 = _mm_sub_ps(iz0,jz0);
797 dx11 = _mm_sub_ps(ix1,jx1);
798 dy11 = _mm_sub_ps(iy1,jy1);
799 dz11 = _mm_sub_ps(iz1,jz1);
800 dx12 = _mm_sub_ps(ix1,jx2);
801 dy12 = _mm_sub_ps(iy1,jy2);
802 dz12 = _mm_sub_ps(iz1,jz2);
803 dx13 = _mm_sub_ps(ix1,jx3);
804 dy13 = _mm_sub_ps(iy1,jy3);
805 dz13 = _mm_sub_ps(iz1,jz3);
806 dx21 = _mm_sub_ps(ix2,jx1);
807 dy21 = _mm_sub_ps(iy2,jy1);
808 dz21 = _mm_sub_ps(iz2,jz1);
809 dx22 = _mm_sub_ps(ix2,jx2);
810 dy22 = _mm_sub_ps(iy2,jy2);
811 dz22 = _mm_sub_ps(iz2,jz2);
812 dx23 = _mm_sub_ps(ix2,jx3);
813 dy23 = _mm_sub_ps(iy2,jy3);
814 dz23 = _mm_sub_ps(iz2,jz3);
815 dx31 = _mm_sub_ps(ix3,jx1);
816 dy31 = _mm_sub_ps(iy3,jy1);
817 dz31 = _mm_sub_ps(iz3,jz1);
818 dx32 = _mm_sub_ps(ix3,jx2);
819 dy32 = _mm_sub_ps(iy3,jy2);
820 dz32 = _mm_sub_ps(iz3,jz2);
821 dx33 = _mm_sub_ps(ix3,jx3);
822 dy33 = _mm_sub_ps(iy3,jy3);
823 dz33 = _mm_sub_ps(iz3,jz3);
825 /* Calculate squared distance and things based on it */
826 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
827 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
828 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
829 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
830 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
831 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
832 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
833 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
834 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
835 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
837 rinv00 = sse41_invsqrt_f(rsq00);
838 rinv11 = sse41_invsqrt_f(rsq11);
839 rinv12 = sse41_invsqrt_f(rsq12);
840 rinv13 = sse41_invsqrt_f(rsq13);
841 rinv21 = sse41_invsqrt_f(rsq21);
842 rinv22 = sse41_invsqrt_f(rsq22);
843 rinv23 = sse41_invsqrt_f(rsq23);
844 rinv31 = sse41_invsqrt_f(rsq31);
845 rinv32 = sse41_invsqrt_f(rsq32);
846 rinv33 = sse41_invsqrt_f(rsq33);
848 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
849 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
850 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
851 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
852 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
853 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
854 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
855 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
856 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
857 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
859 fjx0 = _mm_setzero_ps();
860 fjy0 = _mm_setzero_ps();
861 fjz0 = _mm_setzero_ps();
862 fjx1 = _mm_setzero_ps();
863 fjy1 = _mm_setzero_ps();
864 fjz1 = _mm_setzero_ps();
865 fjx2 = _mm_setzero_ps();
866 fjy2 = _mm_setzero_ps();
867 fjz2 = _mm_setzero_ps();
868 fjx3 = _mm_setzero_ps();
869 fjy3 = _mm_setzero_ps();
870 fjz3 = _mm_setzero_ps();
872 /**************************
873 * CALCULATE INTERACTIONS *
874 **************************/
876 r00 = _mm_mul_ps(rsq00,rinv00);
877 r00 = _mm_andnot_ps(dummy_mask,r00);
879 /* Analytical LJ-PME */
880 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
881 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
882 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
883 exponent = sse41_exp_f(ewcljrsq);
884 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
885 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
886 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
887 vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
888 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
889 vvdw = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
890 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
891 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
893 /* Update potential sum for this i atom from the interaction with this j atom. */
894 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
895 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
899 fscal = _mm_andnot_ps(dummy_mask,fscal);
901 /* Calculate temporary vectorial force */
902 tx = _mm_mul_ps(fscal,dx00);
903 ty = _mm_mul_ps(fscal,dy00);
904 tz = _mm_mul_ps(fscal,dz00);
906 /* Update vectorial force */
907 fix0 = _mm_add_ps(fix0,tx);
908 fiy0 = _mm_add_ps(fiy0,ty);
909 fiz0 = _mm_add_ps(fiz0,tz);
911 fjx0 = _mm_add_ps(fjx0,tx);
912 fjy0 = _mm_add_ps(fjy0,ty);
913 fjz0 = _mm_add_ps(fjz0,tz);
915 /**************************
916 * CALCULATE INTERACTIONS *
917 **************************/
919 r11 = _mm_mul_ps(rsq11,rinv11);
920 r11 = _mm_andnot_ps(dummy_mask,r11);
922 /* EWALD ELECTROSTATICS */
924 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
925 ewrt = _mm_mul_ps(r11,ewtabscale);
926 ewitab = _mm_cvttps_epi32(ewrt);
927 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
928 ewitab = _mm_slli_epi32(ewitab,2);
929 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
930 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
931 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
932 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
933 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
934 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
935 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
936 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
937 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
939 /* Update potential sum for this i atom from the interaction with this j atom. */
940 velec = _mm_andnot_ps(dummy_mask,velec);
941 velecsum = _mm_add_ps(velecsum,velec);
945 fscal = _mm_andnot_ps(dummy_mask,fscal);
947 /* Calculate temporary vectorial force */
948 tx = _mm_mul_ps(fscal,dx11);
949 ty = _mm_mul_ps(fscal,dy11);
950 tz = _mm_mul_ps(fscal,dz11);
952 /* Update vectorial force */
953 fix1 = _mm_add_ps(fix1,tx);
954 fiy1 = _mm_add_ps(fiy1,ty);
955 fiz1 = _mm_add_ps(fiz1,tz);
957 fjx1 = _mm_add_ps(fjx1,tx);
958 fjy1 = _mm_add_ps(fjy1,ty);
959 fjz1 = _mm_add_ps(fjz1,tz);
961 /**************************
962 * CALCULATE INTERACTIONS *
963 **************************/
965 r12 = _mm_mul_ps(rsq12,rinv12);
966 r12 = _mm_andnot_ps(dummy_mask,r12);
968 /* EWALD ELECTROSTATICS */
970 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
971 ewrt = _mm_mul_ps(r12,ewtabscale);
972 ewitab = _mm_cvttps_epi32(ewrt);
973 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
974 ewitab = _mm_slli_epi32(ewitab,2);
975 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
976 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
977 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
978 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
979 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
980 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
981 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
982 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
983 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
985 /* Update potential sum for this i atom from the interaction with this j atom. */
986 velec = _mm_andnot_ps(dummy_mask,velec);
987 velecsum = _mm_add_ps(velecsum,velec);
991 fscal = _mm_andnot_ps(dummy_mask,fscal);
993 /* Calculate temporary vectorial force */
994 tx = _mm_mul_ps(fscal,dx12);
995 ty = _mm_mul_ps(fscal,dy12);
996 tz = _mm_mul_ps(fscal,dz12);
998 /* Update vectorial force */
999 fix1 = _mm_add_ps(fix1,tx);
1000 fiy1 = _mm_add_ps(fiy1,ty);
1001 fiz1 = _mm_add_ps(fiz1,tz);
1003 fjx2 = _mm_add_ps(fjx2,tx);
1004 fjy2 = _mm_add_ps(fjy2,ty);
1005 fjz2 = _mm_add_ps(fjz2,tz);
1007 /**************************
1008 * CALCULATE INTERACTIONS *
1009 **************************/
1011 r13 = _mm_mul_ps(rsq13,rinv13);
1012 r13 = _mm_andnot_ps(dummy_mask,r13);
1014 /* EWALD ELECTROSTATICS */
1016 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1017 ewrt = _mm_mul_ps(r13,ewtabscale);
1018 ewitab = _mm_cvttps_epi32(ewrt);
1019 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1020 ewitab = _mm_slli_epi32(ewitab,2);
1021 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1022 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1023 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1024 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1025 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1026 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1027 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1028 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
1029 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1031 /* Update potential sum for this i atom from the interaction with this j atom. */
1032 velec = _mm_andnot_ps(dummy_mask,velec);
1033 velecsum = _mm_add_ps(velecsum,velec);
1037 fscal = _mm_andnot_ps(dummy_mask,fscal);
1039 /* Calculate temporary vectorial force */
1040 tx = _mm_mul_ps(fscal,dx13);
1041 ty = _mm_mul_ps(fscal,dy13);
1042 tz = _mm_mul_ps(fscal,dz13);
1044 /* Update vectorial force */
1045 fix1 = _mm_add_ps(fix1,tx);
1046 fiy1 = _mm_add_ps(fiy1,ty);
1047 fiz1 = _mm_add_ps(fiz1,tz);
1049 fjx3 = _mm_add_ps(fjx3,tx);
1050 fjy3 = _mm_add_ps(fjy3,ty);
1051 fjz3 = _mm_add_ps(fjz3,tz);
1053 /**************************
1054 * CALCULATE INTERACTIONS *
1055 **************************/
1057 r21 = _mm_mul_ps(rsq21,rinv21);
1058 r21 = _mm_andnot_ps(dummy_mask,r21);
1060 /* EWALD ELECTROSTATICS */
1062 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1063 ewrt = _mm_mul_ps(r21,ewtabscale);
1064 ewitab = _mm_cvttps_epi32(ewrt);
1065 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1066 ewitab = _mm_slli_epi32(ewitab,2);
1067 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1068 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1069 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1070 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1071 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1072 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1073 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1074 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1075 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1077 /* Update potential sum for this i atom from the interaction with this j atom. */
1078 velec = _mm_andnot_ps(dummy_mask,velec);
1079 velecsum = _mm_add_ps(velecsum,velec);
1083 fscal = _mm_andnot_ps(dummy_mask,fscal);
1085 /* Calculate temporary vectorial force */
1086 tx = _mm_mul_ps(fscal,dx21);
1087 ty = _mm_mul_ps(fscal,dy21);
1088 tz = _mm_mul_ps(fscal,dz21);
1090 /* Update vectorial force */
1091 fix2 = _mm_add_ps(fix2,tx);
1092 fiy2 = _mm_add_ps(fiy2,ty);
1093 fiz2 = _mm_add_ps(fiz2,tz);
1095 fjx1 = _mm_add_ps(fjx1,tx);
1096 fjy1 = _mm_add_ps(fjy1,ty);
1097 fjz1 = _mm_add_ps(fjz1,tz);
1099 /**************************
1100 * CALCULATE INTERACTIONS *
1101 **************************/
1103 r22 = _mm_mul_ps(rsq22,rinv22);
1104 r22 = _mm_andnot_ps(dummy_mask,r22);
1106 /* EWALD ELECTROSTATICS */
1108 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1109 ewrt = _mm_mul_ps(r22,ewtabscale);
1110 ewitab = _mm_cvttps_epi32(ewrt);
1111 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1112 ewitab = _mm_slli_epi32(ewitab,2);
1113 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1114 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1115 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1116 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1117 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1118 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1119 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1120 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1121 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1123 /* Update potential sum for this i atom from the interaction with this j atom. */
1124 velec = _mm_andnot_ps(dummy_mask,velec);
1125 velecsum = _mm_add_ps(velecsum,velec);
1129 fscal = _mm_andnot_ps(dummy_mask,fscal);
1131 /* Calculate temporary vectorial force */
1132 tx = _mm_mul_ps(fscal,dx22);
1133 ty = _mm_mul_ps(fscal,dy22);
1134 tz = _mm_mul_ps(fscal,dz22);
1136 /* Update vectorial force */
1137 fix2 = _mm_add_ps(fix2,tx);
1138 fiy2 = _mm_add_ps(fiy2,ty);
1139 fiz2 = _mm_add_ps(fiz2,tz);
1141 fjx2 = _mm_add_ps(fjx2,tx);
1142 fjy2 = _mm_add_ps(fjy2,ty);
1143 fjz2 = _mm_add_ps(fjz2,tz);
1145 /**************************
1146 * CALCULATE INTERACTIONS *
1147 **************************/
1149 r23 = _mm_mul_ps(rsq23,rinv23);
1150 r23 = _mm_andnot_ps(dummy_mask,r23);
1152 /* EWALD ELECTROSTATICS */
1154 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1155 ewrt = _mm_mul_ps(r23,ewtabscale);
1156 ewitab = _mm_cvttps_epi32(ewrt);
1157 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1158 ewitab = _mm_slli_epi32(ewitab,2);
1159 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1160 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1161 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1162 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1163 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1164 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1165 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1166 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1167 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1169 /* Update potential sum for this i atom from the interaction with this j atom. */
1170 velec = _mm_andnot_ps(dummy_mask,velec);
1171 velecsum = _mm_add_ps(velecsum,velec);
1175 fscal = _mm_andnot_ps(dummy_mask,fscal);
1177 /* Calculate temporary vectorial force */
1178 tx = _mm_mul_ps(fscal,dx23);
1179 ty = _mm_mul_ps(fscal,dy23);
1180 tz = _mm_mul_ps(fscal,dz23);
1182 /* Update vectorial force */
1183 fix2 = _mm_add_ps(fix2,tx);
1184 fiy2 = _mm_add_ps(fiy2,ty);
1185 fiz2 = _mm_add_ps(fiz2,tz);
1187 fjx3 = _mm_add_ps(fjx3,tx);
1188 fjy3 = _mm_add_ps(fjy3,ty);
1189 fjz3 = _mm_add_ps(fjz3,tz);
1191 /**************************
1192 * CALCULATE INTERACTIONS *
1193 **************************/
1195 r31 = _mm_mul_ps(rsq31,rinv31);
1196 r31 = _mm_andnot_ps(dummy_mask,r31);
1198 /* EWALD ELECTROSTATICS */
1200 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1201 ewrt = _mm_mul_ps(r31,ewtabscale);
1202 ewitab = _mm_cvttps_epi32(ewrt);
1203 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1204 ewitab = _mm_slli_epi32(ewitab,2);
1205 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1206 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1207 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1208 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1209 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1210 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1211 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1212 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1213 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1215 /* Update potential sum for this i atom from the interaction with this j atom. */
1216 velec = _mm_andnot_ps(dummy_mask,velec);
1217 velecsum = _mm_add_ps(velecsum,velec);
1221 fscal = _mm_andnot_ps(dummy_mask,fscal);
1223 /* Calculate temporary vectorial force */
1224 tx = _mm_mul_ps(fscal,dx31);
1225 ty = _mm_mul_ps(fscal,dy31);
1226 tz = _mm_mul_ps(fscal,dz31);
1228 /* Update vectorial force */
1229 fix3 = _mm_add_ps(fix3,tx);
1230 fiy3 = _mm_add_ps(fiy3,ty);
1231 fiz3 = _mm_add_ps(fiz3,tz);
1233 fjx1 = _mm_add_ps(fjx1,tx);
1234 fjy1 = _mm_add_ps(fjy1,ty);
1235 fjz1 = _mm_add_ps(fjz1,tz);
1237 /**************************
1238 * CALCULATE INTERACTIONS *
1239 **************************/
1241 r32 = _mm_mul_ps(rsq32,rinv32);
1242 r32 = _mm_andnot_ps(dummy_mask,r32);
1244 /* EWALD ELECTROSTATICS */
1246 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1247 ewrt = _mm_mul_ps(r32,ewtabscale);
1248 ewitab = _mm_cvttps_epi32(ewrt);
1249 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1250 ewitab = _mm_slli_epi32(ewitab,2);
1251 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1252 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1253 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1254 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1255 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1256 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1257 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1258 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1259 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1261 /* Update potential sum for this i atom from the interaction with this j atom. */
1262 velec = _mm_andnot_ps(dummy_mask,velec);
1263 velecsum = _mm_add_ps(velecsum,velec);
1267 fscal = _mm_andnot_ps(dummy_mask,fscal);
1269 /* Calculate temporary vectorial force */
1270 tx = _mm_mul_ps(fscal,dx32);
1271 ty = _mm_mul_ps(fscal,dy32);
1272 tz = _mm_mul_ps(fscal,dz32);
1274 /* Update vectorial force */
1275 fix3 = _mm_add_ps(fix3,tx);
1276 fiy3 = _mm_add_ps(fiy3,ty);
1277 fiz3 = _mm_add_ps(fiz3,tz);
1279 fjx2 = _mm_add_ps(fjx2,tx);
1280 fjy2 = _mm_add_ps(fjy2,ty);
1281 fjz2 = _mm_add_ps(fjz2,tz);
1283 /**************************
1284 * CALCULATE INTERACTIONS *
1285 **************************/
1287 r33 = _mm_mul_ps(rsq33,rinv33);
1288 r33 = _mm_andnot_ps(dummy_mask,r33);
1290 /* EWALD ELECTROSTATICS */
1292 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1293 ewrt = _mm_mul_ps(r33,ewtabscale);
1294 ewitab = _mm_cvttps_epi32(ewrt);
1295 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1296 ewitab = _mm_slli_epi32(ewitab,2);
1297 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1298 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1299 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1300 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1301 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1302 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1303 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1304 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1305 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1307 /* Update potential sum for this i atom from the interaction with this j atom. */
1308 velec = _mm_andnot_ps(dummy_mask,velec);
1309 velecsum = _mm_add_ps(velecsum,velec);
1313 fscal = _mm_andnot_ps(dummy_mask,fscal);
1315 /* Calculate temporary vectorial force */
1316 tx = _mm_mul_ps(fscal,dx33);
1317 ty = _mm_mul_ps(fscal,dy33);
1318 tz = _mm_mul_ps(fscal,dz33);
1320 /* Update vectorial force */
1321 fix3 = _mm_add_ps(fix3,tx);
1322 fiy3 = _mm_add_ps(fiy3,ty);
1323 fiz3 = _mm_add_ps(fiz3,tz);
1325 fjx3 = _mm_add_ps(fjx3,tx);
1326 fjy3 = _mm_add_ps(fjy3,ty);
1327 fjz3 = _mm_add_ps(fjz3,tz);
1329 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1330 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1331 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1332 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1334 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1335 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1336 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1338 /* Inner loop uses 433 flops */
1341 /* End of innermost loop */
1343 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1344 f+i_coord_offset,fshift+i_shift_offset);
1347 /* Update potential energies */
1348 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1349 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1351 /* Increment number of inner iterations */
1352 inneriter += j_index_end - j_index_start;
1354 /* Outer loop uses 26 flops */
1357 /* Increment number of outer iterations */
1360 /* Update outer/inner flops */
1362 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*433);
1365 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse4_1_single
1366 * Electrostatics interaction: Ewald
1367 * VdW interaction: LJEwald
1368 * Geometry: Water4-Water4
1369 * Calculate force/pot: Force
1372 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse4_1_single
1373 (t_nblist * gmx_restrict nlist,
1374 rvec * gmx_restrict xx,
1375 rvec * gmx_restrict ff,
1376 struct t_forcerec * gmx_restrict fr,
1377 t_mdatoms * gmx_restrict mdatoms,
1378 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1379 t_nrnb * gmx_restrict nrnb)
1381 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1382 * just 0 for non-waters.
1383 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1384 * jnr indices corresponding to data put in the four positions in the SIMD register.
1386 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1387 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1388 int jnrA,jnrB,jnrC,jnrD;
1389 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1390 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1391 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1392 real rcutoff_scalar;
1393 real *shiftvec,*fshift,*x,*f;
1394 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1395 real scratch[4*DIM];
1396 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1398 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1400 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1402 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1404 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1405 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1406 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1407 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1408 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1409 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1410 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1411 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1412 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1413 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1414 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1415 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1416 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1417 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1418 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1419 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1420 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1421 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1422 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1423 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1426 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1429 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1430 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1441 __m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1443 __m128 one_half = _mm_set1_ps(0.5);
1444 __m128 minus_one = _mm_set1_ps(-1.0);
1446 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1448 __m128 dummy_mask,cutoff_mask;
1449 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1450 __m128 one = _mm_set1_ps(1.0);
1451 __m128 two = _mm_set1_ps(2.0);
1457 jindex = nlist->jindex;
1459 shiftidx = nlist->shift;
1461 shiftvec = fr->shift_vec[0];
1462 fshift = fr->fshift[0];
1463 facel = _mm_set1_ps(fr->ic->epsfac);
1464 charge = mdatoms->chargeA;
1465 nvdwtype = fr->ntype;
1466 vdwparam = fr->nbfp;
1467 vdwtype = mdatoms->typeA;
1468 vdwgridparam = fr->ljpme_c6grid;
1469 sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
1470 ewclj = _mm_set1_ps(fr->ic->ewaldcoeff_lj);
1471 ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
1473 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1474 ewtab = fr->ic->tabq_coul_F;
1475 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1476 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1478 /* Setup water-specific parameters */
1479 inr = nlist->iinr[0];
1480 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1481 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1482 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1483 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1485 jq1 = _mm_set1_ps(charge[inr+1]);
1486 jq2 = _mm_set1_ps(charge[inr+2]);
1487 jq3 = _mm_set1_ps(charge[inr+3]);
1488 vdwjidx0A = 2*vdwtype[inr+0];
1489 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1490 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1491 c6grid_00 = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
1492 qq11 = _mm_mul_ps(iq1,jq1);
1493 qq12 = _mm_mul_ps(iq1,jq2);
1494 qq13 = _mm_mul_ps(iq1,jq3);
1495 qq21 = _mm_mul_ps(iq2,jq1);
1496 qq22 = _mm_mul_ps(iq2,jq2);
1497 qq23 = _mm_mul_ps(iq2,jq3);
1498 qq31 = _mm_mul_ps(iq3,jq1);
1499 qq32 = _mm_mul_ps(iq3,jq2);
1500 qq33 = _mm_mul_ps(iq3,jq3);
1502 /* Avoid stupid compiler warnings */
1503 jnrA = jnrB = jnrC = jnrD = 0;
1504 j_coord_offsetA = 0;
1505 j_coord_offsetB = 0;
1506 j_coord_offsetC = 0;
1507 j_coord_offsetD = 0;
1512 for(iidx=0;iidx<4*DIM;iidx++)
1514 scratch[iidx] = 0.0;
1517 /* Start outer loop over neighborlists */
1518 for(iidx=0; iidx<nri; iidx++)
1520 /* Load shift vector for this list */
1521 i_shift_offset = DIM*shiftidx[iidx];
1523 /* Load limits for loop over neighbors */
1524 j_index_start = jindex[iidx];
1525 j_index_end = jindex[iidx+1];
1527 /* Get outer coordinate index */
1529 i_coord_offset = DIM*inr;
1531 /* Load i particle coords and add shift vector */
1532 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1533 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1535 fix0 = _mm_setzero_ps();
1536 fiy0 = _mm_setzero_ps();
1537 fiz0 = _mm_setzero_ps();
1538 fix1 = _mm_setzero_ps();
1539 fiy1 = _mm_setzero_ps();
1540 fiz1 = _mm_setzero_ps();
1541 fix2 = _mm_setzero_ps();
1542 fiy2 = _mm_setzero_ps();
1543 fiz2 = _mm_setzero_ps();
1544 fix3 = _mm_setzero_ps();
1545 fiy3 = _mm_setzero_ps();
1546 fiz3 = _mm_setzero_ps();
1548 /* Start inner kernel loop */
1549 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1552 /* Get j neighbor index, and coordinate index */
1554 jnrB = jjnr[jidx+1];
1555 jnrC = jjnr[jidx+2];
1556 jnrD = jjnr[jidx+3];
1557 j_coord_offsetA = DIM*jnrA;
1558 j_coord_offsetB = DIM*jnrB;
1559 j_coord_offsetC = DIM*jnrC;
1560 j_coord_offsetD = DIM*jnrD;
1562 /* load j atom coordinates */
1563 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1564 x+j_coord_offsetC,x+j_coord_offsetD,
1565 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1566 &jy2,&jz2,&jx3,&jy3,&jz3);
1568 /* Calculate displacement vector */
1569 dx00 = _mm_sub_ps(ix0,jx0);
1570 dy00 = _mm_sub_ps(iy0,jy0);
1571 dz00 = _mm_sub_ps(iz0,jz0);
1572 dx11 = _mm_sub_ps(ix1,jx1);
1573 dy11 = _mm_sub_ps(iy1,jy1);
1574 dz11 = _mm_sub_ps(iz1,jz1);
1575 dx12 = _mm_sub_ps(ix1,jx2);
1576 dy12 = _mm_sub_ps(iy1,jy2);
1577 dz12 = _mm_sub_ps(iz1,jz2);
1578 dx13 = _mm_sub_ps(ix1,jx3);
1579 dy13 = _mm_sub_ps(iy1,jy3);
1580 dz13 = _mm_sub_ps(iz1,jz3);
1581 dx21 = _mm_sub_ps(ix2,jx1);
1582 dy21 = _mm_sub_ps(iy2,jy1);
1583 dz21 = _mm_sub_ps(iz2,jz1);
1584 dx22 = _mm_sub_ps(ix2,jx2);
1585 dy22 = _mm_sub_ps(iy2,jy2);
1586 dz22 = _mm_sub_ps(iz2,jz2);
1587 dx23 = _mm_sub_ps(ix2,jx3);
1588 dy23 = _mm_sub_ps(iy2,jy3);
1589 dz23 = _mm_sub_ps(iz2,jz3);
1590 dx31 = _mm_sub_ps(ix3,jx1);
1591 dy31 = _mm_sub_ps(iy3,jy1);
1592 dz31 = _mm_sub_ps(iz3,jz1);
1593 dx32 = _mm_sub_ps(ix3,jx2);
1594 dy32 = _mm_sub_ps(iy3,jy2);
1595 dz32 = _mm_sub_ps(iz3,jz2);
1596 dx33 = _mm_sub_ps(ix3,jx3);
1597 dy33 = _mm_sub_ps(iy3,jy3);
1598 dz33 = _mm_sub_ps(iz3,jz3);
1600 /* Calculate squared distance and things based on it */
1601 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1602 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1603 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1604 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1605 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1606 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1607 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1608 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1609 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1610 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1612 rinv00 = sse41_invsqrt_f(rsq00);
1613 rinv11 = sse41_invsqrt_f(rsq11);
1614 rinv12 = sse41_invsqrt_f(rsq12);
1615 rinv13 = sse41_invsqrt_f(rsq13);
1616 rinv21 = sse41_invsqrt_f(rsq21);
1617 rinv22 = sse41_invsqrt_f(rsq22);
1618 rinv23 = sse41_invsqrt_f(rsq23);
1619 rinv31 = sse41_invsqrt_f(rsq31);
1620 rinv32 = sse41_invsqrt_f(rsq32);
1621 rinv33 = sse41_invsqrt_f(rsq33);
1623 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1624 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1625 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1626 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1627 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1628 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1629 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1630 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1631 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1632 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1634 fjx0 = _mm_setzero_ps();
1635 fjy0 = _mm_setzero_ps();
1636 fjz0 = _mm_setzero_ps();
1637 fjx1 = _mm_setzero_ps();
1638 fjy1 = _mm_setzero_ps();
1639 fjz1 = _mm_setzero_ps();
1640 fjx2 = _mm_setzero_ps();
1641 fjy2 = _mm_setzero_ps();
1642 fjz2 = _mm_setzero_ps();
1643 fjx3 = _mm_setzero_ps();
1644 fjy3 = _mm_setzero_ps();
1645 fjz3 = _mm_setzero_ps();
1647 /**************************
1648 * CALCULATE INTERACTIONS *
1649 **************************/
1651 r00 = _mm_mul_ps(rsq00,rinv00);
1653 /* Analytical LJ-PME */
1654 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1655 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
1656 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1657 exponent = sse41_exp_f(ewcljrsq);
1658 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1659 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
1660 /* f6A = 6 * C6grid * (1 - poly) */
1661 f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1662 /* f6B = C6grid * exponent * beta^6 */
1663 f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1664 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1665 fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1669 /* Calculate temporary vectorial force */
1670 tx = _mm_mul_ps(fscal,dx00);
1671 ty = _mm_mul_ps(fscal,dy00);
1672 tz = _mm_mul_ps(fscal,dz00);
1674 /* Update vectorial force */
1675 fix0 = _mm_add_ps(fix0,tx);
1676 fiy0 = _mm_add_ps(fiy0,ty);
1677 fiz0 = _mm_add_ps(fiz0,tz);
1679 fjx0 = _mm_add_ps(fjx0,tx);
1680 fjy0 = _mm_add_ps(fjy0,ty);
1681 fjz0 = _mm_add_ps(fjz0,tz);
1683 /**************************
1684 * CALCULATE INTERACTIONS *
1685 **************************/
1687 r11 = _mm_mul_ps(rsq11,rinv11);
1689 /* EWALD ELECTROSTATICS */
1691 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1692 ewrt = _mm_mul_ps(r11,ewtabscale);
1693 ewitab = _mm_cvttps_epi32(ewrt);
1694 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1695 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1696 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1698 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1699 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1703 /* Calculate temporary vectorial force */
1704 tx = _mm_mul_ps(fscal,dx11);
1705 ty = _mm_mul_ps(fscal,dy11);
1706 tz = _mm_mul_ps(fscal,dz11);
1708 /* Update vectorial force */
1709 fix1 = _mm_add_ps(fix1,tx);
1710 fiy1 = _mm_add_ps(fiy1,ty);
1711 fiz1 = _mm_add_ps(fiz1,tz);
1713 fjx1 = _mm_add_ps(fjx1,tx);
1714 fjy1 = _mm_add_ps(fjy1,ty);
1715 fjz1 = _mm_add_ps(fjz1,tz);
1717 /**************************
1718 * CALCULATE INTERACTIONS *
1719 **************************/
1721 r12 = _mm_mul_ps(rsq12,rinv12);
1723 /* EWALD ELECTROSTATICS */
1725 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1726 ewrt = _mm_mul_ps(r12,ewtabscale);
1727 ewitab = _mm_cvttps_epi32(ewrt);
1728 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1729 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1730 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1732 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1733 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1737 /* Calculate temporary vectorial force */
1738 tx = _mm_mul_ps(fscal,dx12);
1739 ty = _mm_mul_ps(fscal,dy12);
1740 tz = _mm_mul_ps(fscal,dz12);
1742 /* Update vectorial force */
1743 fix1 = _mm_add_ps(fix1,tx);
1744 fiy1 = _mm_add_ps(fiy1,ty);
1745 fiz1 = _mm_add_ps(fiz1,tz);
1747 fjx2 = _mm_add_ps(fjx2,tx);
1748 fjy2 = _mm_add_ps(fjy2,ty);
1749 fjz2 = _mm_add_ps(fjz2,tz);
1751 /**************************
1752 * CALCULATE INTERACTIONS *
1753 **************************/
1755 r13 = _mm_mul_ps(rsq13,rinv13);
1757 /* EWALD ELECTROSTATICS */
1759 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1760 ewrt = _mm_mul_ps(r13,ewtabscale);
1761 ewitab = _mm_cvttps_epi32(ewrt);
1762 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1763 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1764 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1766 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1767 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1771 /* Calculate temporary vectorial force */
1772 tx = _mm_mul_ps(fscal,dx13);
1773 ty = _mm_mul_ps(fscal,dy13);
1774 tz = _mm_mul_ps(fscal,dz13);
1776 /* Update vectorial force */
1777 fix1 = _mm_add_ps(fix1,tx);
1778 fiy1 = _mm_add_ps(fiy1,ty);
1779 fiz1 = _mm_add_ps(fiz1,tz);
1781 fjx3 = _mm_add_ps(fjx3,tx);
1782 fjy3 = _mm_add_ps(fjy3,ty);
1783 fjz3 = _mm_add_ps(fjz3,tz);
1785 /**************************
1786 * CALCULATE INTERACTIONS *
1787 **************************/
1789 r21 = _mm_mul_ps(rsq21,rinv21);
1791 /* EWALD ELECTROSTATICS */
1793 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1794 ewrt = _mm_mul_ps(r21,ewtabscale);
1795 ewitab = _mm_cvttps_epi32(ewrt);
1796 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1797 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1798 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1800 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1801 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1805 /* Calculate temporary vectorial force */
1806 tx = _mm_mul_ps(fscal,dx21);
1807 ty = _mm_mul_ps(fscal,dy21);
1808 tz = _mm_mul_ps(fscal,dz21);
1810 /* Update vectorial force */
1811 fix2 = _mm_add_ps(fix2,tx);
1812 fiy2 = _mm_add_ps(fiy2,ty);
1813 fiz2 = _mm_add_ps(fiz2,tz);
1815 fjx1 = _mm_add_ps(fjx1,tx);
1816 fjy1 = _mm_add_ps(fjy1,ty);
1817 fjz1 = _mm_add_ps(fjz1,tz);
1819 /**************************
1820 * CALCULATE INTERACTIONS *
1821 **************************/
1823 r22 = _mm_mul_ps(rsq22,rinv22);
1825 /* EWALD ELECTROSTATICS */
1827 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1828 ewrt = _mm_mul_ps(r22,ewtabscale);
1829 ewitab = _mm_cvttps_epi32(ewrt);
1830 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1831 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1832 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1834 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1835 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1839 /* Calculate temporary vectorial force */
1840 tx = _mm_mul_ps(fscal,dx22);
1841 ty = _mm_mul_ps(fscal,dy22);
1842 tz = _mm_mul_ps(fscal,dz22);
1844 /* Update vectorial force */
1845 fix2 = _mm_add_ps(fix2,tx);
1846 fiy2 = _mm_add_ps(fiy2,ty);
1847 fiz2 = _mm_add_ps(fiz2,tz);
1849 fjx2 = _mm_add_ps(fjx2,tx);
1850 fjy2 = _mm_add_ps(fjy2,ty);
1851 fjz2 = _mm_add_ps(fjz2,tz);
1853 /**************************
1854 * CALCULATE INTERACTIONS *
1855 **************************/
1857 r23 = _mm_mul_ps(rsq23,rinv23);
1859 /* EWALD ELECTROSTATICS */
1861 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1862 ewrt = _mm_mul_ps(r23,ewtabscale);
1863 ewitab = _mm_cvttps_epi32(ewrt);
1864 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1865 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1866 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1868 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1869 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1873 /* Calculate temporary vectorial force */
1874 tx = _mm_mul_ps(fscal,dx23);
1875 ty = _mm_mul_ps(fscal,dy23);
1876 tz = _mm_mul_ps(fscal,dz23);
1878 /* Update vectorial force */
1879 fix2 = _mm_add_ps(fix2,tx);
1880 fiy2 = _mm_add_ps(fiy2,ty);
1881 fiz2 = _mm_add_ps(fiz2,tz);
1883 fjx3 = _mm_add_ps(fjx3,tx);
1884 fjy3 = _mm_add_ps(fjy3,ty);
1885 fjz3 = _mm_add_ps(fjz3,tz);
1887 /**************************
1888 * CALCULATE INTERACTIONS *
1889 **************************/
1891 r31 = _mm_mul_ps(rsq31,rinv31);
1893 /* EWALD ELECTROSTATICS */
1895 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1896 ewrt = _mm_mul_ps(r31,ewtabscale);
1897 ewitab = _mm_cvttps_epi32(ewrt);
1898 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1899 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1900 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1902 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1903 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1907 /* Calculate temporary vectorial force */
1908 tx = _mm_mul_ps(fscal,dx31);
1909 ty = _mm_mul_ps(fscal,dy31);
1910 tz = _mm_mul_ps(fscal,dz31);
1912 /* Update vectorial force */
1913 fix3 = _mm_add_ps(fix3,tx);
1914 fiy3 = _mm_add_ps(fiy3,ty);
1915 fiz3 = _mm_add_ps(fiz3,tz);
1917 fjx1 = _mm_add_ps(fjx1,tx);
1918 fjy1 = _mm_add_ps(fjy1,ty);
1919 fjz1 = _mm_add_ps(fjz1,tz);
1921 /**************************
1922 * CALCULATE INTERACTIONS *
1923 **************************/
1925 r32 = _mm_mul_ps(rsq32,rinv32);
1927 /* EWALD ELECTROSTATICS */
1929 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1930 ewrt = _mm_mul_ps(r32,ewtabscale);
1931 ewitab = _mm_cvttps_epi32(ewrt);
1932 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1933 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1934 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1936 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1937 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1941 /* Calculate temporary vectorial force */
1942 tx = _mm_mul_ps(fscal,dx32);
1943 ty = _mm_mul_ps(fscal,dy32);
1944 tz = _mm_mul_ps(fscal,dz32);
1946 /* Update vectorial force */
1947 fix3 = _mm_add_ps(fix3,tx);
1948 fiy3 = _mm_add_ps(fiy3,ty);
1949 fiz3 = _mm_add_ps(fiz3,tz);
1951 fjx2 = _mm_add_ps(fjx2,tx);
1952 fjy2 = _mm_add_ps(fjy2,ty);
1953 fjz2 = _mm_add_ps(fjz2,tz);
1955 /**************************
1956 * CALCULATE INTERACTIONS *
1957 **************************/
1959 r33 = _mm_mul_ps(rsq33,rinv33);
1961 /* EWALD ELECTROSTATICS */
1963 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1964 ewrt = _mm_mul_ps(r33,ewtabscale);
1965 ewitab = _mm_cvttps_epi32(ewrt);
1966 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1967 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1968 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1970 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1971 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1975 /* Calculate temporary vectorial force */
1976 tx = _mm_mul_ps(fscal,dx33);
1977 ty = _mm_mul_ps(fscal,dy33);
1978 tz = _mm_mul_ps(fscal,dz33);
1980 /* Update vectorial force */
1981 fix3 = _mm_add_ps(fix3,tx);
1982 fiy3 = _mm_add_ps(fiy3,ty);
1983 fiz3 = _mm_add_ps(fiz3,tz);
1985 fjx3 = _mm_add_ps(fjx3,tx);
1986 fjy3 = _mm_add_ps(fjy3,ty);
1987 fjz3 = _mm_add_ps(fjz3,tz);
1989 fjptrA = f+j_coord_offsetA;
1990 fjptrB = f+j_coord_offsetB;
1991 fjptrC = f+j_coord_offsetC;
1992 fjptrD = f+j_coord_offsetD;
1994 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1995 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1996 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1998 /* Inner loop uses 373 flops */
2001 if(jidx<j_index_end)
2004 /* Get j neighbor index, and coordinate index */
2005 jnrlistA = jjnr[jidx];
2006 jnrlistB = jjnr[jidx+1];
2007 jnrlistC = jjnr[jidx+2];
2008 jnrlistD = jjnr[jidx+3];
2009 /* Sign of each element will be negative for non-real atoms.
2010 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2011 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2013 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2014 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2015 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2016 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2017 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2018 j_coord_offsetA = DIM*jnrA;
2019 j_coord_offsetB = DIM*jnrB;
2020 j_coord_offsetC = DIM*jnrC;
2021 j_coord_offsetD = DIM*jnrD;
2023 /* load j atom coordinates */
2024 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2025 x+j_coord_offsetC,x+j_coord_offsetD,
2026 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2027 &jy2,&jz2,&jx3,&jy3,&jz3);
2029 /* Calculate displacement vector */
2030 dx00 = _mm_sub_ps(ix0,jx0);
2031 dy00 = _mm_sub_ps(iy0,jy0);
2032 dz00 = _mm_sub_ps(iz0,jz0);
2033 dx11 = _mm_sub_ps(ix1,jx1);
2034 dy11 = _mm_sub_ps(iy1,jy1);
2035 dz11 = _mm_sub_ps(iz1,jz1);
2036 dx12 = _mm_sub_ps(ix1,jx2);
2037 dy12 = _mm_sub_ps(iy1,jy2);
2038 dz12 = _mm_sub_ps(iz1,jz2);
2039 dx13 = _mm_sub_ps(ix1,jx3);
2040 dy13 = _mm_sub_ps(iy1,jy3);
2041 dz13 = _mm_sub_ps(iz1,jz3);
2042 dx21 = _mm_sub_ps(ix2,jx1);
2043 dy21 = _mm_sub_ps(iy2,jy1);
2044 dz21 = _mm_sub_ps(iz2,jz1);
2045 dx22 = _mm_sub_ps(ix2,jx2);
2046 dy22 = _mm_sub_ps(iy2,jy2);
2047 dz22 = _mm_sub_ps(iz2,jz2);
2048 dx23 = _mm_sub_ps(ix2,jx3);
2049 dy23 = _mm_sub_ps(iy2,jy3);
2050 dz23 = _mm_sub_ps(iz2,jz3);
2051 dx31 = _mm_sub_ps(ix3,jx1);
2052 dy31 = _mm_sub_ps(iy3,jy1);
2053 dz31 = _mm_sub_ps(iz3,jz1);
2054 dx32 = _mm_sub_ps(ix3,jx2);
2055 dy32 = _mm_sub_ps(iy3,jy2);
2056 dz32 = _mm_sub_ps(iz3,jz2);
2057 dx33 = _mm_sub_ps(ix3,jx3);
2058 dy33 = _mm_sub_ps(iy3,jy3);
2059 dz33 = _mm_sub_ps(iz3,jz3);
2061 /* Calculate squared distance and things based on it */
2062 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2063 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2064 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2065 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2066 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2067 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2068 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2069 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2070 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2071 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2073 rinv00 = sse41_invsqrt_f(rsq00);
2074 rinv11 = sse41_invsqrt_f(rsq11);
2075 rinv12 = sse41_invsqrt_f(rsq12);
2076 rinv13 = sse41_invsqrt_f(rsq13);
2077 rinv21 = sse41_invsqrt_f(rsq21);
2078 rinv22 = sse41_invsqrt_f(rsq22);
2079 rinv23 = sse41_invsqrt_f(rsq23);
2080 rinv31 = sse41_invsqrt_f(rsq31);
2081 rinv32 = sse41_invsqrt_f(rsq32);
2082 rinv33 = sse41_invsqrt_f(rsq33);
2084 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2085 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2086 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2087 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2088 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2089 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2090 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2091 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2092 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2093 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2095 fjx0 = _mm_setzero_ps();
2096 fjy0 = _mm_setzero_ps();
2097 fjz0 = _mm_setzero_ps();
2098 fjx1 = _mm_setzero_ps();
2099 fjy1 = _mm_setzero_ps();
2100 fjz1 = _mm_setzero_ps();
2101 fjx2 = _mm_setzero_ps();
2102 fjy2 = _mm_setzero_ps();
2103 fjz2 = _mm_setzero_ps();
2104 fjx3 = _mm_setzero_ps();
2105 fjy3 = _mm_setzero_ps();
2106 fjz3 = _mm_setzero_ps();
2108 /**************************
2109 * CALCULATE INTERACTIONS *
2110 **************************/
2112 r00 = _mm_mul_ps(rsq00,rinv00);
2113 r00 = _mm_andnot_ps(dummy_mask,r00);
2115 /* Analytical LJ-PME */
2116 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2117 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
2118 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
2119 exponent = sse41_exp_f(ewcljrsq);
2120 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2121 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
2122 /* f6A = 6 * C6grid * (1 - poly) */
2123 f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
2124 /* f6B = C6grid * exponent * beta^6 */
2125 f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
2126 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2127 fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2131 fscal = _mm_andnot_ps(dummy_mask,fscal);
2133 /* Calculate temporary vectorial force */
2134 tx = _mm_mul_ps(fscal,dx00);
2135 ty = _mm_mul_ps(fscal,dy00);
2136 tz = _mm_mul_ps(fscal,dz00);
2138 /* Update vectorial force */
2139 fix0 = _mm_add_ps(fix0,tx);
2140 fiy0 = _mm_add_ps(fiy0,ty);
2141 fiz0 = _mm_add_ps(fiz0,tz);
2143 fjx0 = _mm_add_ps(fjx0,tx);
2144 fjy0 = _mm_add_ps(fjy0,ty);
2145 fjz0 = _mm_add_ps(fjz0,tz);
2147 /**************************
2148 * CALCULATE INTERACTIONS *
2149 **************************/
2151 r11 = _mm_mul_ps(rsq11,rinv11);
2152 r11 = _mm_andnot_ps(dummy_mask,r11);
2154 /* EWALD ELECTROSTATICS */
2156 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2157 ewrt = _mm_mul_ps(r11,ewtabscale);
2158 ewitab = _mm_cvttps_epi32(ewrt);
2159 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2160 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2161 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2163 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2164 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2168 fscal = _mm_andnot_ps(dummy_mask,fscal);
2170 /* Calculate temporary vectorial force */
2171 tx = _mm_mul_ps(fscal,dx11);
2172 ty = _mm_mul_ps(fscal,dy11);
2173 tz = _mm_mul_ps(fscal,dz11);
2175 /* Update vectorial force */
2176 fix1 = _mm_add_ps(fix1,tx);
2177 fiy1 = _mm_add_ps(fiy1,ty);
2178 fiz1 = _mm_add_ps(fiz1,tz);
2180 fjx1 = _mm_add_ps(fjx1,tx);
2181 fjy1 = _mm_add_ps(fjy1,ty);
2182 fjz1 = _mm_add_ps(fjz1,tz);
2184 /**************************
2185 * CALCULATE INTERACTIONS *
2186 **************************/
2188 r12 = _mm_mul_ps(rsq12,rinv12);
2189 r12 = _mm_andnot_ps(dummy_mask,r12);
2191 /* EWALD ELECTROSTATICS */
2193 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2194 ewrt = _mm_mul_ps(r12,ewtabscale);
2195 ewitab = _mm_cvttps_epi32(ewrt);
2196 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2197 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2198 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2200 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2201 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2205 fscal = _mm_andnot_ps(dummy_mask,fscal);
2207 /* Calculate temporary vectorial force */
2208 tx = _mm_mul_ps(fscal,dx12);
2209 ty = _mm_mul_ps(fscal,dy12);
2210 tz = _mm_mul_ps(fscal,dz12);
2212 /* Update vectorial force */
2213 fix1 = _mm_add_ps(fix1,tx);
2214 fiy1 = _mm_add_ps(fiy1,ty);
2215 fiz1 = _mm_add_ps(fiz1,tz);
2217 fjx2 = _mm_add_ps(fjx2,tx);
2218 fjy2 = _mm_add_ps(fjy2,ty);
2219 fjz2 = _mm_add_ps(fjz2,tz);
2221 /**************************
2222 * CALCULATE INTERACTIONS *
2223 **************************/
2225 r13 = _mm_mul_ps(rsq13,rinv13);
2226 r13 = _mm_andnot_ps(dummy_mask,r13);
2228 /* EWALD ELECTROSTATICS */
2230 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2231 ewrt = _mm_mul_ps(r13,ewtabscale);
2232 ewitab = _mm_cvttps_epi32(ewrt);
2233 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2234 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2235 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2237 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2238 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2242 fscal = _mm_andnot_ps(dummy_mask,fscal);
2244 /* Calculate temporary vectorial force */
2245 tx = _mm_mul_ps(fscal,dx13);
2246 ty = _mm_mul_ps(fscal,dy13);
2247 tz = _mm_mul_ps(fscal,dz13);
2249 /* Update vectorial force */
2250 fix1 = _mm_add_ps(fix1,tx);
2251 fiy1 = _mm_add_ps(fiy1,ty);
2252 fiz1 = _mm_add_ps(fiz1,tz);
2254 fjx3 = _mm_add_ps(fjx3,tx);
2255 fjy3 = _mm_add_ps(fjy3,ty);
2256 fjz3 = _mm_add_ps(fjz3,tz);
2258 /**************************
2259 * CALCULATE INTERACTIONS *
2260 **************************/
2262 r21 = _mm_mul_ps(rsq21,rinv21);
2263 r21 = _mm_andnot_ps(dummy_mask,r21);
2265 /* EWALD ELECTROSTATICS */
2267 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2268 ewrt = _mm_mul_ps(r21,ewtabscale);
2269 ewitab = _mm_cvttps_epi32(ewrt);
2270 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2271 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2272 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2274 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2275 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2279 fscal = _mm_andnot_ps(dummy_mask,fscal);
2281 /* Calculate temporary vectorial force */
2282 tx = _mm_mul_ps(fscal,dx21);
2283 ty = _mm_mul_ps(fscal,dy21);
2284 tz = _mm_mul_ps(fscal,dz21);
2286 /* Update vectorial force */
2287 fix2 = _mm_add_ps(fix2,tx);
2288 fiy2 = _mm_add_ps(fiy2,ty);
2289 fiz2 = _mm_add_ps(fiz2,tz);
2291 fjx1 = _mm_add_ps(fjx1,tx);
2292 fjy1 = _mm_add_ps(fjy1,ty);
2293 fjz1 = _mm_add_ps(fjz1,tz);
2295 /**************************
2296 * CALCULATE INTERACTIONS *
2297 **************************/
2299 r22 = _mm_mul_ps(rsq22,rinv22);
2300 r22 = _mm_andnot_ps(dummy_mask,r22);
2302 /* EWALD ELECTROSTATICS */
2304 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2305 ewrt = _mm_mul_ps(r22,ewtabscale);
2306 ewitab = _mm_cvttps_epi32(ewrt);
2307 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2308 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2309 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2311 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2312 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2316 fscal = _mm_andnot_ps(dummy_mask,fscal);
2318 /* Calculate temporary vectorial force */
2319 tx = _mm_mul_ps(fscal,dx22);
2320 ty = _mm_mul_ps(fscal,dy22);
2321 tz = _mm_mul_ps(fscal,dz22);
2323 /* Update vectorial force */
2324 fix2 = _mm_add_ps(fix2,tx);
2325 fiy2 = _mm_add_ps(fiy2,ty);
2326 fiz2 = _mm_add_ps(fiz2,tz);
2328 fjx2 = _mm_add_ps(fjx2,tx);
2329 fjy2 = _mm_add_ps(fjy2,ty);
2330 fjz2 = _mm_add_ps(fjz2,tz);
2332 /**************************
2333 * CALCULATE INTERACTIONS *
2334 **************************/
2336 r23 = _mm_mul_ps(rsq23,rinv23);
2337 r23 = _mm_andnot_ps(dummy_mask,r23);
2339 /* EWALD ELECTROSTATICS */
2341 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2342 ewrt = _mm_mul_ps(r23,ewtabscale);
2343 ewitab = _mm_cvttps_epi32(ewrt);
2344 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2345 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2346 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2348 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2349 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2353 fscal = _mm_andnot_ps(dummy_mask,fscal);
2355 /* Calculate temporary vectorial force */
2356 tx = _mm_mul_ps(fscal,dx23);
2357 ty = _mm_mul_ps(fscal,dy23);
2358 tz = _mm_mul_ps(fscal,dz23);
2360 /* Update vectorial force */
2361 fix2 = _mm_add_ps(fix2,tx);
2362 fiy2 = _mm_add_ps(fiy2,ty);
2363 fiz2 = _mm_add_ps(fiz2,tz);
2365 fjx3 = _mm_add_ps(fjx3,tx);
2366 fjy3 = _mm_add_ps(fjy3,ty);
2367 fjz3 = _mm_add_ps(fjz3,tz);
2369 /**************************
2370 * CALCULATE INTERACTIONS *
2371 **************************/
2373 r31 = _mm_mul_ps(rsq31,rinv31);
2374 r31 = _mm_andnot_ps(dummy_mask,r31);
2376 /* EWALD ELECTROSTATICS */
2378 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2379 ewrt = _mm_mul_ps(r31,ewtabscale);
2380 ewitab = _mm_cvttps_epi32(ewrt);
2381 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2382 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2383 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2385 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2386 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2390 fscal = _mm_andnot_ps(dummy_mask,fscal);
2392 /* Calculate temporary vectorial force */
2393 tx = _mm_mul_ps(fscal,dx31);
2394 ty = _mm_mul_ps(fscal,dy31);
2395 tz = _mm_mul_ps(fscal,dz31);
2397 /* Update vectorial force */
2398 fix3 = _mm_add_ps(fix3,tx);
2399 fiy3 = _mm_add_ps(fiy3,ty);
2400 fiz3 = _mm_add_ps(fiz3,tz);
2402 fjx1 = _mm_add_ps(fjx1,tx);
2403 fjy1 = _mm_add_ps(fjy1,ty);
2404 fjz1 = _mm_add_ps(fjz1,tz);
2406 /**************************
2407 * CALCULATE INTERACTIONS *
2408 **************************/
2410 r32 = _mm_mul_ps(rsq32,rinv32);
2411 r32 = _mm_andnot_ps(dummy_mask,r32);
2413 /* EWALD ELECTROSTATICS */
2415 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2416 ewrt = _mm_mul_ps(r32,ewtabscale);
2417 ewitab = _mm_cvttps_epi32(ewrt);
2418 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2419 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2420 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2422 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2423 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2427 fscal = _mm_andnot_ps(dummy_mask,fscal);
2429 /* Calculate temporary vectorial force */
2430 tx = _mm_mul_ps(fscal,dx32);
2431 ty = _mm_mul_ps(fscal,dy32);
2432 tz = _mm_mul_ps(fscal,dz32);
2434 /* Update vectorial force */
2435 fix3 = _mm_add_ps(fix3,tx);
2436 fiy3 = _mm_add_ps(fiy3,ty);
2437 fiz3 = _mm_add_ps(fiz3,tz);
2439 fjx2 = _mm_add_ps(fjx2,tx);
2440 fjy2 = _mm_add_ps(fjy2,ty);
2441 fjz2 = _mm_add_ps(fjz2,tz);
2443 /**************************
2444 * CALCULATE INTERACTIONS *
2445 **************************/
2447 r33 = _mm_mul_ps(rsq33,rinv33);
2448 r33 = _mm_andnot_ps(dummy_mask,r33);
2450 /* EWALD ELECTROSTATICS */
2452 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2453 ewrt = _mm_mul_ps(r33,ewtabscale);
2454 ewitab = _mm_cvttps_epi32(ewrt);
2455 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2456 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2457 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2459 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2460 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2464 fscal = _mm_andnot_ps(dummy_mask,fscal);
2466 /* Calculate temporary vectorial force */
2467 tx = _mm_mul_ps(fscal,dx33);
2468 ty = _mm_mul_ps(fscal,dy33);
2469 tz = _mm_mul_ps(fscal,dz33);
2471 /* Update vectorial force */
2472 fix3 = _mm_add_ps(fix3,tx);
2473 fiy3 = _mm_add_ps(fiy3,ty);
2474 fiz3 = _mm_add_ps(fiz3,tz);
2476 fjx3 = _mm_add_ps(fjx3,tx);
2477 fjy3 = _mm_add_ps(fjy3,ty);
2478 fjz3 = _mm_add_ps(fjz3,tz);
2480 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2481 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2482 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2483 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2485 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2486 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2487 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2489 /* Inner loop uses 383 flops */
2492 /* End of innermost loop */
2494 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2495 f+i_coord_offset,fshift+i_shift_offset);
2497 /* Increment number of inner iterations */
2498 inneriter += j_index_end - j_index_start;
2500 /* Outer loop uses 24 flops */
2503 /* Increment number of outer iterations */
2506 /* Update outer/inner flops */
2508 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*383);