2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_sse4_1_single.h"
48 #include "kernelutil_x86_sse4_1_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse4_1_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LJEwald
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_sse4_1_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
112 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
116 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
127 __m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
129 __m128 one_half = _mm_set1_ps(0.5);
130 __m128 minus_one = _mm_set1_ps(-1.0);
132 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
134 __m128 dummy_mask,cutoff_mask;
135 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
136 __m128 one = _mm_set1_ps(1.0);
137 __m128 two = _mm_set1_ps(2.0);
143 jindex = nlist->jindex;
145 shiftidx = nlist->shift;
147 shiftvec = fr->shift_vec[0];
148 fshift = fr->fshift[0];
149 facel = _mm_set1_ps(fr->epsfac);
150 charge = mdatoms->chargeA;
151 nvdwtype = fr->ntype;
153 vdwtype = mdatoms->typeA;
154 vdwgridparam = fr->ljpme_c6grid;
155 sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
156 ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
157 ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
159 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
160 ewtab = fr->ic->tabq_coul_FDV0;
161 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
162 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
164 /* Setup water-specific parameters */
165 inr = nlist->iinr[0];
166 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
167 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
168 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
169 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
171 jq1 = _mm_set1_ps(charge[inr+1]);
172 jq2 = _mm_set1_ps(charge[inr+2]);
173 jq3 = _mm_set1_ps(charge[inr+3]);
174 vdwjidx0A = 2*vdwtype[inr+0];
175 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
176 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
177 c6grid_00 = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
178 qq11 = _mm_mul_ps(iq1,jq1);
179 qq12 = _mm_mul_ps(iq1,jq2);
180 qq13 = _mm_mul_ps(iq1,jq3);
181 qq21 = _mm_mul_ps(iq2,jq1);
182 qq22 = _mm_mul_ps(iq2,jq2);
183 qq23 = _mm_mul_ps(iq2,jq3);
184 qq31 = _mm_mul_ps(iq3,jq1);
185 qq32 = _mm_mul_ps(iq3,jq2);
186 qq33 = _mm_mul_ps(iq3,jq3);
188 /* Avoid stupid compiler warnings */
189 jnrA = jnrB = jnrC = jnrD = 0;
198 for(iidx=0;iidx<4*DIM;iidx++)
203 /* Start outer loop over neighborlists */
204 for(iidx=0; iidx<nri; iidx++)
206 /* Load shift vector for this list */
207 i_shift_offset = DIM*shiftidx[iidx];
209 /* Load limits for loop over neighbors */
210 j_index_start = jindex[iidx];
211 j_index_end = jindex[iidx+1];
213 /* Get outer coordinate index */
215 i_coord_offset = DIM*inr;
217 /* Load i particle coords and add shift vector */
218 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
219 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
221 fix0 = _mm_setzero_ps();
222 fiy0 = _mm_setzero_ps();
223 fiz0 = _mm_setzero_ps();
224 fix1 = _mm_setzero_ps();
225 fiy1 = _mm_setzero_ps();
226 fiz1 = _mm_setzero_ps();
227 fix2 = _mm_setzero_ps();
228 fiy2 = _mm_setzero_ps();
229 fiz2 = _mm_setzero_ps();
230 fix3 = _mm_setzero_ps();
231 fiy3 = _mm_setzero_ps();
232 fiz3 = _mm_setzero_ps();
234 /* Reset potential sums */
235 velecsum = _mm_setzero_ps();
236 vvdwsum = _mm_setzero_ps();
238 /* Start inner kernel loop */
239 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
242 /* Get j neighbor index, and coordinate index */
247 j_coord_offsetA = DIM*jnrA;
248 j_coord_offsetB = DIM*jnrB;
249 j_coord_offsetC = DIM*jnrC;
250 j_coord_offsetD = DIM*jnrD;
252 /* load j atom coordinates */
253 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
254 x+j_coord_offsetC,x+j_coord_offsetD,
255 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
256 &jy2,&jz2,&jx3,&jy3,&jz3);
258 /* Calculate displacement vector */
259 dx00 = _mm_sub_ps(ix0,jx0);
260 dy00 = _mm_sub_ps(iy0,jy0);
261 dz00 = _mm_sub_ps(iz0,jz0);
262 dx11 = _mm_sub_ps(ix1,jx1);
263 dy11 = _mm_sub_ps(iy1,jy1);
264 dz11 = _mm_sub_ps(iz1,jz1);
265 dx12 = _mm_sub_ps(ix1,jx2);
266 dy12 = _mm_sub_ps(iy1,jy2);
267 dz12 = _mm_sub_ps(iz1,jz2);
268 dx13 = _mm_sub_ps(ix1,jx3);
269 dy13 = _mm_sub_ps(iy1,jy3);
270 dz13 = _mm_sub_ps(iz1,jz3);
271 dx21 = _mm_sub_ps(ix2,jx1);
272 dy21 = _mm_sub_ps(iy2,jy1);
273 dz21 = _mm_sub_ps(iz2,jz1);
274 dx22 = _mm_sub_ps(ix2,jx2);
275 dy22 = _mm_sub_ps(iy2,jy2);
276 dz22 = _mm_sub_ps(iz2,jz2);
277 dx23 = _mm_sub_ps(ix2,jx3);
278 dy23 = _mm_sub_ps(iy2,jy3);
279 dz23 = _mm_sub_ps(iz2,jz3);
280 dx31 = _mm_sub_ps(ix3,jx1);
281 dy31 = _mm_sub_ps(iy3,jy1);
282 dz31 = _mm_sub_ps(iz3,jz1);
283 dx32 = _mm_sub_ps(ix3,jx2);
284 dy32 = _mm_sub_ps(iy3,jy2);
285 dz32 = _mm_sub_ps(iz3,jz2);
286 dx33 = _mm_sub_ps(ix3,jx3);
287 dy33 = _mm_sub_ps(iy3,jy3);
288 dz33 = _mm_sub_ps(iz3,jz3);
290 /* Calculate squared distance and things based on it */
291 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
292 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
293 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
294 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
295 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
296 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
297 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
298 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
299 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
300 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
302 rinv00 = gmx_mm_invsqrt_ps(rsq00);
303 rinv11 = gmx_mm_invsqrt_ps(rsq11);
304 rinv12 = gmx_mm_invsqrt_ps(rsq12);
305 rinv13 = gmx_mm_invsqrt_ps(rsq13);
306 rinv21 = gmx_mm_invsqrt_ps(rsq21);
307 rinv22 = gmx_mm_invsqrt_ps(rsq22);
308 rinv23 = gmx_mm_invsqrt_ps(rsq23);
309 rinv31 = gmx_mm_invsqrt_ps(rsq31);
310 rinv32 = gmx_mm_invsqrt_ps(rsq32);
311 rinv33 = gmx_mm_invsqrt_ps(rsq33);
313 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
314 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
315 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
316 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
317 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
318 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
319 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
320 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
321 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
322 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
324 fjx0 = _mm_setzero_ps();
325 fjy0 = _mm_setzero_ps();
326 fjz0 = _mm_setzero_ps();
327 fjx1 = _mm_setzero_ps();
328 fjy1 = _mm_setzero_ps();
329 fjz1 = _mm_setzero_ps();
330 fjx2 = _mm_setzero_ps();
331 fjy2 = _mm_setzero_ps();
332 fjz2 = _mm_setzero_ps();
333 fjx3 = _mm_setzero_ps();
334 fjy3 = _mm_setzero_ps();
335 fjz3 = _mm_setzero_ps();
337 /**************************
338 * CALCULATE INTERACTIONS *
339 **************************/
341 r00 = _mm_mul_ps(rsq00,rinv00);
343 /* Analytical LJ-PME */
344 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
345 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
346 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
347 exponent = gmx_simd_exp_r(ewcljrsq);
348 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
349 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
350 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
351 vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
352 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
353 vvdw = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
354 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
355 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
357 /* Update potential sum for this i atom from the interaction with this j atom. */
358 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
362 /* Calculate temporary vectorial force */
363 tx = _mm_mul_ps(fscal,dx00);
364 ty = _mm_mul_ps(fscal,dy00);
365 tz = _mm_mul_ps(fscal,dz00);
367 /* Update vectorial force */
368 fix0 = _mm_add_ps(fix0,tx);
369 fiy0 = _mm_add_ps(fiy0,ty);
370 fiz0 = _mm_add_ps(fiz0,tz);
372 fjx0 = _mm_add_ps(fjx0,tx);
373 fjy0 = _mm_add_ps(fjy0,ty);
374 fjz0 = _mm_add_ps(fjz0,tz);
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
380 r11 = _mm_mul_ps(rsq11,rinv11);
382 /* EWALD ELECTROSTATICS */
384 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
385 ewrt = _mm_mul_ps(r11,ewtabscale);
386 ewitab = _mm_cvttps_epi32(ewrt);
387 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
388 ewitab = _mm_slli_epi32(ewitab,2);
389 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
390 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
391 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
392 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
393 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
394 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
395 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
396 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
397 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
399 /* Update potential sum for this i atom from the interaction with this j atom. */
400 velecsum = _mm_add_ps(velecsum,velec);
404 /* Calculate temporary vectorial force */
405 tx = _mm_mul_ps(fscal,dx11);
406 ty = _mm_mul_ps(fscal,dy11);
407 tz = _mm_mul_ps(fscal,dz11);
409 /* Update vectorial force */
410 fix1 = _mm_add_ps(fix1,tx);
411 fiy1 = _mm_add_ps(fiy1,ty);
412 fiz1 = _mm_add_ps(fiz1,tz);
414 fjx1 = _mm_add_ps(fjx1,tx);
415 fjy1 = _mm_add_ps(fjy1,ty);
416 fjz1 = _mm_add_ps(fjz1,tz);
418 /**************************
419 * CALCULATE INTERACTIONS *
420 **************************/
422 r12 = _mm_mul_ps(rsq12,rinv12);
424 /* EWALD ELECTROSTATICS */
426 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
427 ewrt = _mm_mul_ps(r12,ewtabscale);
428 ewitab = _mm_cvttps_epi32(ewrt);
429 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
430 ewitab = _mm_slli_epi32(ewitab,2);
431 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
432 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
433 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
434 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
435 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
436 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
437 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
438 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
439 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
441 /* Update potential sum for this i atom from the interaction with this j atom. */
442 velecsum = _mm_add_ps(velecsum,velec);
446 /* Calculate temporary vectorial force */
447 tx = _mm_mul_ps(fscal,dx12);
448 ty = _mm_mul_ps(fscal,dy12);
449 tz = _mm_mul_ps(fscal,dz12);
451 /* Update vectorial force */
452 fix1 = _mm_add_ps(fix1,tx);
453 fiy1 = _mm_add_ps(fiy1,ty);
454 fiz1 = _mm_add_ps(fiz1,tz);
456 fjx2 = _mm_add_ps(fjx2,tx);
457 fjy2 = _mm_add_ps(fjy2,ty);
458 fjz2 = _mm_add_ps(fjz2,tz);
460 /**************************
461 * CALCULATE INTERACTIONS *
462 **************************/
464 r13 = _mm_mul_ps(rsq13,rinv13);
466 /* EWALD ELECTROSTATICS */
468 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
469 ewrt = _mm_mul_ps(r13,ewtabscale);
470 ewitab = _mm_cvttps_epi32(ewrt);
471 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
472 ewitab = _mm_slli_epi32(ewitab,2);
473 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
474 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
475 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
476 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
477 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
478 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
479 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
480 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
481 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
483 /* Update potential sum for this i atom from the interaction with this j atom. */
484 velecsum = _mm_add_ps(velecsum,velec);
488 /* Calculate temporary vectorial force */
489 tx = _mm_mul_ps(fscal,dx13);
490 ty = _mm_mul_ps(fscal,dy13);
491 tz = _mm_mul_ps(fscal,dz13);
493 /* Update vectorial force */
494 fix1 = _mm_add_ps(fix1,tx);
495 fiy1 = _mm_add_ps(fiy1,ty);
496 fiz1 = _mm_add_ps(fiz1,tz);
498 fjx3 = _mm_add_ps(fjx3,tx);
499 fjy3 = _mm_add_ps(fjy3,ty);
500 fjz3 = _mm_add_ps(fjz3,tz);
502 /**************************
503 * CALCULATE INTERACTIONS *
504 **************************/
506 r21 = _mm_mul_ps(rsq21,rinv21);
508 /* EWALD ELECTROSTATICS */
510 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
511 ewrt = _mm_mul_ps(r21,ewtabscale);
512 ewitab = _mm_cvttps_epi32(ewrt);
513 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
514 ewitab = _mm_slli_epi32(ewitab,2);
515 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
516 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
517 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
518 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
519 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
520 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
521 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
522 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
523 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
525 /* Update potential sum for this i atom from the interaction with this j atom. */
526 velecsum = _mm_add_ps(velecsum,velec);
530 /* Calculate temporary vectorial force */
531 tx = _mm_mul_ps(fscal,dx21);
532 ty = _mm_mul_ps(fscal,dy21);
533 tz = _mm_mul_ps(fscal,dz21);
535 /* Update vectorial force */
536 fix2 = _mm_add_ps(fix2,tx);
537 fiy2 = _mm_add_ps(fiy2,ty);
538 fiz2 = _mm_add_ps(fiz2,tz);
540 fjx1 = _mm_add_ps(fjx1,tx);
541 fjy1 = _mm_add_ps(fjy1,ty);
542 fjz1 = _mm_add_ps(fjz1,tz);
544 /**************************
545 * CALCULATE INTERACTIONS *
546 **************************/
548 r22 = _mm_mul_ps(rsq22,rinv22);
550 /* EWALD ELECTROSTATICS */
552 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
553 ewrt = _mm_mul_ps(r22,ewtabscale);
554 ewitab = _mm_cvttps_epi32(ewrt);
555 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
556 ewitab = _mm_slli_epi32(ewitab,2);
557 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
558 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
559 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
560 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
561 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
562 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
563 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
564 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
565 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
567 /* Update potential sum for this i atom from the interaction with this j atom. */
568 velecsum = _mm_add_ps(velecsum,velec);
572 /* Calculate temporary vectorial force */
573 tx = _mm_mul_ps(fscal,dx22);
574 ty = _mm_mul_ps(fscal,dy22);
575 tz = _mm_mul_ps(fscal,dz22);
577 /* Update vectorial force */
578 fix2 = _mm_add_ps(fix2,tx);
579 fiy2 = _mm_add_ps(fiy2,ty);
580 fiz2 = _mm_add_ps(fiz2,tz);
582 fjx2 = _mm_add_ps(fjx2,tx);
583 fjy2 = _mm_add_ps(fjy2,ty);
584 fjz2 = _mm_add_ps(fjz2,tz);
586 /**************************
587 * CALCULATE INTERACTIONS *
588 **************************/
590 r23 = _mm_mul_ps(rsq23,rinv23);
592 /* EWALD ELECTROSTATICS */
594 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
595 ewrt = _mm_mul_ps(r23,ewtabscale);
596 ewitab = _mm_cvttps_epi32(ewrt);
597 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
598 ewitab = _mm_slli_epi32(ewitab,2);
599 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
600 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
601 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
602 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
603 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
604 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
605 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
606 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
607 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
609 /* Update potential sum for this i atom from the interaction with this j atom. */
610 velecsum = _mm_add_ps(velecsum,velec);
614 /* Calculate temporary vectorial force */
615 tx = _mm_mul_ps(fscal,dx23);
616 ty = _mm_mul_ps(fscal,dy23);
617 tz = _mm_mul_ps(fscal,dz23);
619 /* Update vectorial force */
620 fix2 = _mm_add_ps(fix2,tx);
621 fiy2 = _mm_add_ps(fiy2,ty);
622 fiz2 = _mm_add_ps(fiz2,tz);
624 fjx3 = _mm_add_ps(fjx3,tx);
625 fjy3 = _mm_add_ps(fjy3,ty);
626 fjz3 = _mm_add_ps(fjz3,tz);
628 /**************************
629 * CALCULATE INTERACTIONS *
630 **************************/
632 r31 = _mm_mul_ps(rsq31,rinv31);
634 /* EWALD ELECTROSTATICS */
636 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
637 ewrt = _mm_mul_ps(r31,ewtabscale);
638 ewitab = _mm_cvttps_epi32(ewrt);
639 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
640 ewitab = _mm_slli_epi32(ewitab,2);
641 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
642 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
643 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
644 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
645 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
646 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
647 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
648 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
649 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
651 /* Update potential sum for this i atom from the interaction with this j atom. */
652 velecsum = _mm_add_ps(velecsum,velec);
656 /* Calculate temporary vectorial force */
657 tx = _mm_mul_ps(fscal,dx31);
658 ty = _mm_mul_ps(fscal,dy31);
659 tz = _mm_mul_ps(fscal,dz31);
661 /* Update vectorial force */
662 fix3 = _mm_add_ps(fix3,tx);
663 fiy3 = _mm_add_ps(fiy3,ty);
664 fiz3 = _mm_add_ps(fiz3,tz);
666 fjx1 = _mm_add_ps(fjx1,tx);
667 fjy1 = _mm_add_ps(fjy1,ty);
668 fjz1 = _mm_add_ps(fjz1,tz);
670 /**************************
671 * CALCULATE INTERACTIONS *
672 **************************/
674 r32 = _mm_mul_ps(rsq32,rinv32);
676 /* EWALD ELECTROSTATICS */
678 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
679 ewrt = _mm_mul_ps(r32,ewtabscale);
680 ewitab = _mm_cvttps_epi32(ewrt);
681 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
682 ewitab = _mm_slli_epi32(ewitab,2);
683 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
684 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
685 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
686 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
687 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
688 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
689 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
690 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
691 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
693 /* Update potential sum for this i atom from the interaction with this j atom. */
694 velecsum = _mm_add_ps(velecsum,velec);
698 /* Calculate temporary vectorial force */
699 tx = _mm_mul_ps(fscal,dx32);
700 ty = _mm_mul_ps(fscal,dy32);
701 tz = _mm_mul_ps(fscal,dz32);
703 /* Update vectorial force */
704 fix3 = _mm_add_ps(fix3,tx);
705 fiy3 = _mm_add_ps(fiy3,ty);
706 fiz3 = _mm_add_ps(fiz3,tz);
708 fjx2 = _mm_add_ps(fjx2,tx);
709 fjy2 = _mm_add_ps(fjy2,ty);
710 fjz2 = _mm_add_ps(fjz2,tz);
712 /**************************
713 * CALCULATE INTERACTIONS *
714 **************************/
716 r33 = _mm_mul_ps(rsq33,rinv33);
718 /* EWALD ELECTROSTATICS */
720 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
721 ewrt = _mm_mul_ps(r33,ewtabscale);
722 ewitab = _mm_cvttps_epi32(ewrt);
723 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
724 ewitab = _mm_slli_epi32(ewitab,2);
725 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
726 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
727 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
728 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
729 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
730 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
731 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
732 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
733 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
735 /* Update potential sum for this i atom from the interaction with this j atom. */
736 velecsum = _mm_add_ps(velecsum,velec);
740 /* Calculate temporary vectorial force */
741 tx = _mm_mul_ps(fscal,dx33);
742 ty = _mm_mul_ps(fscal,dy33);
743 tz = _mm_mul_ps(fscal,dz33);
745 /* Update vectorial force */
746 fix3 = _mm_add_ps(fix3,tx);
747 fiy3 = _mm_add_ps(fiy3,ty);
748 fiz3 = _mm_add_ps(fiz3,tz);
750 fjx3 = _mm_add_ps(fjx3,tx);
751 fjy3 = _mm_add_ps(fjy3,ty);
752 fjz3 = _mm_add_ps(fjz3,tz);
754 fjptrA = f+j_coord_offsetA;
755 fjptrB = f+j_coord_offsetB;
756 fjptrC = f+j_coord_offsetC;
757 fjptrD = f+j_coord_offsetD;
759 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
760 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
761 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
763 /* Inner loop uses 423 flops */
769 /* Get j neighbor index, and coordinate index */
770 jnrlistA = jjnr[jidx];
771 jnrlistB = jjnr[jidx+1];
772 jnrlistC = jjnr[jidx+2];
773 jnrlistD = jjnr[jidx+3];
774 /* Sign of each element will be negative for non-real atoms.
775 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
776 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
778 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
779 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
780 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
781 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
782 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
783 j_coord_offsetA = DIM*jnrA;
784 j_coord_offsetB = DIM*jnrB;
785 j_coord_offsetC = DIM*jnrC;
786 j_coord_offsetD = DIM*jnrD;
788 /* load j atom coordinates */
789 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
790 x+j_coord_offsetC,x+j_coord_offsetD,
791 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
792 &jy2,&jz2,&jx3,&jy3,&jz3);
794 /* Calculate displacement vector */
795 dx00 = _mm_sub_ps(ix0,jx0);
796 dy00 = _mm_sub_ps(iy0,jy0);
797 dz00 = _mm_sub_ps(iz0,jz0);
798 dx11 = _mm_sub_ps(ix1,jx1);
799 dy11 = _mm_sub_ps(iy1,jy1);
800 dz11 = _mm_sub_ps(iz1,jz1);
801 dx12 = _mm_sub_ps(ix1,jx2);
802 dy12 = _mm_sub_ps(iy1,jy2);
803 dz12 = _mm_sub_ps(iz1,jz2);
804 dx13 = _mm_sub_ps(ix1,jx3);
805 dy13 = _mm_sub_ps(iy1,jy3);
806 dz13 = _mm_sub_ps(iz1,jz3);
807 dx21 = _mm_sub_ps(ix2,jx1);
808 dy21 = _mm_sub_ps(iy2,jy1);
809 dz21 = _mm_sub_ps(iz2,jz1);
810 dx22 = _mm_sub_ps(ix2,jx2);
811 dy22 = _mm_sub_ps(iy2,jy2);
812 dz22 = _mm_sub_ps(iz2,jz2);
813 dx23 = _mm_sub_ps(ix2,jx3);
814 dy23 = _mm_sub_ps(iy2,jy3);
815 dz23 = _mm_sub_ps(iz2,jz3);
816 dx31 = _mm_sub_ps(ix3,jx1);
817 dy31 = _mm_sub_ps(iy3,jy1);
818 dz31 = _mm_sub_ps(iz3,jz1);
819 dx32 = _mm_sub_ps(ix3,jx2);
820 dy32 = _mm_sub_ps(iy3,jy2);
821 dz32 = _mm_sub_ps(iz3,jz2);
822 dx33 = _mm_sub_ps(ix3,jx3);
823 dy33 = _mm_sub_ps(iy3,jy3);
824 dz33 = _mm_sub_ps(iz3,jz3);
826 /* Calculate squared distance and things based on it */
827 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
828 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
829 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
830 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
831 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
832 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
833 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
834 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
835 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
836 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
838 rinv00 = gmx_mm_invsqrt_ps(rsq00);
839 rinv11 = gmx_mm_invsqrt_ps(rsq11);
840 rinv12 = gmx_mm_invsqrt_ps(rsq12);
841 rinv13 = gmx_mm_invsqrt_ps(rsq13);
842 rinv21 = gmx_mm_invsqrt_ps(rsq21);
843 rinv22 = gmx_mm_invsqrt_ps(rsq22);
844 rinv23 = gmx_mm_invsqrt_ps(rsq23);
845 rinv31 = gmx_mm_invsqrt_ps(rsq31);
846 rinv32 = gmx_mm_invsqrt_ps(rsq32);
847 rinv33 = gmx_mm_invsqrt_ps(rsq33);
849 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
850 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
851 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
852 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
853 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
854 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
855 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
856 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
857 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
858 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
860 fjx0 = _mm_setzero_ps();
861 fjy0 = _mm_setzero_ps();
862 fjz0 = _mm_setzero_ps();
863 fjx1 = _mm_setzero_ps();
864 fjy1 = _mm_setzero_ps();
865 fjz1 = _mm_setzero_ps();
866 fjx2 = _mm_setzero_ps();
867 fjy2 = _mm_setzero_ps();
868 fjz2 = _mm_setzero_ps();
869 fjx3 = _mm_setzero_ps();
870 fjy3 = _mm_setzero_ps();
871 fjz3 = _mm_setzero_ps();
873 /**************************
874 * CALCULATE INTERACTIONS *
875 **************************/
877 r00 = _mm_mul_ps(rsq00,rinv00);
878 r00 = _mm_andnot_ps(dummy_mask,r00);
880 /* Analytical LJ-PME */
881 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
882 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
883 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
884 exponent = gmx_simd_exp_r(ewcljrsq);
885 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
886 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
887 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
888 vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
889 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
890 vvdw = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
891 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
892 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
894 /* Update potential sum for this i atom from the interaction with this j atom. */
895 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
896 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
900 fscal = _mm_andnot_ps(dummy_mask,fscal);
902 /* Calculate temporary vectorial force */
903 tx = _mm_mul_ps(fscal,dx00);
904 ty = _mm_mul_ps(fscal,dy00);
905 tz = _mm_mul_ps(fscal,dz00);
907 /* Update vectorial force */
908 fix0 = _mm_add_ps(fix0,tx);
909 fiy0 = _mm_add_ps(fiy0,ty);
910 fiz0 = _mm_add_ps(fiz0,tz);
912 fjx0 = _mm_add_ps(fjx0,tx);
913 fjy0 = _mm_add_ps(fjy0,ty);
914 fjz0 = _mm_add_ps(fjz0,tz);
916 /**************************
917 * CALCULATE INTERACTIONS *
918 **************************/
920 r11 = _mm_mul_ps(rsq11,rinv11);
921 r11 = _mm_andnot_ps(dummy_mask,r11);
923 /* EWALD ELECTROSTATICS */
925 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
926 ewrt = _mm_mul_ps(r11,ewtabscale);
927 ewitab = _mm_cvttps_epi32(ewrt);
928 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
929 ewitab = _mm_slli_epi32(ewitab,2);
930 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
931 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
932 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
933 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
934 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
935 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
936 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
937 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
938 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
940 /* Update potential sum for this i atom from the interaction with this j atom. */
941 velec = _mm_andnot_ps(dummy_mask,velec);
942 velecsum = _mm_add_ps(velecsum,velec);
946 fscal = _mm_andnot_ps(dummy_mask,fscal);
948 /* Calculate temporary vectorial force */
949 tx = _mm_mul_ps(fscal,dx11);
950 ty = _mm_mul_ps(fscal,dy11);
951 tz = _mm_mul_ps(fscal,dz11);
953 /* Update vectorial force */
954 fix1 = _mm_add_ps(fix1,tx);
955 fiy1 = _mm_add_ps(fiy1,ty);
956 fiz1 = _mm_add_ps(fiz1,tz);
958 fjx1 = _mm_add_ps(fjx1,tx);
959 fjy1 = _mm_add_ps(fjy1,ty);
960 fjz1 = _mm_add_ps(fjz1,tz);
962 /**************************
963 * CALCULATE INTERACTIONS *
964 **************************/
966 r12 = _mm_mul_ps(rsq12,rinv12);
967 r12 = _mm_andnot_ps(dummy_mask,r12);
969 /* EWALD ELECTROSTATICS */
971 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
972 ewrt = _mm_mul_ps(r12,ewtabscale);
973 ewitab = _mm_cvttps_epi32(ewrt);
974 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
975 ewitab = _mm_slli_epi32(ewitab,2);
976 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
977 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
978 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
979 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
980 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
981 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
982 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
983 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
984 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
986 /* Update potential sum for this i atom from the interaction with this j atom. */
987 velec = _mm_andnot_ps(dummy_mask,velec);
988 velecsum = _mm_add_ps(velecsum,velec);
992 fscal = _mm_andnot_ps(dummy_mask,fscal);
994 /* Calculate temporary vectorial force */
995 tx = _mm_mul_ps(fscal,dx12);
996 ty = _mm_mul_ps(fscal,dy12);
997 tz = _mm_mul_ps(fscal,dz12);
999 /* Update vectorial force */
1000 fix1 = _mm_add_ps(fix1,tx);
1001 fiy1 = _mm_add_ps(fiy1,ty);
1002 fiz1 = _mm_add_ps(fiz1,tz);
1004 fjx2 = _mm_add_ps(fjx2,tx);
1005 fjy2 = _mm_add_ps(fjy2,ty);
1006 fjz2 = _mm_add_ps(fjz2,tz);
1008 /**************************
1009 * CALCULATE INTERACTIONS *
1010 **************************/
1012 r13 = _mm_mul_ps(rsq13,rinv13);
1013 r13 = _mm_andnot_ps(dummy_mask,r13);
1015 /* EWALD ELECTROSTATICS */
1017 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1018 ewrt = _mm_mul_ps(r13,ewtabscale);
1019 ewitab = _mm_cvttps_epi32(ewrt);
1020 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1021 ewitab = _mm_slli_epi32(ewitab,2);
1022 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1023 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1024 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1025 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1026 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1027 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1028 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1029 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
1030 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1032 /* Update potential sum for this i atom from the interaction with this j atom. */
1033 velec = _mm_andnot_ps(dummy_mask,velec);
1034 velecsum = _mm_add_ps(velecsum,velec);
1038 fscal = _mm_andnot_ps(dummy_mask,fscal);
1040 /* Calculate temporary vectorial force */
1041 tx = _mm_mul_ps(fscal,dx13);
1042 ty = _mm_mul_ps(fscal,dy13);
1043 tz = _mm_mul_ps(fscal,dz13);
1045 /* Update vectorial force */
1046 fix1 = _mm_add_ps(fix1,tx);
1047 fiy1 = _mm_add_ps(fiy1,ty);
1048 fiz1 = _mm_add_ps(fiz1,tz);
1050 fjx3 = _mm_add_ps(fjx3,tx);
1051 fjy3 = _mm_add_ps(fjy3,ty);
1052 fjz3 = _mm_add_ps(fjz3,tz);
1054 /**************************
1055 * CALCULATE INTERACTIONS *
1056 **************************/
1058 r21 = _mm_mul_ps(rsq21,rinv21);
1059 r21 = _mm_andnot_ps(dummy_mask,r21);
1061 /* EWALD ELECTROSTATICS */
1063 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1064 ewrt = _mm_mul_ps(r21,ewtabscale);
1065 ewitab = _mm_cvttps_epi32(ewrt);
1066 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1067 ewitab = _mm_slli_epi32(ewitab,2);
1068 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1069 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1070 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1071 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1072 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1073 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1074 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1075 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1076 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1078 /* Update potential sum for this i atom from the interaction with this j atom. */
1079 velec = _mm_andnot_ps(dummy_mask,velec);
1080 velecsum = _mm_add_ps(velecsum,velec);
1084 fscal = _mm_andnot_ps(dummy_mask,fscal);
1086 /* Calculate temporary vectorial force */
1087 tx = _mm_mul_ps(fscal,dx21);
1088 ty = _mm_mul_ps(fscal,dy21);
1089 tz = _mm_mul_ps(fscal,dz21);
1091 /* Update vectorial force */
1092 fix2 = _mm_add_ps(fix2,tx);
1093 fiy2 = _mm_add_ps(fiy2,ty);
1094 fiz2 = _mm_add_ps(fiz2,tz);
1096 fjx1 = _mm_add_ps(fjx1,tx);
1097 fjy1 = _mm_add_ps(fjy1,ty);
1098 fjz1 = _mm_add_ps(fjz1,tz);
1100 /**************************
1101 * CALCULATE INTERACTIONS *
1102 **************************/
1104 r22 = _mm_mul_ps(rsq22,rinv22);
1105 r22 = _mm_andnot_ps(dummy_mask,r22);
1107 /* EWALD ELECTROSTATICS */
1109 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1110 ewrt = _mm_mul_ps(r22,ewtabscale);
1111 ewitab = _mm_cvttps_epi32(ewrt);
1112 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1113 ewitab = _mm_slli_epi32(ewitab,2);
1114 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1115 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1116 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1117 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1118 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1119 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1120 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1121 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1122 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1124 /* Update potential sum for this i atom from the interaction with this j atom. */
1125 velec = _mm_andnot_ps(dummy_mask,velec);
1126 velecsum = _mm_add_ps(velecsum,velec);
1130 fscal = _mm_andnot_ps(dummy_mask,fscal);
1132 /* Calculate temporary vectorial force */
1133 tx = _mm_mul_ps(fscal,dx22);
1134 ty = _mm_mul_ps(fscal,dy22);
1135 tz = _mm_mul_ps(fscal,dz22);
1137 /* Update vectorial force */
1138 fix2 = _mm_add_ps(fix2,tx);
1139 fiy2 = _mm_add_ps(fiy2,ty);
1140 fiz2 = _mm_add_ps(fiz2,tz);
1142 fjx2 = _mm_add_ps(fjx2,tx);
1143 fjy2 = _mm_add_ps(fjy2,ty);
1144 fjz2 = _mm_add_ps(fjz2,tz);
1146 /**************************
1147 * CALCULATE INTERACTIONS *
1148 **************************/
1150 r23 = _mm_mul_ps(rsq23,rinv23);
1151 r23 = _mm_andnot_ps(dummy_mask,r23);
1153 /* EWALD ELECTROSTATICS */
1155 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1156 ewrt = _mm_mul_ps(r23,ewtabscale);
1157 ewitab = _mm_cvttps_epi32(ewrt);
1158 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1159 ewitab = _mm_slli_epi32(ewitab,2);
1160 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1161 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1162 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1163 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1164 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1165 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1166 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1167 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1168 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1170 /* Update potential sum for this i atom from the interaction with this j atom. */
1171 velec = _mm_andnot_ps(dummy_mask,velec);
1172 velecsum = _mm_add_ps(velecsum,velec);
1176 fscal = _mm_andnot_ps(dummy_mask,fscal);
1178 /* Calculate temporary vectorial force */
1179 tx = _mm_mul_ps(fscal,dx23);
1180 ty = _mm_mul_ps(fscal,dy23);
1181 tz = _mm_mul_ps(fscal,dz23);
1183 /* Update vectorial force */
1184 fix2 = _mm_add_ps(fix2,tx);
1185 fiy2 = _mm_add_ps(fiy2,ty);
1186 fiz2 = _mm_add_ps(fiz2,tz);
1188 fjx3 = _mm_add_ps(fjx3,tx);
1189 fjy3 = _mm_add_ps(fjy3,ty);
1190 fjz3 = _mm_add_ps(fjz3,tz);
1192 /**************************
1193 * CALCULATE INTERACTIONS *
1194 **************************/
1196 r31 = _mm_mul_ps(rsq31,rinv31);
1197 r31 = _mm_andnot_ps(dummy_mask,r31);
1199 /* EWALD ELECTROSTATICS */
1201 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1202 ewrt = _mm_mul_ps(r31,ewtabscale);
1203 ewitab = _mm_cvttps_epi32(ewrt);
1204 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1205 ewitab = _mm_slli_epi32(ewitab,2);
1206 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1207 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1208 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1209 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1210 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1211 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1212 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1213 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1214 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1216 /* Update potential sum for this i atom from the interaction with this j atom. */
1217 velec = _mm_andnot_ps(dummy_mask,velec);
1218 velecsum = _mm_add_ps(velecsum,velec);
1222 fscal = _mm_andnot_ps(dummy_mask,fscal);
1224 /* Calculate temporary vectorial force */
1225 tx = _mm_mul_ps(fscal,dx31);
1226 ty = _mm_mul_ps(fscal,dy31);
1227 tz = _mm_mul_ps(fscal,dz31);
1229 /* Update vectorial force */
1230 fix3 = _mm_add_ps(fix3,tx);
1231 fiy3 = _mm_add_ps(fiy3,ty);
1232 fiz3 = _mm_add_ps(fiz3,tz);
1234 fjx1 = _mm_add_ps(fjx1,tx);
1235 fjy1 = _mm_add_ps(fjy1,ty);
1236 fjz1 = _mm_add_ps(fjz1,tz);
1238 /**************************
1239 * CALCULATE INTERACTIONS *
1240 **************************/
1242 r32 = _mm_mul_ps(rsq32,rinv32);
1243 r32 = _mm_andnot_ps(dummy_mask,r32);
1245 /* EWALD ELECTROSTATICS */
1247 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1248 ewrt = _mm_mul_ps(r32,ewtabscale);
1249 ewitab = _mm_cvttps_epi32(ewrt);
1250 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1251 ewitab = _mm_slli_epi32(ewitab,2);
1252 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1253 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1254 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1255 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1256 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1257 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1258 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1259 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1260 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1262 /* Update potential sum for this i atom from the interaction with this j atom. */
1263 velec = _mm_andnot_ps(dummy_mask,velec);
1264 velecsum = _mm_add_ps(velecsum,velec);
1268 fscal = _mm_andnot_ps(dummy_mask,fscal);
1270 /* Calculate temporary vectorial force */
1271 tx = _mm_mul_ps(fscal,dx32);
1272 ty = _mm_mul_ps(fscal,dy32);
1273 tz = _mm_mul_ps(fscal,dz32);
1275 /* Update vectorial force */
1276 fix3 = _mm_add_ps(fix3,tx);
1277 fiy3 = _mm_add_ps(fiy3,ty);
1278 fiz3 = _mm_add_ps(fiz3,tz);
1280 fjx2 = _mm_add_ps(fjx2,tx);
1281 fjy2 = _mm_add_ps(fjy2,ty);
1282 fjz2 = _mm_add_ps(fjz2,tz);
1284 /**************************
1285 * CALCULATE INTERACTIONS *
1286 **************************/
1288 r33 = _mm_mul_ps(rsq33,rinv33);
1289 r33 = _mm_andnot_ps(dummy_mask,r33);
1291 /* EWALD ELECTROSTATICS */
1293 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1294 ewrt = _mm_mul_ps(r33,ewtabscale);
1295 ewitab = _mm_cvttps_epi32(ewrt);
1296 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1297 ewitab = _mm_slli_epi32(ewitab,2);
1298 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1299 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1300 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1301 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1302 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1303 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1304 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1305 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1306 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1308 /* Update potential sum for this i atom from the interaction with this j atom. */
1309 velec = _mm_andnot_ps(dummy_mask,velec);
1310 velecsum = _mm_add_ps(velecsum,velec);
1314 fscal = _mm_andnot_ps(dummy_mask,fscal);
1316 /* Calculate temporary vectorial force */
1317 tx = _mm_mul_ps(fscal,dx33);
1318 ty = _mm_mul_ps(fscal,dy33);
1319 tz = _mm_mul_ps(fscal,dz33);
1321 /* Update vectorial force */
1322 fix3 = _mm_add_ps(fix3,tx);
1323 fiy3 = _mm_add_ps(fiy3,ty);
1324 fiz3 = _mm_add_ps(fiz3,tz);
1326 fjx3 = _mm_add_ps(fjx3,tx);
1327 fjy3 = _mm_add_ps(fjy3,ty);
1328 fjz3 = _mm_add_ps(fjz3,tz);
1330 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1331 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1332 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1333 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1335 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1336 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1337 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1339 /* Inner loop uses 433 flops */
1342 /* End of innermost loop */
1344 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1345 f+i_coord_offset,fshift+i_shift_offset);
1348 /* Update potential energies */
1349 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1350 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1352 /* Increment number of inner iterations */
1353 inneriter += j_index_end - j_index_start;
1355 /* Outer loop uses 26 flops */
1358 /* Increment number of outer iterations */
1361 /* Update outer/inner flops */
1363 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*433);
1366 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse4_1_single
1367 * Electrostatics interaction: Ewald
1368 * VdW interaction: LJEwald
1369 * Geometry: Water4-Water4
1370 * Calculate force/pot: Force
1373 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_sse4_1_single
1374 (t_nblist * gmx_restrict nlist,
1375 rvec * gmx_restrict xx,
1376 rvec * gmx_restrict ff,
1377 t_forcerec * gmx_restrict fr,
1378 t_mdatoms * gmx_restrict mdatoms,
1379 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1380 t_nrnb * gmx_restrict nrnb)
1382 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1383 * just 0 for non-waters.
1384 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1385 * jnr indices corresponding to data put in the four positions in the SIMD register.
1387 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1388 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1389 int jnrA,jnrB,jnrC,jnrD;
1390 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1391 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1392 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1393 real rcutoff_scalar;
1394 real *shiftvec,*fshift,*x,*f;
1395 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1396 real scratch[4*DIM];
1397 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1399 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1401 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1403 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1405 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1406 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1407 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1408 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1409 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1410 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1411 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1412 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1413 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1414 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1415 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1416 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1417 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1418 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1419 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1420 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1421 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1422 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1423 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1424 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1427 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1430 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1431 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1442 __m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1444 __m128 one_half = _mm_set1_ps(0.5);
1445 __m128 minus_one = _mm_set1_ps(-1.0);
1447 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1449 __m128 dummy_mask,cutoff_mask;
1450 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1451 __m128 one = _mm_set1_ps(1.0);
1452 __m128 two = _mm_set1_ps(2.0);
1458 jindex = nlist->jindex;
1460 shiftidx = nlist->shift;
1462 shiftvec = fr->shift_vec[0];
1463 fshift = fr->fshift[0];
1464 facel = _mm_set1_ps(fr->epsfac);
1465 charge = mdatoms->chargeA;
1466 nvdwtype = fr->ntype;
1467 vdwparam = fr->nbfp;
1468 vdwtype = mdatoms->typeA;
1469 vdwgridparam = fr->ljpme_c6grid;
1470 sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
1471 ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
1472 ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
1474 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1475 ewtab = fr->ic->tabq_coul_F;
1476 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1477 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1479 /* Setup water-specific parameters */
1480 inr = nlist->iinr[0];
1481 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1482 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1483 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1484 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1486 jq1 = _mm_set1_ps(charge[inr+1]);
1487 jq2 = _mm_set1_ps(charge[inr+2]);
1488 jq3 = _mm_set1_ps(charge[inr+3]);
1489 vdwjidx0A = 2*vdwtype[inr+0];
1490 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1491 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1492 c6grid_00 = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
1493 qq11 = _mm_mul_ps(iq1,jq1);
1494 qq12 = _mm_mul_ps(iq1,jq2);
1495 qq13 = _mm_mul_ps(iq1,jq3);
1496 qq21 = _mm_mul_ps(iq2,jq1);
1497 qq22 = _mm_mul_ps(iq2,jq2);
1498 qq23 = _mm_mul_ps(iq2,jq3);
1499 qq31 = _mm_mul_ps(iq3,jq1);
1500 qq32 = _mm_mul_ps(iq3,jq2);
1501 qq33 = _mm_mul_ps(iq3,jq3);
1503 /* Avoid stupid compiler warnings */
1504 jnrA = jnrB = jnrC = jnrD = 0;
1505 j_coord_offsetA = 0;
1506 j_coord_offsetB = 0;
1507 j_coord_offsetC = 0;
1508 j_coord_offsetD = 0;
1513 for(iidx=0;iidx<4*DIM;iidx++)
1515 scratch[iidx] = 0.0;
1518 /* Start outer loop over neighborlists */
1519 for(iidx=0; iidx<nri; iidx++)
1521 /* Load shift vector for this list */
1522 i_shift_offset = DIM*shiftidx[iidx];
1524 /* Load limits for loop over neighbors */
1525 j_index_start = jindex[iidx];
1526 j_index_end = jindex[iidx+1];
1528 /* Get outer coordinate index */
1530 i_coord_offset = DIM*inr;
1532 /* Load i particle coords and add shift vector */
1533 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1534 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1536 fix0 = _mm_setzero_ps();
1537 fiy0 = _mm_setzero_ps();
1538 fiz0 = _mm_setzero_ps();
1539 fix1 = _mm_setzero_ps();
1540 fiy1 = _mm_setzero_ps();
1541 fiz1 = _mm_setzero_ps();
1542 fix2 = _mm_setzero_ps();
1543 fiy2 = _mm_setzero_ps();
1544 fiz2 = _mm_setzero_ps();
1545 fix3 = _mm_setzero_ps();
1546 fiy3 = _mm_setzero_ps();
1547 fiz3 = _mm_setzero_ps();
1549 /* Start inner kernel loop */
1550 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1553 /* Get j neighbor index, and coordinate index */
1555 jnrB = jjnr[jidx+1];
1556 jnrC = jjnr[jidx+2];
1557 jnrD = jjnr[jidx+3];
1558 j_coord_offsetA = DIM*jnrA;
1559 j_coord_offsetB = DIM*jnrB;
1560 j_coord_offsetC = DIM*jnrC;
1561 j_coord_offsetD = DIM*jnrD;
1563 /* load j atom coordinates */
1564 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1565 x+j_coord_offsetC,x+j_coord_offsetD,
1566 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1567 &jy2,&jz2,&jx3,&jy3,&jz3);
1569 /* Calculate displacement vector */
1570 dx00 = _mm_sub_ps(ix0,jx0);
1571 dy00 = _mm_sub_ps(iy0,jy0);
1572 dz00 = _mm_sub_ps(iz0,jz0);
1573 dx11 = _mm_sub_ps(ix1,jx1);
1574 dy11 = _mm_sub_ps(iy1,jy1);
1575 dz11 = _mm_sub_ps(iz1,jz1);
1576 dx12 = _mm_sub_ps(ix1,jx2);
1577 dy12 = _mm_sub_ps(iy1,jy2);
1578 dz12 = _mm_sub_ps(iz1,jz2);
1579 dx13 = _mm_sub_ps(ix1,jx3);
1580 dy13 = _mm_sub_ps(iy1,jy3);
1581 dz13 = _mm_sub_ps(iz1,jz3);
1582 dx21 = _mm_sub_ps(ix2,jx1);
1583 dy21 = _mm_sub_ps(iy2,jy1);
1584 dz21 = _mm_sub_ps(iz2,jz1);
1585 dx22 = _mm_sub_ps(ix2,jx2);
1586 dy22 = _mm_sub_ps(iy2,jy2);
1587 dz22 = _mm_sub_ps(iz2,jz2);
1588 dx23 = _mm_sub_ps(ix2,jx3);
1589 dy23 = _mm_sub_ps(iy2,jy3);
1590 dz23 = _mm_sub_ps(iz2,jz3);
1591 dx31 = _mm_sub_ps(ix3,jx1);
1592 dy31 = _mm_sub_ps(iy3,jy1);
1593 dz31 = _mm_sub_ps(iz3,jz1);
1594 dx32 = _mm_sub_ps(ix3,jx2);
1595 dy32 = _mm_sub_ps(iy3,jy2);
1596 dz32 = _mm_sub_ps(iz3,jz2);
1597 dx33 = _mm_sub_ps(ix3,jx3);
1598 dy33 = _mm_sub_ps(iy3,jy3);
1599 dz33 = _mm_sub_ps(iz3,jz3);
1601 /* Calculate squared distance and things based on it */
1602 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1603 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1604 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1605 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1606 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1607 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1608 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1609 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1610 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1611 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1613 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1614 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1615 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1616 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1617 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1618 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1619 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1620 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1621 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1622 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1624 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1625 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1626 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1627 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1628 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1629 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1630 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1631 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1632 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1633 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1635 fjx0 = _mm_setzero_ps();
1636 fjy0 = _mm_setzero_ps();
1637 fjz0 = _mm_setzero_ps();
1638 fjx1 = _mm_setzero_ps();
1639 fjy1 = _mm_setzero_ps();
1640 fjz1 = _mm_setzero_ps();
1641 fjx2 = _mm_setzero_ps();
1642 fjy2 = _mm_setzero_ps();
1643 fjz2 = _mm_setzero_ps();
1644 fjx3 = _mm_setzero_ps();
1645 fjy3 = _mm_setzero_ps();
1646 fjz3 = _mm_setzero_ps();
1648 /**************************
1649 * CALCULATE INTERACTIONS *
1650 **************************/
1652 r00 = _mm_mul_ps(rsq00,rinv00);
1654 /* Analytical LJ-PME */
1655 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1656 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
1657 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1658 exponent = gmx_simd_exp_r(ewcljrsq);
1659 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1660 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
1661 /* f6A = 6 * C6grid * (1 - poly) */
1662 f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1663 /* f6B = C6grid * exponent * beta^6 */
1664 f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1665 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1666 fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1670 /* Calculate temporary vectorial force */
1671 tx = _mm_mul_ps(fscal,dx00);
1672 ty = _mm_mul_ps(fscal,dy00);
1673 tz = _mm_mul_ps(fscal,dz00);
1675 /* Update vectorial force */
1676 fix0 = _mm_add_ps(fix0,tx);
1677 fiy0 = _mm_add_ps(fiy0,ty);
1678 fiz0 = _mm_add_ps(fiz0,tz);
1680 fjx0 = _mm_add_ps(fjx0,tx);
1681 fjy0 = _mm_add_ps(fjy0,ty);
1682 fjz0 = _mm_add_ps(fjz0,tz);
1684 /**************************
1685 * CALCULATE INTERACTIONS *
1686 **************************/
1688 r11 = _mm_mul_ps(rsq11,rinv11);
1690 /* EWALD ELECTROSTATICS */
1692 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1693 ewrt = _mm_mul_ps(r11,ewtabscale);
1694 ewitab = _mm_cvttps_epi32(ewrt);
1695 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1696 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1697 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1699 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1700 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1704 /* Calculate temporary vectorial force */
1705 tx = _mm_mul_ps(fscal,dx11);
1706 ty = _mm_mul_ps(fscal,dy11);
1707 tz = _mm_mul_ps(fscal,dz11);
1709 /* Update vectorial force */
1710 fix1 = _mm_add_ps(fix1,tx);
1711 fiy1 = _mm_add_ps(fiy1,ty);
1712 fiz1 = _mm_add_ps(fiz1,tz);
1714 fjx1 = _mm_add_ps(fjx1,tx);
1715 fjy1 = _mm_add_ps(fjy1,ty);
1716 fjz1 = _mm_add_ps(fjz1,tz);
1718 /**************************
1719 * CALCULATE INTERACTIONS *
1720 **************************/
1722 r12 = _mm_mul_ps(rsq12,rinv12);
1724 /* EWALD ELECTROSTATICS */
1726 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1727 ewrt = _mm_mul_ps(r12,ewtabscale);
1728 ewitab = _mm_cvttps_epi32(ewrt);
1729 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1730 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1731 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1733 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1734 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1738 /* Calculate temporary vectorial force */
1739 tx = _mm_mul_ps(fscal,dx12);
1740 ty = _mm_mul_ps(fscal,dy12);
1741 tz = _mm_mul_ps(fscal,dz12);
1743 /* Update vectorial force */
1744 fix1 = _mm_add_ps(fix1,tx);
1745 fiy1 = _mm_add_ps(fiy1,ty);
1746 fiz1 = _mm_add_ps(fiz1,tz);
1748 fjx2 = _mm_add_ps(fjx2,tx);
1749 fjy2 = _mm_add_ps(fjy2,ty);
1750 fjz2 = _mm_add_ps(fjz2,tz);
1752 /**************************
1753 * CALCULATE INTERACTIONS *
1754 **************************/
1756 r13 = _mm_mul_ps(rsq13,rinv13);
1758 /* EWALD ELECTROSTATICS */
1760 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1761 ewrt = _mm_mul_ps(r13,ewtabscale);
1762 ewitab = _mm_cvttps_epi32(ewrt);
1763 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1764 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1765 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1767 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1768 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1772 /* Calculate temporary vectorial force */
1773 tx = _mm_mul_ps(fscal,dx13);
1774 ty = _mm_mul_ps(fscal,dy13);
1775 tz = _mm_mul_ps(fscal,dz13);
1777 /* Update vectorial force */
1778 fix1 = _mm_add_ps(fix1,tx);
1779 fiy1 = _mm_add_ps(fiy1,ty);
1780 fiz1 = _mm_add_ps(fiz1,tz);
1782 fjx3 = _mm_add_ps(fjx3,tx);
1783 fjy3 = _mm_add_ps(fjy3,ty);
1784 fjz3 = _mm_add_ps(fjz3,tz);
1786 /**************************
1787 * CALCULATE INTERACTIONS *
1788 **************************/
1790 r21 = _mm_mul_ps(rsq21,rinv21);
1792 /* EWALD ELECTROSTATICS */
1794 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1795 ewrt = _mm_mul_ps(r21,ewtabscale);
1796 ewitab = _mm_cvttps_epi32(ewrt);
1797 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1798 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1799 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1801 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1802 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1806 /* Calculate temporary vectorial force */
1807 tx = _mm_mul_ps(fscal,dx21);
1808 ty = _mm_mul_ps(fscal,dy21);
1809 tz = _mm_mul_ps(fscal,dz21);
1811 /* Update vectorial force */
1812 fix2 = _mm_add_ps(fix2,tx);
1813 fiy2 = _mm_add_ps(fiy2,ty);
1814 fiz2 = _mm_add_ps(fiz2,tz);
1816 fjx1 = _mm_add_ps(fjx1,tx);
1817 fjy1 = _mm_add_ps(fjy1,ty);
1818 fjz1 = _mm_add_ps(fjz1,tz);
1820 /**************************
1821 * CALCULATE INTERACTIONS *
1822 **************************/
1824 r22 = _mm_mul_ps(rsq22,rinv22);
1826 /* EWALD ELECTROSTATICS */
1828 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1829 ewrt = _mm_mul_ps(r22,ewtabscale);
1830 ewitab = _mm_cvttps_epi32(ewrt);
1831 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1832 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1833 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1835 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1836 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1840 /* Calculate temporary vectorial force */
1841 tx = _mm_mul_ps(fscal,dx22);
1842 ty = _mm_mul_ps(fscal,dy22);
1843 tz = _mm_mul_ps(fscal,dz22);
1845 /* Update vectorial force */
1846 fix2 = _mm_add_ps(fix2,tx);
1847 fiy2 = _mm_add_ps(fiy2,ty);
1848 fiz2 = _mm_add_ps(fiz2,tz);
1850 fjx2 = _mm_add_ps(fjx2,tx);
1851 fjy2 = _mm_add_ps(fjy2,ty);
1852 fjz2 = _mm_add_ps(fjz2,tz);
1854 /**************************
1855 * CALCULATE INTERACTIONS *
1856 **************************/
1858 r23 = _mm_mul_ps(rsq23,rinv23);
1860 /* EWALD ELECTROSTATICS */
1862 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1863 ewrt = _mm_mul_ps(r23,ewtabscale);
1864 ewitab = _mm_cvttps_epi32(ewrt);
1865 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1866 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1867 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1869 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1870 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1874 /* Calculate temporary vectorial force */
1875 tx = _mm_mul_ps(fscal,dx23);
1876 ty = _mm_mul_ps(fscal,dy23);
1877 tz = _mm_mul_ps(fscal,dz23);
1879 /* Update vectorial force */
1880 fix2 = _mm_add_ps(fix2,tx);
1881 fiy2 = _mm_add_ps(fiy2,ty);
1882 fiz2 = _mm_add_ps(fiz2,tz);
1884 fjx3 = _mm_add_ps(fjx3,tx);
1885 fjy3 = _mm_add_ps(fjy3,ty);
1886 fjz3 = _mm_add_ps(fjz3,tz);
1888 /**************************
1889 * CALCULATE INTERACTIONS *
1890 **************************/
1892 r31 = _mm_mul_ps(rsq31,rinv31);
1894 /* EWALD ELECTROSTATICS */
1896 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1897 ewrt = _mm_mul_ps(r31,ewtabscale);
1898 ewitab = _mm_cvttps_epi32(ewrt);
1899 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1900 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1901 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1903 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1904 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1908 /* Calculate temporary vectorial force */
1909 tx = _mm_mul_ps(fscal,dx31);
1910 ty = _mm_mul_ps(fscal,dy31);
1911 tz = _mm_mul_ps(fscal,dz31);
1913 /* Update vectorial force */
1914 fix3 = _mm_add_ps(fix3,tx);
1915 fiy3 = _mm_add_ps(fiy3,ty);
1916 fiz3 = _mm_add_ps(fiz3,tz);
1918 fjx1 = _mm_add_ps(fjx1,tx);
1919 fjy1 = _mm_add_ps(fjy1,ty);
1920 fjz1 = _mm_add_ps(fjz1,tz);
1922 /**************************
1923 * CALCULATE INTERACTIONS *
1924 **************************/
1926 r32 = _mm_mul_ps(rsq32,rinv32);
1928 /* EWALD ELECTROSTATICS */
1930 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1931 ewrt = _mm_mul_ps(r32,ewtabscale);
1932 ewitab = _mm_cvttps_epi32(ewrt);
1933 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1934 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1935 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1937 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1938 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1942 /* Calculate temporary vectorial force */
1943 tx = _mm_mul_ps(fscal,dx32);
1944 ty = _mm_mul_ps(fscal,dy32);
1945 tz = _mm_mul_ps(fscal,dz32);
1947 /* Update vectorial force */
1948 fix3 = _mm_add_ps(fix3,tx);
1949 fiy3 = _mm_add_ps(fiy3,ty);
1950 fiz3 = _mm_add_ps(fiz3,tz);
1952 fjx2 = _mm_add_ps(fjx2,tx);
1953 fjy2 = _mm_add_ps(fjy2,ty);
1954 fjz2 = _mm_add_ps(fjz2,tz);
1956 /**************************
1957 * CALCULATE INTERACTIONS *
1958 **************************/
1960 r33 = _mm_mul_ps(rsq33,rinv33);
1962 /* EWALD ELECTROSTATICS */
1964 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1965 ewrt = _mm_mul_ps(r33,ewtabscale);
1966 ewitab = _mm_cvttps_epi32(ewrt);
1967 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1968 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1969 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1971 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1972 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1976 /* Calculate temporary vectorial force */
1977 tx = _mm_mul_ps(fscal,dx33);
1978 ty = _mm_mul_ps(fscal,dy33);
1979 tz = _mm_mul_ps(fscal,dz33);
1981 /* Update vectorial force */
1982 fix3 = _mm_add_ps(fix3,tx);
1983 fiy3 = _mm_add_ps(fiy3,ty);
1984 fiz3 = _mm_add_ps(fiz3,tz);
1986 fjx3 = _mm_add_ps(fjx3,tx);
1987 fjy3 = _mm_add_ps(fjy3,ty);
1988 fjz3 = _mm_add_ps(fjz3,tz);
1990 fjptrA = f+j_coord_offsetA;
1991 fjptrB = f+j_coord_offsetB;
1992 fjptrC = f+j_coord_offsetC;
1993 fjptrD = f+j_coord_offsetD;
1995 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1996 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1997 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1999 /* Inner loop uses 373 flops */
2002 if(jidx<j_index_end)
2005 /* Get j neighbor index, and coordinate index */
2006 jnrlistA = jjnr[jidx];
2007 jnrlistB = jjnr[jidx+1];
2008 jnrlistC = jjnr[jidx+2];
2009 jnrlistD = jjnr[jidx+3];
2010 /* Sign of each element will be negative for non-real atoms.
2011 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2012 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2014 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2015 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2016 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2017 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2018 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2019 j_coord_offsetA = DIM*jnrA;
2020 j_coord_offsetB = DIM*jnrB;
2021 j_coord_offsetC = DIM*jnrC;
2022 j_coord_offsetD = DIM*jnrD;
2024 /* load j atom coordinates */
2025 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2026 x+j_coord_offsetC,x+j_coord_offsetD,
2027 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2028 &jy2,&jz2,&jx3,&jy3,&jz3);
2030 /* Calculate displacement vector */
2031 dx00 = _mm_sub_ps(ix0,jx0);
2032 dy00 = _mm_sub_ps(iy0,jy0);
2033 dz00 = _mm_sub_ps(iz0,jz0);
2034 dx11 = _mm_sub_ps(ix1,jx1);
2035 dy11 = _mm_sub_ps(iy1,jy1);
2036 dz11 = _mm_sub_ps(iz1,jz1);
2037 dx12 = _mm_sub_ps(ix1,jx2);
2038 dy12 = _mm_sub_ps(iy1,jy2);
2039 dz12 = _mm_sub_ps(iz1,jz2);
2040 dx13 = _mm_sub_ps(ix1,jx3);
2041 dy13 = _mm_sub_ps(iy1,jy3);
2042 dz13 = _mm_sub_ps(iz1,jz3);
2043 dx21 = _mm_sub_ps(ix2,jx1);
2044 dy21 = _mm_sub_ps(iy2,jy1);
2045 dz21 = _mm_sub_ps(iz2,jz1);
2046 dx22 = _mm_sub_ps(ix2,jx2);
2047 dy22 = _mm_sub_ps(iy2,jy2);
2048 dz22 = _mm_sub_ps(iz2,jz2);
2049 dx23 = _mm_sub_ps(ix2,jx3);
2050 dy23 = _mm_sub_ps(iy2,jy3);
2051 dz23 = _mm_sub_ps(iz2,jz3);
2052 dx31 = _mm_sub_ps(ix3,jx1);
2053 dy31 = _mm_sub_ps(iy3,jy1);
2054 dz31 = _mm_sub_ps(iz3,jz1);
2055 dx32 = _mm_sub_ps(ix3,jx2);
2056 dy32 = _mm_sub_ps(iy3,jy2);
2057 dz32 = _mm_sub_ps(iz3,jz2);
2058 dx33 = _mm_sub_ps(ix3,jx3);
2059 dy33 = _mm_sub_ps(iy3,jy3);
2060 dz33 = _mm_sub_ps(iz3,jz3);
2062 /* Calculate squared distance and things based on it */
2063 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2064 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2065 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2066 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2067 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2068 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2069 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2070 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2071 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2072 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2074 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2075 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2076 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2077 rinv13 = gmx_mm_invsqrt_ps(rsq13);
2078 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2079 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2080 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2081 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2082 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2083 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2085 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2086 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2087 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2088 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2089 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2090 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2091 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2092 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2093 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2094 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2096 fjx0 = _mm_setzero_ps();
2097 fjy0 = _mm_setzero_ps();
2098 fjz0 = _mm_setzero_ps();
2099 fjx1 = _mm_setzero_ps();
2100 fjy1 = _mm_setzero_ps();
2101 fjz1 = _mm_setzero_ps();
2102 fjx2 = _mm_setzero_ps();
2103 fjy2 = _mm_setzero_ps();
2104 fjz2 = _mm_setzero_ps();
2105 fjx3 = _mm_setzero_ps();
2106 fjy3 = _mm_setzero_ps();
2107 fjz3 = _mm_setzero_ps();
2109 /**************************
2110 * CALCULATE INTERACTIONS *
2111 **************************/
2113 r00 = _mm_mul_ps(rsq00,rinv00);
2114 r00 = _mm_andnot_ps(dummy_mask,r00);
2116 /* Analytical LJ-PME */
2117 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2118 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
2119 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
2120 exponent = gmx_simd_exp_r(ewcljrsq);
2121 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2122 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
2123 /* f6A = 6 * C6grid * (1 - poly) */
2124 f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
2125 /* f6B = C6grid * exponent * beta^6 */
2126 f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
2127 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2128 fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2132 fscal = _mm_andnot_ps(dummy_mask,fscal);
2134 /* Calculate temporary vectorial force */
2135 tx = _mm_mul_ps(fscal,dx00);
2136 ty = _mm_mul_ps(fscal,dy00);
2137 tz = _mm_mul_ps(fscal,dz00);
2139 /* Update vectorial force */
2140 fix0 = _mm_add_ps(fix0,tx);
2141 fiy0 = _mm_add_ps(fiy0,ty);
2142 fiz0 = _mm_add_ps(fiz0,tz);
2144 fjx0 = _mm_add_ps(fjx0,tx);
2145 fjy0 = _mm_add_ps(fjy0,ty);
2146 fjz0 = _mm_add_ps(fjz0,tz);
2148 /**************************
2149 * CALCULATE INTERACTIONS *
2150 **************************/
2152 r11 = _mm_mul_ps(rsq11,rinv11);
2153 r11 = _mm_andnot_ps(dummy_mask,r11);
2155 /* EWALD ELECTROSTATICS */
2157 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2158 ewrt = _mm_mul_ps(r11,ewtabscale);
2159 ewitab = _mm_cvttps_epi32(ewrt);
2160 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2161 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2162 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2164 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2165 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2169 fscal = _mm_andnot_ps(dummy_mask,fscal);
2171 /* Calculate temporary vectorial force */
2172 tx = _mm_mul_ps(fscal,dx11);
2173 ty = _mm_mul_ps(fscal,dy11);
2174 tz = _mm_mul_ps(fscal,dz11);
2176 /* Update vectorial force */
2177 fix1 = _mm_add_ps(fix1,tx);
2178 fiy1 = _mm_add_ps(fiy1,ty);
2179 fiz1 = _mm_add_ps(fiz1,tz);
2181 fjx1 = _mm_add_ps(fjx1,tx);
2182 fjy1 = _mm_add_ps(fjy1,ty);
2183 fjz1 = _mm_add_ps(fjz1,tz);
2185 /**************************
2186 * CALCULATE INTERACTIONS *
2187 **************************/
2189 r12 = _mm_mul_ps(rsq12,rinv12);
2190 r12 = _mm_andnot_ps(dummy_mask,r12);
2192 /* EWALD ELECTROSTATICS */
2194 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2195 ewrt = _mm_mul_ps(r12,ewtabscale);
2196 ewitab = _mm_cvttps_epi32(ewrt);
2197 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2198 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2199 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2201 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2202 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2206 fscal = _mm_andnot_ps(dummy_mask,fscal);
2208 /* Calculate temporary vectorial force */
2209 tx = _mm_mul_ps(fscal,dx12);
2210 ty = _mm_mul_ps(fscal,dy12);
2211 tz = _mm_mul_ps(fscal,dz12);
2213 /* Update vectorial force */
2214 fix1 = _mm_add_ps(fix1,tx);
2215 fiy1 = _mm_add_ps(fiy1,ty);
2216 fiz1 = _mm_add_ps(fiz1,tz);
2218 fjx2 = _mm_add_ps(fjx2,tx);
2219 fjy2 = _mm_add_ps(fjy2,ty);
2220 fjz2 = _mm_add_ps(fjz2,tz);
2222 /**************************
2223 * CALCULATE INTERACTIONS *
2224 **************************/
2226 r13 = _mm_mul_ps(rsq13,rinv13);
2227 r13 = _mm_andnot_ps(dummy_mask,r13);
2229 /* EWALD ELECTROSTATICS */
2231 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2232 ewrt = _mm_mul_ps(r13,ewtabscale);
2233 ewitab = _mm_cvttps_epi32(ewrt);
2234 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2235 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2236 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2238 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2239 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2243 fscal = _mm_andnot_ps(dummy_mask,fscal);
2245 /* Calculate temporary vectorial force */
2246 tx = _mm_mul_ps(fscal,dx13);
2247 ty = _mm_mul_ps(fscal,dy13);
2248 tz = _mm_mul_ps(fscal,dz13);
2250 /* Update vectorial force */
2251 fix1 = _mm_add_ps(fix1,tx);
2252 fiy1 = _mm_add_ps(fiy1,ty);
2253 fiz1 = _mm_add_ps(fiz1,tz);
2255 fjx3 = _mm_add_ps(fjx3,tx);
2256 fjy3 = _mm_add_ps(fjy3,ty);
2257 fjz3 = _mm_add_ps(fjz3,tz);
2259 /**************************
2260 * CALCULATE INTERACTIONS *
2261 **************************/
2263 r21 = _mm_mul_ps(rsq21,rinv21);
2264 r21 = _mm_andnot_ps(dummy_mask,r21);
2266 /* EWALD ELECTROSTATICS */
2268 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2269 ewrt = _mm_mul_ps(r21,ewtabscale);
2270 ewitab = _mm_cvttps_epi32(ewrt);
2271 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2272 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2273 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2275 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2276 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2280 fscal = _mm_andnot_ps(dummy_mask,fscal);
2282 /* Calculate temporary vectorial force */
2283 tx = _mm_mul_ps(fscal,dx21);
2284 ty = _mm_mul_ps(fscal,dy21);
2285 tz = _mm_mul_ps(fscal,dz21);
2287 /* Update vectorial force */
2288 fix2 = _mm_add_ps(fix2,tx);
2289 fiy2 = _mm_add_ps(fiy2,ty);
2290 fiz2 = _mm_add_ps(fiz2,tz);
2292 fjx1 = _mm_add_ps(fjx1,tx);
2293 fjy1 = _mm_add_ps(fjy1,ty);
2294 fjz1 = _mm_add_ps(fjz1,tz);
2296 /**************************
2297 * CALCULATE INTERACTIONS *
2298 **************************/
2300 r22 = _mm_mul_ps(rsq22,rinv22);
2301 r22 = _mm_andnot_ps(dummy_mask,r22);
2303 /* EWALD ELECTROSTATICS */
2305 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2306 ewrt = _mm_mul_ps(r22,ewtabscale);
2307 ewitab = _mm_cvttps_epi32(ewrt);
2308 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2309 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2310 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2312 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2313 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2317 fscal = _mm_andnot_ps(dummy_mask,fscal);
2319 /* Calculate temporary vectorial force */
2320 tx = _mm_mul_ps(fscal,dx22);
2321 ty = _mm_mul_ps(fscal,dy22);
2322 tz = _mm_mul_ps(fscal,dz22);
2324 /* Update vectorial force */
2325 fix2 = _mm_add_ps(fix2,tx);
2326 fiy2 = _mm_add_ps(fiy2,ty);
2327 fiz2 = _mm_add_ps(fiz2,tz);
2329 fjx2 = _mm_add_ps(fjx2,tx);
2330 fjy2 = _mm_add_ps(fjy2,ty);
2331 fjz2 = _mm_add_ps(fjz2,tz);
2333 /**************************
2334 * CALCULATE INTERACTIONS *
2335 **************************/
2337 r23 = _mm_mul_ps(rsq23,rinv23);
2338 r23 = _mm_andnot_ps(dummy_mask,r23);
2340 /* EWALD ELECTROSTATICS */
2342 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2343 ewrt = _mm_mul_ps(r23,ewtabscale);
2344 ewitab = _mm_cvttps_epi32(ewrt);
2345 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2346 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2347 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2349 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2350 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2354 fscal = _mm_andnot_ps(dummy_mask,fscal);
2356 /* Calculate temporary vectorial force */
2357 tx = _mm_mul_ps(fscal,dx23);
2358 ty = _mm_mul_ps(fscal,dy23);
2359 tz = _mm_mul_ps(fscal,dz23);
2361 /* Update vectorial force */
2362 fix2 = _mm_add_ps(fix2,tx);
2363 fiy2 = _mm_add_ps(fiy2,ty);
2364 fiz2 = _mm_add_ps(fiz2,tz);
2366 fjx3 = _mm_add_ps(fjx3,tx);
2367 fjy3 = _mm_add_ps(fjy3,ty);
2368 fjz3 = _mm_add_ps(fjz3,tz);
2370 /**************************
2371 * CALCULATE INTERACTIONS *
2372 **************************/
2374 r31 = _mm_mul_ps(rsq31,rinv31);
2375 r31 = _mm_andnot_ps(dummy_mask,r31);
2377 /* EWALD ELECTROSTATICS */
2379 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2380 ewrt = _mm_mul_ps(r31,ewtabscale);
2381 ewitab = _mm_cvttps_epi32(ewrt);
2382 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2383 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2384 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2386 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2387 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2391 fscal = _mm_andnot_ps(dummy_mask,fscal);
2393 /* Calculate temporary vectorial force */
2394 tx = _mm_mul_ps(fscal,dx31);
2395 ty = _mm_mul_ps(fscal,dy31);
2396 tz = _mm_mul_ps(fscal,dz31);
2398 /* Update vectorial force */
2399 fix3 = _mm_add_ps(fix3,tx);
2400 fiy3 = _mm_add_ps(fiy3,ty);
2401 fiz3 = _mm_add_ps(fiz3,tz);
2403 fjx1 = _mm_add_ps(fjx1,tx);
2404 fjy1 = _mm_add_ps(fjy1,ty);
2405 fjz1 = _mm_add_ps(fjz1,tz);
2407 /**************************
2408 * CALCULATE INTERACTIONS *
2409 **************************/
2411 r32 = _mm_mul_ps(rsq32,rinv32);
2412 r32 = _mm_andnot_ps(dummy_mask,r32);
2414 /* EWALD ELECTROSTATICS */
2416 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2417 ewrt = _mm_mul_ps(r32,ewtabscale);
2418 ewitab = _mm_cvttps_epi32(ewrt);
2419 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2420 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2421 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2423 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2424 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2428 fscal = _mm_andnot_ps(dummy_mask,fscal);
2430 /* Calculate temporary vectorial force */
2431 tx = _mm_mul_ps(fscal,dx32);
2432 ty = _mm_mul_ps(fscal,dy32);
2433 tz = _mm_mul_ps(fscal,dz32);
2435 /* Update vectorial force */
2436 fix3 = _mm_add_ps(fix3,tx);
2437 fiy3 = _mm_add_ps(fiy3,ty);
2438 fiz3 = _mm_add_ps(fiz3,tz);
2440 fjx2 = _mm_add_ps(fjx2,tx);
2441 fjy2 = _mm_add_ps(fjy2,ty);
2442 fjz2 = _mm_add_ps(fjz2,tz);
2444 /**************************
2445 * CALCULATE INTERACTIONS *
2446 **************************/
2448 r33 = _mm_mul_ps(rsq33,rinv33);
2449 r33 = _mm_andnot_ps(dummy_mask,r33);
2451 /* EWALD ELECTROSTATICS */
2453 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2454 ewrt = _mm_mul_ps(r33,ewtabscale);
2455 ewitab = _mm_cvttps_epi32(ewrt);
2456 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2457 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2458 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2460 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2461 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2465 fscal = _mm_andnot_ps(dummy_mask,fscal);
2467 /* Calculate temporary vectorial force */
2468 tx = _mm_mul_ps(fscal,dx33);
2469 ty = _mm_mul_ps(fscal,dy33);
2470 tz = _mm_mul_ps(fscal,dz33);
2472 /* Update vectorial force */
2473 fix3 = _mm_add_ps(fix3,tx);
2474 fiy3 = _mm_add_ps(fiy3,ty);
2475 fiz3 = _mm_add_ps(fiz3,tz);
2477 fjx3 = _mm_add_ps(fjx3,tx);
2478 fjy3 = _mm_add_ps(fjy3,ty);
2479 fjz3 = _mm_add_ps(fjz3,tz);
2481 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2482 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2483 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2484 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2486 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2487 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2488 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2490 /* Inner loop uses 383 flops */
2493 /* End of innermost loop */
2495 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2496 f+i_coord_offset,fshift+i_shift_offset);
2498 /* Increment number of inner iterations */
2499 inneriter += j_index_end - j_index_start;
2501 /* Outer loop uses 24 flops */
2504 /* Increment number of outer iterations */
2507 /* Update outer/inner flops */
2509 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*383);