2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_sse4_1_single.h"
48 #include "kernelutil_x86_sse4_1_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse4_1_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LJEwald
54 * Geometry: Water4-Particle
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_VF_sse4_1_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
94 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
95 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
96 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
97 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
100 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
103 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
104 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
109 __m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
111 __m128 one_half = _mm_set1_ps(0.5);
112 __m128 minus_one = _mm_set1_ps(-1.0);
114 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
116 __m128 dummy_mask,cutoff_mask;
117 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
118 __m128 one = _mm_set1_ps(1.0);
119 __m128 two = _mm_set1_ps(2.0);
125 jindex = nlist->jindex;
127 shiftidx = nlist->shift;
129 shiftvec = fr->shift_vec[0];
130 fshift = fr->fshift[0];
131 facel = _mm_set1_ps(fr->epsfac);
132 charge = mdatoms->chargeA;
133 nvdwtype = fr->ntype;
135 vdwtype = mdatoms->typeA;
136 vdwgridparam = fr->ljpme_c6grid;
137 sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
138 ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
139 ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
141 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
142 ewtab = fr->ic->tabq_coul_FDV0;
143 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
144 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
146 /* Setup water-specific parameters */
147 inr = nlist->iinr[0];
148 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
149 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
150 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
151 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
153 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
154 rcutoff_scalar = fr->rcoulomb;
155 rcutoff = _mm_set1_ps(rcutoff_scalar);
156 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
158 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
159 rvdw = _mm_set1_ps(fr->rvdw);
161 /* Avoid stupid compiler warnings */
162 jnrA = jnrB = jnrC = jnrD = 0;
171 for(iidx=0;iidx<4*DIM;iidx++)
176 /* Start outer loop over neighborlists */
177 for(iidx=0; iidx<nri; iidx++)
179 /* Load shift vector for this list */
180 i_shift_offset = DIM*shiftidx[iidx];
182 /* Load limits for loop over neighbors */
183 j_index_start = jindex[iidx];
184 j_index_end = jindex[iidx+1];
186 /* Get outer coordinate index */
188 i_coord_offset = DIM*inr;
190 /* Load i particle coords and add shift vector */
191 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
192 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
194 fix0 = _mm_setzero_ps();
195 fiy0 = _mm_setzero_ps();
196 fiz0 = _mm_setzero_ps();
197 fix1 = _mm_setzero_ps();
198 fiy1 = _mm_setzero_ps();
199 fiz1 = _mm_setzero_ps();
200 fix2 = _mm_setzero_ps();
201 fiy2 = _mm_setzero_ps();
202 fiz2 = _mm_setzero_ps();
203 fix3 = _mm_setzero_ps();
204 fiy3 = _mm_setzero_ps();
205 fiz3 = _mm_setzero_ps();
207 /* Reset potential sums */
208 velecsum = _mm_setzero_ps();
209 vvdwsum = _mm_setzero_ps();
211 /* Start inner kernel loop */
212 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
215 /* Get j neighbor index, and coordinate index */
220 j_coord_offsetA = DIM*jnrA;
221 j_coord_offsetB = DIM*jnrB;
222 j_coord_offsetC = DIM*jnrC;
223 j_coord_offsetD = DIM*jnrD;
225 /* load j atom coordinates */
226 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
227 x+j_coord_offsetC,x+j_coord_offsetD,
230 /* Calculate displacement vector */
231 dx00 = _mm_sub_ps(ix0,jx0);
232 dy00 = _mm_sub_ps(iy0,jy0);
233 dz00 = _mm_sub_ps(iz0,jz0);
234 dx10 = _mm_sub_ps(ix1,jx0);
235 dy10 = _mm_sub_ps(iy1,jy0);
236 dz10 = _mm_sub_ps(iz1,jz0);
237 dx20 = _mm_sub_ps(ix2,jx0);
238 dy20 = _mm_sub_ps(iy2,jy0);
239 dz20 = _mm_sub_ps(iz2,jz0);
240 dx30 = _mm_sub_ps(ix3,jx0);
241 dy30 = _mm_sub_ps(iy3,jy0);
242 dz30 = _mm_sub_ps(iz3,jz0);
244 /* Calculate squared distance and things based on it */
245 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
246 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
247 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
248 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
250 rinv00 = gmx_mm_invsqrt_ps(rsq00);
251 rinv10 = gmx_mm_invsqrt_ps(rsq10);
252 rinv20 = gmx_mm_invsqrt_ps(rsq20);
253 rinv30 = gmx_mm_invsqrt_ps(rsq30);
255 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
256 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
257 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
258 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
260 /* Load parameters for j particles */
261 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
262 charge+jnrC+0,charge+jnrD+0);
263 vdwjidx0A = 2*vdwtype[jnrA+0];
264 vdwjidx0B = 2*vdwtype[jnrB+0];
265 vdwjidx0C = 2*vdwtype[jnrC+0];
266 vdwjidx0D = 2*vdwtype[jnrD+0];
268 fjx0 = _mm_setzero_ps();
269 fjy0 = _mm_setzero_ps();
270 fjz0 = _mm_setzero_ps();
272 /**************************
273 * CALCULATE INTERACTIONS *
274 **************************/
276 if (gmx_mm_any_lt(rsq00,rcutoff2))
279 r00 = _mm_mul_ps(rsq00,rinv00);
281 /* Compute parameters for interactions between i and j atoms */
282 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
283 vdwparam+vdwioffset0+vdwjidx0B,
284 vdwparam+vdwioffset0+vdwjidx0C,
285 vdwparam+vdwioffset0+vdwjidx0D,
288 c6grid_00 = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
289 vdwgridparam+vdwioffset0+vdwjidx0B,
290 vdwgridparam+vdwioffset0+vdwjidx0C,
291 vdwgridparam+vdwioffset0+vdwjidx0D);
293 /* Analytical LJ-PME */
294 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
295 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
296 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
297 exponent = gmx_simd_exp_r(ewcljrsq);
298 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
299 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
300 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
301 vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
302 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
303 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
304 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
305 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
306 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
308 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
310 /* Update potential sum for this i atom from the interaction with this j atom. */
311 vvdw = _mm_and_ps(vvdw,cutoff_mask);
312 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
316 fscal = _mm_and_ps(fscal,cutoff_mask);
318 /* Calculate temporary vectorial force */
319 tx = _mm_mul_ps(fscal,dx00);
320 ty = _mm_mul_ps(fscal,dy00);
321 tz = _mm_mul_ps(fscal,dz00);
323 /* Update vectorial force */
324 fix0 = _mm_add_ps(fix0,tx);
325 fiy0 = _mm_add_ps(fiy0,ty);
326 fiz0 = _mm_add_ps(fiz0,tz);
328 fjx0 = _mm_add_ps(fjx0,tx);
329 fjy0 = _mm_add_ps(fjy0,ty);
330 fjz0 = _mm_add_ps(fjz0,tz);
334 /**************************
335 * CALCULATE INTERACTIONS *
336 **************************/
338 if (gmx_mm_any_lt(rsq10,rcutoff2))
341 r10 = _mm_mul_ps(rsq10,rinv10);
343 /* Compute parameters for interactions between i and j atoms */
344 qq10 = _mm_mul_ps(iq1,jq0);
346 /* EWALD ELECTROSTATICS */
348 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
349 ewrt = _mm_mul_ps(r10,ewtabscale);
350 ewitab = _mm_cvttps_epi32(ewrt);
351 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
352 ewitab = _mm_slli_epi32(ewitab,2);
353 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
354 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
355 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
356 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
357 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
358 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
359 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
360 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
361 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
363 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
365 /* Update potential sum for this i atom from the interaction with this j atom. */
366 velec = _mm_and_ps(velec,cutoff_mask);
367 velecsum = _mm_add_ps(velecsum,velec);
371 fscal = _mm_and_ps(fscal,cutoff_mask);
373 /* Calculate temporary vectorial force */
374 tx = _mm_mul_ps(fscal,dx10);
375 ty = _mm_mul_ps(fscal,dy10);
376 tz = _mm_mul_ps(fscal,dz10);
378 /* Update vectorial force */
379 fix1 = _mm_add_ps(fix1,tx);
380 fiy1 = _mm_add_ps(fiy1,ty);
381 fiz1 = _mm_add_ps(fiz1,tz);
383 fjx0 = _mm_add_ps(fjx0,tx);
384 fjy0 = _mm_add_ps(fjy0,ty);
385 fjz0 = _mm_add_ps(fjz0,tz);
389 /**************************
390 * CALCULATE INTERACTIONS *
391 **************************/
393 if (gmx_mm_any_lt(rsq20,rcutoff2))
396 r20 = _mm_mul_ps(rsq20,rinv20);
398 /* Compute parameters for interactions between i and j atoms */
399 qq20 = _mm_mul_ps(iq2,jq0);
401 /* EWALD ELECTROSTATICS */
403 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
404 ewrt = _mm_mul_ps(r20,ewtabscale);
405 ewitab = _mm_cvttps_epi32(ewrt);
406 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
407 ewitab = _mm_slli_epi32(ewitab,2);
408 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
409 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
410 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
411 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
412 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
413 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
414 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
415 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
416 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
418 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
420 /* Update potential sum for this i atom from the interaction with this j atom. */
421 velec = _mm_and_ps(velec,cutoff_mask);
422 velecsum = _mm_add_ps(velecsum,velec);
426 fscal = _mm_and_ps(fscal,cutoff_mask);
428 /* Calculate temporary vectorial force */
429 tx = _mm_mul_ps(fscal,dx20);
430 ty = _mm_mul_ps(fscal,dy20);
431 tz = _mm_mul_ps(fscal,dz20);
433 /* Update vectorial force */
434 fix2 = _mm_add_ps(fix2,tx);
435 fiy2 = _mm_add_ps(fiy2,ty);
436 fiz2 = _mm_add_ps(fiz2,tz);
438 fjx0 = _mm_add_ps(fjx0,tx);
439 fjy0 = _mm_add_ps(fjy0,ty);
440 fjz0 = _mm_add_ps(fjz0,tz);
444 /**************************
445 * CALCULATE INTERACTIONS *
446 **************************/
448 if (gmx_mm_any_lt(rsq30,rcutoff2))
451 r30 = _mm_mul_ps(rsq30,rinv30);
453 /* Compute parameters for interactions between i and j atoms */
454 qq30 = _mm_mul_ps(iq3,jq0);
456 /* EWALD ELECTROSTATICS */
458 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
459 ewrt = _mm_mul_ps(r30,ewtabscale);
460 ewitab = _mm_cvttps_epi32(ewrt);
461 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
462 ewitab = _mm_slli_epi32(ewitab,2);
463 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
464 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
465 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
466 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
467 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
468 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
469 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
470 velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_sub_ps(rinv30,sh_ewald),velec));
471 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
473 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
475 /* Update potential sum for this i atom from the interaction with this j atom. */
476 velec = _mm_and_ps(velec,cutoff_mask);
477 velecsum = _mm_add_ps(velecsum,velec);
481 fscal = _mm_and_ps(fscal,cutoff_mask);
483 /* Calculate temporary vectorial force */
484 tx = _mm_mul_ps(fscal,dx30);
485 ty = _mm_mul_ps(fscal,dy30);
486 tz = _mm_mul_ps(fscal,dz30);
488 /* Update vectorial force */
489 fix3 = _mm_add_ps(fix3,tx);
490 fiy3 = _mm_add_ps(fiy3,ty);
491 fiz3 = _mm_add_ps(fiz3,tz);
493 fjx0 = _mm_add_ps(fjx0,tx);
494 fjy0 = _mm_add_ps(fjy0,ty);
495 fjz0 = _mm_add_ps(fjz0,tz);
499 fjptrA = f+j_coord_offsetA;
500 fjptrB = f+j_coord_offsetB;
501 fjptrC = f+j_coord_offsetC;
502 fjptrD = f+j_coord_offsetD;
504 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
506 /* Inner loop uses 200 flops */
512 /* Get j neighbor index, and coordinate index */
513 jnrlistA = jjnr[jidx];
514 jnrlistB = jjnr[jidx+1];
515 jnrlistC = jjnr[jidx+2];
516 jnrlistD = jjnr[jidx+3];
517 /* Sign of each element will be negative for non-real atoms.
518 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
519 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
521 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
522 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
523 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
524 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
525 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
526 j_coord_offsetA = DIM*jnrA;
527 j_coord_offsetB = DIM*jnrB;
528 j_coord_offsetC = DIM*jnrC;
529 j_coord_offsetD = DIM*jnrD;
531 /* load j atom coordinates */
532 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
533 x+j_coord_offsetC,x+j_coord_offsetD,
536 /* Calculate displacement vector */
537 dx00 = _mm_sub_ps(ix0,jx0);
538 dy00 = _mm_sub_ps(iy0,jy0);
539 dz00 = _mm_sub_ps(iz0,jz0);
540 dx10 = _mm_sub_ps(ix1,jx0);
541 dy10 = _mm_sub_ps(iy1,jy0);
542 dz10 = _mm_sub_ps(iz1,jz0);
543 dx20 = _mm_sub_ps(ix2,jx0);
544 dy20 = _mm_sub_ps(iy2,jy0);
545 dz20 = _mm_sub_ps(iz2,jz0);
546 dx30 = _mm_sub_ps(ix3,jx0);
547 dy30 = _mm_sub_ps(iy3,jy0);
548 dz30 = _mm_sub_ps(iz3,jz0);
550 /* Calculate squared distance and things based on it */
551 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
552 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
553 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
554 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
556 rinv00 = gmx_mm_invsqrt_ps(rsq00);
557 rinv10 = gmx_mm_invsqrt_ps(rsq10);
558 rinv20 = gmx_mm_invsqrt_ps(rsq20);
559 rinv30 = gmx_mm_invsqrt_ps(rsq30);
561 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
562 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
563 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
564 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
566 /* Load parameters for j particles */
567 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
568 charge+jnrC+0,charge+jnrD+0);
569 vdwjidx0A = 2*vdwtype[jnrA+0];
570 vdwjidx0B = 2*vdwtype[jnrB+0];
571 vdwjidx0C = 2*vdwtype[jnrC+0];
572 vdwjidx0D = 2*vdwtype[jnrD+0];
574 fjx0 = _mm_setzero_ps();
575 fjy0 = _mm_setzero_ps();
576 fjz0 = _mm_setzero_ps();
578 /**************************
579 * CALCULATE INTERACTIONS *
580 **************************/
582 if (gmx_mm_any_lt(rsq00,rcutoff2))
585 r00 = _mm_mul_ps(rsq00,rinv00);
586 r00 = _mm_andnot_ps(dummy_mask,r00);
588 /* Compute parameters for interactions between i and j atoms */
589 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
590 vdwparam+vdwioffset0+vdwjidx0B,
591 vdwparam+vdwioffset0+vdwjidx0C,
592 vdwparam+vdwioffset0+vdwjidx0D,
595 c6grid_00 = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
596 vdwgridparam+vdwioffset0+vdwjidx0B,
597 vdwgridparam+vdwioffset0+vdwjidx0C,
598 vdwgridparam+vdwioffset0+vdwjidx0D);
600 /* Analytical LJ-PME */
601 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
602 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
603 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
604 exponent = gmx_simd_exp_r(ewcljrsq);
605 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
606 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
607 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
608 vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
609 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
610 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
611 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
612 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
613 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
615 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
617 /* Update potential sum for this i atom from the interaction with this j atom. */
618 vvdw = _mm_and_ps(vvdw,cutoff_mask);
619 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
620 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
624 fscal = _mm_and_ps(fscal,cutoff_mask);
626 fscal = _mm_andnot_ps(dummy_mask,fscal);
628 /* Calculate temporary vectorial force */
629 tx = _mm_mul_ps(fscal,dx00);
630 ty = _mm_mul_ps(fscal,dy00);
631 tz = _mm_mul_ps(fscal,dz00);
633 /* Update vectorial force */
634 fix0 = _mm_add_ps(fix0,tx);
635 fiy0 = _mm_add_ps(fiy0,ty);
636 fiz0 = _mm_add_ps(fiz0,tz);
638 fjx0 = _mm_add_ps(fjx0,tx);
639 fjy0 = _mm_add_ps(fjy0,ty);
640 fjz0 = _mm_add_ps(fjz0,tz);
644 /**************************
645 * CALCULATE INTERACTIONS *
646 **************************/
648 if (gmx_mm_any_lt(rsq10,rcutoff2))
651 r10 = _mm_mul_ps(rsq10,rinv10);
652 r10 = _mm_andnot_ps(dummy_mask,r10);
654 /* Compute parameters for interactions between i and j atoms */
655 qq10 = _mm_mul_ps(iq1,jq0);
657 /* EWALD ELECTROSTATICS */
659 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
660 ewrt = _mm_mul_ps(r10,ewtabscale);
661 ewitab = _mm_cvttps_epi32(ewrt);
662 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
663 ewitab = _mm_slli_epi32(ewitab,2);
664 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
665 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
666 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
667 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
668 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
669 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
670 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
671 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
672 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
674 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
676 /* Update potential sum for this i atom from the interaction with this j atom. */
677 velec = _mm_and_ps(velec,cutoff_mask);
678 velec = _mm_andnot_ps(dummy_mask,velec);
679 velecsum = _mm_add_ps(velecsum,velec);
683 fscal = _mm_and_ps(fscal,cutoff_mask);
685 fscal = _mm_andnot_ps(dummy_mask,fscal);
687 /* Calculate temporary vectorial force */
688 tx = _mm_mul_ps(fscal,dx10);
689 ty = _mm_mul_ps(fscal,dy10);
690 tz = _mm_mul_ps(fscal,dz10);
692 /* Update vectorial force */
693 fix1 = _mm_add_ps(fix1,tx);
694 fiy1 = _mm_add_ps(fiy1,ty);
695 fiz1 = _mm_add_ps(fiz1,tz);
697 fjx0 = _mm_add_ps(fjx0,tx);
698 fjy0 = _mm_add_ps(fjy0,ty);
699 fjz0 = _mm_add_ps(fjz0,tz);
703 /**************************
704 * CALCULATE INTERACTIONS *
705 **************************/
707 if (gmx_mm_any_lt(rsq20,rcutoff2))
710 r20 = _mm_mul_ps(rsq20,rinv20);
711 r20 = _mm_andnot_ps(dummy_mask,r20);
713 /* Compute parameters for interactions between i and j atoms */
714 qq20 = _mm_mul_ps(iq2,jq0);
716 /* EWALD ELECTROSTATICS */
718 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
719 ewrt = _mm_mul_ps(r20,ewtabscale);
720 ewitab = _mm_cvttps_epi32(ewrt);
721 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
722 ewitab = _mm_slli_epi32(ewitab,2);
723 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
724 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
725 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
726 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
727 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
728 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
729 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
730 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
731 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
733 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
735 /* Update potential sum for this i atom from the interaction with this j atom. */
736 velec = _mm_and_ps(velec,cutoff_mask);
737 velec = _mm_andnot_ps(dummy_mask,velec);
738 velecsum = _mm_add_ps(velecsum,velec);
742 fscal = _mm_and_ps(fscal,cutoff_mask);
744 fscal = _mm_andnot_ps(dummy_mask,fscal);
746 /* Calculate temporary vectorial force */
747 tx = _mm_mul_ps(fscal,dx20);
748 ty = _mm_mul_ps(fscal,dy20);
749 tz = _mm_mul_ps(fscal,dz20);
751 /* Update vectorial force */
752 fix2 = _mm_add_ps(fix2,tx);
753 fiy2 = _mm_add_ps(fiy2,ty);
754 fiz2 = _mm_add_ps(fiz2,tz);
756 fjx0 = _mm_add_ps(fjx0,tx);
757 fjy0 = _mm_add_ps(fjy0,ty);
758 fjz0 = _mm_add_ps(fjz0,tz);
762 /**************************
763 * CALCULATE INTERACTIONS *
764 **************************/
766 if (gmx_mm_any_lt(rsq30,rcutoff2))
769 r30 = _mm_mul_ps(rsq30,rinv30);
770 r30 = _mm_andnot_ps(dummy_mask,r30);
772 /* Compute parameters for interactions between i and j atoms */
773 qq30 = _mm_mul_ps(iq3,jq0);
775 /* EWALD ELECTROSTATICS */
777 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
778 ewrt = _mm_mul_ps(r30,ewtabscale);
779 ewitab = _mm_cvttps_epi32(ewrt);
780 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
781 ewitab = _mm_slli_epi32(ewitab,2);
782 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
783 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
784 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
785 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
786 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
787 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
788 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
789 velec = _mm_mul_ps(qq30,_mm_sub_ps(_mm_sub_ps(rinv30,sh_ewald),velec));
790 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
792 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
794 /* Update potential sum for this i atom from the interaction with this j atom. */
795 velec = _mm_and_ps(velec,cutoff_mask);
796 velec = _mm_andnot_ps(dummy_mask,velec);
797 velecsum = _mm_add_ps(velecsum,velec);
801 fscal = _mm_and_ps(fscal,cutoff_mask);
803 fscal = _mm_andnot_ps(dummy_mask,fscal);
805 /* Calculate temporary vectorial force */
806 tx = _mm_mul_ps(fscal,dx30);
807 ty = _mm_mul_ps(fscal,dy30);
808 tz = _mm_mul_ps(fscal,dz30);
810 /* Update vectorial force */
811 fix3 = _mm_add_ps(fix3,tx);
812 fiy3 = _mm_add_ps(fiy3,ty);
813 fiz3 = _mm_add_ps(fiz3,tz);
815 fjx0 = _mm_add_ps(fjx0,tx);
816 fjy0 = _mm_add_ps(fjy0,ty);
817 fjz0 = _mm_add_ps(fjz0,tz);
821 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
822 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
823 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
824 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
826 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
828 /* Inner loop uses 204 flops */
831 /* End of innermost loop */
833 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
834 f+i_coord_offset,fshift+i_shift_offset);
837 /* Update potential energies */
838 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
839 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
841 /* Increment number of inner iterations */
842 inneriter += j_index_end - j_index_start;
844 /* Outer loop uses 26 flops */
847 /* Increment number of outer iterations */
850 /* Update outer/inner flops */
852 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*204);
855 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse4_1_single
856 * Electrostatics interaction: Ewald
857 * VdW interaction: LJEwald
858 * Geometry: Water4-Particle
859 * Calculate force/pot: Force
862 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4P1_F_sse4_1_single
863 (t_nblist * gmx_restrict nlist,
864 rvec * gmx_restrict xx,
865 rvec * gmx_restrict ff,
866 t_forcerec * gmx_restrict fr,
867 t_mdatoms * gmx_restrict mdatoms,
868 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
869 t_nrnb * gmx_restrict nrnb)
871 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
872 * just 0 for non-waters.
873 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
874 * jnr indices corresponding to data put in the four positions in the SIMD register.
876 int i_shift_offset,i_coord_offset,outeriter,inneriter;
877 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
878 int jnrA,jnrB,jnrC,jnrD;
879 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
880 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
881 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
883 real *shiftvec,*fshift,*x,*f;
884 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
886 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
888 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
890 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
892 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
894 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
895 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
896 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
897 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
898 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
899 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
900 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
901 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
904 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
907 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
908 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
913 __m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
915 __m128 one_half = _mm_set1_ps(0.5);
916 __m128 minus_one = _mm_set1_ps(-1.0);
918 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
920 __m128 dummy_mask,cutoff_mask;
921 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
922 __m128 one = _mm_set1_ps(1.0);
923 __m128 two = _mm_set1_ps(2.0);
929 jindex = nlist->jindex;
931 shiftidx = nlist->shift;
933 shiftvec = fr->shift_vec[0];
934 fshift = fr->fshift[0];
935 facel = _mm_set1_ps(fr->epsfac);
936 charge = mdatoms->chargeA;
937 nvdwtype = fr->ntype;
939 vdwtype = mdatoms->typeA;
940 vdwgridparam = fr->ljpme_c6grid;
941 sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
942 ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
943 ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
945 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
946 ewtab = fr->ic->tabq_coul_F;
947 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
948 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
950 /* Setup water-specific parameters */
951 inr = nlist->iinr[0];
952 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
953 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
954 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
955 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
957 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
958 rcutoff_scalar = fr->rcoulomb;
959 rcutoff = _mm_set1_ps(rcutoff_scalar);
960 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
962 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
963 rvdw = _mm_set1_ps(fr->rvdw);
965 /* Avoid stupid compiler warnings */
966 jnrA = jnrB = jnrC = jnrD = 0;
975 for(iidx=0;iidx<4*DIM;iidx++)
980 /* Start outer loop over neighborlists */
981 for(iidx=0; iidx<nri; iidx++)
983 /* Load shift vector for this list */
984 i_shift_offset = DIM*shiftidx[iidx];
986 /* Load limits for loop over neighbors */
987 j_index_start = jindex[iidx];
988 j_index_end = jindex[iidx+1];
990 /* Get outer coordinate index */
992 i_coord_offset = DIM*inr;
994 /* Load i particle coords and add shift vector */
995 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
996 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
998 fix0 = _mm_setzero_ps();
999 fiy0 = _mm_setzero_ps();
1000 fiz0 = _mm_setzero_ps();
1001 fix1 = _mm_setzero_ps();
1002 fiy1 = _mm_setzero_ps();
1003 fiz1 = _mm_setzero_ps();
1004 fix2 = _mm_setzero_ps();
1005 fiy2 = _mm_setzero_ps();
1006 fiz2 = _mm_setzero_ps();
1007 fix3 = _mm_setzero_ps();
1008 fiy3 = _mm_setzero_ps();
1009 fiz3 = _mm_setzero_ps();
1011 /* Start inner kernel loop */
1012 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1015 /* Get j neighbor index, and coordinate index */
1017 jnrB = jjnr[jidx+1];
1018 jnrC = jjnr[jidx+2];
1019 jnrD = jjnr[jidx+3];
1020 j_coord_offsetA = DIM*jnrA;
1021 j_coord_offsetB = DIM*jnrB;
1022 j_coord_offsetC = DIM*jnrC;
1023 j_coord_offsetD = DIM*jnrD;
1025 /* load j atom coordinates */
1026 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1027 x+j_coord_offsetC,x+j_coord_offsetD,
1030 /* Calculate displacement vector */
1031 dx00 = _mm_sub_ps(ix0,jx0);
1032 dy00 = _mm_sub_ps(iy0,jy0);
1033 dz00 = _mm_sub_ps(iz0,jz0);
1034 dx10 = _mm_sub_ps(ix1,jx0);
1035 dy10 = _mm_sub_ps(iy1,jy0);
1036 dz10 = _mm_sub_ps(iz1,jz0);
1037 dx20 = _mm_sub_ps(ix2,jx0);
1038 dy20 = _mm_sub_ps(iy2,jy0);
1039 dz20 = _mm_sub_ps(iz2,jz0);
1040 dx30 = _mm_sub_ps(ix3,jx0);
1041 dy30 = _mm_sub_ps(iy3,jy0);
1042 dz30 = _mm_sub_ps(iz3,jz0);
1044 /* Calculate squared distance and things based on it */
1045 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1046 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1047 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1048 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1050 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1051 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1052 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1053 rinv30 = gmx_mm_invsqrt_ps(rsq30);
1055 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1056 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1057 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1058 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1060 /* Load parameters for j particles */
1061 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1062 charge+jnrC+0,charge+jnrD+0);
1063 vdwjidx0A = 2*vdwtype[jnrA+0];
1064 vdwjidx0B = 2*vdwtype[jnrB+0];
1065 vdwjidx0C = 2*vdwtype[jnrC+0];
1066 vdwjidx0D = 2*vdwtype[jnrD+0];
1068 fjx0 = _mm_setzero_ps();
1069 fjy0 = _mm_setzero_ps();
1070 fjz0 = _mm_setzero_ps();
1072 /**************************
1073 * CALCULATE INTERACTIONS *
1074 **************************/
1076 if (gmx_mm_any_lt(rsq00,rcutoff2))
1079 r00 = _mm_mul_ps(rsq00,rinv00);
1081 /* Compute parameters for interactions between i and j atoms */
1082 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1083 vdwparam+vdwioffset0+vdwjidx0B,
1084 vdwparam+vdwioffset0+vdwjidx0C,
1085 vdwparam+vdwioffset0+vdwjidx0D,
1088 c6grid_00 = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
1089 vdwgridparam+vdwioffset0+vdwjidx0B,
1090 vdwgridparam+vdwioffset0+vdwjidx0C,
1091 vdwgridparam+vdwioffset0+vdwjidx0D);
1093 /* Analytical LJ-PME */
1094 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1095 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
1096 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1097 exponent = gmx_simd_exp_r(ewcljrsq);
1098 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1099 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
1100 /* f6A = 6 * C6grid * (1 - poly) */
1101 f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1102 /* f6B = C6grid * exponent * beta^6 */
1103 f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1104 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1105 fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1107 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1111 fscal = _mm_and_ps(fscal,cutoff_mask);
1113 /* Calculate temporary vectorial force */
1114 tx = _mm_mul_ps(fscal,dx00);
1115 ty = _mm_mul_ps(fscal,dy00);
1116 tz = _mm_mul_ps(fscal,dz00);
1118 /* Update vectorial force */
1119 fix0 = _mm_add_ps(fix0,tx);
1120 fiy0 = _mm_add_ps(fiy0,ty);
1121 fiz0 = _mm_add_ps(fiz0,tz);
1123 fjx0 = _mm_add_ps(fjx0,tx);
1124 fjy0 = _mm_add_ps(fjy0,ty);
1125 fjz0 = _mm_add_ps(fjz0,tz);
1129 /**************************
1130 * CALCULATE INTERACTIONS *
1131 **************************/
1133 if (gmx_mm_any_lt(rsq10,rcutoff2))
1136 r10 = _mm_mul_ps(rsq10,rinv10);
1138 /* Compute parameters for interactions between i and j atoms */
1139 qq10 = _mm_mul_ps(iq1,jq0);
1141 /* EWALD ELECTROSTATICS */
1143 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1144 ewrt = _mm_mul_ps(r10,ewtabscale);
1145 ewitab = _mm_cvttps_epi32(ewrt);
1146 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1147 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1148 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1150 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1151 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1153 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1157 fscal = _mm_and_ps(fscal,cutoff_mask);
1159 /* Calculate temporary vectorial force */
1160 tx = _mm_mul_ps(fscal,dx10);
1161 ty = _mm_mul_ps(fscal,dy10);
1162 tz = _mm_mul_ps(fscal,dz10);
1164 /* Update vectorial force */
1165 fix1 = _mm_add_ps(fix1,tx);
1166 fiy1 = _mm_add_ps(fiy1,ty);
1167 fiz1 = _mm_add_ps(fiz1,tz);
1169 fjx0 = _mm_add_ps(fjx0,tx);
1170 fjy0 = _mm_add_ps(fjy0,ty);
1171 fjz0 = _mm_add_ps(fjz0,tz);
1175 /**************************
1176 * CALCULATE INTERACTIONS *
1177 **************************/
1179 if (gmx_mm_any_lt(rsq20,rcutoff2))
1182 r20 = _mm_mul_ps(rsq20,rinv20);
1184 /* Compute parameters for interactions between i and j atoms */
1185 qq20 = _mm_mul_ps(iq2,jq0);
1187 /* EWALD ELECTROSTATICS */
1189 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1190 ewrt = _mm_mul_ps(r20,ewtabscale);
1191 ewitab = _mm_cvttps_epi32(ewrt);
1192 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1193 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1194 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1196 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1197 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1199 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1203 fscal = _mm_and_ps(fscal,cutoff_mask);
1205 /* Calculate temporary vectorial force */
1206 tx = _mm_mul_ps(fscal,dx20);
1207 ty = _mm_mul_ps(fscal,dy20);
1208 tz = _mm_mul_ps(fscal,dz20);
1210 /* Update vectorial force */
1211 fix2 = _mm_add_ps(fix2,tx);
1212 fiy2 = _mm_add_ps(fiy2,ty);
1213 fiz2 = _mm_add_ps(fiz2,tz);
1215 fjx0 = _mm_add_ps(fjx0,tx);
1216 fjy0 = _mm_add_ps(fjy0,ty);
1217 fjz0 = _mm_add_ps(fjz0,tz);
1221 /**************************
1222 * CALCULATE INTERACTIONS *
1223 **************************/
1225 if (gmx_mm_any_lt(rsq30,rcutoff2))
1228 r30 = _mm_mul_ps(rsq30,rinv30);
1230 /* Compute parameters for interactions between i and j atoms */
1231 qq30 = _mm_mul_ps(iq3,jq0);
1233 /* EWALD ELECTROSTATICS */
1235 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1236 ewrt = _mm_mul_ps(r30,ewtabscale);
1237 ewitab = _mm_cvttps_epi32(ewrt);
1238 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1239 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1240 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1242 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1243 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1245 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1249 fscal = _mm_and_ps(fscal,cutoff_mask);
1251 /* Calculate temporary vectorial force */
1252 tx = _mm_mul_ps(fscal,dx30);
1253 ty = _mm_mul_ps(fscal,dy30);
1254 tz = _mm_mul_ps(fscal,dz30);
1256 /* Update vectorial force */
1257 fix3 = _mm_add_ps(fix3,tx);
1258 fiy3 = _mm_add_ps(fiy3,ty);
1259 fiz3 = _mm_add_ps(fiz3,tz);
1261 fjx0 = _mm_add_ps(fjx0,tx);
1262 fjy0 = _mm_add_ps(fjy0,ty);
1263 fjz0 = _mm_add_ps(fjz0,tz);
1267 fjptrA = f+j_coord_offsetA;
1268 fjptrB = f+j_coord_offsetB;
1269 fjptrC = f+j_coord_offsetC;
1270 fjptrD = f+j_coord_offsetD;
1272 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1274 /* Inner loop uses 166 flops */
1277 if(jidx<j_index_end)
1280 /* Get j neighbor index, and coordinate index */
1281 jnrlistA = jjnr[jidx];
1282 jnrlistB = jjnr[jidx+1];
1283 jnrlistC = jjnr[jidx+2];
1284 jnrlistD = jjnr[jidx+3];
1285 /* Sign of each element will be negative for non-real atoms.
1286 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1287 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1289 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1290 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1291 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1292 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1293 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1294 j_coord_offsetA = DIM*jnrA;
1295 j_coord_offsetB = DIM*jnrB;
1296 j_coord_offsetC = DIM*jnrC;
1297 j_coord_offsetD = DIM*jnrD;
1299 /* load j atom coordinates */
1300 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1301 x+j_coord_offsetC,x+j_coord_offsetD,
1304 /* Calculate displacement vector */
1305 dx00 = _mm_sub_ps(ix0,jx0);
1306 dy00 = _mm_sub_ps(iy0,jy0);
1307 dz00 = _mm_sub_ps(iz0,jz0);
1308 dx10 = _mm_sub_ps(ix1,jx0);
1309 dy10 = _mm_sub_ps(iy1,jy0);
1310 dz10 = _mm_sub_ps(iz1,jz0);
1311 dx20 = _mm_sub_ps(ix2,jx0);
1312 dy20 = _mm_sub_ps(iy2,jy0);
1313 dz20 = _mm_sub_ps(iz2,jz0);
1314 dx30 = _mm_sub_ps(ix3,jx0);
1315 dy30 = _mm_sub_ps(iy3,jy0);
1316 dz30 = _mm_sub_ps(iz3,jz0);
1318 /* Calculate squared distance and things based on it */
1319 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1320 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1321 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1322 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1324 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1325 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1326 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1327 rinv30 = gmx_mm_invsqrt_ps(rsq30);
1329 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1330 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1331 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1332 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1334 /* Load parameters for j particles */
1335 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1336 charge+jnrC+0,charge+jnrD+0);
1337 vdwjidx0A = 2*vdwtype[jnrA+0];
1338 vdwjidx0B = 2*vdwtype[jnrB+0];
1339 vdwjidx0C = 2*vdwtype[jnrC+0];
1340 vdwjidx0D = 2*vdwtype[jnrD+0];
1342 fjx0 = _mm_setzero_ps();
1343 fjy0 = _mm_setzero_ps();
1344 fjz0 = _mm_setzero_ps();
1346 /**************************
1347 * CALCULATE INTERACTIONS *
1348 **************************/
1350 if (gmx_mm_any_lt(rsq00,rcutoff2))
1353 r00 = _mm_mul_ps(rsq00,rinv00);
1354 r00 = _mm_andnot_ps(dummy_mask,r00);
1356 /* Compute parameters for interactions between i and j atoms */
1357 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1358 vdwparam+vdwioffset0+vdwjidx0B,
1359 vdwparam+vdwioffset0+vdwjidx0C,
1360 vdwparam+vdwioffset0+vdwjidx0D,
1363 c6grid_00 = gmx_mm_load_4real_swizzle_ps(vdwgridparam+vdwioffset0+vdwjidx0A,
1364 vdwgridparam+vdwioffset0+vdwjidx0B,
1365 vdwgridparam+vdwioffset0+vdwjidx0C,
1366 vdwgridparam+vdwioffset0+vdwjidx0D);
1368 /* Analytical LJ-PME */
1369 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1370 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
1371 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1372 exponent = gmx_simd_exp_r(ewcljrsq);
1373 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1374 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
1375 /* f6A = 6 * C6grid * (1 - poly) */
1376 f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1377 /* f6B = C6grid * exponent * beta^6 */
1378 f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1379 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1380 fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1382 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1386 fscal = _mm_and_ps(fscal,cutoff_mask);
1388 fscal = _mm_andnot_ps(dummy_mask,fscal);
1390 /* Calculate temporary vectorial force */
1391 tx = _mm_mul_ps(fscal,dx00);
1392 ty = _mm_mul_ps(fscal,dy00);
1393 tz = _mm_mul_ps(fscal,dz00);
1395 /* Update vectorial force */
1396 fix0 = _mm_add_ps(fix0,tx);
1397 fiy0 = _mm_add_ps(fiy0,ty);
1398 fiz0 = _mm_add_ps(fiz0,tz);
1400 fjx0 = _mm_add_ps(fjx0,tx);
1401 fjy0 = _mm_add_ps(fjy0,ty);
1402 fjz0 = _mm_add_ps(fjz0,tz);
1406 /**************************
1407 * CALCULATE INTERACTIONS *
1408 **************************/
1410 if (gmx_mm_any_lt(rsq10,rcutoff2))
1413 r10 = _mm_mul_ps(rsq10,rinv10);
1414 r10 = _mm_andnot_ps(dummy_mask,r10);
1416 /* Compute parameters for interactions between i and j atoms */
1417 qq10 = _mm_mul_ps(iq1,jq0);
1419 /* EWALD ELECTROSTATICS */
1421 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1422 ewrt = _mm_mul_ps(r10,ewtabscale);
1423 ewitab = _mm_cvttps_epi32(ewrt);
1424 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1425 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1426 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1428 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1429 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1431 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1435 fscal = _mm_and_ps(fscal,cutoff_mask);
1437 fscal = _mm_andnot_ps(dummy_mask,fscal);
1439 /* Calculate temporary vectorial force */
1440 tx = _mm_mul_ps(fscal,dx10);
1441 ty = _mm_mul_ps(fscal,dy10);
1442 tz = _mm_mul_ps(fscal,dz10);
1444 /* Update vectorial force */
1445 fix1 = _mm_add_ps(fix1,tx);
1446 fiy1 = _mm_add_ps(fiy1,ty);
1447 fiz1 = _mm_add_ps(fiz1,tz);
1449 fjx0 = _mm_add_ps(fjx0,tx);
1450 fjy0 = _mm_add_ps(fjy0,ty);
1451 fjz0 = _mm_add_ps(fjz0,tz);
1455 /**************************
1456 * CALCULATE INTERACTIONS *
1457 **************************/
1459 if (gmx_mm_any_lt(rsq20,rcutoff2))
1462 r20 = _mm_mul_ps(rsq20,rinv20);
1463 r20 = _mm_andnot_ps(dummy_mask,r20);
1465 /* Compute parameters for interactions between i and j atoms */
1466 qq20 = _mm_mul_ps(iq2,jq0);
1468 /* EWALD ELECTROSTATICS */
1470 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1471 ewrt = _mm_mul_ps(r20,ewtabscale);
1472 ewitab = _mm_cvttps_epi32(ewrt);
1473 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1474 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1475 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1477 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1478 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1480 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1484 fscal = _mm_and_ps(fscal,cutoff_mask);
1486 fscal = _mm_andnot_ps(dummy_mask,fscal);
1488 /* Calculate temporary vectorial force */
1489 tx = _mm_mul_ps(fscal,dx20);
1490 ty = _mm_mul_ps(fscal,dy20);
1491 tz = _mm_mul_ps(fscal,dz20);
1493 /* Update vectorial force */
1494 fix2 = _mm_add_ps(fix2,tx);
1495 fiy2 = _mm_add_ps(fiy2,ty);
1496 fiz2 = _mm_add_ps(fiz2,tz);
1498 fjx0 = _mm_add_ps(fjx0,tx);
1499 fjy0 = _mm_add_ps(fjy0,ty);
1500 fjz0 = _mm_add_ps(fjz0,tz);
1504 /**************************
1505 * CALCULATE INTERACTIONS *
1506 **************************/
1508 if (gmx_mm_any_lt(rsq30,rcutoff2))
1511 r30 = _mm_mul_ps(rsq30,rinv30);
1512 r30 = _mm_andnot_ps(dummy_mask,r30);
1514 /* Compute parameters for interactions between i and j atoms */
1515 qq30 = _mm_mul_ps(iq3,jq0);
1517 /* EWALD ELECTROSTATICS */
1519 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1520 ewrt = _mm_mul_ps(r30,ewtabscale);
1521 ewitab = _mm_cvttps_epi32(ewrt);
1522 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1523 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1524 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1526 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1527 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1529 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1533 fscal = _mm_and_ps(fscal,cutoff_mask);
1535 fscal = _mm_andnot_ps(dummy_mask,fscal);
1537 /* Calculate temporary vectorial force */
1538 tx = _mm_mul_ps(fscal,dx30);
1539 ty = _mm_mul_ps(fscal,dy30);
1540 tz = _mm_mul_ps(fscal,dz30);
1542 /* Update vectorial force */
1543 fix3 = _mm_add_ps(fix3,tx);
1544 fiy3 = _mm_add_ps(fiy3,ty);
1545 fiz3 = _mm_add_ps(fiz3,tz);
1547 fjx0 = _mm_add_ps(fjx0,tx);
1548 fjy0 = _mm_add_ps(fjy0,ty);
1549 fjz0 = _mm_add_ps(fjz0,tz);
1553 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1554 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1555 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1556 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1558 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1560 /* Inner loop uses 170 flops */
1563 /* End of innermost loop */
1565 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1566 f+i_coord_offset,fshift+i_shift_offset);
1568 /* Increment number of inner iterations */
1569 inneriter += j_index_end - j_index_start;
1571 /* Outer loop uses 24 flops */
1574 /* Increment number of outer iterations */
1577 /* Update outer/inner flops */
1579 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*170);