2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse2_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LennardJones
53 * Geometry: Water4-Particle
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_VF_sse2_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
78 real *shiftvec,*fshift,*x,*f;
79 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
90 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
93 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
94 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
95 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
96 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
99 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
102 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
103 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
105 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
107 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
108 real rswitch_scalar,d_scalar;
109 __m128 dummy_mask,cutoff_mask;
110 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
111 __m128 one = _mm_set1_ps(1.0);
112 __m128 two = _mm_set1_ps(2.0);
118 jindex = nlist->jindex;
120 shiftidx = nlist->shift;
122 shiftvec = fr->shift_vec[0];
123 fshift = fr->fshift[0];
124 facel = _mm_set1_ps(fr->ic->epsfac);
125 charge = mdatoms->chargeA;
126 nvdwtype = fr->ntype;
128 vdwtype = mdatoms->typeA;
130 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
131 ewtab = fr->ic->tabq_coul_FDV0;
132 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
133 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
135 /* Setup water-specific parameters */
136 inr = nlist->iinr[0];
137 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
138 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
139 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
140 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
142 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
143 rcutoff_scalar = fr->ic->rcoulomb;
144 rcutoff = _mm_set1_ps(rcutoff_scalar);
145 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
147 rswitch_scalar = fr->ic->rcoulomb_switch;
148 rswitch = _mm_set1_ps(rswitch_scalar);
149 /* Setup switch parameters */
150 d_scalar = rcutoff_scalar-rswitch_scalar;
151 d = _mm_set1_ps(d_scalar);
152 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
153 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
154 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
155 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
156 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
157 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
159 /* Avoid stupid compiler warnings */
160 jnrA = jnrB = jnrC = jnrD = 0;
169 for(iidx=0;iidx<4*DIM;iidx++)
174 /* Start outer loop over neighborlists */
175 for(iidx=0; iidx<nri; iidx++)
177 /* Load shift vector for this list */
178 i_shift_offset = DIM*shiftidx[iidx];
180 /* Load limits for loop over neighbors */
181 j_index_start = jindex[iidx];
182 j_index_end = jindex[iidx+1];
184 /* Get outer coordinate index */
186 i_coord_offset = DIM*inr;
188 /* Load i particle coords and add shift vector */
189 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
190 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
192 fix0 = _mm_setzero_ps();
193 fiy0 = _mm_setzero_ps();
194 fiz0 = _mm_setzero_ps();
195 fix1 = _mm_setzero_ps();
196 fiy1 = _mm_setzero_ps();
197 fiz1 = _mm_setzero_ps();
198 fix2 = _mm_setzero_ps();
199 fiy2 = _mm_setzero_ps();
200 fiz2 = _mm_setzero_ps();
201 fix3 = _mm_setzero_ps();
202 fiy3 = _mm_setzero_ps();
203 fiz3 = _mm_setzero_ps();
205 /* Reset potential sums */
206 velecsum = _mm_setzero_ps();
207 vvdwsum = _mm_setzero_ps();
209 /* Start inner kernel loop */
210 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
213 /* Get j neighbor index, and coordinate index */
218 j_coord_offsetA = DIM*jnrA;
219 j_coord_offsetB = DIM*jnrB;
220 j_coord_offsetC = DIM*jnrC;
221 j_coord_offsetD = DIM*jnrD;
223 /* load j atom coordinates */
224 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
225 x+j_coord_offsetC,x+j_coord_offsetD,
228 /* Calculate displacement vector */
229 dx00 = _mm_sub_ps(ix0,jx0);
230 dy00 = _mm_sub_ps(iy0,jy0);
231 dz00 = _mm_sub_ps(iz0,jz0);
232 dx10 = _mm_sub_ps(ix1,jx0);
233 dy10 = _mm_sub_ps(iy1,jy0);
234 dz10 = _mm_sub_ps(iz1,jz0);
235 dx20 = _mm_sub_ps(ix2,jx0);
236 dy20 = _mm_sub_ps(iy2,jy0);
237 dz20 = _mm_sub_ps(iz2,jz0);
238 dx30 = _mm_sub_ps(ix3,jx0);
239 dy30 = _mm_sub_ps(iy3,jy0);
240 dz30 = _mm_sub_ps(iz3,jz0);
242 /* Calculate squared distance and things based on it */
243 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
244 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
245 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
246 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
248 rinv00 = sse2_invsqrt_f(rsq00);
249 rinv10 = sse2_invsqrt_f(rsq10);
250 rinv20 = sse2_invsqrt_f(rsq20);
251 rinv30 = sse2_invsqrt_f(rsq30);
253 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
254 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
255 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
256 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
258 /* Load parameters for j particles */
259 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
260 charge+jnrC+0,charge+jnrD+0);
261 vdwjidx0A = 2*vdwtype[jnrA+0];
262 vdwjidx0B = 2*vdwtype[jnrB+0];
263 vdwjidx0C = 2*vdwtype[jnrC+0];
264 vdwjidx0D = 2*vdwtype[jnrD+0];
266 fjx0 = _mm_setzero_ps();
267 fjy0 = _mm_setzero_ps();
268 fjz0 = _mm_setzero_ps();
270 /**************************
271 * CALCULATE INTERACTIONS *
272 **************************/
274 if (gmx_mm_any_lt(rsq00,rcutoff2))
277 r00 = _mm_mul_ps(rsq00,rinv00);
279 /* Compute parameters for interactions between i and j atoms */
280 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
281 vdwparam+vdwioffset0+vdwjidx0B,
282 vdwparam+vdwioffset0+vdwjidx0C,
283 vdwparam+vdwioffset0+vdwjidx0D,
286 /* LENNARD-JONES DISPERSION/REPULSION */
288 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
289 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
290 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
291 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
292 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
294 d = _mm_sub_ps(r00,rswitch);
295 d = _mm_max_ps(d,_mm_setzero_ps());
296 d2 = _mm_mul_ps(d,d);
297 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
299 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
301 /* Evaluate switch function */
302 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
303 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
304 vvdw = _mm_mul_ps(vvdw,sw);
305 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
307 /* Update potential sum for this i atom from the interaction with this j atom. */
308 vvdw = _mm_and_ps(vvdw,cutoff_mask);
309 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
313 fscal = _mm_and_ps(fscal,cutoff_mask);
315 /* Calculate temporary vectorial force */
316 tx = _mm_mul_ps(fscal,dx00);
317 ty = _mm_mul_ps(fscal,dy00);
318 tz = _mm_mul_ps(fscal,dz00);
320 /* Update vectorial force */
321 fix0 = _mm_add_ps(fix0,tx);
322 fiy0 = _mm_add_ps(fiy0,ty);
323 fiz0 = _mm_add_ps(fiz0,tz);
325 fjx0 = _mm_add_ps(fjx0,tx);
326 fjy0 = _mm_add_ps(fjy0,ty);
327 fjz0 = _mm_add_ps(fjz0,tz);
331 /**************************
332 * CALCULATE INTERACTIONS *
333 **************************/
335 if (gmx_mm_any_lt(rsq10,rcutoff2))
338 r10 = _mm_mul_ps(rsq10,rinv10);
340 /* Compute parameters for interactions between i and j atoms */
341 qq10 = _mm_mul_ps(iq1,jq0);
343 /* EWALD ELECTROSTATICS */
345 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
346 ewrt = _mm_mul_ps(r10,ewtabscale);
347 ewitab = _mm_cvttps_epi32(ewrt);
348 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
349 ewitab = _mm_slli_epi32(ewitab,2);
350 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
351 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
352 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
353 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
354 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
355 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
356 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
357 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
358 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
360 d = _mm_sub_ps(r10,rswitch);
361 d = _mm_max_ps(d,_mm_setzero_ps());
362 d2 = _mm_mul_ps(d,d);
363 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
365 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
367 /* Evaluate switch function */
368 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
369 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
370 velec = _mm_mul_ps(velec,sw);
371 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
373 /* Update potential sum for this i atom from the interaction with this j atom. */
374 velec = _mm_and_ps(velec,cutoff_mask);
375 velecsum = _mm_add_ps(velecsum,velec);
379 fscal = _mm_and_ps(fscal,cutoff_mask);
381 /* Calculate temporary vectorial force */
382 tx = _mm_mul_ps(fscal,dx10);
383 ty = _mm_mul_ps(fscal,dy10);
384 tz = _mm_mul_ps(fscal,dz10);
386 /* Update vectorial force */
387 fix1 = _mm_add_ps(fix1,tx);
388 fiy1 = _mm_add_ps(fiy1,ty);
389 fiz1 = _mm_add_ps(fiz1,tz);
391 fjx0 = _mm_add_ps(fjx0,tx);
392 fjy0 = _mm_add_ps(fjy0,ty);
393 fjz0 = _mm_add_ps(fjz0,tz);
397 /**************************
398 * CALCULATE INTERACTIONS *
399 **************************/
401 if (gmx_mm_any_lt(rsq20,rcutoff2))
404 r20 = _mm_mul_ps(rsq20,rinv20);
406 /* Compute parameters for interactions between i and j atoms */
407 qq20 = _mm_mul_ps(iq2,jq0);
409 /* EWALD ELECTROSTATICS */
411 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
412 ewrt = _mm_mul_ps(r20,ewtabscale);
413 ewitab = _mm_cvttps_epi32(ewrt);
414 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
415 ewitab = _mm_slli_epi32(ewitab,2);
416 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
417 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
418 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
419 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
420 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
421 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
422 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
423 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
424 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
426 d = _mm_sub_ps(r20,rswitch);
427 d = _mm_max_ps(d,_mm_setzero_ps());
428 d2 = _mm_mul_ps(d,d);
429 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
431 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
433 /* Evaluate switch function */
434 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
435 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
436 velec = _mm_mul_ps(velec,sw);
437 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
439 /* Update potential sum for this i atom from the interaction with this j atom. */
440 velec = _mm_and_ps(velec,cutoff_mask);
441 velecsum = _mm_add_ps(velecsum,velec);
445 fscal = _mm_and_ps(fscal,cutoff_mask);
447 /* Calculate temporary vectorial force */
448 tx = _mm_mul_ps(fscal,dx20);
449 ty = _mm_mul_ps(fscal,dy20);
450 tz = _mm_mul_ps(fscal,dz20);
452 /* Update vectorial force */
453 fix2 = _mm_add_ps(fix2,tx);
454 fiy2 = _mm_add_ps(fiy2,ty);
455 fiz2 = _mm_add_ps(fiz2,tz);
457 fjx0 = _mm_add_ps(fjx0,tx);
458 fjy0 = _mm_add_ps(fjy0,ty);
459 fjz0 = _mm_add_ps(fjz0,tz);
463 /**************************
464 * CALCULATE INTERACTIONS *
465 **************************/
467 if (gmx_mm_any_lt(rsq30,rcutoff2))
470 r30 = _mm_mul_ps(rsq30,rinv30);
472 /* Compute parameters for interactions between i and j atoms */
473 qq30 = _mm_mul_ps(iq3,jq0);
475 /* EWALD ELECTROSTATICS */
477 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
478 ewrt = _mm_mul_ps(r30,ewtabscale);
479 ewitab = _mm_cvttps_epi32(ewrt);
480 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
481 ewitab = _mm_slli_epi32(ewitab,2);
482 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
483 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
484 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
485 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
486 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
487 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
488 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
489 velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
490 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
492 d = _mm_sub_ps(r30,rswitch);
493 d = _mm_max_ps(d,_mm_setzero_ps());
494 d2 = _mm_mul_ps(d,d);
495 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
497 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
499 /* Evaluate switch function */
500 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
501 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
502 velec = _mm_mul_ps(velec,sw);
503 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
505 /* Update potential sum for this i atom from the interaction with this j atom. */
506 velec = _mm_and_ps(velec,cutoff_mask);
507 velecsum = _mm_add_ps(velecsum,velec);
511 fscal = _mm_and_ps(fscal,cutoff_mask);
513 /* Calculate temporary vectorial force */
514 tx = _mm_mul_ps(fscal,dx30);
515 ty = _mm_mul_ps(fscal,dy30);
516 tz = _mm_mul_ps(fscal,dz30);
518 /* Update vectorial force */
519 fix3 = _mm_add_ps(fix3,tx);
520 fiy3 = _mm_add_ps(fiy3,ty);
521 fiz3 = _mm_add_ps(fiz3,tz);
523 fjx0 = _mm_add_ps(fjx0,tx);
524 fjy0 = _mm_add_ps(fjy0,ty);
525 fjz0 = _mm_add_ps(fjz0,tz);
529 fjptrA = f+j_coord_offsetA;
530 fjptrB = f+j_coord_offsetB;
531 fjptrC = f+j_coord_offsetC;
532 fjptrD = f+j_coord_offsetD;
534 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
536 /* Inner loop uses 254 flops */
542 /* Get j neighbor index, and coordinate index */
543 jnrlistA = jjnr[jidx];
544 jnrlistB = jjnr[jidx+1];
545 jnrlistC = jjnr[jidx+2];
546 jnrlistD = jjnr[jidx+3];
547 /* Sign of each element will be negative for non-real atoms.
548 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
549 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
551 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
552 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
553 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
554 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
555 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
556 j_coord_offsetA = DIM*jnrA;
557 j_coord_offsetB = DIM*jnrB;
558 j_coord_offsetC = DIM*jnrC;
559 j_coord_offsetD = DIM*jnrD;
561 /* load j atom coordinates */
562 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
563 x+j_coord_offsetC,x+j_coord_offsetD,
566 /* Calculate displacement vector */
567 dx00 = _mm_sub_ps(ix0,jx0);
568 dy00 = _mm_sub_ps(iy0,jy0);
569 dz00 = _mm_sub_ps(iz0,jz0);
570 dx10 = _mm_sub_ps(ix1,jx0);
571 dy10 = _mm_sub_ps(iy1,jy0);
572 dz10 = _mm_sub_ps(iz1,jz0);
573 dx20 = _mm_sub_ps(ix2,jx0);
574 dy20 = _mm_sub_ps(iy2,jy0);
575 dz20 = _mm_sub_ps(iz2,jz0);
576 dx30 = _mm_sub_ps(ix3,jx0);
577 dy30 = _mm_sub_ps(iy3,jy0);
578 dz30 = _mm_sub_ps(iz3,jz0);
580 /* Calculate squared distance and things based on it */
581 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
582 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
583 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
584 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
586 rinv00 = sse2_invsqrt_f(rsq00);
587 rinv10 = sse2_invsqrt_f(rsq10);
588 rinv20 = sse2_invsqrt_f(rsq20);
589 rinv30 = sse2_invsqrt_f(rsq30);
591 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
592 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
593 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
594 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
596 /* Load parameters for j particles */
597 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
598 charge+jnrC+0,charge+jnrD+0);
599 vdwjidx0A = 2*vdwtype[jnrA+0];
600 vdwjidx0B = 2*vdwtype[jnrB+0];
601 vdwjidx0C = 2*vdwtype[jnrC+0];
602 vdwjidx0D = 2*vdwtype[jnrD+0];
604 fjx0 = _mm_setzero_ps();
605 fjy0 = _mm_setzero_ps();
606 fjz0 = _mm_setzero_ps();
608 /**************************
609 * CALCULATE INTERACTIONS *
610 **************************/
612 if (gmx_mm_any_lt(rsq00,rcutoff2))
615 r00 = _mm_mul_ps(rsq00,rinv00);
616 r00 = _mm_andnot_ps(dummy_mask,r00);
618 /* Compute parameters for interactions between i and j atoms */
619 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
620 vdwparam+vdwioffset0+vdwjidx0B,
621 vdwparam+vdwioffset0+vdwjidx0C,
622 vdwparam+vdwioffset0+vdwjidx0D,
625 /* LENNARD-JONES DISPERSION/REPULSION */
627 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
628 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
629 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
630 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
631 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
633 d = _mm_sub_ps(r00,rswitch);
634 d = _mm_max_ps(d,_mm_setzero_ps());
635 d2 = _mm_mul_ps(d,d);
636 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
638 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
640 /* Evaluate switch function */
641 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
642 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
643 vvdw = _mm_mul_ps(vvdw,sw);
644 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
646 /* Update potential sum for this i atom from the interaction with this j atom. */
647 vvdw = _mm_and_ps(vvdw,cutoff_mask);
648 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
649 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
653 fscal = _mm_and_ps(fscal,cutoff_mask);
655 fscal = _mm_andnot_ps(dummy_mask,fscal);
657 /* Calculate temporary vectorial force */
658 tx = _mm_mul_ps(fscal,dx00);
659 ty = _mm_mul_ps(fscal,dy00);
660 tz = _mm_mul_ps(fscal,dz00);
662 /* Update vectorial force */
663 fix0 = _mm_add_ps(fix0,tx);
664 fiy0 = _mm_add_ps(fiy0,ty);
665 fiz0 = _mm_add_ps(fiz0,tz);
667 fjx0 = _mm_add_ps(fjx0,tx);
668 fjy0 = _mm_add_ps(fjy0,ty);
669 fjz0 = _mm_add_ps(fjz0,tz);
673 /**************************
674 * CALCULATE INTERACTIONS *
675 **************************/
677 if (gmx_mm_any_lt(rsq10,rcutoff2))
680 r10 = _mm_mul_ps(rsq10,rinv10);
681 r10 = _mm_andnot_ps(dummy_mask,r10);
683 /* Compute parameters for interactions between i and j atoms */
684 qq10 = _mm_mul_ps(iq1,jq0);
686 /* EWALD ELECTROSTATICS */
688 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
689 ewrt = _mm_mul_ps(r10,ewtabscale);
690 ewitab = _mm_cvttps_epi32(ewrt);
691 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
692 ewitab = _mm_slli_epi32(ewitab,2);
693 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
694 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
695 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
696 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
697 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
698 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
699 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
700 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
701 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
703 d = _mm_sub_ps(r10,rswitch);
704 d = _mm_max_ps(d,_mm_setzero_ps());
705 d2 = _mm_mul_ps(d,d);
706 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
708 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
710 /* Evaluate switch function */
711 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
712 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
713 velec = _mm_mul_ps(velec,sw);
714 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
716 /* Update potential sum for this i atom from the interaction with this j atom. */
717 velec = _mm_and_ps(velec,cutoff_mask);
718 velec = _mm_andnot_ps(dummy_mask,velec);
719 velecsum = _mm_add_ps(velecsum,velec);
723 fscal = _mm_and_ps(fscal,cutoff_mask);
725 fscal = _mm_andnot_ps(dummy_mask,fscal);
727 /* Calculate temporary vectorial force */
728 tx = _mm_mul_ps(fscal,dx10);
729 ty = _mm_mul_ps(fscal,dy10);
730 tz = _mm_mul_ps(fscal,dz10);
732 /* Update vectorial force */
733 fix1 = _mm_add_ps(fix1,tx);
734 fiy1 = _mm_add_ps(fiy1,ty);
735 fiz1 = _mm_add_ps(fiz1,tz);
737 fjx0 = _mm_add_ps(fjx0,tx);
738 fjy0 = _mm_add_ps(fjy0,ty);
739 fjz0 = _mm_add_ps(fjz0,tz);
743 /**************************
744 * CALCULATE INTERACTIONS *
745 **************************/
747 if (gmx_mm_any_lt(rsq20,rcutoff2))
750 r20 = _mm_mul_ps(rsq20,rinv20);
751 r20 = _mm_andnot_ps(dummy_mask,r20);
753 /* Compute parameters for interactions between i and j atoms */
754 qq20 = _mm_mul_ps(iq2,jq0);
756 /* EWALD ELECTROSTATICS */
758 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
759 ewrt = _mm_mul_ps(r20,ewtabscale);
760 ewitab = _mm_cvttps_epi32(ewrt);
761 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
762 ewitab = _mm_slli_epi32(ewitab,2);
763 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
764 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
765 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
766 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
767 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
768 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
769 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
770 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
771 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
773 d = _mm_sub_ps(r20,rswitch);
774 d = _mm_max_ps(d,_mm_setzero_ps());
775 d2 = _mm_mul_ps(d,d);
776 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
778 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
780 /* Evaluate switch function */
781 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
782 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
783 velec = _mm_mul_ps(velec,sw);
784 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
786 /* Update potential sum for this i atom from the interaction with this j atom. */
787 velec = _mm_and_ps(velec,cutoff_mask);
788 velec = _mm_andnot_ps(dummy_mask,velec);
789 velecsum = _mm_add_ps(velecsum,velec);
793 fscal = _mm_and_ps(fscal,cutoff_mask);
795 fscal = _mm_andnot_ps(dummy_mask,fscal);
797 /* Calculate temporary vectorial force */
798 tx = _mm_mul_ps(fscal,dx20);
799 ty = _mm_mul_ps(fscal,dy20);
800 tz = _mm_mul_ps(fscal,dz20);
802 /* Update vectorial force */
803 fix2 = _mm_add_ps(fix2,tx);
804 fiy2 = _mm_add_ps(fiy2,ty);
805 fiz2 = _mm_add_ps(fiz2,tz);
807 fjx0 = _mm_add_ps(fjx0,tx);
808 fjy0 = _mm_add_ps(fjy0,ty);
809 fjz0 = _mm_add_ps(fjz0,tz);
813 /**************************
814 * CALCULATE INTERACTIONS *
815 **************************/
817 if (gmx_mm_any_lt(rsq30,rcutoff2))
820 r30 = _mm_mul_ps(rsq30,rinv30);
821 r30 = _mm_andnot_ps(dummy_mask,r30);
823 /* Compute parameters for interactions between i and j atoms */
824 qq30 = _mm_mul_ps(iq3,jq0);
826 /* EWALD ELECTROSTATICS */
828 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
829 ewrt = _mm_mul_ps(r30,ewtabscale);
830 ewitab = _mm_cvttps_epi32(ewrt);
831 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
832 ewitab = _mm_slli_epi32(ewitab,2);
833 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
834 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
835 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
836 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
837 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
838 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
839 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
840 velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
841 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
843 d = _mm_sub_ps(r30,rswitch);
844 d = _mm_max_ps(d,_mm_setzero_ps());
845 d2 = _mm_mul_ps(d,d);
846 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
848 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
850 /* Evaluate switch function */
851 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
852 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
853 velec = _mm_mul_ps(velec,sw);
854 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
856 /* Update potential sum for this i atom from the interaction with this j atom. */
857 velec = _mm_and_ps(velec,cutoff_mask);
858 velec = _mm_andnot_ps(dummy_mask,velec);
859 velecsum = _mm_add_ps(velecsum,velec);
863 fscal = _mm_and_ps(fscal,cutoff_mask);
865 fscal = _mm_andnot_ps(dummy_mask,fscal);
867 /* Calculate temporary vectorial force */
868 tx = _mm_mul_ps(fscal,dx30);
869 ty = _mm_mul_ps(fscal,dy30);
870 tz = _mm_mul_ps(fscal,dz30);
872 /* Update vectorial force */
873 fix3 = _mm_add_ps(fix3,tx);
874 fiy3 = _mm_add_ps(fiy3,ty);
875 fiz3 = _mm_add_ps(fiz3,tz);
877 fjx0 = _mm_add_ps(fjx0,tx);
878 fjy0 = _mm_add_ps(fjy0,ty);
879 fjz0 = _mm_add_ps(fjz0,tz);
883 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
884 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
885 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
886 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
888 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
890 /* Inner loop uses 258 flops */
893 /* End of innermost loop */
895 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
896 f+i_coord_offset,fshift+i_shift_offset);
899 /* Update potential energies */
900 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
901 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
903 /* Increment number of inner iterations */
904 inneriter += j_index_end - j_index_start;
906 /* Outer loop uses 26 flops */
909 /* Increment number of outer iterations */
912 /* Update outer/inner flops */
914 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*258);
917 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single
918 * Electrostatics interaction: Ewald
919 * VdW interaction: LennardJones
920 * Geometry: Water4-Particle
921 * Calculate force/pot: Force
924 nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_F_sse2_single
925 (t_nblist * gmx_restrict nlist,
926 rvec * gmx_restrict xx,
927 rvec * gmx_restrict ff,
928 struct t_forcerec * gmx_restrict fr,
929 t_mdatoms * gmx_restrict mdatoms,
930 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
931 t_nrnb * gmx_restrict nrnb)
933 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
934 * just 0 for non-waters.
935 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
936 * jnr indices corresponding to data put in the four positions in the SIMD register.
938 int i_shift_offset,i_coord_offset,outeriter,inneriter;
939 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
940 int jnrA,jnrB,jnrC,jnrD;
941 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
942 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
943 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
945 real *shiftvec,*fshift,*x,*f;
946 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
948 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
950 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
952 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
954 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
956 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
957 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
958 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
959 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
960 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
961 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
962 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
963 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
966 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
969 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
970 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
972 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
974 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
975 real rswitch_scalar,d_scalar;
976 __m128 dummy_mask,cutoff_mask;
977 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
978 __m128 one = _mm_set1_ps(1.0);
979 __m128 two = _mm_set1_ps(2.0);
985 jindex = nlist->jindex;
987 shiftidx = nlist->shift;
989 shiftvec = fr->shift_vec[0];
990 fshift = fr->fshift[0];
991 facel = _mm_set1_ps(fr->ic->epsfac);
992 charge = mdatoms->chargeA;
993 nvdwtype = fr->ntype;
995 vdwtype = mdatoms->typeA;
997 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
998 ewtab = fr->ic->tabq_coul_FDV0;
999 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1000 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1002 /* Setup water-specific parameters */
1003 inr = nlist->iinr[0];
1004 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1005 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1006 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1007 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1009 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1010 rcutoff_scalar = fr->ic->rcoulomb;
1011 rcutoff = _mm_set1_ps(rcutoff_scalar);
1012 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1014 rswitch_scalar = fr->ic->rcoulomb_switch;
1015 rswitch = _mm_set1_ps(rswitch_scalar);
1016 /* Setup switch parameters */
1017 d_scalar = rcutoff_scalar-rswitch_scalar;
1018 d = _mm_set1_ps(d_scalar);
1019 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1020 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1021 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1022 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1023 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1024 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1026 /* Avoid stupid compiler warnings */
1027 jnrA = jnrB = jnrC = jnrD = 0;
1028 j_coord_offsetA = 0;
1029 j_coord_offsetB = 0;
1030 j_coord_offsetC = 0;
1031 j_coord_offsetD = 0;
1036 for(iidx=0;iidx<4*DIM;iidx++)
1038 scratch[iidx] = 0.0;
1041 /* Start outer loop over neighborlists */
1042 for(iidx=0; iidx<nri; iidx++)
1044 /* Load shift vector for this list */
1045 i_shift_offset = DIM*shiftidx[iidx];
1047 /* Load limits for loop over neighbors */
1048 j_index_start = jindex[iidx];
1049 j_index_end = jindex[iidx+1];
1051 /* Get outer coordinate index */
1053 i_coord_offset = DIM*inr;
1055 /* Load i particle coords and add shift vector */
1056 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1057 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1059 fix0 = _mm_setzero_ps();
1060 fiy0 = _mm_setzero_ps();
1061 fiz0 = _mm_setzero_ps();
1062 fix1 = _mm_setzero_ps();
1063 fiy1 = _mm_setzero_ps();
1064 fiz1 = _mm_setzero_ps();
1065 fix2 = _mm_setzero_ps();
1066 fiy2 = _mm_setzero_ps();
1067 fiz2 = _mm_setzero_ps();
1068 fix3 = _mm_setzero_ps();
1069 fiy3 = _mm_setzero_ps();
1070 fiz3 = _mm_setzero_ps();
1072 /* Start inner kernel loop */
1073 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1076 /* Get j neighbor index, and coordinate index */
1078 jnrB = jjnr[jidx+1];
1079 jnrC = jjnr[jidx+2];
1080 jnrD = jjnr[jidx+3];
1081 j_coord_offsetA = DIM*jnrA;
1082 j_coord_offsetB = DIM*jnrB;
1083 j_coord_offsetC = DIM*jnrC;
1084 j_coord_offsetD = DIM*jnrD;
1086 /* load j atom coordinates */
1087 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1088 x+j_coord_offsetC,x+j_coord_offsetD,
1091 /* Calculate displacement vector */
1092 dx00 = _mm_sub_ps(ix0,jx0);
1093 dy00 = _mm_sub_ps(iy0,jy0);
1094 dz00 = _mm_sub_ps(iz0,jz0);
1095 dx10 = _mm_sub_ps(ix1,jx0);
1096 dy10 = _mm_sub_ps(iy1,jy0);
1097 dz10 = _mm_sub_ps(iz1,jz0);
1098 dx20 = _mm_sub_ps(ix2,jx0);
1099 dy20 = _mm_sub_ps(iy2,jy0);
1100 dz20 = _mm_sub_ps(iz2,jz0);
1101 dx30 = _mm_sub_ps(ix3,jx0);
1102 dy30 = _mm_sub_ps(iy3,jy0);
1103 dz30 = _mm_sub_ps(iz3,jz0);
1105 /* Calculate squared distance and things based on it */
1106 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1107 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1108 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1109 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1111 rinv00 = sse2_invsqrt_f(rsq00);
1112 rinv10 = sse2_invsqrt_f(rsq10);
1113 rinv20 = sse2_invsqrt_f(rsq20);
1114 rinv30 = sse2_invsqrt_f(rsq30);
1116 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1117 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1118 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1119 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1121 /* Load parameters for j particles */
1122 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1123 charge+jnrC+0,charge+jnrD+0);
1124 vdwjidx0A = 2*vdwtype[jnrA+0];
1125 vdwjidx0B = 2*vdwtype[jnrB+0];
1126 vdwjidx0C = 2*vdwtype[jnrC+0];
1127 vdwjidx0D = 2*vdwtype[jnrD+0];
1129 fjx0 = _mm_setzero_ps();
1130 fjy0 = _mm_setzero_ps();
1131 fjz0 = _mm_setzero_ps();
1133 /**************************
1134 * CALCULATE INTERACTIONS *
1135 **************************/
1137 if (gmx_mm_any_lt(rsq00,rcutoff2))
1140 r00 = _mm_mul_ps(rsq00,rinv00);
1142 /* Compute parameters for interactions between i and j atoms */
1143 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1144 vdwparam+vdwioffset0+vdwjidx0B,
1145 vdwparam+vdwioffset0+vdwjidx0C,
1146 vdwparam+vdwioffset0+vdwjidx0D,
1149 /* LENNARD-JONES DISPERSION/REPULSION */
1151 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1152 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1153 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1154 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1155 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1157 d = _mm_sub_ps(r00,rswitch);
1158 d = _mm_max_ps(d,_mm_setzero_ps());
1159 d2 = _mm_mul_ps(d,d);
1160 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1162 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1164 /* Evaluate switch function */
1165 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1166 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1167 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1171 fscal = _mm_and_ps(fscal,cutoff_mask);
1173 /* Calculate temporary vectorial force */
1174 tx = _mm_mul_ps(fscal,dx00);
1175 ty = _mm_mul_ps(fscal,dy00);
1176 tz = _mm_mul_ps(fscal,dz00);
1178 /* Update vectorial force */
1179 fix0 = _mm_add_ps(fix0,tx);
1180 fiy0 = _mm_add_ps(fiy0,ty);
1181 fiz0 = _mm_add_ps(fiz0,tz);
1183 fjx0 = _mm_add_ps(fjx0,tx);
1184 fjy0 = _mm_add_ps(fjy0,ty);
1185 fjz0 = _mm_add_ps(fjz0,tz);
1189 /**************************
1190 * CALCULATE INTERACTIONS *
1191 **************************/
1193 if (gmx_mm_any_lt(rsq10,rcutoff2))
1196 r10 = _mm_mul_ps(rsq10,rinv10);
1198 /* Compute parameters for interactions between i and j atoms */
1199 qq10 = _mm_mul_ps(iq1,jq0);
1201 /* EWALD ELECTROSTATICS */
1203 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1204 ewrt = _mm_mul_ps(r10,ewtabscale);
1205 ewitab = _mm_cvttps_epi32(ewrt);
1206 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1207 ewitab = _mm_slli_epi32(ewitab,2);
1208 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1209 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1210 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1211 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1212 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1213 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1214 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1215 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1216 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1218 d = _mm_sub_ps(r10,rswitch);
1219 d = _mm_max_ps(d,_mm_setzero_ps());
1220 d2 = _mm_mul_ps(d,d);
1221 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1223 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1225 /* Evaluate switch function */
1226 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1227 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1228 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1232 fscal = _mm_and_ps(fscal,cutoff_mask);
1234 /* Calculate temporary vectorial force */
1235 tx = _mm_mul_ps(fscal,dx10);
1236 ty = _mm_mul_ps(fscal,dy10);
1237 tz = _mm_mul_ps(fscal,dz10);
1239 /* Update vectorial force */
1240 fix1 = _mm_add_ps(fix1,tx);
1241 fiy1 = _mm_add_ps(fiy1,ty);
1242 fiz1 = _mm_add_ps(fiz1,tz);
1244 fjx0 = _mm_add_ps(fjx0,tx);
1245 fjy0 = _mm_add_ps(fjy0,ty);
1246 fjz0 = _mm_add_ps(fjz0,tz);
1250 /**************************
1251 * CALCULATE INTERACTIONS *
1252 **************************/
1254 if (gmx_mm_any_lt(rsq20,rcutoff2))
1257 r20 = _mm_mul_ps(rsq20,rinv20);
1259 /* Compute parameters for interactions between i and j atoms */
1260 qq20 = _mm_mul_ps(iq2,jq0);
1262 /* EWALD ELECTROSTATICS */
1264 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1265 ewrt = _mm_mul_ps(r20,ewtabscale);
1266 ewitab = _mm_cvttps_epi32(ewrt);
1267 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1268 ewitab = _mm_slli_epi32(ewitab,2);
1269 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1270 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1271 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1272 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1273 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1274 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1275 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1276 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1277 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1279 d = _mm_sub_ps(r20,rswitch);
1280 d = _mm_max_ps(d,_mm_setzero_ps());
1281 d2 = _mm_mul_ps(d,d);
1282 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1284 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1286 /* Evaluate switch function */
1287 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1288 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1289 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1293 fscal = _mm_and_ps(fscal,cutoff_mask);
1295 /* Calculate temporary vectorial force */
1296 tx = _mm_mul_ps(fscal,dx20);
1297 ty = _mm_mul_ps(fscal,dy20);
1298 tz = _mm_mul_ps(fscal,dz20);
1300 /* Update vectorial force */
1301 fix2 = _mm_add_ps(fix2,tx);
1302 fiy2 = _mm_add_ps(fiy2,ty);
1303 fiz2 = _mm_add_ps(fiz2,tz);
1305 fjx0 = _mm_add_ps(fjx0,tx);
1306 fjy0 = _mm_add_ps(fjy0,ty);
1307 fjz0 = _mm_add_ps(fjz0,tz);
1311 /**************************
1312 * CALCULATE INTERACTIONS *
1313 **************************/
1315 if (gmx_mm_any_lt(rsq30,rcutoff2))
1318 r30 = _mm_mul_ps(rsq30,rinv30);
1320 /* Compute parameters for interactions between i and j atoms */
1321 qq30 = _mm_mul_ps(iq3,jq0);
1323 /* EWALD ELECTROSTATICS */
1325 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1326 ewrt = _mm_mul_ps(r30,ewtabscale);
1327 ewitab = _mm_cvttps_epi32(ewrt);
1328 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1329 ewitab = _mm_slli_epi32(ewitab,2);
1330 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1331 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1332 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1333 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1334 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1335 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1336 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1337 velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
1338 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1340 d = _mm_sub_ps(r30,rswitch);
1341 d = _mm_max_ps(d,_mm_setzero_ps());
1342 d2 = _mm_mul_ps(d,d);
1343 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1345 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1347 /* Evaluate switch function */
1348 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1349 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
1350 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1354 fscal = _mm_and_ps(fscal,cutoff_mask);
1356 /* Calculate temporary vectorial force */
1357 tx = _mm_mul_ps(fscal,dx30);
1358 ty = _mm_mul_ps(fscal,dy30);
1359 tz = _mm_mul_ps(fscal,dz30);
1361 /* Update vectorial force */
1362 fix3 = _mm_add_ps(fix3,tx);
1363 fiy3 = _mm_add_ps(fiy3,ty);
1364 fiz3 = _mm_add_ps(fiz3,tz);
1366 fjx0 = _mm_add_ps(fjx0,tx);
1367 fjy0 = _mm_add_ps(fjy0,ty);
1368 fjz0 = _mm_add_ps(fjz0,tz);
1372 fjptrA = f+j_coord_offsetA;
1373 fjptrB = f+j_coord_offsetB;
1374 fjptrC = f+j_coord_offsetC;
1375 fjptrD = f+j_coord_offsetD;
1377 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1379 /* Inner loop uses 242 flops */
1382 if(jidx<j_index_end)
1385 /* Get j neighbor index, and coordinate index */
1386 jnrlistA = jjnr[jidx];
1387 jnrlistB = jjnr[jidx+1];
1388 jnrlistC = jjnr[jidx+2];
1389 jnrlistD = jjnr[jidx+3];
1390 /* Sign of each element will be negative for non-real atoms.
1391 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1392 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1394 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1395 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1396 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1397 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1398 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1399 j_coord_offsetA = DIM*jnrA;
1400 j_coord_offsetB = DIM*jnrB;
1401 j_coord_offsetC = DIM*jnrC;
1402 j_coord_offsetD = DIM*jnrD;
1404 /* load j atom coordinates */
1405 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1406 x+j_coord_offsetC,x+j_coord_offsetD,
1409 /* Calculate displacement vector */
1410 dx00 = _mm_sub_ps(ix0,jx0);
1411 dy00 = _mm_sub_ps(iy0,jy0);
1412 dz00 = _mm_sub_ps(iz0,jz0);
1413 dx10 = _mm_sub_ps(ix1,jx0);
1414 dy10 = _mm_sub_ps(iy1,jy0);
1415 dz10 = _mm_sub_ps(iz1,jz0);
1416 dx20 = _mm_sub_ps(ix2,jx0);
1417 dy20 = _mm_sub_ps(iy2,jy0);
1418 dz20 = _mm_sub_ps(iz2,jz0);
1419 dx30 = _mm_sub_ps(ix3,jx0);
1420 dy30 = _mm_sub_ps(iy3,jy0);
1421 dz30 = _mm_sub_ps(iz3,jz0);
1423 /* Calculate squared distance and things based on it */
1424 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1425 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1426 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1427 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1429 rinv00 = sse2_invsqrt_f(rsq00);
1430 rinv10 = sse2_invsqrt_f(rsq10);
1431 rinv20 = sse2_invsqrt_f(rsq20);
1432 rinv30 = sse2_invsqrt_f(rsq30);
1434 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1435 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1436 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1437 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1439 /* Load parameters for j particles */
1440 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1441 charge+jnrC+0,charge+jnrD+0);
1442 vdwjidx0A = 2*vdwtype[jnrA+0];
1443 vdwjidx0B = 2*vdwtype[jnrB+0];
1444 vdwjidx0C = 2*vdwtype[jnrC+0];
1445 vdwjidx0D = 2*vdwtype[jnrD+0];
1447 fjx0 = _mm_setzero_ps();
1448 fjy0 = _mm_setzero_ps();
1449 fjz0 = _mm_setzero_ps();
1451 /**************************
1452 * CALCULATE INTERACTIONS *
1453 **************************/
1455 if (gmx_mm_any_lt(rsq00,rcutoff2))
1458 r00 = _mm_mul_ps(rsq00,rinv00);
1459 r00 = _mm_andnot_ps(dummy_mask,r00);
1461 /* Compute parameters for interactions between i and j atoms */
1462 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1463 vdwparam+vdwioffset0+vdwjidx0B,
1464 vdwparam+vdwioffset0+vdwjidx0C,
1465 vdwparam+vdwioffset0+vdwjidx0D,
1468 /* LENNARD-JONES DISPERSION/REPULSION */
1470 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1471 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
1472 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1473 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1474 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1476 d = _mm_sub_ps(r00,rswitch);
1477 d = _mm_max_ps(d,_mm_setzero_ps());
1478 d2 = _mm_mul_ps(d,d);
1479 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1481 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1483 /* Evaluate switch function */
1484 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1485 fvdw = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1486 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1490 fscal = _mm_and_ps(fscal,cutoff_mask);
1492 fscal = _mm_andnot_ps(dummy_mask,fscal);
1494 /* Calculate temporary vectorial force */
1495 tx = _mm_mul_ps(fscal,dx00);
1496 ty = _mm_mul_ps(fscal,dy00);
1497 tz = _mm_mul_ps(fscal,dz00);
1499 /* Update vectorial force */
1500 fix0 = _mm_add_ps(fix0,tx);
1501 fiy0 = _mm_add_ps(fiy0,ty);
1502 fiz0 = _mm_add_ps(fiz0,tz);
1504 fjx0 = _mm_add_ps(fjx0,tx);
1505 fjy0 = _mm_add_ps(fjy0,ty);
1506 fjz0 = _mm_add_ps(fjz0,tz);
1510 /**************************
1511 * CALCULATE INTERACTIONS *
1512 **************************/
1514 if (gmx_mm_any_lt(rsq10,rcutoff2))
1517 r10 = _mm_mul_ps(rsq10,rinv10);
1518 r10 = _mm_andnot_ps(dummy_mask,r10);
1520 /* Compute parameters for interactions between i and j atoms */
1521 qq10 = _mm_mul_ps(iq1,jq0);
1523 /* EWALD ELECTROSTATICS */
1525 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1526 ewrt = _mm_mul_ps(r10,ewtabscale);
1527 ewitab = _mm_cvttps_epi32(ewrt);
1528 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1529 ewitab = _mm_slli_epi32(ewitab,2);
1530 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1531 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1532 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1533 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1534 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1535 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1536 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1537 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1538 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1540 d = _mm_sub_ps(r10,rswitch);
1541 d = _mm_max_ps(d,_mm_setzero_ps());
1542 d2 = _mm_mul_ps(d,d);
1543 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1545 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1547 /* Evaluate switch function */
1548 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1549 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1550 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1554 fscal = _mm_and_ps(fscal,cutoff_mask);
1556 fscal = _mm_andnot_ps(dummy_mask,fscal);
1558 /* Calculate temporary vectorial force */
1559 tx = _mm_mul_ps(fscal,dx10);
1560 ty = _mm_mul_ps(fscal,dy10);
1561 tz = _mm_mul_ps(fscal,dz10);
1563 /* Update vectorial force */
1564 fix1 = _mm_add_ps(fix1,tx);
1565 fiy1 = _mm_add_ps(fiy1,ty);
1566 fiz1 = _mm_add_ps(fiz1,tz);
1568 fjx0 = _mm_add_ps(fjx0,tx);
1569 fjy0 = _mm_add_ps(fjy0,ty);
1570 fjz0 = _mm_add_ps(fjz0,tz);
1574 /**************************
1575 * CALCULATE INTERACTIONS *
1576 **************************/
1578 if (gmx_mm_any_lt(rsq20,rcutoff2))
1581 r20 = _mm_mul_ps(rsq20,rinv20);
1582 r20 = _mm_andnot_ps(dummy_mask,r20);
1584 /* Compute parameters for interactions between i and j atoms */
1585 qq20 = _mm_mul_ps(iq2,jq0);
1587 /* EWALD ELECTROSTATICS */
1589 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1590 ewrt = _mm_mul_ps(r20,ewtabscale);
1591 ewitab = _mm_cvttps_epi32(ewrt);
1592 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1593 ewitab = _mm_slli_epi32(ewitab,2);
1594 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1595 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1596 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1597 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1598 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1599 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1600 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1601 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1602 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1604 d = _mm_sub_ps(r20,rswitch);
1605 d = _mm_max_ps(d,_mm_setzero_ps());
1606 d2 = _mm_mul_ps(d,d);
1607 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1609 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1611 /* Evaluate switch function */
1612 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1613 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1614 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1618 fscal = _mm_and_ps(fscal,cutoff_mask);
1620 fscal = _mm_andnot_ps(dummy_mask,fscal);
1622 /* Calculate temporary vectorial force */
1623 tx = _mm_mul_ps(fscal,dx20);
1624 ty = _mm_mul_ps(fscal,dy20);
1625 tz = _mm_mul_ps(fscal,dz20);
1627 /* Update vectorial force */
1628 fix2 = _mm_add_ps(fix2,tx);
1629 fiy2 = _mm_add_ps(fiy2,ty);
1630 fiz2 = _mm_add_ps(fiz2,tz);
1632 fjx0 = _mm_add_ps(fjx0,tx);
1633 fjy0 = _mm_add_ps(fjy0,ty);
1634 fjz0 = _mm_add_ps(fjz0,tz);
1638 /**************************
1639 * CALCULATE INTERACTIONS *
1640 **************************/
1642 if (gmx_mm_any_lt(rsq30,rcutoff2))
1645 r30 = _mm_mul_ps(rsq30,rinv30);
1646 r30 = _mm_andnot_ps(dummy_mask,r30);
1648 /* Compute parameters for interactions between i and j atoms */
1649 qq30 = _mm_mul_ps(iq3,jq0);
1651 /* EWALD ELECTROSTATICS */
1653 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1654 ewrt = _mm_mul_ps(r30,ewtabscale);
1655 ewitab = _mm_cvttps_epi32(ewrt);
1656 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1657 ewitab = _mm_slli_epi32(ewitab,2);
1658 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1659 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1660 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1661 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1662 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1663 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1664 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1665 velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
1666 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1668 d = _mm_sub_ps(r30,rswitch);
1669 d = _mm_max_ps(d,_mm_setzero_ps());
1670 d2 = _mm_mul_ps(d,d);
1671 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1673 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1675 /* Evaluate switch function */
1676 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1677 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
1678 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1682 fscal = _mm_and_ps(fscal,cutoff_mask);
1684 fscal = _mm_andnot_ps(dummy_mask,fscal);
1686 /* Calculate temporary vectorial force */
1687 tx = _mm_mul_ps(fscal,dx30);
1688 ty = _mm_mul_ps(fscal,dy30);
1689 tz = _mm_mul_ps(fscal,dz30);
1691 /* Update vectorial force */
1692 fix3 = _mm_add_ps(fix3,tx);
1693 fiy3 = _mm_add_ps(fiy3,ty);
1694 fiz3 = _mm_add_ps(fiz3,tz);
1696 fjx0 = _mm_add_ps(fjx0,tx);
1697 fjy0 = _mm_add_ps(fjy0,ty);
1698 fjz0 = _mm_add_ps(fjz0,tz);
1702 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1703 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1704 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1705 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1707 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1709 /* Inner loop uses 246 flops */
1712 /* End of innermost loop */
1714 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1715 f+i_coord_offset,fshift+i_shift_offset);
1717 /* Increment number of inner iterations */
1718 inneriter += j_index_end - j_index_start;
1720 /* Outer loop uses 24 flops */
1723 /* Increment number of outer iterations */
1726 /* Update outer/inner flops */
1728 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*246);