2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: None
40 * Geometry: Water4-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwNone_GeomW4P1_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
72 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
74 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
75 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
78 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
79 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
80 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
83 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
85 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
86 real rswitch_scalar,d_scalar;
87 __m128 dummy_mask,cutoff_mask;
88 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
89 __m128 one = _mm_set1_ps(1.0);
90 __m128 two = _mm_set1_ps(2.0);
96 jindex = nlist->jindex;
98 shiftidx = nlist->shift;
100 shiftvec = fr->shift_vec[0];
101 fshift = fr->fshift[0];
102 facel = _mm_set1_ps(fr->epsfac);
103 charge = mdatoms->chargeA;
105 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
106 ewtab = fr->ic->tabq_coul_FDV0;
107 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
108 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
110 /* Setup water-specific parameters */
111 inr = nlist->iinr[0];
112 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
113 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
114 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
116 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
117 rcutoff_scalar = fr->rcoulomb;
118 rcutoff = _mm_set1_ps(rcutoff_scalar);
119 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
121 rswitch_scalar = fr->rcoulomb_switch;
122 rswitch = _mm_set1_ps(rswitch_scalar);
123 /* Setup switch parameters */
124 d_scalar = rcutoff_scalar-rswitch_scalar;
125 d = _mm_set1_ps(d_scalar);
126 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
127 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
128 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
129 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
130 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
131 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
133 /* Avoid stupid compiler warnings */
134 jnrA = jnrB = jnrC = jnrD = 0;
143 for(iidx=0;iidx<4*DIM;iidx++)
148 /* Start outer loop over neighborlists */
149 for(iidx=0; iidx<nri; iidx++)
151 /* Load shift vector for this list */
152 i_shift_offset = DIM*shiftidx[iidx];
154 /* Load limits for loop over neighbors */
155 j_index_start = jindex[iidx];
156 j_index_end = jindex[iidx+1];
158 /* Get outer coordinate index */
160 i_coord_offset = DIM*inr;
162 /* Load i particle coords and add shift vector */
163 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
164 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
166 fix1 = _mm_setzero_ps();
167 fiy1 = _mm_setzero_ps();
168 fiz1 = _mm_setzero_ps();
169 fix2 = _mm_setzero_ps();
170 fiy2 = _mm_setzero_ps();
171 fiz2 = _mm_setzero_ps();
172 fix3 = _mm_setzero_ps();
173 fiy3 = _mm_setzero_ps();
174 fiz3 = _mm_setzero_ps();
176 /* Reset potential sums */
177 velecsum = _mm_setzero_ps();
179 /* Start inner kernel loop */
180 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
183 /* Get j neighbor index, and coordinate index */
188 j_coord_offsetA = DIM*jnrA;
189 j_coord_offsetB = DIM*jnrB;
190 j_coord_offsetC = DIM*jnrC;
191 j_coord_offsetD = DIM*jnrD;
193 /* load j atom coordinates */
194 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
195 x+j_coord_offsetC,x+j_coord_offsetD,
198 /* Calculate displacement vector */
199 dx10 = _mm_sub_ps(ix1,jx0);
200 dy10 = _mm_sub_ps(iy1,jy0);
201 dz10 = _mm_sub_ps(iz1,jz0);
202 dx20 = _mm_sub_ps(ix2,jx0);
203 dy20 = _mm_sub_ps(iy2,jy0);
204 dz20 = _mm_sub_ps(iz2,jz0);
205 dx30 = _mm_sub_ps(ix3,jx0);
206 dy30 = _mm_sub_ps(iy3,jy0);
207 dz30 = _mm_sub_ps(iz3,jz0);
209 /* Calculate squared distance and things based on it */
210 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
211 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
212 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
214 rinv10 = gmx_mm_invsqrt_ps(rsq10);
215 rinv20 = gmx_mm_invsqrt_ps(rsq20);
216 rinv30 = gmx_mm_invsqrt_ps(rsq30);
218 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
219 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
220 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
222 /* Load parameters for j particles */
223 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
224 charge+jnrC+0,charge+jnrD+0);
226 fjx0 = _mm_setzero_ps();
227 fjy0 = _mm_setzero_ps();
228 fjz0 = _mm_setzero_ps();
230 /**************************
231 * CALCULATE INTERACTIONS *
232 **************************/
234 if (gmx_mm_any_lt(rsq10,rcutoff2))
237 r10 = _mm_mul_ps(rsq10,rinv10);
239 /* Compute parameters for interactions between i and j atoms */
240 qq10 = _mm_mul_ps(iq1,jq0);
242 /* EWALD ELECTROSTATICS */
244 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
245 ewrt = _mm_mul_ps(r10,ewtabscale);
246 ewitab = _mm_cvttps_epi32(ewrt);
247 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
248 ewitab = _mm_slli_epi32(ewitab,2);
249 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
250 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
251 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
252 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
253 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
254 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
255 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
256 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
257 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
259 d = _mm_sub_ps(r10,rswitch);
260 d = _mm_max_ps(d,_mm_setzero_ps());
261 d2 = _mm_mul_ps(d,d);
262 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
264 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
266 /* Evaluate switch function */
267 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
268 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
269 velec = _mm_mul_ps(velec,sw);
270 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
272 /* Update potential sum for this i atom from the interaction with this j atom. */
273 velec = _mm_and_ps(velec,cutoff_mask);
274 velecsum = _mm_add_ps(velecsum,velec);
278 fscal = _mm_and_ps(fscal,cutoff_mask);
280 /* Calculate temporary vectorial force */
281 tx = _mm_mul_ps(fscal,dx10);
282 ty = _mm_mul_ps(fscal,dy10);
283 tz = _mm_mul_ps(fscal,dz10);
285 /* Update vectorial force */
286 fix1 = _mm_add_ps(fix1,tx);
287 fiy1 = _mm_add_ps(fiy1,ty);
288 fiz1 = _mm_add_ps(fiz1,tz);
290 fjx0 = _mm_add_ps(fjx0,tx);
291 fjy0 = _mm_add_ps(fjy0,ty);
292 fjz0 = _mm_add_ps(fjz0,tz);
296 /**************************
297 * CALCULATE INTERACTIONS *
298 **************************/
300 if (gmx_mm_any_lt(rsq20,rcutoff2))
303 r20 = _mm_mul_ps(rsq20,rinv20);
305 /* Compute parameters for interactions between i and j atoms */
306 qq20 = _mm_mul_ps(iq2,jq0);
308 /* EWALD ELECTROSTATICS */
310 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
311 ewrt = _mm_mul_ps(r20,ewtabscale);
312 ewitab = _mm_cvttps_epi32(ewrt);
313 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
314 ewitab = _mm_slli_epi32(ewitab,2);
315 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
316 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
317 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
318 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
319 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
320 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
321 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
322 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
323 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
325 d = _mm_sub_ps(r20,rswitch);
326 d = _mm_max_ps(d,_mm_setzero_ps());
327 d2 = _mm_mul_ps(d,d);
328 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
330 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
332 /* Evaluate switch function */
333 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
334 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
335 velec = _mm_mul_ps(velec,sw);
336 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
338 /* Update potential sum for this i atom from the interaction with this j atom. */
339 velec = _mm_and_ps(velec,cutoff_mask);
340 velecsum = _mm_add_ps(velecsum,velec);
344 fscal = _mm_and_ps(fscal,cutoff_mask);
346 /* Calculate temporary vectorial force */
347 tx = _mm_mul_ps(fscal,dx20);
348 ty = _mm_mul_ps(fscal,dy20);
349 tz = _mm_mul_ps(fscal,dz20);
351 /* Update vectorial force */
352 fix2 = _mm_add_ps(fix2,tx);
353 fiy2 = _mm_add_ps(fiy2,ty);
354 fiz2 = _mm_add_ps(fiz2,tz);
356 fjx0 = _mm_add_ps(fjx0,tx);
357 fjy0 = _mm_add_ps(fjy0,ty);
358 fjz0 = _mm_add_ps(fjz0,tz);
362 /**************************
363 * CALCULATE INTERACTIONS *
364 **************************/
366 if (gmx_mm_any_lt(rsq30,rcutoff2))
369 r30 = _mm_mul_ps(rsq30,rinv30);
371 /* Compute parameters for interactions between i and j atoms */
372 qq30 = _mm_mul_ps(iq3,jq0);
374 /* EWALD ELECTROSTATICS */
376 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
377 ewrt = _mm_mul_ps(r30,ewtabscale);
378 ewitab = _mm_cvttps_epi32(ewrt);
379 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
380 ewitab = _mm_slli_epi32(ewitab,2);
381 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
382 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
383 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
384 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
385 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
386 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
387 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
388 velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
389 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
391 d = _mm_sub_ps(r30,rswitch);
392 d = _mm_max_ps(d,_mm_setzero_ps());
393 d2 = _mm_mul_ps(d,d);
394 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
396 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
398 /* Evaluate switch function */
399 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
400 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
401 velec = _mm_mul_ps(velec,sw);
402 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
404 /* Update potential sum for this i atom from the interaction with this j atom. */
405 velec = _mm_and_ps(velec,cutoff_mask);
406 velecsum = _mm_add_ps(velecsum,velec);
410 fscal = _mm_and_ps(fscal,cutoff_mask);
412 /* Calculate temporary vectorial force */
413 tx = _mm_mul_ps(fscal,dx30);
414 ty = _mm_mul_ps(fscal,dy30);
415 tz = _mm_mul_ps(fscal,dz30);
417 /* Update vectorial force */
418 fix3 = _mm_add_ps(fix3,tx);
419 fiy3 = _mm_add_ps(fiy3,ty);
420 fiz3 = _mm_add_ps(fiz3,tz);
422 fjx0 = _mm_add_ps(fjx0,tx);
423 fjy0 = _mm_add_ps(fjy0,ty);
424 fjz0 = _mm_add_ps(fjz0,tz);
428 fjptrA = f+j_coord_offsetA;
429 fjptrB = f+j_coord_offsetB;
430 fjptrC = f+j_coord_offsetC;
431 fjptrD = f+j_coord_offsetD;
433 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
435 /* Inner loop uses 195 flops */
441 /* Get j neighbor index, and coordinate index */
442 jnrlistA = jjnr[jidx];
443 jnrlistB = jjnr[jidx+1];
444 jnrlistC = jjnr[jidx+2];
445 jnrlistD = jjnr[jidx+3];
446 /* Sign of each element will be negative for non-real atoms.
447 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
448 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
450 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
451 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
452 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
453 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
454 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
455 j_coord_offsetA = DIM*jnrA;
456 j_coord_offsetB = DIM*jnrB;
457 j_coord_offsetC = DIM*jnrC;
458 j_coord_offsetD = DIM*jnrD;
460 /* load j atom coordinates */
461 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
462 x+j_coord_offsetC,x+j_coord_offsetD,
465 /* Calculate displacement vector */
466 dx10 = _mm_sub_ps(ix1,jx0);
467 dy10 = _mm_sub_ps(iy1,jy0);
468 dz10 = _mm_sub_ps(iz1,jz0);
469 dx20 = _mm_sub_ps(ix2,jx0);
470 dy20 = _mm_sub_ps(iy2,jy0);
471 dz20 = _mm_sub_ps(iz2,jz0);
472 dx30 = _mm_sub_ps(ix3,jx0);
473 dy30 = _mm_sub_ps(iy3,jy0);
474 dz30 = _mm_sub_ps(iz3,jz0);
476 /* Calculate squared distance and things based on it */
477 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
478 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
479 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
481 rinv10 = gmx_mm_invsqrt_ps(rsq10);
482 rinv20 = gmx_mm_invsqrt_ps(rsq20);
483 rinv30 = gmx_mm_invsqrt_ps(rsq30);
485 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
486 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
487 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
489 /* Load parameters for j particles */
490 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
491 charge+jnrC+0,charge+jnrD+0);
493 fjx0 = _mm_setzero_ps();
494 fjy0 = _mm_setzero_ps();
495 fjz0 = _mm_setzero_ps();
497 /**************************
498 * CALCULATE INTERACTIONS *
499 **************************/
501 if (gmx_mm_any_lt(rsq10,rcutoff2))
504 r10 = _mm_mul_ps(rsq10,rinv10);
505 r10 = _mm_andnot_ps(dummy_mask,r10);
507 /* Compute parameters for interactions between i and j atoms */
508 qq10 = _mm_mul_ps(iq1,jq0);
510 /* EWALD ELECTROSTATICS */
512 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
513 ewrt = _mm_mul_ps(r10,ewtabscale);
514 ewitab = _mm_cvttps_epi32(ewrt);
515 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
516 ewitab = _mm_slli_epi32(ewitab,2);
517 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
518 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
519 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
520 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
521 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
522 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
523 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
524 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
525 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
527 d = _mm_sub_ps(r10,rswitch);
528 d = _mm_max_ps(d,_mm_setzero_ps());
529 d2 = _mm_mul_ps(d,d);
530 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
532 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
534 /* Evaluate switch function */
535 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
536 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
537 velec = _mm_mul_ps(velec,sw);
538 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
540 /* Update potential sum for this i atom from the interaction with this j atom. */
541 velec = _mm_and_ps(velec,cutoff_mask);
542 velec = _mm_andnot_ps(dummy_mask,velec);
543 velecsum = _mm_add_ps(velecsum,velec);
547 fscal = _mm_and_ps(fscal,cutoff_mask);
549 fscal = _mm_andnot_ps(dummy_mask,fscal);
551 /* Calculate temporary vectorial force */
552 tx = _mm_mul_ps(fscal,dx10);
553 ty = _mm_mul_ps(fscal,dy10);
554 tz = _mm_mul_ps(fscal,dz10);
556 /* Update vectorial force */
557 fix1 = _mm_add_ps(fix1,tx);
558 fiy1 = _mm_add_ps(fiy1,ty);
559 fiz1 = _mm_add_ps(fiz1,tz);
561 fjx0 = _mm_add_ps(fjx0,tx);
562 fjy0 = _mm_add_ps(fjy0,ty);
563 fjz0 = _mm_add_ps(fjz0,tz);
567 /**************************
568 * CALCULATE INTERACTIONS *
569 **************************/
571 if (gmx_mm_any_lt(rsq20,rcutoff2))
574 r20 = _mm_mul_ps(rsq20,rinv20);
575 r20 = _mm_andnot_ps(dummy_mask,r20);
577 /* Compute parameters for interactions between i and j atoms */
578 qq20 = _mm_mul_ps(iq2,jq0);
580 /* EWALD ELECTROSTATICS */
582 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
583 ewrt = _mm_mul_ps(r20,ewtabscale);
584 ewitab = _mm_cvttps_epi32(ewrt);
585 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
586 ewitab = _mm_slli_epi32(ewitab,2);
587 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
588 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
589 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
590 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
591 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
592 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
593 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
594 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
595 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
597 d = _mm_sub_ps(r20,rswitch);
598 d = _mm_max_ps(d,_mm_setzero_ps());
599 d2 = _mm_mul_ps(d,d);
600 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
602 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
604 /* Evaluate switch function */
605 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
606 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
607 velec = _mm_mul_ps(velec,sw);
608 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
610 /* Update potential sum for this i atom from the interaction with this j atom. */
611 velec = _mm_and_ps(velec,cutoff_mask);
612 velec = _mm_andnot_ps(dummy_mask,velec);
613 velecsum = _mm_add_ps(velecsum,velec);
617 fscal = _mm_and_ps(fscal,cutoff_mask);
619 fscal = _mm_andnot_ps(dummy_mask,fscal);
621 /* Calculate temporary vectorial force */
622 tx = _mm_mul_ps(fscal,dx20);
623 ty = _mm_mul_ps(fscal,dy20);
624 tz = _mm_mul_ps(fscal,dz20);
626 /* Update vectorial force */
627 fix2 = _mm_add_ps(fix2,tx);
628 fiy2 = _mm_add_ps(fiy2,ty);
629 fiz2 = _mm_add_ps(fiz2,tz);
631 fjx0 = _mm_add_ps(fjx0,tx);
632 fjy0 = _mm_add_ps(fjy0,ty);
633 fjz0 = _mm_add_ps(fjz0,tz);
637 /**************************
638 * CALCULATE INTERACTIONS *
639 **************************/
641 if (gmx_mm_any_lt(rsq30,rcutoff2))
644 r30 = _mm_mul_ps(rsq30,rinv30);
645 r30 = _mm_andnot_ps(dummy_mask,r30);
647 /* Compute parameters for interactions between i and j atoms */
648 qq30 = _mm_mul_ps(iq3,jq0);
650 /* EWALD ELECTROSTATICS */
652 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
653 ewrt = _mm_mul_ps(r30,ewtabscale);
654 ewitab = _mm_cvttps_epi32(ewrt);
655 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
656 ewitab = _mm_slli_epi32(ewitab,2);
657 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
658 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
659 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
660 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
661 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
662 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
663 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
664 velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
665 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
667 d = _mm_sub_ps(r30,rswitch);
668 d = _mm_max_ps(d,_mm_setzero_ps());
669 d2 = _mm_mul_ps(d,d);
670 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
672 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
674 /* Evaluate switch function */
675 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
676 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
677 velec = _mm_mul_ps(velec,sw);
678 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
680 /* Update potential sum for this i atom from the interaction with this j atom. */
681 velec = _mm_and_ps(velec,cutoff_mask);
682 velec = _mm_andnot_ps(dummy_mask,velec);
683 velecsum = _mm_add_ps(velecsum,velec);
687 fscal = _mm_and_ps(fscal,cutoff_mask);
689 fscal = _mm_andnot_ps(dummy_mask,fscal);
691 /* Calculate temporary vectorial force */
692 tx = _mm_mul_ps(fscal,dx30);
693 ty = _mm_mul_ps(fscal,dy30);
694 tz = _mm_mul_ps(fscal,dz30);
696 /* Update vectorial force */
697 fix3 = _mm_add_ps(fix3,tx);
698 fiy3 = _mm_add_ps(fiy3,ty);
699 fiz3 = _mm_add_ps(fiz3,tz);
701 fjx0 = _mm_add_ps(fjx0,tx);
702 fjy0 = _mm_add_ps(fjy0,ty);
703 fjz0 = _mm_add_ps(fjz0,tz);
707 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
708 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
709 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
710 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
712 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
714 /* Inner loop uses 198 flops */
717 /* End of innermost loop */
719 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
720 f+i_coord_offset+DIM,fshift+i_shift_offset);
723 /* Update potential energies */
724 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
726 /* Increment number of inner iterations */
727 inneriter += j_index_end - j_index_start;
729 /* Outer loop uses 19 flops */
732 /* Increment number of outer iterations */
735 /* Update outer/inner flops */
737 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_VF,outeriter*19 + inneriter*198);
740 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single
741 * Electrostatics interaction: Ewald
742 * VdW interaction: None
743 * Geometry: Water4-Particle
744 * Calculate force/pot: Force
747 nb_kernel_ElecEwSw_VdwNone_GeomW4P1_F_sse2_single
748 (t_nblist * gmx_restrict nlist,
749 rvec * gmx_restrict xx,
750 rvec * gmx_restrict ff,
751 t_forcerec * gmx_restrict fr,
752 t_mdatoms * gmx_restrict mdatoms,
753 nb_kernel_data_t * gmx_restrict kernel_data,
754 t_nrnb * gmx_restrict nrnb)
756 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
757 * just 0 for non-waters.
758 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
759 * jnr indices corresponding to data put in the four positions in the SIMD register.
761 int i_shift_offset,i_coord_offset,outeriter,inneriter;
762 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
763 int jnrA,jnrB,jnrC,jnrD;
764 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
765 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
766 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
768 real *shiftvec,*fshift,*x,*f;
769 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
771 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
773 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
775 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
777 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
778 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
779 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
780 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
781 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
782 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
783 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
786 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
788 __m128 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
789 real rswitch_scalar,d_scalar;
790 __m128 dummy_mask,cutoff_mask;
791 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
792 __m128 one = _mm_set1_ps(1.0);
793 __m128 two = _mm_set1_ps(2.0);
799 jindex = nlist->jindex;
801 shiftidx = nlist->shift;
803 shiftvec = fr->shift_vec[0];
804 fshift = fr->fshift[0];
805 facel = _mm_set1_ps(fr->epsfac);
806 charge = mdatoms->chargeA;
808 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
809 ewtab = fr->ic->tabq_coul_FDV0;
810 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
811 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
813 /* Setup water-specific parameters */
814 inr = nlist->iinr[0];
815 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
816 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
817 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
819 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
820 rcutoff_scalar = fr->rcoulomb;
821 rcutoff = _mm_set1_ps(rcutoff_scalar);
822 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
824 rswitch_scalar = fr->rcoulomb_switch;
825 rswitch = _mm_set1_ps(rswitch_scalar);
826 /* Setup switch parameters */
827 d_scalar = rcutoff_scalar-rswitch_scalar;
828 d = _mm_set1_ps(d_scalar);
829 swV3 = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
830 swV4 = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
831 swV5 = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
832 swF2 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
833 swF3 = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
834 swF4 = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
836 /* Avoid stupid compiler warnings */
837 jnrA = jnrB = jnrC = jnrD = 0;
846 for(iidx=0;iidx<4*DIM;iidx++)
851 /* Start outer loop over neighborlists */
852 for(iidx=0; iidx<nri; iidx++)
854 /* Load shift vector for this list */
855 i_shift_offset = DIM*shiftidx[iidx];
857 /* Load limits for loop over neighbors */
858 j_index_start = jindex[iidx];
859 j_index_end = jindex[iidx+1];
861 /* Get outer coordinate index */
863 i_coord_offset = DIM*inr;
865 /* Load i particle coords and add shift vector */
866 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
867 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
869 fix1 = _mm_setzero_ps();
870 fiy1 = _mm_setzero_ps();
871 fiz1 = _mm_setzero_ps();
872 fix2 = _mm_setzero_ps();
873 fiy2 = _mm_setzero_ps();
874 fiz2 = _mm_setzero_ps();
875 fix3 = _mm_setzero_ps();
876 fiy3 = _mm_setzero_ps();
877 fiz3 = _mm_setzero_ps();
879 /* Start inner kernel loop */
880 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
883 /* Get j neighbor index, and coordinate index */
888 j_coord_offsetA = DIM*jnrA;
889 j_coord_offsetB = DIM*jnrB;
890 j_coord_offsetC = DIM*jnrC;
891 j_coord_offsetD = DIM*jnrD;
893 /* load j atom coordinates */
894 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
895 x+j_coord_offsetC,x+j_coord_offsetD,
898 /* Calculate displacement vector */
899 dx10 = _mm_sub_ps(ix1,jx0);
900 dy10 = _mm_sub_ps(iy1,jy0);
901 dz10 = _mm_sub_ps(iz1,jz0);
902 dx20 = _mm_sub_ps(ix2,jx0);
903 dy20 = _mm_sub_ps(iy2,jy0);
904 dz20 = _mm_sub_ps(iz2,jz0);
905 dx30 = _mm_sub_ps(ix3,jx0);
906 dy30 = _mm_sub_ps(iy3,jy0);
907 dz30 = _mm_sub_ps(iz3,jz0);
909 /* Calculate squared distance and things based on it */
910 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
911 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
912 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
914 rinv10 = gmx_mm_invsqrt_ps(rsq10);
915 rinv20 = gmx_mm_invsqrt_ps(rsq20);
916 rinv30 = gmx_mm_invsqrt_ps(rsq30);
918 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
919 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
920 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
922 /* Load parameters for j particles */
923 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
924 charge+jnrC+0,charge+jnrD+0);
926 fjx0 = _mm_setzero_ps();
927 fjy0 = _mm_setzero_ps();
928 fjz0 = _mm_setzero_ps();
930 /**************************
931 * CALCULATE INTERACTIONS *
932 **************************/
934 if (gmx_mm_any_lt(rsq10,rcutoff2))
937 r10 = _mm_mul_ps(rsq10,rinv10);
939 /* Compute parameters for interactions between i and j atoms */
940 qq10 = _mm_mul_ps(iq1,jq0);
942 /* EWALD ELECTROSTATICS */
944 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
945 ewrt = _mm_mul_ps(r10,ewtabscale);
946 ewitab = _mm_cvttps_epi32(ewrt);
947 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
948 ewitab = _mm_slli_epi32(ewitab,2);
949 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
950 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
951 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
952 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
953 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
954 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
955 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
956 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
957 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
959 d = _mm_sub_ps(r10,rswitch);
960 d = _mm_max_ps(d,_mm_setzero_ps());
961 d2 = _mm_mul_ps(d,d);
962 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
964 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
966 /* Evaluate switch function */
967 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
968 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
969 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
973 fscal = _mm_and_ps(fscal,cutoff_mask);
975 /* Calculate temporary vectorial force */
976 tx = _mm_mul_ps(fscal,dx10);
977 ty = _mm_mul_ps(fscal,dy10);
978 tz = _mm_mul_ps(fscal,dz10);
980 /* Update vectorial force */
981 fix1 = _mm_add_ps(fix1,tx);
982 fiy1 = _mm_add_ps(fiy1,ty);
983 fiz1 = _mm_add_ps(fiz1,tz);
985 fjx0 = _mm_add_ps(fjx0,tx);
986 fjy0 = _mm_add_ps(fjy0,ty);
987 fjz0 = _mm_add_ps(fjz0,tz);
991 /**************************
992 * CALCULATE INTERACTIONS *
993 **************************/
995 if (gmx_mm_any_lt(rsq20,rcutoff2))
998 r20 = _mm_mul_ps(rsq20,rinv20);
1000 /* Compute parameters for interactions between i and j atoms */
1001 qq20 = _mm_mul_ps(iq2,jq0);
1003 /* EWALD ELECTROSTATICS */
1005 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1006 ewrt = _mm_mul_ps(r20,ewtabscale);
1007 ewitab = _mm_cvttps_epi32(ewrt);
1008 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1009 ewitab = _mm_slli_epi32(ewitab,2);
1010 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1011 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1012 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1013 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1014 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1015 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1016 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1017 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1018 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1020 d = _mm_sub_ps(r20,rswitch);
1021 d = _mm_max_ps(d,_mm_setzero_ps());
1022 d2 = _mm_mul_ps(d,d);
1023 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1025 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1027 /* Evaluate switch function */
1028 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1029 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1030 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1034 fscal = _mm_and_ps(fscal,cutoff_mask);
1036 /* Calculate temporary vectorial force */
1037 tx = _mm_mul_ps(fscal,dx20);
1038 ty = _mm_mul_ps(fscal,dy20);
1039 tz = _mm_mul_ps(fscal,dz20);
1041 /* Update vectorial force */
1042 fix2 = _mm_add_ps(fix2,tx);
1043 fiy2 = _mm_add_ps(fiy2,ty);
1044 fiz2 = _mm_add_ps(fiz2,tz);
1046 fjx0 = _mm_add_ps(fjx0,tx);
1047 fjy0 = _mm_add_ps(fjy0,ty);
1048 fjz0 = _mm_add_ps(fjz0,tz);
1052 /**************************
1053 * CALCULATE INTERACTIONS *
1054 **************************/
1056 if (gmx_mm_any_lt(rsq30,rcutoff2))
1059 r30 = _mm_mul_ps(rsq30,rinv30);
1061 /* Compute parameters for interactions between i and j atoms */
1062 qq30 = _mm_mul_ps(iq3,jq0);
1064 /* EWALD ELECTROSTATICS */
1066 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1067 ewrt = _mm_mul_ps(r30,ewtabscale);
1068 ewitab = _mm_cvttps_epi32(ewrt);
1069 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1070 ewitab = _mm_slli_epi32(ewitab,2);
1071 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1072 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1073 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1074 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1075 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1076 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1077 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1078 velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
1079 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1081 d = _mm_sub_ps(r30,rswitch);
1082 d = _mm_max_ps(d,_mm_setzero_ps());
1083 d2 = _mm_mul_ps(d,d);
1084 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1086 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1088 /* Evaluate switch function */
1089 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1090 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
1091 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1095 fscal = _mm_and_ps(fscal,cutoff_mask);
1097 /* Calculate temporary vectorial force */
1098 tx = _mm_mul_ps(fscal,dx30);
1099 ty = _mm_mul_ps(fscal,dy30);
1100 tz = _mm_mul_ps(fscal,dz30);
1102 /* Update vectorial force */
1103 fix3 = _mm_add_ps(fix3,tx);
1104 fiy3 = _mm_add_ps(fiy3,ty);
1105 fiz3 = _mm_add_ps(fiz3,tz);
1107 fjx0 = _mm_add_ps(fjx0,tx);
1108 fjy0 = _mm_add_ps(fjy0,ty);
1109 fjz0 = _mm_add_ps(fjz0,tz);
1113 fjptrA = f+j_coord_offsetA;
1114 fjptrB = f+j_coord_offsetB;
1115 fjptrC = f+j_coord_offsetC;
1116 fjptrD = f+j_coord_offsetD;
1118 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1120 /* Inner loop uses 186 flops */
1123 if(jidx<j_index_end)
1126 /* Get j neighbor index, and coordinate index */
1127 jnrlistA = jjnr[jidx];
1128 jnrlistB = jjnr[jidx+1];
1129 jnrlistC = jjnr[jidx+2];
1130 jnrlistD = jjnr[jidx+3];
1131 /* Sign of each element will be negative for non-real atoms.
1132 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1133 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1135 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1136 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1137 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1138 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1139 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1140 j_coord_offsetA = DIM*jnrA;
1141 j_coord_offsetB = DIM*jnrB;
1142 j_coord_offsetC = DIM*jnrC;
1143 j_coord_offsetD = DIM*jnrD;
1145 /* load j atom coordinates */
1146 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1147 x+j_coord_offsetC,x+j_coord_offsetD,
1150 /* Calculate displacement vector */
1151 dx10 = _mm_sub_ps(ix1,jx0);
1152 dy10 = _mm_sub_ps(iy1,jy0);
1153 dz10 = _mm_sub_ps(iz1,jz0);
1154 dx20 = _mm_sub_ps(ix2,jx0);
1155 dy20 = _mm_sub_ps(iy2,jy0);
1156 dz20 = _mm_sub_ps(iz2,jz0);
1157 dx30 = _mm_sub_ps(ix3,jx0);
1158 dy30 = _mm_sub_ps(iy3,jy0);
1159 dz30 = _mm_sub_ps(iz3,jz0);
1161 /* Calculate squared distance and things based on it */
1162 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1163 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1164 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1166 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1167 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1168 rinv30 = gmx_mm_invsqrt_ps(rsq30);
1170 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1171 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1172 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1174 /* Load parameters for j particles */
1175 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1176 charge+jnrC+0,charge+jnrD+0);
1178 fjx0 = _mm_setzero_ps();
1179 fjy0 = _mm_setzero_ps();
1180 fjz0 = _mm_setzero_ps();
1182 /**************************
1183 * CALCULATE INTERACTIONS *
1184 **************************/
1186 if (gmx_mm_any_lt(rsq10,rcutoff2))
1189 r10 = _mm_mul_ps(rsq10,rinv10);
1190 r10 = _mm_andnot_ps(dummy_mask,r10);
1192 /* Compute parameters for interactions between i and j atoms */
1193 qq10 = _mm_mul_ps(iq1,jq0);
1195 /* EWALD ELECTROSTATICS */
1197 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1198 ewrt = _mm_mul_ps(r10,ewtabscale);
1199 ewitab = _mm_cvttps_epi32(ewrt);
1200 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1201 ewitab = _mm_slli_epi32(ewitab,2);
1202 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1203 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1204 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1205 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1206 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1207 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1208 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1209 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1210 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1212 d = _mm_sub_ps(r10,rswitch);
1213 d = _mm_max_ps(d,_mm_setzero_ps());
1214 d2 = _mm_mul_ps(d,d);
1215 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1217 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1219 /* Evaluate switch function */
1220 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1221 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1222 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1226 fscal = _mm_and_ps(fscal,cutoff_mask);
1228 fscal = _mm_andnot_ps(dummy_mask,fscal);
1230 /* Calculate temporary vectorial force */
1231 tx = _mm_mul_ps(fscal,dx10);
1232 ty = _mm_mul_ps(fscal,dy10);
1233 tz = _mm_mul_ps(fscal,dz10);
1235 /* Update vectorial force */
1236 fix1 = _mm_add_ps(fix1,tx);
1237 fiy1 = _mm_add_ps(fiy1,ty);
1238 fiz1 = _mm_add_ps(fiz1,tz);
1240 fjx0 = _mm_add_ps(fjx0,tx);
1241 fjy0 = _mm_add_ps(fjy0,ty);
1242 fjz0 = _mm_add_ps(fjz0,tz);
1246 /**************************
1247 * CALCULATE INTERACTIONS *
1248 **************************/
1250 if (gmx_mm_any_lt(rsq20,rcutoff2))
1253 r20 = _mm_mul_ps(rsq20,rinv20);
1254 r20 = _mm_andnot_ps(dummy_mask,r20);
1256 /* Compute parameters for interactions between i and j atoms */
1257 qq20 = _mm_mul_ps(iq2,jq0);
1259 /* EWALD ELECTROSTATICS */
1261 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1262 ewrt = _mm_mul_ps(r20,ewtabscale);
1263 ewitab = _mm_cvttps_epi32(ewrt);
1264 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1265 ewitab = _mm_slli_epi32(ewitab,2);
1266 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1267 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1268 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1269 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1270 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1271 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1272 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1273 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1274 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1276 d = _mm_sub_ps(r20,rswitch);
1277 d = _mm_max_ps(d,_mm_setzero_ps());
1278 d2 = _mm_mul_ps(d,d);
1279 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1281 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1283 /* Evaluate switch function */
1284 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1285 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1286 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1290 fscal = _mm_and_ps(fscal,cutoff_mask);
1292 fscal = _mm_andnot_ps(dummy_mask,fscal);
1294 /* Calculate temporary vectorial force */
1295 tx = _mm_mul_ps(fscal,dx20);
1296 ty = _mm_mul_ps(fscal,dy20);
1297 tz = _mm_mul_ps(fscal,dz20);
1299 /* Update vectorial force */
1300 fix2 = _mm_add_ps(fix2,tx);
1301 fiy2 = _mm_add_ps(fiy2,ty);
1302 fiz2 = _mm_add_ps(fiz2,tz);
1304 fjx0 = _mm_add_ps(fjx0,tx);
1305 fjy0 = _mm_add_ps(fjy0,ty);
1306 fjz0 = _mm_add_ps(fjz0,tz);
1310 /**************************
1311 * CALCULATE INTERACTIONS *
1312 **************************/
1314 if (gmx_mm_any_lt(rsq30,rcutoff2))
1317 r30 = _mm_mul_ps(rsq30,rinv30);
1318 r30 = _mm_andnot_ps(dummy_mask,r30);
1320 /* Compute parameters for interactions between i and j atoms */
1321 qq30 = _mm_mul_ps(iq3,jq0);
1323 /* EWALD ELECTROSTATICS */
1325 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1326 ewrt = _mm_mul_ps(r30,ewtabscale);
1327 ewitab = _mm_cvttps_epi32(ewrt);
1328 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1329 ewitab = _mm_slli_epi32(ewitab,2);
1330 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1331 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1332 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1333 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1334 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1335 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1336 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1337 velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
1338 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1340 d = _mm_sub_ps(r30,rswitch);
1341 d = _mm_max_ps(d,_mm_setzero_ps());
1342 d2 = _mm_mul_ps(d,d);
1343 sw = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1345 dsw = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1347 /* Evaluate switch function */
1348 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1349 felec = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv30,_mm_mul_ps(velec,dsw)) );
1350 cutoff_mask = _mm_cmplt_ps(rsq30,rcutoff2);
1354 fscal = _mm_and_ps(fscal,cutoff_mask);
1356 fscal = _mm_andnot_ps(dummy_mask,fscal);
1358 /* Calculate temporary vectorial force */
1359 tx = _mm_mul_ps(fscal,dx30);
1360 ty = _mm_mul_ps(fscal,dy30);
1361 tz = _mm_mul_ps(fscal,dz30);
1363 /* Update vectorial force */
1364 fix3 = _mm_add_ps(fix3,tx);
1365 fiy3 = _mm_add_ps(fiy3,ty);
1366 fiz3 = _mm_add_ps(fiz3,tz);
1368 fjx0 = _mm_add_ps(fjx0,tx);
1369 fjy0 = _mm_add_ps(fjy0,ty);
1370 fjz0 = _mm_add_ps(fjz0,tz);
1374 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1375 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1376 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1377 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1379 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1381 /* Inner loop uses 189 flops */
1384 /* End of innermost loop */
1386 gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1387 f+i_coord_offset+DIM,fshift+i_shift_offset);
1389 /* Increment number of inner iterations */
1390 inneriter += j_index_end - j_index_start;
1392 /* Outer loop uses 18 flops */
1395 /* Increment number of outer iterations */
1398 /* Update outer/inner flops */
1400 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4_F,outeriter*18 + inneriter*189);