2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
49 #include "gromacs/simd/math_x86_sse2_single.h"
50 #include "kernelutil_x86_sse2_single.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single
54 * Electrostatics interaction: Ewald
55 * VdW interaction: CubicSplineTable
56 * Geometry: Water4-Particle
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_sse2_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
84 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
97 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
98 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
99 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
102 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
105 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
106 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
108 __m128i ifour = _mm_set1_epi32(4);
109 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
112 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
114 __m128 dummy_mask,cutoff_mask;
115 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
116 __m128 one = _mm_set1_ps(1.0);
117 __m128 two = _mm_set1_ps(2.0);
123 jindex = nlist->jindex;
125 shiftidx = nlist->shift;
127 shiftvec = fr->shift_vec[0];
128 fshift = fr->fshift[0];
129 facel = _mm_set1_ps(fr->epsfac);
130 charge = mdatoms->chargeA;
131 nvdwtype = fr->ntype;
133 vdwtype = mdatoms->typeA;
135 vftab = kernel_data->table_vdw->data;
136 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
138 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
139 ewtab = fr->ic->tabq_coul_FDV0;
140 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
141 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
143 /* Setup water-specific parameters */
144 inr = nlist->iinr[0];
145 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
146 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
147 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
148 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
150 /* Avoid stupid compiler warnings */
151 jnrA = jnrB = jnrC = jnrD = 0;
160 for(iidx=0;iidx<4*DIM;iidx++)
165 /* Start outer loop over neighborlists */
166 for(iidx=0; iidx<nri; iidx++)
168 /* Load shift vector for this list */
169 i_shift_offset = DIM*shiftidx[iidx];
171 /* Load limits for loop over neighbors */
172 j_index_start = jindex[iidx];
173 j_index_end = jindex[iidx+1];
175 /* Get outer coordinate index */
177 i_coord_offset = DIM*inr;
179 /* Load i particle coords and add shift vector */
180 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
181 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
183 fix0 = _mm_setzero_ps();
184 fiy0 = _mm_setzero_ps();
185 fiz0 = _mm_setzero_ps();
186 fix1 = _mm_setzero_ps();
187 fiy1 = _mm_setzero_ps();
188 fiz1 = _mm_setzero_ps();
189 fix2 = _mm_setzero_ps();
190 fiy2 = _mm_setzero_ps();
191 fiz2 = _mm_setzero_ps();
192 fix3 = _mm_setzero_ps();
193 fiy3 = _mm_setzero_ps();
194 fiz3 = _mm_setzero_ps();
196 /* Reset potential sums */
197 velecsum = _mm_setzero_ps();
198 vvdwsum = _mm_setzero_ps();
200 /* Start inner kernel loop */
201 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
204 /* Get j neighbor index, and coordinate index */
209 j_coord_offsetA = DIM*jnrA;
210 j_coord_offsetB = DIM*jnrB;
211 j_coord_offsetC = DIM*jnrC;
212 j_coord_offsetD = DIM*jnrD;
214 /* load j atom coordinates */
215 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
216 x+j_coord_offsetC,x+j_coord_offsetD,
219 /* Calculate displacement vector */
220 dx00 = _mm_sub_ps(ix0,jx0);
221 dy00 = _mm_sub_ps(iy0,jy0);
222 dz00 = _mm_sub_ps(iz0,jz0);
223 dx10 = _mm_sub_ps(ix1,jx0);
224 dy10 = _mm_sub_ps(iy1,jy0);
225 dz10 = _mm_sub_ps(iz1,jz0);
226 dx20 = _mm_sub_ps(ix2,jx0);
227 dy20 = _mm_sub_ps(iy2,jy0);
228 dz20 = _mm_sub_ps(iz2,jz0);
229 dx30 = _mm_sub_ps(ix3,jx0);
230 dy30 = _mm_sub_ps(iy3,jy0);
231 dz30 = _mm_sub_ps(iz3,jz0);
233 /* Calculate squared distance and things based on it */
234 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
235 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
236 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
237 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
239 rinv00 = gmx_mm_invsqrt_ps(rsq00);
240 rinv10 = gmx_mm_invsqrt_ps(rsq10);
241 rinv20 = gmx_mm_invsqrt_ps(rsq20);
242 rinv30 = gmx_mm_invsqrt_ps(rsq30);
244 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
245 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
246 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
248 /* Load parameters for j particles */
249 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
250 charge+jnrC+0,charge+jnrD+0);
251 vdwjidx0A = 2*vdwtype[jnrA+0];
252 vdwjidx0B = 2*vdwtype[jnrB+0];
253 vdwjidx0C = 2*vdwtype[jnrC+0];
254 vdwjidx0D = 2*vdwtype[jnrD+0];
256 fjx0 = _mm_setzero_ps();
257 fjy0 = _mm_setzero_ps();
258 fjz0 = _mm_setzero_ps();
260 /**************************
261 * CALCULATE INTERACTIONS *
262 **************************/
264 r00 = _mm_mul_ps(rsq00,rinv00);
266 /* Compute parameters for interactions between i and j atoms */
267 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
268 vdwparam+vdwioffset0+vdwjidx0B,
269 vdwparam+vdwioffset0+vdwjidx0C,
270 vdwparam+vdwioffset0+vdwjidx0D,
273 /* Calculate table index by multiplying r with table scale and truncate to integer */
274 rt = _mm_mul_ps(r00,vftabscale);
275 vfitab = _mm_cvttps_epi32(rt);
276 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
277 vfitab = _mm_slli_epi32(vfitab,3);
279 /* CUBIC SPLINE TABLE DISPERSION */
280 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
281 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
282 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
283 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
284 _MM_TRANSPOSE4_PS(Y,F,G,H);
285 Heps = _mm_mul_ps(vfeps,H);
286 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
287 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
288 vvdw6 = _mm_mul_ps(c6_00,VV);
289 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
290 fvdw6 = _mm_mul_ps(c6_00,FF);
292 /* CUBIC SPLINE TABLE REPULSION */
293 vfitab = _mm_add_epi32(vfitab,ifour);
294 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
295 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
296 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
297 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
298 _MM_TRANSPOSE4_PS(Y,F,G,H);
299 Heps = _mm_mul_ps(vfeps,H);
300 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
301 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
302 vvdw12 = _mm_mul_ps(c12_00,VV);
303 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
304 fvdw12 = _mm_mul_ps(c12_00,FF);
305 vvdw = _mm_add_ps(vvdw12,vvdw6);
306 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
308 /* Update potential sum for this i atom from the interaction with this j atom. */
309 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
313 /* Calculate temporary vectorial force */
314 tx = _mm_mul_ps(fscal,dx00);
315 ty = _mm_mul_ps(fscal,dy00);
316 tz = _mm_mul_ps(fscal,dz00);
318 /* Update vectorial force */
319 fix0 = _mm_add_ps(fix0,tx);
320 fiy0 = _mm_add_ps(fiy0,ty);
321 fiz0 = _mm_add_ps(fiz0,tz);
323 fjx0 = _mm_add_ps(fjx0,tx);
324 fjy0 = _mm_add_ps(fjy0,ty);
325 fjz0 = _mm_add_ps(fjz0,tz);
327 /**************************
328 * CALCULATE INTERACTIONS *
329 **************************/
331 r10 = _mm_mul_ps(rsq10,rinv10);
333 /* Compute parameters for interactions between i and j atoms */
334 qq10 = _mm_mul_ps(iq1,jq0);
336 /* EWALD ELECTROSTATICS */
338 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
339 ewrt = _mm_mul_ps(r10,ewtabscale);
340 ewitab = _mm_cvttps_epi32(ewrt);
341 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
342 ewitab = _mm_slli_epi32(ewitab,2);
343 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
344 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
345 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
346 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
347 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
348 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
349 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
350 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
351 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
353 /* Update potential sum for this i atom from the interaction with this j atom. */
354 velecsum = _mm_add_ps(velecsum,velec);
358 /* Calculate temporary vectorial force */
359 tx = _mm_mul_ps(fscal,dx10);
360 ty = _mm_mul_ps(fscal,dy10);
361 tz = _mm_mul_ps(fscal,dz10);
363 /* Update vectorial force */
364 fix1 = _mm_add_ps(fix1,tx);
365 fiy1 = _mm_add_ps(fiy1,ty);
366 fiz1 = _mm_add_ps(fiz1,tz);
368 fjx0 = _mm_add_ps(fjx0,tx);
369 fjy0 = _mm_add_ps(fjy0,ty);
370 fjz0 = _mm_add_ps(fjz0,tz);
372 /**************************
373 * CALCULATE INTERACTIONS *
374 **************************/
376 r20 = _mm_mul_ps(rsq20,rinv20);
378 /* Compute parameters for interactions between i and j atoms */
379 qq20 = _mm_mul_ps(iq2,jq0);
381 /* EWALD ELECTROSTATICS */
383 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
384 ewrt = _mm_mul_ps(r20,ewtabscale);
385 ewitab = _mm_cvttps_epi32(ewrt);
386 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
387 ewitab = _mm_slli_epi32(ewitab,2);
388 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
389 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
390 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
391 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
392 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
393 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
394 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
395 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
396 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
398 /* Update potential sum for this i atom from the interaction with this j atom. */
399 velecsum = _mm_add_ps(velecsum,velec);
403 /* Calculate temporary vectorial force */
404 tx = _mm_mul_ps(fscal,dx20);
405 ty = _mm_mul_ps(fscal,dy20);
406 tz = _mm_mul_ps(fscal,dz20);
408 /* Update vectorial force */
409 fix2 = _mm_add_ps(fix2,tx);
410 fiy2 = _mm_add_ps(fiy2,ty);
411 fiz2 = _mm_add_ps(fiz2,tz);
413 fjx0 = _mm_add_ps(fjx0,tx);
414 fjy0 = _mm_add_ps(fjy0,ty);
415 fjz0 = _mm_add_ps(fjz0,tz);
417 /**************************
418 * CALCULATE INTERACTIONS *
419 **************************/
421 r30 = _mm_mul_ps(rsq30,rinv30);
423 /* Compute parameters for interactions between i and j atoms */
424 qq30 = _mm_mul_ps(iq3,jq0);
426 /* EWALD ELECTROSTATICS */
428 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
429 ewrt = _mm_mul_ps(r30,ewtabscale);
430 ewitab = _mm_cvttps_epi32(ewrt);
431 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
432 ewitab = _mm_slli_epi32(ewitab,2);
433 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
434 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
435 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
436 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
437 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
438 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
439 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
440 velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
441 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
443 /* Update potential sum for this i atom from the interaction with this j atom. */
444 velecsum = _mm_add_ps(velecsum,velec);
448 /* Calculate temporary vectorial force */
449 tx = _mm_mul_ps(fscal,dx30);
450 ty = _mm_mul_ps(fscal,dy30);
451 tz = _mm_mul_ps(fscal,dz30);
453 /* Update vectorial force */
454 fix3 = _mm_add_ps(fix3,tx);
455 fiy3 = _mm_add_ps(fiy3,ty);
456 fiz3 = _mm_add_ps(fiz3,tz);
458 fjx0 = _mm_add_ps(fjx0,tx);
459 fjy0 = _mm_add_ps(fjy0,ty);
460 fjz0 = _mm_add_ps(fjz0,tz);
462 fjptrA = f+j_coord_offsetA;
463 fjptrB = f+j_coord_offsetB;
464 fjptrC = f+j_coord_offsetC;
465 fjptrD = f+j_coord_offsetD;
467 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
469 /* Inner loop uses 179 flops */
475 /* Get j neighbor index, and coordinate index */
476 jnrlistA = jjnr[jidx];
477 jnrlistB = jjnr[jidx+1];
478 jnrlistC = jjnr[jidx+2];
479 jnrlistD = jjnr[jidx+3];
480 /* Sign of each element will be negative for non-real atoms.
481 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
482 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
484 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
485 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
486 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
487 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
488 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
489 j_coord_offsetA = DIM*jnrA;
490 j_coord_offsetB = DIM*jnrB;
491 j_coord_offsetC = DIM*jnrC;
492 j_coord_offsetD = DIM*jnrD;
494 /* load j atom coordinates */
495 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
496 x+j_coord_offsetC,x+j_coord_offsetD,
499 /* Calculate displacement vector */
500 dx00 = _mm_sub_ps(ix0,jx0);
501 dy00 = _mm_sub_ps(iy0,jy0);
502 dz00 = _mm_sub_ps(iz0,jz0);
503 dx10 = _mm_sub_ps(ix1,jx0);
504 dy10 = _mm_sub_ps(iy1,jy0);
505 dz10 = _mm_sub_ps(iz1,jz0);
506 dx20 = _mm_sub_ps(ix2,jx0);
507 dy20 = _mm_sub_ps(iy2,jy0);
508 dz20 = _mm_sub_ps(iz2,jz0);
509 dx30 = _mm_sub_ps(ix3,jx0);
510 dy30 = _mm_sub_ps(iy3,jy0);
511 dz30 = _mm_sub_ps(iz3,jz0);
513 /* Calculate squared distance and things based on it */
514 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
515 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
516 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
517 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
519 rinv00 = gmx_mm_invsqrt_ps(rsq00);
520 rinv10 = gmx_mm_invsqrt_ps(rsq10);
521 rinv20 = gmx_mm_invsqrt_ps(rsq20);
522 rinv30 = gmx_mm_invsqrt_ps(rsq30);
524 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
525 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
526 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
528 /* Load parameters for j particles */
529 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
530 charge+jnrC+0,charge+jnrD+0);
531 vdwjidx0A = 2*vdwtype[jnrA+0];
532 vdwjidx0B = 2*vdwtype[jnrB+0];
533 vdwjidx0C = 2*vdwtype[jnrC+0];
534 vdwjidx0D = 2*vdwtype[jnrD+0];
536 fjx0 = _mm_setzero_ps();
537 fjy0 = _mm_setzero_ps();
538 fjz0 = _mm_setzero_ps();
540 /**************************
541 * CALCULATE INTERACTIONS *
542 **************************/
544 r00 = _mm_mul_ps(rsq00,rinv00);
545 r00 = _mm_andnot_ps(dummy_mask,r00);
547 /* Compute parameters for interactions between i and j atoms */
548 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
549 vdwparam+vdwioffset0+vdwjidx0B,
550 vdwparam+vdwioffset0+vdwjidx0C,
551 vdwparam+vdwioffset0+vdwjidx0D,
554 /* Calculate table index by multiplying r with table scale and truncate to integer */
555 rt = _mm_mul_ps(r00,vftabscale);
556 vfitab = _mm_cvttps_epi32(rt);
557 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
558 vfitab = _mm_slli_epi32(vfitab,3);
560 /* CUBIC SPLINE TABLE DISPERSION */
561 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
562 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
563 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
564 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
565 _MM_TRANSPOSE4_PS(Y,F,G,H);
566 Heps = _mm_mul_ps(vfeps,H);
567 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
568 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
569 vvdw6 = _mm_mul_ps(c6_00,VV);
570 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
571 fvdw6 = _mm_mul_ps(c6_00,FF);
573 /* CUBIC SPLINE TABLE REPULSION */
574 vfitab = _mm_add_epi32(vfitab,ifour);
575 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
576 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
577 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
578 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
579 _MM_TRANSPOSE4_PS(Y,F,G,H);
580 Heps = _mm_mul_ps(vfeps,H);
581 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
582 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
583 vvdw12 = _mm_mul_ps(c12_00,VV);
584 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
585 fvdw12 = _mm_mul_ps(c12_00,FF);
586 vvdw = _mm_add_ps(vvdw12,vvdw6);
587 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
589 /* Update potential sum for this i atom from the interaction with this j atom. */
590 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
591 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
595 fscal = _mm_andnot_ps(dummy_mask,fscal);
597 /* Calculate temporary vectorial force */
598 tx = _mm_mul_ps(fscal,dx00);
599 ty = _mm_mul_ps(fscal,dy00);
600 tz = _mm_mul_ps(fscal,dz00);
602 /* Update vectorial force */
603 fix0 = _mm_add_ps(fix0,tx);
604 fiy0 = _mm_add_ps(fiy0,ty);
605 fiz0 = _mm_add_ps(fiz0,tz);
607 fjx0 = _mm_add_ps(fjx0,tx);
608 fjy0 = _mm_add_ps(fjy0,ty);
609 fjz0 = _mm_add_ps(fjz0,tz);
611 /**************************
612 * CALCULATE INTERACTIONS *
613 **************************/
615 r10 = _mm_mul_ps(rsq10,rinv10);
616 r10 = _mm_andnot_ps(dummy_mask,r10);
618 /* Compute parameters for interactions between i and j atoms */
619 qq10 = _mm_mul_ps(iq1,jq0);
621 /* EWALD ELECTROSTATICS */
623 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
624 ewrt = _mm_mul_ps(r10,ewtabscale);
625 ewitab = _mm_cvttps_epi32(ewrt);
626 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
627 ewitab = _mm_slli_epi32(ewitab,2);
628 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
629 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
630 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
631 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
632 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
633 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
634 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
635 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
636 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
638 /* Update potential sum for this i atom from the interaction with this j atom. */
639 velec = _mm_andnot_ps(dummy_mask,velec);
640 velecsum = _mm_add_ps(velecsum,velec);
644 fscal = _mm_andnot_ps(dummy_mask,fscal);
646 /* Calculate temporary vectorial force */
647 tx = _mm_mul_ps(fscal,dx10);
648 ty = _mm_mul_ps(fscal,dy10);
649 tz = _mm_mul_ps(fscal,dz10);
651 /* Update vectorial force */
652 fix1 = _mm_add_ps(fix1,tx);
653 fiy1 = _mm_add_ps(fiy1,ty);
654 fiz1 = _mm_add_ps(fiz1,tz);
656 fjx0 = _mm_add_ps(fjx0,tx);
657 fjy0 = _mm_add_ps(fjy0,ty);
658 fjz0 = _mm_add_ps(fjz0,tz);
660 /**************************
661 * CALCULATE INTERACTIONS *
662 **************************/
664 r20 = _mm_mul_ps(rsq20,rinv20);
665 r20 = _mm_andnot_ps(dummy_mask,r20);
667 /* Compute parameters for interactions between i and j atoms */
668 qq20 = _mm_mul_ps(iq2,jq0);
670 /* EWALD ELECTROSTATICS */
672 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
673 ewrt = _mm_mul_ps(r20,ewtabscale);
674 ewitab = _mm_cvttps_epi32(ewrt);
675 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
676 ewitab = _mm_slli_epi32(ewitab,2);
677 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
678 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
679 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
680 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
681 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
682 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
683 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
684 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
685 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
687 /* Update potential sum for this i atom from the interaction with this j atom. */
688 velec = _mm_andnot_ps(dummy_mask,velec);
689 velecsum = _mm_add_ps(velecsum,velec);
693 fscal = _mm_andnot_ps(dummy_mask,fscal);
695 /* Calculate temporary vectorial force */
696 tx = _mm_mul_ps(fscal,dx20);
697 ty = _mm_mul_ps(fscal,dy20);
698 tz = _mm_mul_ps(fscal,dz20);
700 /* Update vectorial force */
701 fix2 = _mm_add_ps(fix2,tx);
702 fiy2 = _mm_add_ps(fiy2,ty);
703 fiz2 = _mm_add_ps(fiz2,tz);
705 fjx0 = _mm_add_ps(fjx0,tx);
706 fjy0 = _mm_add_ps(fjy0,ty);
707 fjz0 = _mm_add_ps(fjz0,tz);
709 /**************************
710 * CALCULATE INTERACTIONS *
711 **************************/
713 r30 = _mm_mul_ps(rsq30,rinv30);
714 r30 = _mm_andnot_ps(dummy_mask,r30);
716 /* Compute parameters for interactions between i and j atoms */
717 qq30 = _mm_mul_ps(iq3,jq0);
719 /* EWALD ELECTROSTATICS */
721 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
722 ewrt = _mm_mul_ps(r30,ewtabscale);
723 ewitab = _mm_cvttps_epi32(ewrt);
724 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
725 ewitab = _mm_slli_epi32(ewitab,2);
726 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
727 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
728 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
729 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
730 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
731 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
732 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
733 velec = _mm_mul_ps(qq30,_mm_sub_ps(rinv30,velec));
734 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
736 /* Update potential sum for this i atom from the interaction with this j atom. */
737 velec = _mm_andnot_ps(dummy_mask,velec);
738 velecsum = _mm_add_ps(velecsum,velec);
742 fscal = _mm_andnot_ps(dummy_mask,fscal);
744 /* Calculate temporary vectorial force */
745 tx = _mm_mul_ps(fscal,dx30);
746 ty = _mm_mul_ps(fscal,dy30);
747 tz = _mm_mul_ps(fscal,dz30);
749 /* Update vectorial force */
750 fix3 = _mm_add_ps(fix3,tx);
751 fiy3 = _mm_add_ps(fiy3,ty);
752 fiz3 = _mm_add_ps(fiz3,tz);
754 fjx0 = _mm_add_ps(fjx0,tx);
755 fjy0 = _mm_add_ps(fjy0,ty);
756 fjz0 = _mm_add_ps(fjz0,tz);
758 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
759 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
760 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
761 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
763 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
765 /* Inner loop uses 183 flops */
768 /* End of innermost loop */
770 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
771 f+i_coord_offset,fshift+i_shift_offset);
774 /* Update potential energies */
775 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
776 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
778 /* Increment number of inner iterations */
779 inneriter += j_index_end - j_index_start;
781 /* Outer loop uses 26 flops */
784 /* Increment number of outer iterations */
787 /* Update outer/inner flops */
789 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*183);
792 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single
793 * Electrostatics interaction: Ewald
794 * VdW interaction: CubicSplineTable
795 * Geometry: Water4-Particle
796 * Calculate force/pot: Force
799 nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_sse2_single
800 (t_nblist * gmx_restrict nlist,
801 rvec * gmx_restrict xx,
802 rvec * gmx_restrict ff,
803 t_forcerec * gmx_restrict fr,
804 t_mdatoms * gmx_restrict mdatoms,
805 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
806 t_nrnb * gmx_restrict nrnb)
808 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
809 * just 0 for non-waters.
810 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
811 * jnr indices corresponding to data put in the four positions in the SIMD register.
813 int i_shift_offset,i_coord_offset,outeriter,inneriter;
814 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
815 int jnrA,jnrB,jnrC,jnrD;
816 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
817 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
818 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
820 real *shiftvec,*fshift,*x,*f;
821 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
823 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
825 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
827 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
829 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
831 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
832 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
833 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
834 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
835 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
836 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
837 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
838 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
841 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
844 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
845 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
847 __m128i ifour = _mm_set1_epi32(4);
848 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
851 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
853 __m128 dummy_mask,cutoff_mask;
854 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
855 __m128 one = _mm_set1_ps(1.0);
856 __m128 two = _mm_set1_ps(2.0);
862 jindex = nlist->jindex;
864 shiftidx = nlist->shift;
866 shiftvec = fr->shift_vec[0];
867 fshift = fr->fshift[0];
868 facel = _mm_set1_ps(fr->epsfac);
869 charge = mdatoms->chargeA;
870 nvdwtype = fr->ntype;
872 vdwtype = mdatoms->typeA;
874 vftab = kernel_data->table_vdw->data;
875 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
877 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
878 ewtab = fr->ic->tabq_coul_F;
879 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
880 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
882 /* Setup water-specific parameters */
883 inr = nlist->iinr[0];
884 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
885 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
886 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
887 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
889 /* Avoid stupid compiler warnings */
890 jnrA = jnrB = jnrC = jnrD = 0;
899 for(iidx=0;iidx<4*DIM;iidx++)
904 /* Start outer loop over neighborlists */
905 for(iidx=0; iidx<nri; iidx++)
907 /* Load shift vector for this list */
908 i_shift_offset = DIM*shiftidx[iidx];
910 /* Load limits for loop over neighbors */
911 j_index_start = jindex[iidx];
912 j_index_end = jindex[iidx+1];
914 /* Get outer coordinate index */
916 i_coord_offset = DIM*inr;
918 /* Load i particle coords and add shift vector */
919 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
920 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
922 fix0 = _mm_setzero_ps();
923 fiy0 = _mm_setzero_ps();
924 fiz0 = _mm_setzero_ps();
925 fix1 = _mm_setzero_ps();
926 fiy1 = _mm_setzero_ps();
927 fiz1 = _mm_setzero_ps();
928 fix2 = _mm_setzero_ps();
929 fiy2 = _mm_setzero_ps();
930 fiz2 = _mm_setzero_ps();
931 fix3 = _mm_setzero_ps();
932 fiy3 = _mm_setzero_ps();
933 fiz3 = _mm_setzero_ps();
935 /* Start inner kernel loop */
936 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
939 /* Get j neighbor index, and coordinate index */
944 j_coord_offsetA = DIM*jnrA;
945 j_coord_offsetB = DIM*jnrB;
946 j_coord_offsetC = DIM*jnrC;
947 j_coord_offsetD = DIM*jnrD;
949 /* load j atom coordinates */
950 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
951 x+j_coord_offsetC,x+j_coord_offsetD,
954 /* Calculate displacement vector */
955 dx00 = _mm_sub_ps(ix0,jx0);
956 dy00 = _mm_sub_ps(iy0,jy0);
957 dz00 = _mm_sub_ps(iz0,jz0);
958 dx10 = _mm_sub_ps(ix1,jx0);
959 dy10 = _mm_sub_ps(iy1,jy0);
960 dz10 = _mm_sub_ps(iz1,jz0);
961 dx20 = _mm_sub_ps(ix2,jx0);
962 dy20 = _mm_sub_ps(iy2,jy0);
963 dz20 = _mm_sub_ps(iz2,jz0);
964 dx30 = _mm_sub_ps(ix3,jx0);
965 dy30 = _mm_sub_ps(iy3,jy0);
966 dz30 = _mm_sub_ps(iz3,jz0);
968 /* Calculate squared distance and things based on it */
969 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
970 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
971 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
972 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
974 rinv00 = gmx_mm_invsqrt_ps(rsq00);
975 rinv10 = gmx_mm_invsqrt_ps(rsq10);
976 rinv20 = gmx_mm_invsqrt_ps(rsq20);
977 rinv30 = gmx_mm_invsqrt_ps(rsq30);
979 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
980 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
981 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
983 /* Load parameters for j particles */
984 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
985 charge+jnrC+0,charge+jnrD+0);
986 vdwjidx0A = 2*vdwtype[jnrA+0];
987 vdwjidx0B = 2*vdwtype[jnrB+0];
988 vdwjidx0C = 2*vdwtype[jnrC+0];
989 vdwjidx0D = 2*vdwtype[jnrD+0];
991 fjx0 = _mm_setzero_ps();
992 fjy0 = _mm_setzero_ps();
993 fjz0 = _mm_setzero_ps();
995 /**************************
996 * CALCULATE INTERACTIONS *
997 **************************/
999 r00 = _mm_mul_ps(rsq00,rinv00);
1001 /* Compute parameters for interactions between i and j atoms */
1002 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1003 vdwparam+vdwioffset0+vdwjidx0B,
1004 vdwparam+vdwioffset0+vdwjidx0C,
1005 vdwparam+vdwioffset0+vdwjidx0D,
1008 /* Calculate table index by multiplying r with table scale and truncate to integer */
1009 rt = _mm_mul_ps(r00,vftabscale);
1010 vfitab = _mm_cvttps_epi32(rt);
1011 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1012 vfitab = _mm_slli_epi32(vfitab,3);
1014 /* CUBIC SPLINE TABLE DISPERSION */
1015 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1016 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1017 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1018 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1019 _MM_TRANSPOSE4_PS(Y,F,G,H);
1020 Heps = _mm_mul_ps(vfeps,H);
1021 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1022 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1023 fvdw6 = _mm_mul_ps(c6_00,FF);
1025 /* CUBIC SPLINE TABLE REPULSION */
1026 vfitab = _mm_add_epi32(vfitab,ifour);
1027 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1028 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1029 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1030 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1031 _MM_TRANSPOSE4_PS(Y,F,G,H);
1032 Heps = _mm_mul_ps(vfeps,H);
1033 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1034 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1035 fvdw12 = _mm_mul_ps(c12_00,FF);
1036 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1040 /* Calculate temporary vectorial force */
1041 tx = _mm_mul_ps(fscal,dx00);
1042 ty = _mm_mul_ps(fscal,dy00);
1043 tz = _mm_mul_ps(fscal,dz00);
1045 /* Update vectorial force */
1046 fix0 = _mm_add_ps(fix0,tx);
1047 fiy0 = _mm_add_ps(fiy0,ty);
1048 fiz0 = _mm_add_ps(fiz0,tz);
1050 fjx0 = _mm_add_ps(fjx0,tx);
1051 fjy0 = _mm_add_ps(fjy0,ty);
1052 fjz0 = _mm_add_ps(fjz0,tz);
1054 /**************************
1055 * CALCULATE INTERACTIONS *
1056 **************************/
1058 r10 = _mm_mul_ps(rsq10,rinv10);
1060 /* Compute parameters for interactions between i and j atoms */
1061 qq10 = _mm_mul_ps(iq1,jq0);
1063 /* EWALD ELECTROSTATICS */
1065 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1066 ewrt = _mm_mul_ps(r10,ewtabscale);
1067 ewitab = _mm_cvttps_epi32(ewrt);
1068 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1069 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1070 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1072 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1073 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1077 /* Calculate temporary vectorial force */
1078 tx = _mm_mul_ps(fscal,dx10);
1079 ty = _mm_mul_ps(fscal,dy10);
1080 tz = _mm_mul_ps(fscal,dz10);
1082 /* Update vectorial force */
1083 fix1 = _mm_add_ps(fix1,tx);
1084 fiy1 = _mm_add_ps(fiy1,ty);
1085 fiz1 = _mm_add_ps(fiz1,tz);
1087 fjx0 = _mm_add_ps(fjx0,tx);
1088 fjy0 = _mm_add_ps(fjy0,ty);
1089 fjz0 = _mm_add_ps(fjz0,tz);
1091 /**************************
1092 * CALCULATE INTERACTIONS *
1093 **************************/
1095 r20 = _mm_mul_ps(rsq20,rinv20);
1097 /* Compute parameters for interactions between i and j atoms */
1098 qq20 = _mm_mul_ps(iq2,jq0);
1100 /* EWALD ELECTROSTATICS */
1102 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1103 ewrt = _mm_mul_ps(r20,ewtabscale);
1104 ewitab = _mm_cvttps_epi32(ewrt);
1105 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1106 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1107 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1109 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1110 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1114 /* Calculate temporary vectorial force */
1115 tx = _mm_mul_ps(fscal,dx20);
1116 ty = _mm_mul_ps(fscal,dy20);
1117 tz = _mm_mul_ps(fscal,dz20);
1119 /* Update vectorial force */
1120 fix2 = _mm_add_ps(fix2,tx);
1121 fiy2 = _mm_add_ps(fiy2,ty);
1122 fiz2 = _mm_add_ps(fiz2,tz);
1124 fjx0 = _mm_add_ps(fjx0,tx);
1125 fjy0 = _mm_add_ps(fjy0,ty);
1126 fjz0 = _mm_add_ps(fjz0,tz);
1128 /**************************
1129 * CALCULATE INTERACTIONS *
1130 **************************/
1132 r30 = _mm_mul_ps(rsq30,rinv30);
1134 /* Compute parameters for interactions between i and j atoms */
1135 qq30 = _mm_mul_ps(iq3,jq0);
1137 /* EWALD ELECTROSTATICS */
1139 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1140 ewrt = _mm_mul_ps(r30,ewtabscale);
1141 ewitab = _mm_cvttps_epi32(ewrt);
1142 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1143 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1144 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1146 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1147 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1151 /* Calculate temporary vectorial force */
1152 tx = _mm_mul_ps(fscal,dx30);
1153 ty = _mm_mul_ps(fscal,dy30);
1154 tz = _mm_mul_ps(fscal,dz30);
1156 /* Update vectorial force */
1157 fix3 = _mm_add_ps(fix3,tx);
1158 fiy3 = _mm_add_ps(fiy3,ty);
1159 fiz3 = _mm_add_ps(fiz3,tz);
1161 fjx0 = _mm_add_ps(fjx0,tx);
1162 fjy0 = _mm_add_ps(fjy0,ty);
1163 fjz0 = _mm_add_ps(fjz0,tz);
1165 fjptrA = f+j_coord_offsetA;
1166 fjptrB = f+j_coord_offsetB;
1167 fjptrC = f+j_coord_offsetC;
1168 fjptrD = f+j_coord_offsetD;
1170 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1172 /* Inner loop uses 156 flops */
1175 if(jidx<j_index_end)
1178 /* Get j neighbor index, and coordinate index */
1179 jnrlistA = jjnr[jidx];
1180 jnrlistB = jjnr[jidx+1];
1181 jnrlistC = jjnr[jidx+2];
1182 jnrlistD = jjnr[jidx+3];
1183 /* Sign of each element will be negative for non-real atoms.
1184 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1185 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1187 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1188 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1189 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1190 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1191 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1192 j_coord_offsetA = DIM*jnrA;
1193 j_coord_offsetB = DIM*jnrB;
1194 j_coord_offsetC = DIM*jnrC;
1195 j_coord_offsetD = DIM*jnrD;
1197 /* load j atom coordinates */
1198 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1199 x+j_coord_offsetC,x+j_coord_offsetD,
1202 /* Calculate displacement vector */
1203 dx00 = _mm_sub_ps(ix0,jx0);
1204 dy00 = _mm_sub_ps(iy0,jy0);
1205 dz00 = _mm_sub_ps(iz0,jz0);
1206 dx10 = _mm_sub_ps(ix1,jx0);
1207 dy10 = _mm_sub_ps(iy1,jy0);
1208 dz10 = _mm_sub_ps(iz1,jz0);
1209 dx20 = _mm_sub_ps(ix2,jx0);
1210 dy20 = _mm_sub_ps(iy2,jy0);
1211 dz20 = _mm_sub_ps(iz2,jz0);
1212 dx30 = _mm_sub_ps(ix3,jx0);
1213 dy30 = _mm_sub_ps(iy3,jy0);
1214 dz30 = _mm_sub_ps(iz3,jz0);
1216 /* Calculate squared distance and things based on it */
1217 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1218 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1219 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1220 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1222 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1223 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1224 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1225 rinv30 = gmx_mm_invsqrt_ps(rsq30);
1227 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1228 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1229 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1231 /* Load parameters for j particles */
1232 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1233 charge+jnrC+0,charge+jnrD+0);
1234 vdwjidx0A = 2*vdwtype[jnrA+0];
1235 vdwjidx0B = 2*vdwtype[jnrB+0];
1236 vdwjidx0C = 2*vdwtype[jnrC+0];
1237 vdwjidx0D = 2*vdwtype[jnrD+0];
1239 fjx0 = _mm_setzero_ps();
1240 fjy0 = _mm_setzero_ps();
1241 fjz0 = _mm_setzero_ps();
1243 /**************************
1244 * CALCULATE INTERACTIONS *
1245 **************************/
1247 r00 = _mm_mul_ps(rsq00,rinv00);
1248 r00 = _mm_andnot_ps(dummy_mask,r00);
1250 /* Compute parameters for interactions between i and j atoms */
1251 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1252 vdwparam+vdwioffset0+vdwjidx0B,
1253 vdwparam+vdwioffset0+vdwjidx0C,
1254 vdwparam+vdwioffset0+vdwjidx0D,
1257 /* Calculate table index by multiplying r with table scale and truncate to integer */
1258 rt = _mm_mul_ps(r00,vftabscale);
1259 vfitab = _mm_cvttps_epi32(rt);
1260 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1261 vfitab = _mm_slli_epi32(vfitab,3);
1263 /* CUBIC SPLINE TABLE DISPERSION */
1264 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1265 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1266 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1267 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1268 _MM_TRANSPOSE4_PS(Y,F,G,H);
1269 Heps = _mm_mul_ps(vfeps,H);
1270 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1271 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1272 fvdw6 = _mm_mul_ps(c6_00,FF);
1274 /* CUBIC SPLINE TABLE REPULSION */
1275 vfitab = _mm_add_epi32(vfitab,ifour);
1276 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1277 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1278 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1279 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1280 _MM_TRANSPOSE4_PS(Y,F,G,H);
1281 Heps = _mm_mul_ps(vfeps,H);
1282 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1283 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1284 fvdw12 = _mm_mul_ps(c12_00,FF);
1285 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1289 fscal = _mm_andnot_ps(dummy_mask,fscal);
1291 /* Calculate temporary vectorial force */
1292 tx = _mm_mul_ps(fscal,dx00);
1293 ty = _mm_mul_ps(fscal,dy00);
1294 tz = _mm_mul_ps(fscal,dz00);
1296 /* Update vectorial force */
1297 fix0 = _mm_add_ps(fix0,tx);
1298 fiy0 = _mm_add_ps(fiy0,ty);
1299 fiz0 = _mm_add_ps(fiz0,tz);
1301 fjx0 = _mm_add_ps(fjx0,tx);
1302 fjy0 = _mm_add_ps(fjy0,ty);
1303 fjz0 = _mm_add_ps(fjz0,tz);
1305 /**************************
1306 * CALCULATE INTERACTIONS *
1307 **************************/
1309 r10 = _mm_mul_ps(rsq10,rinv10);
1310 r10 = _mm_andnot_ps(dummy_mask,r10);
1312 /* Compute parameters for interactions between i and j atoms */
1313 qq10 = _mm_mul_ps(iq1,jq0);
1315 /* EWALD ELECTROSTATICS */
1317 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1318 ewrt = _mm_mul_ps(r10,ewtabscale);
1319 ewitab = _mm_cvttps_epi32(ewrt);
1320 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1321 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1322 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1324 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1325 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1329 fscal = _mm_andnot_ps(dummy_mask,fscal);
1331 /* Calculate temporary vectorial force */
1332 tx = _mm_mul_ps(fscal,dx10);
1333 ty = _mm_mul_ps(fscal,dy10);
1334 tz = _mm_mul_ps(fscal,dz10);
1336 /* Update vectorial force */
1337 fix1 = _mm_add_ps(fix1,tx);
1338 fiy1 = _mm_add_ps(fiy1,ty);
1339 fiz1 = _mm_add_ps(fiz1,tz);
1341 fjx0 = _mm_add_ps(fjx0,tx);
1342 fjy0 = _mm_add_ps(fjy0,ty);
1343 fjz0 = _mm_add_ps(fjz0,tz);
1345 /**************************
1346 * CALCULATE INTERACTIONS *
1347 **************************/
1349 r20 = _mm_mul_ps(rsq20,rinv20);
1350 r20 = _mm_andnot_ps(dummy_mask,r20);
1352 /* Compute parameters for interactions between i and j atoms */
1353 qq20 = _mm_mul_ps(iq2,jq0);
1355 /* EWALD ELECTROSTATICS */
1357 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1358 ewrt = _mm_mul_ps(r20,ewtabscale);
1359 ewitab = _mm_cvttps_epi32(ewrt);
1360 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1361 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1362 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1364 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1365 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1369 fscal = _mm_andnot_ps(dummy_mask,fscal);
1371 /* Calculate temporary vectorial force */
1372 tx = _mm_mul_ps(fscal,dx20);
1373 ty = _mm_mul_ps(fscal,dy20);
1374 tz = _mm_mul_ps(fscal,dz20);
1376 /* Update vectorial force */
1377 fix2 = _mm_add_ps(fix2,tx);
1378 fiy2 = _mm_add_ps(fiy2,ty);
1379 fiz2 = _mm_add_ps(fiz2,tz);
1381 fjx0 = _mm_add_ps(fjx0,tx);
1382 fjy0 = _mm_add_ps(fjy0,ty);
1383 fjz0 = _mm_add_ps(fjz0,tz);
1385 /**************************
1386 * CALCULATE INTERACTIONS *
1387 **************************/
1389 r30 = _mm_mul_ps(rsq30,rinv30);
1390 r30 = _mm_andnot_ps(dummy_mask,r30);
1392 /* Compute parameters for interactions between i and j atoms */
1393 qq30 = _mm_mul_ps(iq3,jq0);
1395 /* EWALD ELECTROSTATICS */
1397 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1398 ewrt = _mm_mul_ps(r30,ewtabscale);
1399 ewitab = _mm_cvttps_epi32(ewrt);
1400 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1401 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1402 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1404 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1405 felec = _mm_mul_ps(_mm_mul_ps(qq30,rinv30),_mm_sub_ps(rinvsq30,felec));
1409 fscal = _mm_andnot_ps(dummy_mask,fscal);
1411 /* Calculate temporary vectorial force */
1412 tx = _mm_mul_ps(fscal,dx30);
1413 ty = _mm_mul_ps(fscal,dy30);
1414 tz = _mm_mul_ps(fscal,dz30);
1416 /* Update vectorial force */
1417 fix3 = _mm_add_ps(fix3,tx);
1418 fiy3 = _mm_add_ps(fiy3,ty);
1419 fiz3 = _mm_add_ps(fiz3,tz);
1421 fjx0 = _mm_add_ps(fjx0,tx);
1422 fjy0 = _mm_add_ps(fjy0,ty);
1423 fjz0 = _mm_add_ps(fjz0,tz);
1425 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1426 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1427 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1428 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1430 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1432 /* Inner loop uses 160 flops */
1435 /* End of innermost loop */
1437 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1438 f+i_coord_offset,fshift+i_shift_offset);
1440 /* Increment number of inner iterations */
1441 inneriter += j_index_end - j_index_start;
1443 /* Outer loop uses 24 flops */
1446 /* Increment number of outer iterations */
1449 /* Update outer/inner flops */
1451 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*160);